In [1]:
# Run this notebook after running Mother's Education and Father's education

In [2]:
# Libraries
import pandas as pd
import numpy as np


import warnings
warnings.filterwarnings('ignore')

In [3]:
# Read the data
df = pd.read_csv('meduc.csv')

In [4]:
# Number of missing values per column
print(df.isnull().sum())

id           0
year         0
wage         0
hours        0
emp          0
treat        0
female       0
IQ           0
KWW          0
educ         0
exper        0
tenure       0
age          0
married      0
black        0
south        0
urban        0
sibs         0
brthord    188
meduc        0
feduc        0
dtype: int64


# Birth Order 

In [5]:
new_df = df.dropna(subset=['brthord'])

In [6]:
print(new_df.isnull().sum())

id         0
year       0
wage       0
hours      0
emp        0
treat      0
female     0
IQ         0
KWW        0
educ       0
exper      0
tenure     0
age        0
married    0
black      0
south      0
urban      0
sibs       0
brthord    0
meduc      0
feduc      0
dtype: int64


## OLS

### Moderate Dimension

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_brthord = new_df[["sibs", "age", "married", "female", "IQ", "black", "south", "urban"]]
y_brthord = new_df[['brthord']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_brthord, y_brthord, test_size=0.2, random_state=42)

# Create a Linear Regression model
model = LinearRegression()

# Fit the model using the training data
model.fit(x_train, y_train)

# Predict the target variable for the training and testing data
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2:", in_sample_r2)
print("In-sample MSE:", in_sample_mse)
print("Out-of-sample R2:", out_of_sample_r2)
print("Out-of-sample MSE:", out_of_sample_mse)


In-sample R2: 0.4047990135880557
In-sample MSE: 1.5458971617665145
Out-of-sample R2: 0.36172683150172047
Out-of-sample MSE: 1.6410282923828774


### High Dimension (up to degree 3)

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_brthord = new_df[["sibs", "age", "married", "female", "IQ", "black", "south", "urban"]]
y_brthord = new_df[['brthord']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_brthord, y_brthord, test_size=0.2, random_state=42)

# Create polynomial features including all possible interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.transform(x_test)

# Create a Linear Regression model
model = LinearRegression()

# Fit the model using the training data with polynomial features
model.fit(x_train_poly, y_train)

# Predict the target variable for the training and testing data with polynomial features
y_train_pred = model.predict(x_train_poly)
y_test_pred = model.predict(x_test_poly)

# Calculate in-sample R2 and MSE with polynomial features
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE with polynomial features
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 with polynomial features:", in_sample_r2)
print("In-sample MSE with polynomial features:", in_sample_mse)
print("Out-of-sample R2 with polynomial features:", out_of_sample_r2)
print("Out-of-sample MSE with polynomial features:", out_of_sample_mse)


In-sample R2 with polynomial features: 0.5392618763936858
In-sample MSE with polynomial features: 1.19666091599464
Out-of-sample R2 with polynomial features: 0.342603477277278
Out-of-sample MSE with polynomial features: 1.6901952742903337


## LASSO (alpha = 0.1)

### Moderate Dimension

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_brthord = new_df[["sibs", "age", "married", "female", "IQ", "black", "south", "urban"]]
y_brthord = new_df[['brthord']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_brthord, y_brthord, test_size=0.2, random_state=42)

# Create a LASSO model
model = Lasso(alpha = 0.1)

# Fit the model using the training data
model.fit(x_train, y_train)

# Predict the target variable for the training and testing data
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2:", in_sample_r2)
print("In-sample MSE:", in_sample_mse)
print("Out-of-sample R2:", out_of_sample_r2)
print("Out-of-sample MSE:", out_of_sample_mse)


In-sample R2: 0.3852634668835623
In-sample MSE: 1.5966362346065786
Out-of-sample R2: 0.359702742755576
Out-of-sample MSE: 1.6462323132671204


### High Dimension (up to degree 3)

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_brthord = new_df[["sibs", "age", "married", "female", "IQ", "black", "south", "urban"]]
y_brthord = new_df[['brthord']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_brthord, y_brthord, test_size=0.2, random_state=42)

# Create polynomial features including all possible interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.transform(x_test)

# Create a Linear Regression model
model = Lasso(alpha=0.1)

# Fit the model using the training data with polynomial features
model.fit(x_train_poly, y_train)

# Predict the target variable for the training and testing data with polynomial features
y_train_pred = model.predict(x_train_poly)
y_test_pred = model.predict(x_test_poly)

# Calculate in-sample R2 and MSE with polynomial features
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE with polynomial features
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 with polynomial features:", in_sample_r2)
print("In-sample MSE with polynomial features:", in_sample_mse)
print("Out-of-sample R2 with polynomial features:", out_of_sample_r2)
print("Out-of-sample MSE with polynomial features:", out_of_sample_mse)


In-sample R2 with polynomial features: 0.46970634684849555
In-sample MSE with polynomial features: 1.3773153472940147
Out-of-sample R2 with polynomial features: 0.34469556781484156
Out-of-sample MSE with polynomial features: 1.6848164178197635


## LASSO (Scaled)

### Moderate Dimension

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_brthord = new_df[["sibs", "age", "married", "female", "IQ", "black", "south", "urban"]]
y_brthord = new_df[['brthord']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_brthord, y_brthord, test_size=0.2, random_state=42)

# Create a StandardScaler object and fit it to the training data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Create a LASSO Regression model
model = Lasso(alpha=0.1)  # Adjust the alpha value for regularization strength

# Fit the model using the scaled training data
model.fit(x_train_scaled, y_train)

# Predict the target variable for the training and testing data
y_train_pred = model.predict(x_train_scaled)
y_test_pred = model.predict(x_test_scaled)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 (LASSO with scaled features):", in_sample_r2)
print("In-sample MSE (LASSO with scaled features):", in_sample_mse)
print("Out-of-sample R2 (LASSO with scaled features):", out_of_sample_r2)
print("Out-of-sample MSE (LASSO with scaled features):", out_of_sample_mse)

In-sample R2 (LASSO with scaled features): 0.3820456022372356
In-sample MSE (LASSO with scaled features): 1.6049938951905989
Out-of-sample R2 (LASSO with scaled features): 0.3557218576791876
Out-of-sample MSE (LASSO with scaled features): 1.6564673432848291


### High Dimension (up to degree 3)

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_brthord = new_df[["sibs", "age", "married", "female", "IQ", "black", "south", "urban"]]
y_brthord = new_df[['brthord']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_brthord, y_brthord, test_size=0.2, random_state=42)

# Create polynomial features including all possible interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.transform(x_test)

# Create a StandardScaler object and fit it to the training data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_poly)
x_test_scaled = scaler.transform(x_test_poly)

# Create a LASSO Regression model
model = Lasso(alpha=0.1)  # Adjust the alpha value for regularization strength

# Fit the model using the scaled training data
model.fit(x_train_scaled, y_train)

# Predict the target variable for the training and testing data
y_train_pred = model.predict(x_train_scaled)
y_test_pred = model.predict(x_test_scaled)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 (LASSO with scaled features and interactions):", in_sample_r2)
print("In-sample MSE (LASSO with scaled features and interactions):", in_sample_mse)
print("Out-of-sample R2 (LASSO with scaled features and interactions):", out_of_sample_r2)
print("Out-of-sample MSE (LASSO with scaled features and interactions):", out_of_sample_mse)


In-sample R2 (LASSO with scaled features and interactions): 0.41047160894642964
In-sample MSE (LASSO with scaled features and interactions): 1.531163904825487
Out-of-sample R2 (LASSO with scaled features and interactions): 0.3789368062989372
Out-of-sample MSE (LASSO with scaled features and interactions): 1.5967806928482196


## LASSO (Cross-Validated)

### Moderate Dimension

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_brthord = new_df[["sibs", "age", "married", "female", "IQ", "black", "south", "urban"]]
y_brthord = new_df[['brthord']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_brthord, y_brthord, test_size=0.2, random_state=42)

# Create a LASSO Regression model with cross-validated alpha tuning
model = LassoCV(cv=5)  # Adjust cv parameter for the number of cross-validation folds

# Fit the model using the training data
model.fit(x_train, y_train)

# Predict the target variable for the training and testing data
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 (LASSO with alpha tuning):", in_sample_r2)
print("In-sample MSE (LASSO with alpha tuning):", in_sample_mse)
print("Out-of-sample R2 (LASSO with alpha tuning):", out_of_sample_r2)
print("Out-of-sample MSE (LASSO with alpha tuning):", out_of_sample_mse)

In-sample R2 (LASSO with alpha tuning): 0.4044155048568142
In-sample MSE (LASSO with alpha tuning): 1.5468932371640252
Out-of-sample R2 (LASSO with alpha tuning): 0.36492012163751597
Out-of-sample MSE (LASSO with alpha tuning): 1.6328182034785343


### High Dimension (up to degree 3)

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_brthord = new_df[["sibs", "age", "married", "female", "IQ", "black", "south", "urban"]]
y_brthord = new_df[['brthord']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_brthord, y_brthord, test_size=0.2, random_state=42)

# Create polynomial features including all possible interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.transform(x_test)

# Create a StandardScaler object and fit it to the training data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_poly)
x_test_scaled = scaler.transform(x_test_poly)

# Create a LASSO Regression model with cross-validated alpha tuning
model = LassoCV(cv=5, max_iter=10000)  # Adjust cv parameter for the number of cross-validation folds

# Fit the model using the scaled training data
model.fit(x_train_scaled, y_train)

# Predict the target variable for the training and testing data
y_train_pred = model.predict(x_train_scaled)
y_test_pred = model.predict(x_test_scaled)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 (LASSO with scaled features and interactions):", in_sample_r2)
print("In-sample MSE (LASSO with scaled features and interactions):", in_sample_mse)
print("Out-of-sample R2 (LASSO with scaled features and interactions):", out_of_sample_r2)
print("Out-of-sample MSE (LASSO with scaled features and interactions):", out_of_sample_mse)


In-sample R2 (LASSO with scaled features and interactions): 0.4874002455878892
In-sample MSE (LASSO with scaled features and interactions): 1.3313595299041534
Out-of-sample R2 (LASSO with scaled features and interactions): 0.3740535237823608
Out-of-sample MSE (LASSO with scaled features and interactions): 1.6093358262376662


## Post-LASSO

### Moderate Dimension

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
import statsmodels.api as sm
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_brthord = new_df[["sibs", "age", "married", "female", "IQ", "black", "south", "urban"]]
y_brthord = new_df[['brthord']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_brthord, y_brthord, test_size=0.2, random_state=42)

# Create a StandardScaler object and fit it to the training data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Create a LASSO Regression model with cross-validated alpha tuning
model = LassoCV(cv=5)  # Adjust cv parameter for the number of cross-validation folds

# Fit the model using the scaled training data
model.fit(x_train_scaled, y_train)

# Perform post-LASSO inference
X_train_scaled_with_intercept = sm.add_constant(x_train_scaled)
model_sm = sm.OLS(y_train, X_train_scaled_with_intercept)
results = model_sm.fit_regularized(alpha=model.alpha_, L1_wt=1)

# Predict the target variable for the training and testing data
X_test_scaled_with_intercept = sm.add_constant(x_test_scaled)
y_train_pred = results.predict(X_train_scaled_with_intercept)
y_test_pred = results.predict(X_test_scaled_with_intercept)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 (Post-LASSO):", in_sample_r2)
print("In-sample MSE (Post-LASSO):", in_sample_mse)
print("Out-of-sample R2 (Post-LASSO):", out_of_sample_r2)
print("Out-of-sample MSE (Post-LASSO):", out_of_sample_mse)


In-sample R2 (Post-LASSO): 0.40479298489512383
In-sample MSE (Post-LASSO): 1.5459128199046985
Out-of-sample R2 (Post-LASSO): 0.3620820157699305
Out-of-sample MSE (Post-LASSO): 1.6401150980612154


### High Dimension (up to degree 3)

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LassoCV
import statsmodels.api as sm
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_brthord = new_df[["sibs", "age", "married", "female", "IQ", "black", "south", "urban"]]
y_brthord = new_df[['brthord']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_brthord, y_brthord, test_size=0.2, random_state=42)

# Create polynomial features including all possible interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.transform(x_test)

# Create a StandardScaler object and fit it to the training data with polynomial features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_poly)
x_test_scaled = scaler.transform(x_test_poly)

# Create a LASSO Regression model with cross-validated alpha tuning
model = LassoCV(cv=5, max_iter=10000)  # Adjust cv parameter for the number of cross-validation folds

# Fit the model using the scaled training data
model.fit(x_train_scaled, y_train)

# Perform post-LASSO inference
X_train_scaled_with_intercept = sm.add_constant(x_train_scaled)
model_sm = sm.OLS(y_train, X_train_scaled_with_intercept)
results = model_sm.fit_regularized(alpha=model.alpha_, L1_wt=1)

# Predict the target variable for the training and testing data
X_test_scaled_with_intercept = sm.add_constant(x_test_scaled)
y_train_pred = results.predict(X_train_scaled_with_intercept)
y_test_pred = results.predict(X_test_scaled_with_intercept)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 (Post-LASSO with interactions):", in_sample_r2)
print("In-sample MSE (Post-LASSO with interactions):", in_sample_mse)
print("Out-of-sample R2 (Post-LASSO with interactions):", out_of_sample_r2)
print("Out-of-sample MSE (Post-LASSO with interactions):", out_of_sample_mse)


In-sample R2 (Post-LASSO with interactions): 0.48148956297653944
In-sample MSE (Post-LASSO with interactions): 1.3467111635230267
Out-of-sample R2 (Post-LASSO with interactions): 0.3608213065766117
Out-of-sample MSE (Post-LASSO with interactions): 1.6433564366554272


## Prediction

#### Best Performing Model is Cross-Validated LASSO with High Dimension up to degree 3

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_brthord = new_df[["sibs", "age", "married", "female", "IQ", "black", "south", "urban"]]
y_brthord = new_df[['brthord']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_brthord, y_brthord, test_size=0.2, random_state=42)

# Create polynomial features including all possible interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.transform(x_test)

# Create a StandardScaler object and fit it to the training data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_poly)
x_test_scaled = scaler.transform(x_test_poly)

# Create a LASSO Regression model with cross-validated alpha tuning
model = LassoCV(cv=5, max_iter=10000)  # Adjust cv parameter for the number of cross-validation folds

# Fit the model using the scaled training data
model.fit(x_train_scaled, y_train)

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# Filter the rows with missing birthorder values
missing_rows = df[df['brthord'].isnull()]

# Extract the features for the missing rows
x_missing = missing_rows[["sibs", "age", "married", "female", "IQ", "black", "south", "urban"]]

# Create polynomial features with interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_missing_interactions = poly_features.fit_transform(x_missing)

# Create a new DataFrame with the original and interaction features
missing_rows_interactions_df = pd.DataFrame(x_missing_interactions,
                                            columns=poly_features.get_feature_names_out(x_missing.columns))

# Scale the features
scaler = StandardScaler()
x_missing_scaled = scaler.fit_transform(missing_rows_interactions_df)

# Predict the birthorder for the missing rows using the trained LASSO model
birthorder_pred = model.predict(x_missing_scaled)

# Fill the missing values with the predicted birthorder values
missing_rows['brthord'] = birthorder_pred

missing_rows['brthord'] = round(missing_rows['brthord'])

In [19]:
# Set the index of both DataFrames to match on "id" and "year" columns
df = df.set_index(["id", "year"])
missing_rows = missing_rows.set_index(["id", "year"])

# Replace the missing values in df with values from missing_rows
df["brthord"] = df["brthord"].combine_first(missing_rows["brthord"])

# Reset the index
df = df.reset_index()

In [20]:
# Save the DataFrame as a CSV file
df.to_csv('filled_df.csv', index=False)

## Assumptions

### Sparsity

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_brthord = new_df[["sibs", "age", "married", "female", "IQ", "black", "south", "urban"]]
y_brthord = new_df[['brthord']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_brthord, y_brthord, test_size=0.2, random_state=42)

# Create polynomial features including all possible interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.transform(x_test)

# Create a LASSO model
model = LassoCV(cv=5, max_iter=100000)

# Fit the model using the training data with polynomial features
model.fit(x_train_poly, y_train)

# Retrieve the coefficients
coef = model.coef_

# Count number of coefficients that are exactly zero
n_zero_coef = np.sum(coef == 0)
total_coef = len(coef)

print("Total number of features: ", total_coef)
print("Number of features LASSO used (non-zero coefficient): ", total_coef - n_zero_coef)
print("Number of features LASSO did not use (zero coefficient): ", n_zero_coef)

Total number of features:  164
Number of features LASSO used (non-zero coefficient):  10
Number of features LASSO did not use (zero coefficient):  154


### Independence

In [22]:
import numpy as np
from statsmodels.stats.stattools import durbin_watson

# Predict the outcomes for the training data
y_train_pred = model.predict(x_train_poly)

# Compute the residuals
residuals = y_train.values.flatten() - y_train_pred

# Perform the Durbin-Watson test
dw_result = durbin_watson(residuals)

print('Durbin-Watson statistic:', dw_result)

Durbin-Watson statistic: 2.0312742054642707


## Sensitivity Analysis

### Sibs

In [23]:
import numpy as np

# Choose a random data point from the test set
data_point = x_test.iloc[1, :]

# Save the original prediction
original_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))

# Save the original value of "sibs"
original_sibs = data_point["sibs"]

# Create an array to hold the changes in "sibs" and the corresponding changes in the prediction
changes = []

# Change "sibs" by -10% to +10%
for change in np.linspace(-0.10 * original_sibs, 0.10 * original_sibs, num=50):
    data_point["sibs"] += change
    new_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))
    prediction_change = new_prediction - original_prediction
    changes.append((change, prediction_change))

# Convert the results to a DataFrame for easier viewing
sensitivity_df = pd.DataFrame(changes, columns=["Change in Sibs", "Change in Prediction"])

print(sensitivity_df)

    Change in Sibs    Change in Prediction
0        -0.200000  [-0.06736983671553709]
1        -0.191837  [-0.13198988417737878]
2        -0.183673  [-0.19386014238552507]
3        -0.175510  [-0.25298061133997596]
4        -0.167347  [-0.30935129104073167]
5        -0.159184  [-0.36297218148779153]
6        -0.151020  [-0.41384328268115644]
7        -0.142857   [-0.4619645946208255]
8        -0.134694   [-0.5073361173067994]
9        -0.126531   [-0.5499578507390781]
10       -0.118367   [-0.5898297949176614]
11       -0.110204   [-0.6269519498425491]
12       -0.102041   [-0.6613243155137414]
13       -0.093878   [-0.6929468919312383]
14       -0.085714     [-0.72181967909504]
15       -0.077551    [-0.747942677005146]
16       -0.069388    [-0.771315885661557]
17       -0.061224   [-0.7919393050642722]
18       -0.053061   [-0.8098129352132923]
19       -0.044898    [-0.824936776108617]
20       -0.036735   [-0.8373108277502463]
21       -0.028571   [-0.8469350901381802]
22       -0

### IQ

In [24]:
import numpy as np

# Choose a random data point from the test set
data_point = x_test.iloc[0, :]

# Save the original prediction
original_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))

# Save the original value of "exper"
original_IQ = data_point["IQ"]

# Create an array to hold the changes in "IQ" and the corresponding changes in the prediction
changes = []

# Change "IQ" by -10% to +10%
for change in np.linspace(-0.10 * original_IQ, 0.10 * original_IQ, num=50):
    data_point["IQ"] += change
    new_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))
    prediction_change = new_prediction - original_prediction
    changes.append((change, prediction_change))

# Convert the results to a DataFrame for easier viewing
sensitivity_df = pd.DataFrame(changes, columns=["Change in IQ", "Change in Prediction"])

print(sensitivity_df)

    Change in IQ     Change in Prediction
0     -10.800000    [0.24009092022282996]
1     -10.359184    [0.45404556817818476]
2      -9.918367     [0.6423748976079933]
3      -9.477551     [0.8058938911623783]
4      -9.036735     [0.9456790684491414]
5      -8.595918      [1.063027882614383]
6      -8.155102     [1.1594200054542543]
7      -7.714286     [1.2364805010578428]
8      -7.273469     [1.2959448879811954]
9      -6.832653     [1.3396260899524723]
10     -6.391837     [1.3693832751082347]
11     -5.951020     [1.3870925837608687]
12     -5.510204      [1.394619744697143]
13     -5.069388     [1.3937945800078977]
14     -4.628571     [1.3863873984488702]
15     -4.187755     [1.3740872773326551]
16     -3.746939     [1.3584822329517945]
17     -3.306122     [1.3410412795330078]
18     -2.865306     [1.3230983767225506]
19     -2.424490     [1.3058382656027099]
20     -1.983673     [1.2902841932394342]
21     -1.542857     [1.2772875257610954]
22     -1.102041     [1.2675192499

### Age

In [25]:
import numpy as np

# Choose a random data point from the test set
data_point = x_test.iloc[0, :]

# Save the original prediction
original_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))

# Save the original value of "age"
original_age = data_point["age"]

# Create an array to hold the changes in "age" and the corresponding changes in the prediction
changes = []

# Change "age" by -10% to +10%
for change in np.linspace(-0.10 * original_age, 0.10 * original_age, num=50):
    data_point["age"] += change
    new_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))
    prediction_change = new_prediction - original_prediction
    changes.append((change, prediction_change))

# Convert the results to a DataFrame for easier viewing
sensitivity_df = pd.DataFrame(changes, columns=["Change in Age", "Change in Prediction"])

print(sensitivity_df)

    Change in Age   Change in Prediction
0       -4.300000  [0.12163875125805013]
1       -4.124490   [0.2463641083596717]
2       -3.948980   [0.3723834836928348]
3       -3.773469   [0.4981478372051309]
4       -3.597959   [0.6223298589382475]
5       -3.422449   [0.7438031212275742]
6       -3.246939   [0.8616222005669465]
7       -3.071429   [0.9750037691385218]
8       -2.895918   [1.0833086560077902]
9       -2.720408   [1.1860248779837215]
10      -2.544898   [1.2827516401440437]
11      -2.369388   [1.3731843060256594]
12      -2.193878   [1.4571003374801947]
13      -2.018367   [1.5343462041946825]
14      -1.842857   [1.6048252628773814]
15      -1.667347    [1.668486606108729]
16      -1.491837   [1.7253148808574281]
17      -1.316327    [1.775321076661671]
18      -1.140816   [1.8185342834754923]
19      -0.965306    [1.854994419180264]
20      -0.789796   [1.8847459267613191]
21      -0.614286   [1.9078324411497103]
22      -0.438776    [1.924292425729108]
23      -0.26326