In [2]:
# Libraries
import pandas as pd
import numpy as np

# Data

In [3]:
# Read the data
df = pd.read_csv('genderinequality.csv')

In [4]:
# Number of missing values by column
print(df.isnull().sum())

year         0
id           0
wage         0
hours        0
emp          0
treat        0
female       0
IQ           0
KWW          0
educ         0
exper        0
tenure       0
age          0
married      0
black        0
south        0
urban        0
sibs         0
brthord    188
meduc      174
feduc      448
dtype: int64


# Models

In [5]:
# Drop the rows that are missing 'feduc'
new_df = df.dropna(subset=['feduc'])

## OLS

### Moderate Dimension

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_feduc = new_df[["educ", "IQ", "age", "married", "female", "black", "south", "urban", "sibs"]]
y_feduc = new_df[['feduc']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_feduc, y_feduc, test_size=0.2, random_state=42)

# Create a Linear Regression model
model = LinearRegression()

# Fit the model using the training data
model.fit(x_train, y_train)

# Predict the target variable for the training and testing data
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2:", in_sample_r2)
print("In-sample MSE:", in_sample_mse)
print("Out-of-sample R2:", out_of_sample_r2)
print("Out-of-sample MSE:", out_of_sample_mse)

In-sample R2: 0.24832026286300302
In-sample MSE: 8.328183758297584
Out-of-sample R2: 0.20480753557417075
Out-of-sample MSE: 7.92228565240242


### High Dimension (up to degree 3)

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error

# Seperating independent and dependent variables
x_feduc = new_df[["educ", "IQ", "age", "married", "female", "black", "south", "urban", "sibs"]]
y_feduc = new_df[['feduc']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_feduc, y_feduc, test_size=0.2, random_state=42)

# Create polynomial features including all possible interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.transform(x_test)

# Create a Linear Regression model
model = LinearRegression()

# Fit the model using the training data with polynomial features
model.fit(x_train_poly, y_train)

# Predict the target variable for the training and testing data with polynomial features
y_train_pred = model.predict(x_train_poly)
y_test_pred = model.predict(x_test_poly)

# Calculate in-sample R2 and MSE with polynomial features
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE with polynomial features
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 with polynomial features:", in_sample_r2)
print("In-sample MSE with polynomial features:", in_sample_mse)
print("Out-of-sample R2 with polynomial features:", out_of_sample_r2)
print("Out-of-sample MSE with polynomial features:", out_of_sample_mse)

In-sample R2 with polynomial features: 0.4054592026222108
In-sample MSE with polynomial features: 6.587173722716136
Out-of-sample R2 with polynomial features: 0.1853932869874103
Out-of-sample MSE with polynomial features: 8.115704516277246


## LASSO (alpha=1)

### Moderate Dimension

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_feduc = new_df[["educ", "IQ", "age", "married", "female", "black", "south", "urban", "sibs"]]
y_feduc = new_df[['feduc']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_feduc, y_feduc, test_size=0.2, random_state=42)

# Create a LASSO model
model = Lasso(alpha = 0.1)

# Fit the model using the training data
model.fit(x_train, y_train)

# Predict the target variable for the training and testing data
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2:", in_sample_r2)
print("In-sample MSE:", in_sample_mse)
print("Out-of-sample R2:", out_of_sample_r2)
print("Out-of-sample MSE:", out_of_sample_mse)

In-sample R2: 0.2322334469973738
In-sample MSE: 8.506416524189529
Out-of-sample R2: 0.1894121198171702
Out-of-sample MSE: 8.075665980839664


### High Dimension (up to degree 3)

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_feduc = new_df[["educ", "IQ", "age", "married", "female", "black", "south", "urban", "sibs"]]
y_feduc = new_df[['feduc']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_feduc, y_feduc, test_size=0.2, random_state=42)

# Create polynomial features including all possible interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.transform(x_test)

# Create a LASSO model
model = Lasso(alpha=0.1, max_iter=100000)

# Fit the model using the training data with polynomial features
model.fit(x_train_poly, y_train)

# Predict the target variable for the training and testing data with polynomial features
y_train_pred = model.predict(x_train_poly)
y_test_pred = model.predict(x_test_poly)

# Calculate in-sample R2 and MSE with polynomial features
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE with polynomial features
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 with polynomial features:", in_sample_r2)
print("In-sample MSE with polynomial features:", in_sample_mse)
print("Out-of-sample R2 with polynomial features:", out_of_sample_r2)
print("Out-of-sample MSE with polynomial features:", out_of_sample_mse)

In-sample R2 with polynomial features: 0.3343004818522648
In-sample MSE with polynomial features: 7.37557185731889
Out-of-sample R2 with polynomial features: 0.1852761636576884
Out-of-sample MSE with polynomial features: 8.116871384068538


  model = cd_fast.enet_coordinate_descent(


## LASSO (Scaled)

### Moderate Dimension

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_feduc = new_df[["educ", "IQ", "age", "married", "female", "black", "south", "urban", "sibs"]]
y_feduc = new_df[['feduc']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_feduc, y_feduc, test_size=0.2, random_state=42)

# Create a StandardScaler object and fit it to the training data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Create a LASSO Regression model
model = Lasso(alpha=0.1)  # Adjust the alpha value for regularization strength

# Fit the model using the scaled training data
model.fit(x_train_scaled, y_train)

# Predict the target variable for the training and testing data
y_train_pred = model.predict(x_train_scaled)
y_test_pred = model.predict(x_test_scaled)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 (LASSO with scaled features):", in_sample_r2)
print("In-sample MSE (LASSO with scaled features):", in_sample_mse)
print("Out-of-sample R2 (LASSO with scaled features):", out_of_sample_r2)
print("Out-of-sample MSE (LASSO with scaled features):", out_of_sample_mse)

In-sample R2 (LASSO with scaled features): 0.2424204128734081
In-sample MSE (LASSO with scaled features): 8.39355073898391
Out-of-sample R2 (LASSO with scaled features): 0.2093172139602396
Out-of-sample MSE (LASSO with scaled features): 7.877356956554304


### High Dimension (up to degree 3)

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_feduc = new_df[["educ", "IQ", "age", "married", "female", "black", "south", "urban", "sibs"]]
y_feduc = new_df[['feduc']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_feduc, y_feduc, test_size=0.2, random_state=42)

# Create polynomial features including all possible interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.transform(x_test)

# Create a StandardScaler object and fit it to the training data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_poly)
x_test_scaled = scaler.transform(x_test_poly)

# Create a LASSO Regression model
model = Lasso(alpha=0.1)  # Adjust the alpha value for regularization strength

# Fit the model using the scaled training data
model.fit(x_train_scaled, y_train)

# Predict the target variable for the training and testing data
y_train_pred = model.predict(x_train_scaled)
y_test_pred = model.predict(x_test_scaled)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 (LASSO with scaled features and interactions):", in_sample_r2)
print("In-sample MSE (LASSO with scaled features and interactions):", in_sample_mse)
print("Out-of-sample R2 (LASSO with scaled features and interactions):", out_of_sample_r2)
print("Out-of-sample MSE (LASSO with scaled features and interactions):", out_of_sample_mse)

In-sample R2 (LASSO with scaled features and interactions): 0.2611804696911686
In-sample MSE (LASSO with scaled features and interactions): 8.185699984499706
Out-of-sample R2 (LASSO with scaled features and interactions): 0.21599049790821467
Out-of-sample MSE (LASSO with scaled features and interactions): 7.8108728485671595


## LASSO (Cross-Validated)

### Moderate Dimension

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_feduc = new_df[["educ", "IQ", "age", "married", "female", "black", "south", "urban", "sibs"]]
y_feduc = new_df[['feduc']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_feduc, y_feduc, test_size=0.2, random_state=42)

# Create a LASSO Regression model with cross-validated alpha tuning
model = LassoCV(cv=5)  # Adjust cv parameter for the number of cross-validation folds

# Fit the model using the training data
model.fit(x_train, y_train)

# Predict the target variable for the training and testing data
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 (LASSO with alpha tuning):", in_sample_r2)
print("In-sample MSE (LASSO with alpha tuning):", in_sample_mse)
print("Out-of-sample R2 (LASSO with alpha tuning):", out_of_sample_r2)
print("Out-of-sample MSE (LASSO with alpha tuning):", out_of_sample_mse)

In-sample R2 (LASSO with alpha tuning): 0.24723701798172604
In-sample MSE (LASSO with alpha tuning): 8.340185495182059
Out-of-sample R2 (LASSO with alpha tuning): 0.20562754795818472
Out-of-sample MSE (LASSO with alpha tuning): 7.91411609266023


  y = column_or_1d(y, warn=True)


### High Dimension (up to degree 3)

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_feduc = new_df[["educ", "IQ", "age", "married", "female", "black", "south", "urban", "sibs"]]
y_feduc = new_df[['feduc']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_feduc, y_feduc, test_size=0.2, random_state=42)

# Create polynomial features including all possible interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.transform(x_test)

# Create a StandardScaler object and fit it to the training data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_poly)
x_test_scaled = scaler.transform(x_test_poly)

# Create a LASSO Regression model
model = LassoCV(cv=5, max_iter=100000)  # Adjust the alpha value for regularization strength

# Fit the model using the scaled training data
model.fit(x_train_scaled, y_train)

# Predict the target variable for the training and testing data
y_train_pred = model.predict(x_train_scaled)
y_test_pred = model.predict(x_test_scaled)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 (LASSO with scaled features and interactions):", in_sample_r2)
print("In-sample MSE (LASSO with scaled features and interactions):", in_sample_mse)
print("Out-of-sample R2 (LASSO with scaled features and interactions):", out_of_sample_r2)
print("Out-of-sample MSE (LASSO with scaled features and interactions):", out_of_sample_mse)

  y = column_or_1d(y, warn=True)


In-sample R2 (LASSO with scaled features and interactions): 0.3371250121374145
In-sample MSE (LASSO with scaled features and interactions): 7.3442776690051295
Out-of-sample R2 (LASSO with scaled features and interactions): 0.2512177292644211
Out-of-sample MSE (LASSO with scaled features and interactions): 7.459913549992008


## Post LASSO

### Moderate Dimension

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
import statsmodels.api as sm
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_feduc = new_df[["educ", "IQ", "age", "married", "female", "black", "south", "urban", "sibs"]]
y_feduc = new_df[['feduc']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_feduc, y_feduc, test_size=0.2, random_state=42)

# Create a StandardScaler object and fit it to the training data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Create a LASSO Regression model with cross-validated alpha tuning
model = LassoCV(cv=5)  # Adjust cv parameter for the number of cross-validation folds

# Fit the model using the scaled training data
model.fit(x_train_scaled, y_train)

# Perform post-LASSO inference
X_train_scaled_with_intercept = sm.add_constant(x_train_scaled)
model_sm = sm.OLS(y_train, X_train_scaled_with_intercept)
results = model_sm.fit_regularized(alpha=model.alpha_, L1_wt=1)

# Predict the target variable for the training and testing data
X_test_scaled_with_intercept = sm.add_constant(x_test_scaled)
y_train_pred = results.predict(X_train_scaled_with_intercept)
y_test_pred = results.predict(X_test_scaled_with_intercept)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 (Post-LASSO):", in_sample_r2)
print("In-sample MSE (Post-LASSO):", in_sample_mse)
print("Out-of-sample R2 (Post-LASSO):", out_of_sample_r2)
print("Out-of-sample MSE (Post-LASSO):", out_of_sample_mse)

In-sample R2 (Post-LASSO): 0.24831902244514792
In-sample MSE (Post-LASSO): 8.328197501421581
Out-of-sample R2 (Post-LASSO): 0.2049870158266296
Out-of-sample MSE (Post-LASSO): 7.920497539596332


  y = column_or_1d(y, warn=True)


### High Dimension (up to degree 3)

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LassoCV
import statsmodels.api as sm
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_feduc = new_df[["educ", "IQ", "age", "married", "female", "black", "south", "urban", "sibs"]]
y_feduc = new_df[['feduc']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_feduc, y_feduc, test_size=0.2, random_state=42)

# Create polynomial features including all possible interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.transform(x_test)

# Create a StandardScaler object and fit it to the training data with polynomial features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_poly)
x_test_scaled = scaler.transform(x_test_poly)

# Create a LASSO Regression model with cross-validated alpha tuning
model = LassoCV(cv=5, max_iter=100000)  # Adjust cv parameter for the number of cross-validation folds

# Fit the model using the scaled training data
model.fit(x_train_scaled, y_train)

# Perform post-LASSO inference
X_train_scaled_with_intercept = sm.add_constant(x_train_scaled)
model_sm = sm.OLS(y_train, X_train_scaled_with_intercept)
results = model_sm.fit_regularized(alpha=model.alpha_, L1_wt=1)

# Predict the target variable for the training and testing data
X_test_scaled_with_intercept = sm.add_constant(x_test_scaled)
y_train_pred = results.predict(X_train_scaled_with_intercept)
y_test_pred = results.predict(X_test_scaled_with_intercept)

# Calculate in-sample R2 and MSE
in_sample_r2 = r2_score(y_train, y_train_pred)
in_sample_mse = mean_squared_error(y_train, y_train_pred)

# Calculate out-of-sample R2 and MSE
out_of_sample_r2 = r2_score(y_test, y_test_pred)
out_of_sample_mse = mean_squared_error(y_test, y_test_pred)

print("In-sample R2 (Post-LASSO with interactions):", in_sample_r2)
print("In-sample MSE (Post-LASSO with interactions):", in_sample_mse)
print("Out-of-sample R2 (Post-LASSO with interactions):", out_of_sample_r2)
print("Out-of-sample MSE (Post-LASSO with interactions):", out_of_sample_mse)

  y = column_or_1d(y, warn=True)


In-sample R2 (Post-LASSO with interactions): 0.3176703888008058
In-sample MSE (Post-LASSO with interactions): 7.559823825288191
Out-of-sample R2 (Post-LASSO with interactions): 0.24019796885702716
Out-of-sample MSE (Post-LASSO with interactions): 7.5697004175416716


## Prediction

Result: Best Performing Model is LASSO(alpha=0.1) with High Dimension up to degree 3

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_feduc = new_df[["educ", "IQ", "age", "married", "female", "black", "south", "urban", "sibs"]]
y_feduc = new_df[['feduc']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_feduc, y_feduc, test_size=0.2, random_state=42)

# Create polynomial features including all possible interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.transform(x_test)

# Create a LASSO model
model = LassoCV(cv=5, max_iter=100000)

# Fit the model using the training data with polynomial features
model.fit(x_train_poly, y_train)

  y = column_or_1d(y, warn=True)


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# Filter the rows with missing feduc values
missing_rows = df[df['feduc'].isnull()]

# Extract the features for the missing rows
x_missing = missing_rows[["educ", "IQ", "age", "married", "female", "black", "south", "urban", "sibs"]]

# Create polynomial features with interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_missing_interactions = poly_features.fit_transform(x_missing)

# Create a new DataFrame with the original and interaction features
missing_rows_interactions_df = pd.DataFrame(x_missing_interactions,
                                            columns=poly_features.get_feature_names_out(x_missing.columns))
# Scale the features
scaler = StandardScaler()
x_missing_scaled = scaler.fit_transform(missing_rows_interactions_df)

# Predict the birthorder for the missing rows using the trained LASSO model
feduc_pred = model.predict(x_missing_scaled)

# Fill the missing values with the predicted feduc values
missing_rows['feduc'] = feduc_pred

# Round the value (education is integer)
missing_rows['feduc'] = round(missing_rows['feduc'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['feduc'] = feduc_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['feduc'] = round(missing_rows['feduc'])


In [18]:
# Set the index of both DataFrames to match on "id" and "year" columns
df = df.set_index(["id", "year"])
missing_rows = missing_rows.set_index(["id", "year"])

# Replace the missing values in df with values from missing_rows
df["feduc"] = df["feduc"].combine_first(missing_rows["feduc"])

# Reset the index
df = df.reset_index()

In [19]:
# Check for missing values after prediction
print(df.isnull().sum())

id           0
year         0
wage         0
hours        0
emp          0
treat        0
female       0
IQ           0
KWW          0
educ         0
exper        0
tenure       0
age          0
married      0
black        0
south        0
urban        0
sibs         0
brthord    188
meduc      174
feduc        0
dtype: int64


In [20]:
df.to_csv("feduc.csv", index=False)

## Assumptions

### Sparsity

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error

# Seperate independent and dependent variables
x_feduc = new_df[["educ", "IQ", "age", "married", "female", "black", "south", "urban", "sibs"]]
y_feduc = new_df[['feduc']]

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_feduc, y_feduc, test_size=0.2, random_state=42)

# Create polynomial features including all possible interactions up to degree 3
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.transform(x_test)

# Create a LASSO model
model = LassoCV(cv=5, max_iter=100000)

# Fit the model using the training data with polynomial features
model.fit(x_train_poly, y_train)

# Retrieve the coefficients
coef = model.coef_

# Count number of coefficients that are exactly zero
n_zero_coef = np.sum(coef == 0)
total_coef = len(coef)

print("Total number of features: ", total_coef)
print("Number of features LASSO used (non-zero coefficient): ", total_coef - n_zero_coef)
print("Number of features LASSO did not use (zero coefficient): ", n_zero_coef)

  y = column_or_1d(y, warn=True)


Total number of features:  219
Number of features LASSO used (non-zero coefficient):  9
Number of features LASSO did not use (zero coefficient):  210


### Independence

In [22]:
import numpy as np
from statsmodels.stats.stattools import durbin_watson

# Predict the outcomes for the training data
y_train_pred = model.predict(x_train_poly)

# Compute the residuals
residuals = y_train.values.flatten() - y_train_pred

# Perform the Durbin-Watson test
dw_result = durbin_watson(residuals)

print('Durbin-Watson statistic:', dw_result)

Durbin-Watson statistic: 2.0228604978327174


## Sensitivity Analysis

### Education

In [23]:
import numpy as np

# Choose a random data point from the test set
data_point = x_test.iloc[0, :]

# Save the original prediction
original_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))

# Save the original value of "educ"
original_educ = data_point["educ"]

# Create an array to hold the changes in "educ" and the corresponding changes in the prediction
changes = []

# Change "educ" by -10% to +10%
for change in np.linspace(-0.10 * original_educ, 0.10 * original_educ, num=50):
    data_point["educ"] += change
    new_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))
    prediction_change = new_prediction - original_prediction
    changes.append((change, prediction_change))

# Convert the results to a DataFrame for easier viewing
sensitivity_df = pd.DataFrame(changes, columns=["Change in educ", "Change in Prediction"])

print(sensitivity_df)

    Change in educ   Change in Prediction
0        -1.600000  [-0.5937329159236597]
1        -1.534694  [-1.1632318352790065]
2        -1.469388  [-1.7084967580660404]
3        -1.404082  [-2.2295276842847613]
4        -1.338776   [-2.726324613935171]
5        -1.273469   [-3.198887547017266]
6        -1.208163    [-3.64721648353105]
7        -1.142857   [-4.071311423476521]
8        -1.077551   [-4.471172366853679]
9        -1.012245   [-4.846799313662524]
10       -0.946939   [-5.198192263903057]
11       -0.881633  [-5.5253512175752775]
12       -0.816327   [-5.828276174679186]
13       -0.751020   [-6.106967135214781]
14       -0.685714   [-6.361424099182063]
15       -0.620408   [-6.591647066581032]
16       -0.555102  [-6.7976360374116895]
17       -0.489796   [-6.979391011674034]
18       -0.424490   [-7.136911989368066]
19       -0.359184  [-7.2701989704937855]
20       -0.293878   [-7.379251955051192]
21       -0.228571   [-7.464070943040287]
22       -0.163265   [-7.524655934



### IQ

In [24]:
import numpy as np

# Choose a random data point from the test set
data_point = x_test.iloc[0, :]

# Save the original prediction
original_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))

# Save the original value of "exper"
original_IQ = data_point["IQ"]

# Create an array to hold the changes in "IQ" and the corresponding changes in the prediction
changes = []

# Change "IQ" by -10% to +10%
for change in np.linspace(-0.10 * original_IQ, 0.10 * original_IQ, num=50):
    data_point["IQ"] += change
    new_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))
    prediction_change = new_prediction - original_prediction
    changes.append((change, prediction_change))

# Convert the results to a DataFrame for easier viewing
sensitivity_df = pd.DataFrame(changes, columns=["Change in IQ", "Change in Prediction"])

print(sensitivity_df)

    Change in IQ       Change in Prediction
0     -10.700000      [-0.5476065791171365]
1     -10.263265      [-1.0523590416030544]
2      -9.826531      [-1.5095973904249398]
3      -9.389796       [-1.916396702996856]
4      -8.953061      [-2.2713666820448584]
5      -8.516327        [-2.57446011532255]
6      -8.079592       [-2.826790244177065]
7      -7.642857       [-3.030457040965514]
8      -7.206122      [-3.1883823953218373]
9      -6.769388      [-3.3041542092741274]
10     -6.332653       [-3.381879401212373]
11     -5.895918      [-3.4260458187066485]
12     -5.459184       [-3.441393060175743]
13     -5.022449      [-3.4327922054062263]
14     -4.585714      [-3.4051344549219573]
15     -4.148980      [-3.3632286782040293]
16     -3.712245       [-3.311707870761156]
17     -3.275510      [-3.2549445200504996]
18     -2.838776      [-3.1969748802489315]
19     -2.402041        [-3.14143215587474]
20     -1.965306      [-3.0914885942597685]
21     -1.528571       [-3.04980



### Age

In [25]:
import numpy as np

# Choose a random data point from the test set
data_point = x_test.iloc[0, :]

# Save the original prediction
original_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))

# Save the original value of "age"
original_age = data_point["age"]

# Create an array to hold the changes in "age" and the corresponding changes in the prediction
changes = []

# Change "age" by -10% to +10%
for change in np.linspace(-0.10 * original_age, 0.10 * original_age, num=50):
    data_point["age"] += change
    new_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))
    prediction_change = new_prediction - original_prediction
    changes.append((change, prediction_change))

# Convert the results to a DataFrame for easier viewing
sensitivity_df = pd.DataFrame(changes, columns=["Change in Age", "Change in Prediction"])

print(sensitivity_df)

    Change in Age   Change in Prediction
0       -3.300000  [0.15345263792054098]
1       -3.165306  [0.27407351778987277]
2       -3.030612  [0.36753745696668183]
3       -2.895918    [0.438776989257299]
4       -2.761224   [0.4920476869108281]
5       -2.626531   [0.5309905794144818]
6       -2.491837   [0.5586916690891446]
7       -2.357143   [0.5777385434851414]
8       -2.222449   [0.5902740845782422]
9       -2.087755   [0.5980472747658521]
10      -1.953061    [0.602461099663449]
11      -1.818367   [0.6046175477012206]
12      -1.683673   [0.6053597065209271]
13      -1.548980   [0.6053109561729659]
14      -1.414286    [0.604911259113674]
15      -1.279592   [0.6044505470028305]
16      -1.144898   [0.6040992043013826]
17      -1.010204   [0.6039356486693865]
18      -0.875510   [0.6039710081641676]
19      -0.740816   [0.6041708952386973]
20      -0.606122   [0.6044742775401719]
21      -0.471429   [0.6048094455088417]
22      -0.336735    [0.605107076777017]
23      -0.20204



### Sibs

In [26]:
import numpy as np

# Choose a random data point from the test set
data_point = x_test.iloc[0, :]

# Save the original prediction
original_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))

# Save the original value of "sibs"
original_sibs = data_point["sibs"]

# Create an array to hold the changes in "sibs" and the corresponding changes in the prediction
changes = []

# Change "sibs" by -10% to +10%
for change in np.linspace(-0.10 * original_sibs, 0.10 * original_sibs, num=50):
    data_point["sibs"] += change
    new_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))
    prediction_change = new_prediction - original_prediction
    changes.append((change, prediction_change))

# Convert the results to a DataFrame for easier viewing
sensitivity_df = pd.DataFrame(changes, columns=["Change in Sibs", "Change in Prediction"])

print(sensitivity_df)

    Change in Sibs   Change in Prediction
0        -0.600000  [0.12381211697151251]
1        -0.575510  [0.24257067814826705]
2        -0.551020  [0.35627568353026895]
3        -0.526531  [0.46492713311751466]
4        -0.502041   [0.5685250269100042]
5        -0.477551   [0.6670693649077375]
6        -0.453061   [0.7605601471107164]
7        -0.428571   [0.8489973735189391]
8        -0.404082   [0.9323810441324074]
9        -0.379592   [1.0107111589511195]
10       -0.355102   [1.0839877179750754]
11       -0.330612   [1.1522107212042751]
12       -0.306122   [1.2153801686387204]
13       -0.281633   [1.2734960602784096]
14       -0.257143   [1.3265583961233443]
15       -0.232653   [1.3745671761735228]
16       -0.208163    [1.417522400428945]
17       -0.183673   [1.4554240688896112]
18       -0.159184   [1.4882721815555229]
19       -0.134694   [1.5160667384266784]
20       -0.110204   [1.5388077395030795]
21       -0.085714   [1.5564951847847244]
22       -0.061224    [1.569129074



### Married

In [27]:
import numpy as np

# Choose a random data point from the test set
data_point = x_test.iloc[0, :]

# Save the original prediction
original_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))

# Save the original value of "married"
original_married = data_point["married"]

# Create an array to hold the changes in "married" and the corresponding changes in the prediction
changes = []

# Change "married" from 0 to 1
for change in [0, 1]:
    data_point["married"] = change
    new_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))
    prediction_change = new_prediction - original_prediction
    changes.append((change, prediction_change))

# Convert the results to a DataFrame for easier viewing
sensitivity_df = pd.DataFrame(changes, columns=["Change in Married", "Change in Prediction"])

print(sensitivity_df)


   Change in Married Change in Prediction
0                  0                [0.0]
1                  1                [0.0]




### Female

In [28]:
import numpy as np

# Choose a random data point from the test set
data_point = x_test.iloc[0, :]

# Save the original prediction
original_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))

# Save the original value of "female"
original_female = data_point["female"]

# Create an array to hold the changes in "female" and the corresponding changes in the prediction
changes = []

# Change "female" from 0 to 1
for change in [0, 1]:
    data_point["female"] = change
    new_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))
    prediction_change = new_prediction - original_prediction
    changes.append((change, prediction_change))

# Convert the results to a DataFrame for easier viewing
sensitivity_df = pd.DataFrame(changes, columns=["Change in Female", "Change in Prediction"])

print(sensitivity_df)


   Change in Female   Change in Prediction
0                 0                  [0.0]
1                 1  [0.06674138104101068]




### Black

In [29]:
import numpy as np

# Choose a random data point from the test set
data_point = x_test.iloc[0, :]

# Save the original prediction
original_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))

# Save the original value of "black"
original_black = data_point["black"]

# Create an array to hold the changes in "black" and the corresponding changes in the prediction
changes = []

# Change "black" from 0 to 1
for change in [0, 1]:
    data_point["black"] = change
    new_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))
    prediction_change = new_prediction - original_prediction
    changes.append((change, prediction_change))

# Convert the results to a DataFrame for easier viewing
sensitivity_df = pd.DataFrame(changes, columns=["Change in Black", "Change in Prediction"])

print(sensitivity_df)


   Change in Black     Change in Prediction
0                0                    [0.0]
1                1  [-0.016575236384170466]




### South

In [30]:
import numpy as np

# Choose a random data point from the test set
data_point = x_test.iloc[0, :]

# Save the original prediction
original_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))

# Save the original value of "south"
original_south = data_point["south"]

# Create an array to hold the changes in "south" and the corresponding changes in the prediction
changes = []

# Change "south" from 0 to 1
for change in [0, 1]:
    data_point["south"] = change
    new_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))
    prediction_change = new_prediction - original_prediction
    changes.append((change, prediction_change))

# Convert the results to a DataFrame for easier viewing
sensitivity_df = pd.DataFrame(changes, columns=["Change in South", "Change in Prediction"])

print(sensitivity_df)


   Change in South   Change in Prediction
0                0                  [0.0]
1                1  [-0.5759755579174755]




### Urban

In [31]:
import numpy as np

# Choose a random data point from the test set
data_point = x_test.iloc[0, :]

# Save the original prediction
original_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))

# Save the original value of "urban"
original_urban = data_point["urban"]

# Create an array to hold the changes in "urban" and the corresponding changes in the prediction
changes = []

# Change "urban" from 0 to 1
for change in [0, 1]:
    data_point["urban"] = change
    new_prediction = model.predict(poly_features.transform(data_point.values.reshape(1, -1)))
    prediction_change = new_prediction - original_prediction
    changes.append((change, prediction_change))

# Convert the results to a DataFrame for easier viewing
sensitivity_df = pd.DataFrame(changes, columns=["Change in Urban", "Change in Prediction"])

print(sensitivity_df)


   Change in Urban   Change in Prediction
0                0  [-0.3282041113210763]
1                1                  [0.0]


