In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
data = pd.read_csv('./Data/scaled_grad_demo_fin.csv')

In [3]:
data

Unnamed: 0,ECONOMICALLY_DISADVANTAGED,H_Female,AA_FEMALE,W_FEMALE,AA_MALE,W_MALE,H_MALE,AA,H,W,Expend_per_pupil,grad_rate
0,-0.340444,-0.810652,-0.749432,0.881421,-0.731183,0.939311,-1.003977,-0.743636,-0.922150,0.926043,-0.247486,0.96
1,0.295084,-0.742734,-0.610572,0.488740,-0.632984,0.972940,-0.629554,-0.624308,-0.691083,0.747043,-0.077559,0.95
2,-0.669709,0.225286,-0.220271,0.073510,-0.104159,-0.119980,0.183801,-0.163735,0.205828,-0.025222,1.379947,0.92
3,-0.714760,0.031734,-0.641120,0.294973,-0.573553,0.814632,-0.067018,-0.610439,-0.020230,0.568360,-0.278540,0.98
4,0.520390,3.856570,-0.095779,-0.799500,-0.123050,-0.743684,3.180804,-0.109695,3.541708,-0.784013,-1.092946,0.89
...,...,...,...,...,...,...,...,...,...,...,...,...
262,-0.340087,0.645130,-0.160112,-0.038207,-0.149754,-0.012516,0.629353,-0.155676,0.643992,-0.025573,-0.888277,0.94
263,-1.560176,-0.530941,-0.422304,0.470175,-0.385032,0.388020,-0.400242,-0.405676,-0.467638,0.435583,-1.044175,0.97
264,-0.768097,-0.901072,-0.606816,0.650322,-0.509172,0.773584,-0.499283,-0.561095,-0.698286,0.724858,0.325878,0.96
265,-1.298110,-0.259168,-0.374950,0.246499,-0.412574,0.458281,-0.167937,-0.395199,-0.213734,0.360024,-0.814397,0.97


## Splitting into train/test

In [4]:
X = data.drop(columns=['grad_rate'])
y = data['grad_rate']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state = 40)

## Dictionary to store results

In [6]:
evals = dict()

##  Multiple Linear Regression (OLS)

In [7]:
MLR_model = LinearRegression()
scores= cross_val_score(MLR_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
mean_mae = np.mean(scores)
print("mean MAE: %.2E" %(mean_mae))

evals['MLR'] =  mean_mae

mean MAE: -3.36E-02


## Ridge without hyperparameter tuning

In [8]:
Ridge_model = Ridge(alpha=1)
scores= cross_val_score(Ridge_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
mean_mae = np.mean(scores)
print("mean MAE: %.2E" %(mean_mae))

mean MAE: -3.34E-02


## Ridge with hyperparameter tuning

In [9]:
Ridge_model = Ridge()
grid = dict()
grid['alpha'] = np.arange(0,50,.1)
search = GridSearchCV(Ridge_model, grid, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
results = search.fit(X_train, y_train)
print('MAE: %.2E' % results.best_score_)
print('Config: %s' % results.best_params_)

evals['Ridge'] = [results.best_score_,results.best_params_]

MAE: -3.21E-02
Config: {'alpha': 49.900000000000006}


## Lasso without hyperparameter tuning

In [10]:
lasso_model = Lasso(alpha=1)
scores= cross_val_score(lasso_model, X, y, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
mean_mae = np.mean(scores)
print("mean MAE: %.2E" %(mean_mae))

mean MAE: -4.76E-02


 ## Lasso with hyperparameter tuning

In [11]:
lasso_model = Lasso()
grid = dict()
grid['alpha'] = np.arange(.0001,50,.1)
search = GridSearchCV(lasso_model, grid, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
results = search.fit(X_train, y_train)
print('MAE: %.2E' % results.best_score_)
print('Config: %s' % results.best_params_)

evals['Lasso'] = [results.best_score_,results.best_params_]

MAE: -3.34E-02
Config: {'alpha': 0.0001}


## Random forest without hyperparameter tuning

In [12]:
RF_model = RandomForestRegressor()
scores= cross_val_score(RF_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
mean_mae = np.mean(scores)
print("mean MAE: %.2E" %(mean_mae))

mean MAE: -3.38E-02


## Random forest with hyperparameter tuning

In [13]:
RF_model = RandomForestRegressor()
grid = {
    'bootstrap': [True,False],
    'max_depth': [int(x) for x in np.linspace(50, 110, num = 11)] + [None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
}
search = RandomizedSearchCV(RF_model, grid, n_iter=10,scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
results = search.fit(X_train, y_train)
print('MAE: %.2E' % results.best_score_)
print('Config: %s' % results.best_params_)

evals['RF'] = [results.best_score_,results.best_params_]

MAE: -3.37E-02
Config: {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 98, 'bootstrap': True}


In [14]:
evals

{'MLR': -0.033626269121816973,
 'Ridge': [-0.03210272912965349, {'alpha': 49.900000000000006}],
 'Lasso': [-0.03338795966651996, {'alpha': 0.0001}],
 'RF': [-0.03366485673843754,
  {'n_estimators': 400,
   'min_samples_split': 5,
   'min_samples_leaf': 4,
   'max_features': 'auto',
   'max_depth': 98,
   'bootstrap': True}]}

Ridge Regressor seems to be the best performing model.

In [15]:
Ridge_model = Ridge(alpha=evals['Ridge'][1]['alpha'])
Ridge_model.fit(X_train, y_train)
with open('./Model/Ridge_Regressor.pkl', 'wb') as f:
    pickle.dump(Ridge_model, f)