#### This script runs the RFE (recursive feature elimination) on the Logit and OLS models that exclude demographic predictors -- the generated feature ranking will be used to compare with the feature ranking of models that include demographic predictors

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_recall_curve, confusion_matrix, precision_score, recall_score
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import pickle

fpath = "/Users/ys8mz/Box Sync/Predictive Models of College Completion (VCCS)/intermediate_files"

In [2]:
df = pd.read_stata(fpath + "/full_data_truncated.dta")

In [3]:
demo_predictors = ["age_entry", "male", "white", "afam", "hisp", "other", "pell_0_ind", "pell_1_ind"] + \
["pell_0_" + s1 + str(s2) for s1 in ["fa","sp","su"] for s2 in range(1,7)] + \
["pell_1_" + s1 + str(s2) for s1 in ["fa","sp","su"] for s2 in range(1,7)] + \
["phe_" + str(i) for i in range(1,8)]
demo_predictors = set(demo_predictors)
predictors = [p for p in list(df.columns)[10:] if p not in demo_predictors]
print(len(predictors), len(demo_predictors))

280 51


In [4]:
impute_list_1 = set(["prop_comp_pre","cum_gpa_pre"])
impute_list_2 = set([t1+"_"+t2+str(t3) for t1 in ["term_gpa", "prop_comp", "lvl2_prop_comp", "dev_prop_comp"] for t2 in ["fa", "sp", "su"] for t3 in range(1,7,1)])
impute_list_3 = set(["cum_gpa", "lvl2_prop_comp", "dev_prop_comp", "prop_comp", "prop_comp_sd", "withdrawn_prop_comp_sd"])
impute_list_4 = set(["admrate", "gradrate", "satvr25", "satvr75", "satmt25", "satmt75", "satwr25", "satwr75"])

In [5]:
train_df = df[df.valid == 0]
test_df = df[df.valid == 1]
print(train_df.shape,test_df.shape)

(298139, 341) (33115, 341)


In [6]:
def impute(train_original, test_original):
    train = train_original.copy()
    test = test_original.copy()
    for p in impute_list_1:
        avg_p = np.nanmean(train[train.enrolled_pre == 1][p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    for p in impute_list_3:
        avg_p = np.nanmean(train[p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    for p in impute_list_2:
        suffix = p[-3:]
        avg_p = np.nanmean(train[train["enrolled_" + suffix] == 1][p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    for p in impute_list_4:
        avg_p = np.nanmean(train[train["enrolled_nsc"] == 1][p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    return train, test                    

In [7]:
train_df_new, test_df_new = impute(train_df, test_df)
X_train = train_df_new.loc[:,predictors]
y_train = train_df_new.grad_6years
X_test = test_df_new.loc[:,predictors]
y_test = test_df_new.grad_6years

In [8]:
scaler = MinMaxScaler()
X_train_new = scaler.fit_transform(X_train)
X_test_new = scaler.transform(X_test)

  return self.partial_fit(X, y)


In [9]:
results_dir = "C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\evaluation_results\\truncated_without_demo\\"

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
import datetime as dt

In [11]:
lr_estimator = LogisticRegression(solver="saga", max_iter=10000)
selector = RFE(lr_estimator, n_features_to_select=1, step=1)
print(dt.datetime.now())
selector.fit(X_train_new, y_train)
print(dt.datetime.now())
df1 = pd.DataFrame({'predictor_name':predictors, 'ranking':selector.ranking_}).sort_values(['ranking'])
df1.to_csv(results_dir + "lr_feature_ranking.csv", index=False)

2020-03-17 22:22:16.893250
2020-03-18 04:57:22.298070


In [12]:
ols_estimator = LinearRegression()
selector_3 = RFE(ols_estimator, n_features_to_select=1, step=1)
print(dt.datetime.now())
selector_3.fit(X_train_new, y_train)
print(dt.datetime.now())
df3 = pd.DataFrame({'predictor_name':predictors, 'ranking':selector_3.ranking_}).sort_values(['ranking'])
df3.to_csv(results_dir + "ols_feature_ranking.csv", index=False)

2020-03-18 04:57:22.317060
2020-03-18 05:06:43.580729
