The purpose of this script is to create the dataset for running the RNN models without including demographic predictors.

In [17]:
import pickle
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
fpath = "/Users/ys8mz/Box Sync/Predictive Models of College Completion (VCCS)/intermediate_files"

In [18]:
df = pd.read_stata(fpath + "/full_data_truncated.dta")
df.loc[:,'available_sum'] = 0
for p in [p for p in list(df.columns)[10:] if p.startswith("available") and p != "available_sum"]:
    df.loc[:,'available_sum'] += df[p]

In [19]:
train = df[df.valid == 0].iloc[:,[0,1,7]+list(range(10,df.shape[1]))]
test_part = df[df.valid == 1].iloc[:,[0,1,7]+list(range(10,df.shape[1]))]

In [20]:
impute_list_1 = set(["prop_comp_pre","cum_gpa_pre"])
impute_list_2 = set(["cum_gpa", "lvl2_prop_comp", "dev_prop_comp", "prop_comp", "prop_comp_sd", "withdrawn_prop_comp_sd"])
impute_list_3 = set(["admrate", "gradrate", "satvr25", "satvr75", "satmt25", "satmt75", "satwr25", "satwr75"])

In [21]:
def impute(train, test):
    for p in impute_list_1:
        avg_p = np.nanmean(train[train.enrolled_pre == 1][p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    for p in impute_list_2:
        avg_p = np.nanmean(train[p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    for p in impute_list_3:
        avg_p = np.nanmean(train[train["enrolled_nsc"] == 1][p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    return train, test  

In [22]:
train_new, test_part_new = impute(train, test_part)
train_part, valid_part = train_test_split(train_new, test_size=0.1, 
                                          stratify=train['grad_6years'].astype(str)+"_"+train['available_sum'].astype(str),
                                          random_state=54321)

In [24]:
predictors_2 = [p for p in df.columns.values if p[-4:] not in ["_"+v1+str(v2) for v1 in ['fa','sp','su','yr'] for v2 in range(1,7,1)]][10:]
predictors_2 = [p for p in predictors_2 if p != "available_sum"]
predictors_2 = [p for p in predictors_2 if p not in ["age_entry", "male", "white", "afam", "hisp", "other", "pell_0_ind", "pell_1_ind"] + ["phe_" + str(i) for i in range(1,8)]]
len(predictors_2)

40

#### Part2: Non-term-specific predictors

In [25]:
part2_train = train_part.loc[:, ['vccsid']+predictors_2+['grad_6years']].sort_values(['vccsid'])
part2_valid = valid_part.loc[:, ['vccsid']+predictors_2+['grad_6years']].sort_values(['vccsid'])
part2_test = test_part_new.loc[:, ['vccsid']+predictors_2+['grad_6years']].sort_values(['vccsid'])

In [26]:
part2_train_X = part2_train.loc[:,predictors_2].values
part2_valid_X = part2_valid.loc[:,predictors_2].values
part2_test_X = part2_test.loc[:,predictors_2].values
train_y = np.array(part2_train.grad_6years)
valid_y = np.array(part2_valid.grad_6years)
test_y = np.array(part2_test.grad_6years)

scaler_2 = MinMaxScaler(feature_range=(-1,1))
part2_train_X = scaler_2.fit_transform(part2_train_X)
part2_valid_X = scaler_2.transform(part2_valid_X)
part2_test_X = scaler_2.transform(part2_test_X)

In [28]:
np.save(fpath + "/lstm_data_2/part2_train_X", part2_train_X)
np.save(fpath + "/lstm_data_2/part2_valid_X", part2_valid_X)
np.save(fpath + "/lstm_data_2/part2_test_X", part2_test_X)
np.save(fpath + "/lstm_data_2/train_y", train_y)
np.save(fpath + "/lstm_data_2/valid_y", valid_y)
np.save(fpath + "/lstm_data_2/test_y", test_y)

#### Part1: Term-specific predictors

In [30]:
part1 = pd.read_stata("C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\intermediate_files\\term_specific_part.dta")
part1_train = part2_train.loc[:,['vccsid']].merge(part1, how='inner', on=['vccsid']).sort_values(['vccsid','strm'])
part1_train = part1_train.drop(['pell_0_', 'pell_1_'], axis=1)
assert len(np.unique(part1_train.vccsid)) == part2_train.shape[0]
part1_valid = part2_valid.loc[:,['vccsid']].merge(part1, how='inner', on=['vccsid']).sort_values(['vccsid','strm'])
part1_valid = part1_valid.drop(['pell_0_', 'pell_1_'], axis=1)
assert len(np.unique(part1_valid.vccsid)) == part2_valid.shape[0]
part1_test = part2_test.loc[:,['vccsid']].merge(part1, how='inner', on=['vccsid']).sort_values(['vccsid','strm'])
part1_test = part1_test.drop(['pell_0_', 'pell_1_'], axis=1)
assert len(np.unique(part1_test.vccsid)) == part2_test.shape[0]

In [35]:
def create_id_rng(dat):
    l = []
    crnt_id = None
    for i in range(dat.shape[0]):
        vccsid = dat.vccsid.iloc[i]
        if vccsid != crnt_id:
            if crnt_id is not None:
                l.append((start_indx, i))
            start_indx = i
            crnt_id = vccsid
    l.append((start_indx, dat.shape[0]))
    return l

In [36]:
train_id_rng = create_id_rng(part1_train)
valid_id_rng = create_id_rng(part1_valid)
test_id_rng = create_id_rng(part1_test)

In [37]:
scaler_1 = MinMaxScaler(feature_range=(-1,1))
part1_train_X = scaler_1.fit_transform(part1_train.iloc[:,4:].values)
part1_valid_X = scaler_1.transform(part1_valid.iloc[:,4:].values)
part1_test_X = scaler_1.transform(part1_test.iloc[:,4:].values)

In [38]:
part1_train_X = [part1_train_X[i1:i2,:] for i1,i2 in train_id_rng]
part1_valid_X = [part1_valid_X[i1:i2,:] for i1,i2 in valid_id_rng]
part1_test_X = [part1_test_X[i1:i2,:] for i1,i2 in test_id_rng]

In [39]:
assert len(part1_train_X) == part2_train_X.shape[0]
assert len(part1_valid_X) == part2_valid_X.shape[0]
assert len(part1_test_X) == part2_test_X.shape[0]

In [40]:
for v in ['train','valid','test']:
    v_name = "part1_{}_X".format(v)
    pickle.dump(eval(v_name), open(fpath+"/lstm_data_2/"+v_name+".p", "wb"))