In [1]:
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")

# Other common imports
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_recall_curve

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

No GPU was detected. LSTMs and CNNs can be very slow without a GPU.


In [1]:
fpath = "C:/Users/ys8mz/Box Sync/Predictive Models of College Completion (VCCS)/intermediate_files"

In [2]:
X_train_part1 = pickle.load(open("lstm_data/part1_train_X.p", "rb"))
X_valid_part1 = pickle.load(open("lstm_data/part1_valid_X.p", "rb"))
X_test_part1 = pickle.load(open("lstm_data/part1_test_X.p", "rb"))
X_train_part2 = np.load("lstm_data/part2_train_X.npy")
X_valid_part2 = np.load("lstm_data/part2_valid_X.npy")
X_test_part2 = np.load("lstm_data/part2_test_X.npy")
y_train = np.load("lstm_data/train_y.npy")
y_valid = np.load("lstm_data/valid_y.npy")
y_test = np.load("lstm_data/test_y.npy")

In [3]:
def generate_test_batches(X, X2):
    while True:
        for i in range(len(X)):
            yield [X[i][np.newaxis,:,:], X2[np.newaxis,i,:]]

In [4]:
# load the best RNN model fitted: it's the lstm_model_10
model10 = keras.models.load_model(fpath + "output/lstm_model_10.h5")

In [5]:
y_valid_pred = np.asarray(model10.predict_generator(generate_test_batches(X_valid_part1, X_valid_part2),
                          steps=len(X_valid_part1)))
valid_auc_10 = \
roc_auc_score(y_valid, y_valid_pred)
print("\nLSTM Model 10:\nC-statistic = {}\n\n".format(valid_auc_10))

Instructions for updating:
Please use Model.predict, which supports generators.

LSTM Model 10:
C-statistic = 0.892074098659827




In [6]:
y_test_pred = np.asarray(model10.predict_generator(generate_test_batches(X_test_part1, X_test_part2),
                                                   steps=len(X_test_part1)))
test_auc_10 = \
roc_auc_score(y_test, y_test_pred)
print("\nLSTM Model 10:\nC-statistic = {}\n\n".format(test_auc_10))


LSTM Model 10:
C-statistic = 0.89588538364115




In [7]:
y_train_pred = np.asarray(model10.predict_generator(generate_test_batches(X_train_part1, X_train_part2),
                          steps=len(X_train_part1)))
train_auc_10 = \
roc_auc_score(y_train, y_train_pred)
print("\nLSTM Model 10:\nC-statistic = {}\n\n".format(train_auc_10))


LSTM Model 10:
C-statistic = 0.9003391038365796




In [11]:
import pickle
pickle.dump(y_test_pred[:,0], open(fpath + "y_test_pred_rnn.p", "wb"))

In [15]:
def find_optimal_threshold(p,r,t):
    to_drop = np.union1d(np.where(pd.isnull(p[:-1]) == True)[0], np.where(pd.isnull(r[:-1]) == True)[0])
    to_drop = np.union1d(to_drop, np.where(pd.isnull(t) == True)[0])
    to_keep = np.setdiff1d(np.array(list(range(len(p)-1))), to_drop)
    p,r,t = p[to_keep],r[to_keep],t[to_keep]
    f1 = 2*p*r/(p+r)
    best_t = t[np.argmax(f1)]
    best_t
    return best_t

In [18]:
p,r,t = precision_recall_curve(y_valid, y_valid_pred)
best_threshold = find_optimal_threshold(p,r,t)
best_threshold

0.3519462

In [19]:
cm_arr = confusion_matrix(y_test, np.where(y_test_pred > best_threshold, 1, 0))
cm_df = pd.DataFrame(cm_arr, columns=['Pred_0','Pred_1'], index=['Real_0', 'Real_1'])
cm_df.loc[:,''] = cm_df.sum(axis=1)
cm_df.loc['',:] = cm_df.sum(axis=0)
p1 = cm_df.iloc[1,1]/cm_df.iloc[2,1]
r1 = cm_df.iloc[1,1]/cm_df.iloc[1,2]
p0 = cm_df.iloc[0,0]/cm_df.iloc[2,0]
r0 = cm_df.iloc[0,0]/cm_df.iloc[0,2]    
print("F1 score 1 = {}".format(round(2*p1*r1/(p1+r1),4)))
print("F1 score 0 = {}".format(round(2*p0*r0/(p0+r0),4)))

F1 score 1 = 0.7582
F1 score 0 = 0.866


In [20]:
# Precision/Recall for both graduates and non-graduates
np.round(np.array([p1,r1]), 4), np.round(np.array([p0,r0]), 4)

(array([0.7274, 0.7916]), array([0.8868, 0.8462]))