In [None]:
import xgboost as xgb
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import LeavePGroupsOut, cross_val_score, GroupShuffleSplit
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold

from sklearn.dummy import DummyClassifier
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from math import sqrt

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
def standard_error(roc_auc, num_pos, num_neg):
    roc_auc_squared = roc_auc**2
    d_pos = (num_pos-1)*(roc_auc/(2-roc_auc)-roc_auc_squared)
    d_neg = (num_neg-1)*((2*roc_auc_squared)/(1+roc_auc)-roc_auc_squared)
    return sqrt((roc_auc*(1-roc_auc) + d_pos + d_neg)/(num_pos*num_neg)) 

def z_score(roc_auc1, roc_auc2, se1, se2):
    return((roc_auc1 - roc_auc2)/(sqrt(se1**2 + se2**2)))

def TrainModels(df, groupID, nSplits, text, clf, name, sm=True):
            toRet = {}
            groups = df[groupID]
            finalPreds = pd.DataFrame()
            for iter_num, fold_indices in enumerate(GroupKFold(n_splits=4).split(X=df, groups=groups)):
                train_indices = fold_indices[0]
                test_indices = fold_indices[1]

                train_inst = df.iloc[train_indices].copy()
                test_inst = df.iloc[test_indices].copy()

                train_X = train_inst[features].copy()
                train_y = train_inst[label].copy()

                test_x = test_inst[features].copy()
                test_y = test_inst[label].copy()
                
                if(sm):
                    sm = SMOTE(random_state=42)
                    train_X, train_y = sm.fit_resample(train_X,train_y)

                clf.fit(train_X, train_y)
                probs = clf.predict_proba(test_x)
                df_prob = pd.DataFrame(probs, columns=["P0", "P1"], index=test_y.index)
    #             print(probs)
    #             print(df_prob)
                test_y = test_y.to_frame()
                test_y["Prob"] = df_prob["P1"].copy()
    #             print(test_y)
                finalPreds=finalPreds.append(test_y)
            #print(finalPreds)
            roc = roc_auc_score(finalPreds[label], finalPreds["Prob"])
            new_se = standard_error(roc, df[label].value_counts()[1],df[label].value_counts()[0])
            print("{0}: {1} roc_auc: {2}".format(text,name,roc))
            print ("StandardError = {0}".format(new_se))
            return [roc, new_se]
            return 
def remove_outliers(df, id_column):
    for col in df.columns:
        if(col!=id_column and col!='Target'):
            y = df[col]
            outliers = y.between(y.quantile(0.01), y.quantile(0.99))
            index_to_remove = df[~outliers].index
            df.drop(index_to_remove, inplace=True)
    return df

In [None]:
df = pd.read_csv("training_data.csv")
df =df.fillna(df.median())
df2 = pd.read_csv("old_training_data.csv")
df2 =df2.fillna(df.median())

common_cols = [col for col in set(df.columns).intersection(df2.columns)] #just in case remove any features not shared by the two data sets
df = df[common_cols]
df2 = df2[common_cols]

print(y.value_counts())
print(y_old.value_counts())

In [None]:
#df = remove_outliers(df, 'StudentID')
#df2 = remove_outliers(df2, 'StudentID')

In [None]:
#Spot Check Means
df.describe()
df2.describe()
for c in df.columns.to_list():
    if(c != 'StudentID'): print("{2} -- MeanNEW: {0} , MeanOLD {1}".format(df[c].mean(), df2[c].mean(), c))

In [None]:
y = df["Target"]
X = df.drop(["Target"], axis=1) 

y_old = df2["Target"]
X_old = df2.drop(["Target"], axis=1)

In [None]:
print(y.value_counts())
print(y_old.value_counts())

In [None]:
names = [
    "Nearest Neighbors",
    #"Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "XGBoost"
    #"Naive Bayes"
]
classifiers = [
    KNeighborsClassifier(3),
    #GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5, random_state = 42),
    RandomForestClassifier(max_depth=5, n_estimators=30, max_features=30, random_state = 42),
    MLPClassifier(alpha=1, max_iter=1000, random_state = 42),
    AdaBoostClassifier(random_state = 42),
    xgb.XGBClassifier(random_state = 42)
    #GaussianNB()
]


In [None]:
X_train = X.drop(["StudentID"], axis=1)
X_old_train=X_old.drop(["StudentID"], axis=1) 



features = list(df.columns)
features.remove("Target")
features.remove("StudentID")
label = "Target"


res_NEW = {}
res_OLD = {}       
for name, clf in zip(names, classifiers):
    res_NEW[name] = TrainModels(df, "StudentID", 4, "NEW", clf, name)
    res_OLD[name] = TrainModels(df2, "StudentID", 4, "OLD", clf, name)
    
    #sm = SMOTE(random_state=42)
    #oldX_train, oldY_train = sm.fit_resample(df2[features], df2[label])  
    clf.fit(oldX_train, oldY_train)
    roc_auc_old_to_new = metrics.roc_auc_score(df[label],clf.predict_proba(df[features])[:, 1]) #score on the full new data
    print("OldToNew: " + name + " roc_auc: "+ str(roc_auc_old_to_new))
    old_to_new_se = standard_error(roc_auc_old_to_new, y.value_counts()[1],y.value_counts()[0])
    print ("StandardError = ", old_to_new_se)
    print("Z-Score OTN versus N: ", z_score(res_NEW[name][0], roc_auc_old_to_new, res_NEW[name][1], old_to_new_se))
    print("Z-Score OTN versus O: ", z_score(roc_auc_old_to_new, res_OLD[name][0], old_to_new_se, res_OLD[name][1]), "\n")


In [None]:
#Old to New - this seems to be getting slightly different results from above, guessing due to the stochastic nature of some of the models Answer: Yes fixing random state locked these in
#for name, clf in zip(names, classifiers):
#        clf.fit(X_old_train, y_old)
#        roc_auc = metrics.roc_auc_score(y,clf.predict_proba(X_train)[:, 1])
#        ConfusionMatrixDisplay.from_estimator(clf,X_train,y)
#        plt.show()
#        print(name + " roc_auc:" + str(roc_auc))

In [None]:
#New Feature Importance
clf = xgb.XGBClassifier()
sm = SMOTE(random_state=42)
X_train, Y_train = sm.fit_resample(df[features], df[label])

# Train the model
clf.fit(X_train, Y_train)
importances = clf.feature_importances_
num_features = 15
indices = np.argsort(importances)


fig, ax = plt.subplots()
ax.set_title('New Data Top 15 Features')
ax.set_xlabel('Feature Importance')
ax.barh(range(num_features), importances[indices][-num_features:])
ax.set_yticks(range(num_features))
_ = ax.set_yticklabels(np.array(X_train.columns)[indices][-num_features:])
plt.tight_layout()
fig.savefig('new_top_features.png')

In [None]:
#Old Feature Importance
clf = xgb.XGBClassifier()

sm = SMOTE(random_state=42)
X_train, Y_train = sm.fit_resample(df2[features], df2[label])

# Train the model
clf.fit(X_train,Y_train)
importances = clf.feature_importances_
num_features = 15
indices = np.argsort(importances)


fig, ax = plt.subplots()
ax.set_title("Old Data Top 15 Features")
ax.set_xlabel('Feature Importance')
ax.barh(range(num_features), importances[indices][-num_features:])
ax.set_yticks(range(num_features))
_ = ax.set_yticklabels(np.array(X_train.columns)[indices][-num_features:])
plt.tight_layout()
fig.savefig('old_top_features.png')

In [None]:
roc_auc_old = 0.833
roc_auc_new = 0.468
new_se = standard_error(roc_auc_new, y.value_counts()[1],y.value_counts()[0])
old_se = standard_error(roc_auc_old, y_old.value_counts()[1],y_old.value_counts()[0])

print("Z-Score O versus N: ", z_score(roc_auc_new, roc_auc_old, new_se, old_se), "\n")