In [4]:
import pickle
import pandas as pd
import numpy as np
from scipy.stats.mstats import gmean
import sklearn
import statsmodels.formula.api as sm
from statsmodels.tools import add_constant
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families
from sklearn.linear_model import LogisticRegression, LinearRegression
from numpy.linalg import LinAlgError
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
fpath = "/Users/ys8mz/Box Sync/Predictive Models of College Completion (VCCS)/intermediate_files"

In [5]:
df = pd.read_stata(fpath + "/full_data_truncated.dta")
df.loc[:,'available_sum'] = 0
for p in [p for p in list(df.columns)[10:] if p.startswith("available") and p != "available_sum"]:
    df.loc[:,'available_sum'] += df[p]

In [3]:
train_df_old = df[df.valid == 0]
test_df = df[df.valid == 1]
print(train_df_old.shape,test_df.shape)

(298139, 342) (33115, 342)


##### Randomly select 10% observations from the original training sample

In [4]:
_, train_df = train_test_split(train_df_old, test_size=0.1, stratify=train_df_old['grad_6years'].astype(str)+"_"+train_df_old['available_sum'].astype(str), random_state=54321)

In [5]:
predictors = list(df.columns)[10:-1]
print(len(predictors))
impute_list_1 = set(["prop_comp_pre","cum_gpa_pre"])
impute_list_2 = set([t1+"_"+t2+str(t3) for t1 in ["term_gpa", "prop_comp", "lvl2_prop_comp", "dev_prop_comp"] for t2 in ["fa", "sp", "su"] for t3 in range(1,7,1)])
impute_list_3 = set(["cum_gpa", "lvl2_prop_comp", "dev_prop_comp", "prop_comp", "prop_comp_sd", "withdrawn_prop_comp_sd"])
impute_list_4 = set(["admrate", "gradrate", "satvr25", "satvr75", "satmt25", "satmt75", "satwr25", "satwr75"])

331


In [6]:
# degree completion rate of validation sample
sum(train_df.grad_6years)/train_df.shape[0]

0.34202052726906823

In [7]:
# degree completion rate of validation sample
sum(test_df.grad_6years)/test_df.shape[0]

0.34144647440736825

In [8]:
def impute(train, test):
    for p in impute_list_1:
        avg_p = np.nanmean(train[train.enrolled_pre == 1][p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    for p in impute_list_3:
        avg_p = np.nanmean(train[p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    for p in impute_list_2:
        suffix = p[-3:]
        avg_p = np.nanmean(train[train["enrolled_" + suffix] == 1][p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    for p in impute_list_4:
        avg_p = np.nanmean(train[train["enrolled_nsc"] == 1][p])
        train.loc[:,p] = train.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
        test.loc[:,p] = test.loc[:,p].apply(lambda x: avg_p if pd.isnull(x) else x)
    return train, test                    

In [9]:
train_df_new, test_df_new = impute(train_df, test_df)
X_train = train_df_new.loc[:,predictors]
y_train = train_df_new.grad_6years
X_test = test_df_new.loc[:,predictors]
y_test = test_df_new.grad_6years

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [10]:
results_dir = "C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\evaluation_results\\smaller_training_sample\\test1\\"

#### (1) Run the basic version of logistic regression

In [11]:
try:
    lr = sm.Logit(y_train, add_constant(X_train,prepend=True)).fit()
except LinAlgError:
    lr = LogisticRegression(random_state=1234)
    lr.fit(X_train, y_train)

  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


         Current function value: inf
         Iterations: 35




In [12]:
print("Logistic Regression:")
if type(lr) == LogisticRegression:
    y_test_pred_lr = list(lr.predict_proba(X_test)[:,1])
else:
    y_test_pred_lr = list(lr.predict(add_constant(X_test, prepend=True)))
print("AUC = {}".format(round(roc_auc_score(y_test, y_test_pred_lr),4)))

Logistic Regression:
AUC = 0.8805


In [25]:
# save the model object and predicted scores on the validation sample to local disk
# pickle.dump(lr, open(results_dir + "/lr.p", "wb"))
pickle.dump(list(y_test_pred_lr), open(results_dir + "/y_test_pred_lr.p", "wb"))

In [48]:
def find_optimal_threshold(p,r,t):
    to_drop = np.union1d(np.where(pd.isnull(p[:-1]) == True)[0], np.where(pd.isnull(r[:-1]) == True)[0])
    to_drop = np.union1d(to_drop, np.where(pd.isnull(t) == True)[0])
    to_keep = np.setdiff1d(np.array(list(range(len(p)-1))), to_drop)
    p,r,t = p[to_keep],r[to_keep],t[to_keep]
    f1 = 2*p*r/(p+r)
    best_t = t[np.argmax(f1)]
    best_t
    return best_t

In [49]:
def cross_validation_lr(train):
    threshold_list = []
    auc_list = []
    k_fold =  StratifiedKFold(n_splits = 10, random_state = 12345, shuffle=True)
    for train_indices, test_indices in k_fold.split(train, train.grad_6years):
        train_part = train.iloc[train_indices,:]
        test_part = train.iloc[test_indices,:]
        train_part_new, test_part_new = impute(train_part, test_part)
        X_1 = train_part_new.loc[:,predictors]
        y_1 = train_part_new.grad_6years
        X_2 = test_part_new.loc[:,predictors]
        y_2 = test_part_new.grad_6years
        model = LogisticRegression(random_state=1234)
        model.fit(X_1,y_1)
        p,r,t = precision_recall_curve(y_2, model.predict_proba(X_2)[:,1])
        auc = roc_auc_score(y_2, model.predict_proba(X_2)[:,1])
        threshold_list.append(find_optimal_threshold(p,r,t))
        auc_list.append(auc)
    print(np.mean(auc_list), np.std(auc_list, ddof=1))
    return gmean(threshold_list)                                   

In [50]:
best_threshold = cross_validation_lr(train_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/

0.8755915092902458 0.005418749221475392


In [51]:
best_threshold

0.38643128923361736

In [52]:
def create_confusion_matrix(y_test_pred, threshold, fname):
    cm_arr = confusion_matrix(y_test, np.where(np.array(y_test_pred) > threshold, 1, 0))
    cm_df = pd.DataFrame(cm_arr, columns=['Pred_0','Pred_1'], index=['Real_0', 'Real_1'])
    cm_df.loc[:,''] = cm_df.sum(axis=1)
    cm_df.loc['',:] = cm_df.sum(axis=0)
    print(cm_df)
    print("")
    p1 = cm_df.iloc[1,1]/cm_df.iloc[2,1]
    r1 = cm_df.iloc[1,1]/cm_df.iloc[1,2]
    p0 = cm_df.iloc[0,0]/cm_df.iloc[2,0]
    r0 = cm_df.iloc[0,0]/cm_df.iloc[0,2]    
    print("F1 score = {}".format(round(2*p1*r1/(p1+r1),4)))    
    cm_df.to_csv(results_dir + fname + ".csv")
    return p1,r1,p0,r0,round(2*p1*r1/(p1+r1),4)

In [53]:
print("F1 threshold = {}:\n".format(str(round(best_threshold,3))))
pr_lr = create_confusion_matrix(y_test_pred_lr, best_threshold, "LR1_cm1")

F1 threshold = 0.386:

         Pred_0   Pred_1         
Real_0  18189.0   3619.0  21808.0
Real_1   2512.0   8795.0  11307.0
        20701.0  12414.0  33115.0

F1 score = 0.7415


In [54]:
num_of_0 = int(round((1-np.mean(train_df.grad_6years))*len(y_test)))
y_test_pred_binary = np.ones(len(y_test))
y_test_pred_binary[np.argsort(y_test_pred_lr)[:num_of_0]] = 0
alternative_threshold = y_test_pred_lr[np.argsort(y_test_pred_lr)[num_of_0]]
print("Alternative threshold = {}:\n".format(str(round(alternative_threshold,3))))
pr2_lr = create_confusion_matrix(y_test_pred_binary, best_threshold, "LR1_cm2")

Alternative threshold = 0.438:

         Pred_0   Pred_1         
Real_0  18817.0   2991.0  21808.0
Real_1   2972.0   8335.0  11307.0
        21789.0  11326.0  33115.0

F1 score = 0.7365


In [55]:
precision_recall_df = pd.DataFrame([(best_threshold,)+pr_lr,(alternative_threshold,)+pr2_lr]).round(4)
precision_recall_df.index = ['F1','Same_Graduation_Rate']
precision_recall_df.columns = ['threshold','precision_1','recall_1','precision_0','recall_0','f1_score']
precision_recall_df.to_csv(results_dir + "LR1_precision_recall.csv", index=True)

#### (2) Run the OLS model

In [13]:
try:
    ols = sm.OLS(y_train, add_constant(X_train,prepend=True)).fit()
except LinAlgError:
    ols = LinearRegression()
    ols.fit(X_train, y_train)

In [14]:
print("OLS:")
if type(ols) == LinearRegression:
    y_test_pred_ols = list(ols.predict(X_test))
else:
    y_test_pred_ols = list(ols.predict(add_constant(X_test, prepend=True)))
print("AUC = {}".format(round(roc_auc_score(y_test, y_test_pred_ols),4)))

OLS:
AUC = 0.8762


In [65]:
# pickle.dump(ols, open(results_dir + "/ols.p", "wb"))
pickle.dump(list(y_test_pred_ols), open(results_dir + "y_test_pred_ols.p", "wb"))

In [69]:
def cross_validation_ols(train):
    threshold_list = []
    auc_list = []
    k_fold =  StratifiedKFold(n_splits = 10, random_state = 12345, shuffle=True)
    for train_indices, test_indices in k_fold.split(train, train.grad_6years):
        train_part = train.iloc[train_indices,:]
        test_part = train.iloc[test_indices,:]
        train_part_new, test_part_new = impute(train_part, test_part)
        X_1 = train_part_new.loc[:,predictors]
        y_1 = train_part_new.grad_6years
        X_2 = test_part_new.loc[:,predictors]
        y_2 = test_part_new.grad_6years
        model = LinearRegression()
        model.fit(X_2,y_2)
        p,r,t = precision_recall_curve(y_2, model.predict(X_2))
        auc = roc_auc_score(y_2, model.predict(X_2))
        threshold_list.append(find_optimal_threshold(p,r,t))
        auc_list.append(auc)
    print(np.mean(auc_list), np.std(auc_list, ddof=1))
    return gmean(threshold_list)   

In [70]:
best_threshold_3 = cross_validation_ols(train_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/

0.9034157202934319 0.005268666942512579


In [71]:
best_threshold_3

0.4250315407143919

In [72]:
print("F1 threshold = {}:\n".format(str(round(best_threshold_3,3))))
pr_ols = create_confusion_matrix(y_test_pred_ols, best_threshold_3, "OLS_cm1")

F1 threshold = 0.425:

         Pred_0   Pred_1         
Real_0  18232.0   3576.0  21808.0
Real_1   2673.0   8634.0  11307.0
        20905.0  12210.0  33115.0

F1 score = 0.7343


In [73]:
num_of_0 = int(round((1-np.mean(train_df.grad_6years))*len(y_test)))
y_test_pred_binary = np.ones(len(y_test))
y_test_pred_binary[np.argsort(y_test_pred_ols)[:num_of_0]] = 0
alternative_threshold_3 = y_test_pred_ols[np.argsort(y_test_pred_ols)[num_of_0]]
print("Alternative threshold = {}:\n".format(str(round(alternative_threshold_3,3))))
pr2_ols = create_confusion_matrix(y_test_pred_binary, best_threshold_3, "OLS_cm2")

Alternative threshold = 0.454:

         Pred_0   Pred_1         
Real_0  18726.0   3082.0  21808.0
Real_1   3063.0   8244.0  11307.0
        21789.0  11326.0  33115.0

F1 score = 0.7285


In [74]:
precision_recall_df_3 = pd.DataFrame([(best_threshold_3,)+pr_ols,(alternative_threshold_3,)+pr2_ols]).round(4)
precision_recall_df_3.index = ['F1','Same_Graduation_Rate']
precision_recall_df_3.columns = ['threshold','precision_1','recall_1','precision_0','recall_0','f1_score']
precision_recall_df_3.to_csv(results_dir + "OLS_precision_recall.csv", index=True)

#### (3) Comparison with reduced validation sample

##### Randomly select 10% observations from the original validation sample

In [15]:
test_df_old = df[df.valid == 1]
_, test_df_reduced = train_test_split(test_df_old, test_size=0.1, stratify=test_df_old['grad_6years'].astype(str)+"_"+test_df_old['available_sum'].astype(str), random_state=54321)

In [16]:
_, test_df_reduced_new = impute(train_df, test_df_reduced)
X_test_reduced = test_df_reduced_new.loc[:,predictors]
y_test_reduced = test_df_reduced_new.grad_6years

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [17]:
results_dir_new1 = "C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\evaluation_results\\smaller_training_sample\\test1\\comparison_1\\"

In [18]:
print("Logistic Regression:")
if type(lr) == LogisticRegression:
    y_test_reduced_pred_lr = list(lr.predict_proba(X_test_reduced)[:,1])
else:
    y_test_reduced_pred_lr = list(lr.predict(add_constant(X_test_reduced, prepend=True)))
print("AUC = {}".format(round(roc_auc_score(y_test_reduced, y_test_reduced_pred_lr),4)))

Logistic Regression:
AUC = 0.8815


In [19]:
pickle.dump(list(y_test_reduced_pred_lr), open(results_dir_new1 + "y_test_pred_lr.p", "wb"))

In [48]:
def create_confusion_matrix_new(y_test, y_test_pred, threshold, fpath, fname):
    cm_arr = confusion_matrix(y_test, np.where(np.array(y_test_pred) > threshold, 1, 0))
    cm_df = pd.DataFrame(cm_arr, columns=['Pred_0','Pred_1'], index=['Real_0', 'Real_1'])
    cm_df.loc[:,''] = cm_df.sum(axis=1)
    cm_df.loc['',:] = cm_df.sum(axis=0)
    print(cm_df)
    print("")
    p1 = cm_df.iloc[1,1]/cm_df.iloc[2,1]
    r1 = cm_df.iloc[1,1]/cm_df.iloc[1,2]
    p0 = cm_df.iloc[0,0]/cm_df.iloc[2,0]
    r0 = cm_df.iloc[0,0]/cm_df.iloc[0,2]    
    print("F1 score = {}".format(round(2*p1*r1/(p1+r1),4)))    
    cm_df.to_csv(fpath + fname + ".csv")
    return p1,r1,p0,r0,round(2*p1*r1/(p1+r1),4)

In [61]:
def create_pr(train_df, y_test, y_test_pred, best_threshold, fpath, mn):
    print("F1 threshold = {}:\n".format(str(round(best_threshold,3))))
    pr_lr = create_confusion_matrix_new(y_test, y_test_pred, best_threshold, fpath, "{}_cm1".format(mn))

    num_of_0 = int(round((1-np.mean(train_df.grad_6years))*len(y_test)))
    y_test_pred_binary = np.ones(len(y_test))
    y_test_pred_binary[np.argsort(y_test_pred)[:num_of_0]] = 0
    alternative_threshold = y_test_pred[np.argsort(y_test_pred)[num_of_0]]
    print("\n\n")
    print("Alternative threshold = {}:\n".format(str(round(alternative_threshold,3))))
    pr2_lr = create_confusion_matrix_new(y_test, y_test_pred_binary, best_threshold, fpath, "{}_cm2".format(mn))

    precision_recall_df = pd.DataFrame([(best_threshold,)+pr_lr,(alternative_threshold,)+pr2_lr]).round(4)
    precision_recall_df.index = ['F1','Same_Graduation_Rate']
    precision_recall_df.columns = ['threshold','precision_1','recall_1','precision_0','recall_0','f1_score']
    precision_recall_df.to_csv(fpath + "{}_precision_recall.csv".format(mn), index=True)

In [62]:
create_pr(train_df, y_test_reduced, y_test_reduced_pred_lr, best_threshold, results_dir_new1, "LR1")

F1 threshold = 0.386:

        Pred_0  Pred_1        
Real_0  1839.0   342.0  2181.0
Real_1   242.0   889.0  1131.0
        2081.0  1231.0  3312.0

F1 score = 0.7528



Alternative threshold = 0.437:

        Pred_0  Pred_1        
Real_0  1895.0   286.0  2181.0
Real_1   284.0   847.0  1131.0
        2179.0  1133.0  3312.0

F1 score = 0.7482


In [20]:
print("OLS:")
if type(ols) == LinearRegression:
    y_test_reduced_pred_ols = list(ols.predict(X_test_reduced))
else:
    y_test_reduced_pred_ols = list(ols.predict(add_constant(X_test_reduced, prepend=True)))
print("AUC = {}".format(round(roc_auc_score(y_test_reduced, y_test_reduced_pred_ols),4)))

OLS:
AUC = 0.8773


In [21]:
pickle.dump(list(y_test_reduced_pred_ols), open(results_dir_new1 + "y_test_pred_ols.p", "wb"))

In [64]:
create_pr(train_df, y_test_reduced, y_test_reduced_pred_ols, best_threshold_3, results_dir_new1, "OLS")

F1 threshold = 0.425:

        Pred_0  Pred_1        
Real_0  1838.0   343.0  2181.0
Real_1   257.0   874.0  1131.0
        2095.0  1217.0  3312.0

F1 score = 0.7445



Alternative threshold = 0.455:

        Pred_0  Pred_1        
Real_0  1877.0   304.0  2181.0
Real_1   302.0   829.0  1131.0
        2179.0  1133.0  3312.0

F1 score = 0.7323


#### (6) Comparison with the base model (trained on the full training data), using the reduced validation sample

In [22]:
model_dir = "C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\evaluation_results\\truncated_predictors\\"
base_lr = pickle.load(open(model_dir + "lr.p", "rb"))
base_ols = pickle.load(open(model_dir + "ols.p", "rb"))

In [23]:
train_df_old = df[df.valid == 0]
test_df_old = df[df.valid == 1]
_, test_df_reduced = train_test_split(test_df_old, test_size=0.1, stratify=test_df_old['grad_6years'].astype(str)+"_"+test_df_old['available_sum'].astype(str), random_state=54321)

In [24]:
_, test_df_reduced_new = impute(train_df_old, test_df_reduced)
X_test_reduced = test_df_reduced_new.loc[:,predictors]
y_test_reduced = test_df_reduced_new.grad_6years

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [25]:
results_dir_new2 = "C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\evaluation_results\\smaller_training_sample\\test1\\comparison_2\\"

In [26]:
print("Logistic Regression:")
if type(base_lr) == LogisticRegression:
    y_test_reduced_pred_lr = list(base_lr.predict_proba(X_test_reduced)[:,1])
else:
    y_test_reduced_pred_lr = list(base_lr.predict(add_constant(X_test_reduced, prepend=True)))
print("AUC = {}".format(round(roc_auc_score(y_test_reduced, y_test_reduced_pred_lr),4)))

Logistic Regression:
AUC = 0.8861


In [27]:
pickle.dump(list(y_test_reduced_pred_lr), open(results_dir_new2 + "y_test_pred_lr.p", "wb"))

In [76]:
create_pr(train_df_old, y_test_reduced, y_test_reduced_pred_lr, 0.43344163040148626, results_dir_new2, "LR1")

F1 threshold = 0.433:

        Pred_0  Pred_1        
Real_0  1896.0   285.0  2181.0
Real_1   274.0   857.0  1131.0
        2170.0  1142.0  3312.0

F1 score = 0.7541



Alternative threshold = 0.438:

        Pred_0  Pred_1        
Real_0  1901.0   280.0  2181.0
Real_1   278.0   853.0  1131.0
        2179.0  1133.0  3312.0

F1 score = 0.7535


In [28]:
print("OLS:")
if type(ols) == LinearRegression:
    y_test_reduced_pred_ols = list(base_ols.predict(X_test_reduced))
else:
    y_test_reduced_pred_ols = list(base_ols.predict(add_constant(X_test_reduced, prepend=True)))
print("AUC = {}".format(round(roc_auc_score(y_test_reduced, y_test_reduced_pred_ols),4)))

OLS:
AUC = 0.8807


In [29]:
pickle.dump(list(y_test_reduced_pred_ols), open(results_dir_new2 + "y_test_pred_ols.p", "wb"))

In [78]:
create_pr(train_df_old, y_test_reduced, y_test_reduced_pred_ols, 0.46918787407941415, results_dir_new2, "OLS")

F1 threshold = 0.469:

        Pred_0  Pred_1        
Real_0  1911.0   270.0  2181.0
Real_1   318.0   813.0  1131.0
        2229.0  1083.0  3312.0

F1 score = 0.7344



Alternative threshold = 0.451:

        Pred_0  Pred_1        
Real_0  1890.0   291.0  2181.0
Real_1   289.0   842.0  1131.0
        2179.0  1133.0  3312.0

F1 score = 0.7438
