In [None]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import KFold , cross_val_score
from sklearn import metrics 
from matplotlib import pyplot as plt
from numpy import mean, absolute
import openpyxl
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot
from tensorflow.keras.models import load_model
from sklearn.metrics import log_loss, confusion_matrix, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score, plot_roc_curve, plot_precision_recall_curve, plot_confusion_matrix

In [None]:
# Import feature list
infile = open('../03_prediction_solution_probability/01_data/FINALsmallSampleSet_3months_without_duplicates.pkl','rb')
import_file = pickle.load(infile)
infile.close()
df = import_file
feature_cols = list(df.columns)

In [None]:
"""
function to predict y and to calculate all scores neccessary for fairness evaluation
"""
def get_metrics(clf,X,y):
    pred = clf.predict(X)
    a = accuracy_score(y,pred)
    p = precision_score(y,pred)
    r = recall_score(y,pred)
    roc_auc = roc_auc_score(y,pred)
    tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
    fpr = fp/(fp+tn)

    return a,p,r,roc_auc,fpr

In [None]:
## list subgroups
fair_metrics = pd.DataFrame(columns=['model', 'group', 'subgroup', 'Accuracy', 'Precision', 'Recall', 'AUC', 'FPR'])
matrice = ['double_df_abi','double_df_keinAbi','double_df_boys','double_df_girls','double_df_deutsch','double_df_migration','double_df_buch0','double_df_buch1']
group = ['abiEltern', 'abiEltern', 'gender', 'gender', 'erstsprache', 'erstsprache', 'buecher', 'buecher']
subgroup = ['abi', 'keinAbi', 'boys', 'girls', 'deutsch', 'migration', 'buch0', 'buch1']

Decision Tree Classifier

In [None]:
# load model
DTE_model = pickle.load(open('../03_prediction_solution_probability/02_decisionTree/DecisionTreemodel_3months.pkl', 'rb'))

for (group, subgroup, matrice) in zip(group, subgroup, matrice):
    print(matrice)
    path= matrice+'.pkl'
    infile = open(path,'rb')
    df = pickle.load(infile)
    infile.close()

    dataset= df[df.columns[df.columns.isin(feature_cols)]]
    y = dataset['Erfolg']
    X = dataset.drop(columns=['Erfolg'])
    a,p,r,roc_auc,fpr = get_metrics(DTE_model,X,y)
    fair_metrics = fair_metrics.append({'model':'DTE','group':group,'subgroup':subgroup,'Accuracy':a,'Precision': p, 'Recall':r, 'AUC':roc_auc, 'FPR':fpr}, ignore_index=True)

fair_metrics.to_excel('dte_metrics.xlsx')
fair_metrics.to_pickle('dte_metrics.pkl')

Logistic Regression

In [None]:
# load model
logreg_model = pickle.load(open('../03_prediction_solution_probability/03_logisticRegression/Logregmodel_3months.pkl', 'rb'))

for (group, subgroup, matrice) in zip(group, subgroup, matrice):
    path= matrice+'.pkl'
    infile = open(path,'rb')
    df = pickle.load(infile)
    infile.close()

    dataset = df[df.columns[df.columns.isin(feature_cols)]]
    y = dataset['Erfolg']
    X = dataset.drop(columns=['Erfolg'])

    a,p,r,roc_auc,fpr = get_metrics(logreg_model,X,y)
    fair_metrics = fair_metrics.append({'model':'LogReg','group':group,'subgroup':subgroup,'Accuracy':a,'Precision': p, 'Recall':r, 'AUC':roc_auc, 'FPR':fpr}, ignore_index=True)

fair_metrics.to_excel('log_metrics.xlsx')
fair_metrics.to_pickle('log_metrics.pkl')

SVM

In [None]:
# load model
svm_model = pickle.load(open('../03_prediction_solution_probability/04_svm/SVMmodel_3months.pkl', 'rb'))

for (group, subgroup, matrice) in zip(group, subgroup, matrice):
    path= matrice+'.pkl'
    infile = open(path,'rb')
    df = pickle.load(infile)
    infile.close()

    dataset = df[df.columns[df.columns.isin(feature_cols)]]
    y = dataset['Erfolg']
    X = dataset.drop(columns=['Erfolg'])

    a,p,r,roc_auc,fpr = get_metrics(svm_model,X,y)
    fair_metrics = fair_metrics.append({'model':'SVM','group':group,'subgroup':subgroup,'Accuracy':a,'Precision': p, 'Recall':r, 'AUC':roc_auc, 'FPR':fpr}, ignore_index=True)

fair_metrics.to_excel('svm_metrics.xlsx')
fair_metrics.to_pickle('svm_metrics.pkl')

Neural Network

In [None]:
"""
function to predict y in nn model
return all metrics neccessary for fairness evaluation
"""
def get_dn_metrics(model, X,y):
    X = np.asarray(X).astype('float32')
    yhat_probs = model.predict(X, verbose=0)
    yhat_classes =  (model.predict(X) > 0.5).astype("int32")
    # reduce to 1d array
    yhat_probs = yhat_probs[:, 0]
    yhat_classes = yhat_classes[:, 0]
    a = accuracy_score(y, yhat_classes)
    p = precision_score(y, yhat_classes)
    r = recall_score(y, yhat_classes)
    roc_auc = roc_auc_score(y, yhat_probs)
    tn, fp, fn, tp = confusion_matrix(y, yhat_classes).ravel()
    fpr = fp/(fp+tn)

    return a,p,r,roc_auc,fpr


# load model
nn_model = load_model('../03_prediction_solution_probability/05_nn/nn_3months/')

for (group, subgroup, matrice) in zip(group, subgroup, matrice):
    path= matrice+'.pkl'
    infile = open(path,'rb')
    df = pickle.load(infile)
    infile.close()

    dataset = df[df.columns[df.columns.isin(feature_cols)]]
    y = dataset['Erfolg']
    X = dataset.drop(columns=['Erfolg'])

    a,p,r,roc_auc,fpr = get_dn_metrics(nn_model,X,y)
    fair_metrics = fair_metrics.append({'model':'NN','group':group,'subgroup':subgroup,'Accuracy':a,'Precision': p, 'Recall':r, 'AUC':roc_auc, 'FPR':fpr}, ignore_index=True)

fair_metrics.to_excel('nn_metrics.xlsx')
fair_metrics.to_pickle('nn_metrics.pkl')