In [None]:
import pandas as pd
import pickle
import seaborn as sns
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    log_loss,
    roc_auc_score,
    recall_score,
    precision_score,
    accuracy_score,
    plot_roc_curve,
    plot_confusion_matrix,
    roc_curve,
    confusion_matrix,
)
import itertools
from tensorflow.keras.initializers import Constant, TruncatedNormal
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import LabelEncoder
from numpy import mean, absolute

# Oversampling and under sampling
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from collections import Counter

Learning Bias Mitigation: Gender <> DTE

In [None]:
## define count of n from temporal models
n = list(range(2, 61))

In [None]:
## load data
# not balanced out, as representational bias is not attempted to be mitigated yet
for i in n:
    path = (
        "../../02_dropout_prediction/01_keep_it_up/matrices_allsessions/matrix"
        + str(i)
        + ".pkl"
    )
    infile = open(path, "rb")
    df = pickle.load(infile)
    infile.close()
    df = df.reset_index(level=0)

    df_1 = df[df.Sex__m == 1]
    df_0 = df[df.Sex__w == 1]
    df = pd.concat([df_0, df_1])

    # Randomly over sample the minority class
    X_df = df.drop(columns=["Sex__m"])
    y_df = df.Sex__m
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_df, y_df)
    df = X_train_smote.join(
        pd.DataFrame(list(y_train_smote.values), columns=["Sex__m"])
    )

    # save
    path = "gender_allsessions/matrix" + str(i) + ".pkl"
    df.to_pickle(path)

Models

In [None]:
## define metrics dataframe
metrics = pd.DataFrame(
    columns=[
        "model",
        "group",
        "subgroup",
        "Length",
        "Sentence",
        "Accuracy",
        "max_depth",
        "min_samples_leaf",
        "min_samples_split",
        "Precision",
        "Recall",
        "AUC",
        "FPR",
    ]
)

In [None]:
## define feature cols
feature_cols = [
    "Erstloesung",
    "Schussel",
    "Erfolg",
    "Schwierigkeit",
    "ist_Schulzeit",
    "MehrfachFalsch",
    "vorher_abgebrochen",
    "Fehler",
    "Klassenstufe",
    "Jahredabei",
    "Testposition__pruefung",
    "Testposition__training",
    "Testposition__version",
    "Art__GK",
    "Art__GR",
    "Art__GZ",
    "Art__K",
    "Art__LB",
    "UserAttribut",
    "OrderNumber",
    "steps",
]

In [None]:
"""
calculate and extract relevant metrics from y and pred
return metrics
"""


def get_metrics(clf, X, y, cv, pred):
    a = accuracy_score(y, pred)
    p = precision_score(y, pred)
    r = recall_score(y, pred)

    roc_auc = roc_auc_score(y, pred)
    tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
    fpr = fp / (fp + tn)

    return a, p, r, roc_auc, fpr

In [None]:
max_depth = [1, 5, 10, 15, 20, 25, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
min_samples_leaf = [1, 1, 1, 1, 1, 1, 1, 5, 10, 15, 20, 25, 1, 1, 1, 1, 1, 1]
min_samples_split = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 10, 15, 20, 25]
## build model for different parameters
for max_depth, min_samples_leaf, min_samples_split in zip(
    max_depth, min_samples_leaf, min_samples_split
):
    for i in n:
        path = "gender_allsessions/matrix" + str(i) + ".pkl"
        infile = open(path, "rb")
        df = pickle.load(infile)
        infile.close()
        df = df.reset_index()
        X = df[feature_cols]
        y = df.y
        y = y.astype("int")
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=1
        )
        k = 5
        cv = KFold(n_splits=k, random_state=None)

        # fit
        clf = DecisionTreeClassifier(
            criterion="entropy",
            max_depth=max_depth,
            min_samples_leaf=min_samples_leaf,
            min_samples_split=min_samples_split,
        )
        clf = clf.fit(X_train, y_train)
        pred = clf.predict(X_test)

        a, p, r, roc_auc, fpr = get_metrics(clf, X_test, y_test, cv, pred)
        metrics = metrics.append(
            {
                "model": "DTE",
                "group": "all",
                "subgroup": "all",
                "Length": len(df),
                "Sentence": i,
                "Accuracy": a,
                "max_depth": max_depth,
                "min_samples_leaf": min_samples_leaf,
                "min_samples_split": min_samples_split,
                "Precision": p,
                "Recall": r,
                "AUC": roc_auc,
                "FPR": fpr,
            },
            ignore_index=True,
        )

        group = ["gender", "gender"]
        subgroup = ["boys", "girls"]
        matrice = ["matrices_forte_boys", "matrices_forte_girls"]

        for group, subgroup, matrix in zip(group, subgroup, matrice):
            path = (
                "../../02_dropout_prediction/01_keep_it_up/"
                + matrix
                + "/matrix"
                + str(i)
                + ".pkl"
            )
            infile = open(path, "rb")
            df = pickle.load(infile)
            infile.close()
            df = df.reset_index()
            X = df[feature_cols]
            y = df.y
            y = y.astype("int")
            pred = clf.predict(X)

            a, p, r, roc_auc, fpr = get_metrics(clf, X, y, cv, pred)
            metrics = metrics.append(
                {
                    "model": "DTE",
                    "group": group,
                    "subgroup": subgroup,
                    "Length": len(df),
                    "Sentence": i,
                    "Accuracy": a,
                    "max_depth": max_depth,
                    "min_samples_leaf": min_samples_leaf,
                    "min_samples_split": min_samples_split,
                    "Precision": p,
                    "Recall": r,
                    "AUC": roc_auc,
                    "FPR": fpr,
                },
                ignore_index=True,
            )

Evaluate

In [None]:
## construct dfs from metric df
grouped = metrics.groupby(df.group)
df_gender = grouped.get_group("gender")

df_gender = df_gender.drop(columns=["group", "Accuracy"])
df_gender = pd.pivot_table(
    df_gender,
    values=["Precision", "Recall", "AUC", "FPR"],
    index=["max_depth", "min_samples_leaf", "min_samples_split", "Sentence"],
    columns=["subgroup"],
)
df_gender["PP"] = df_gender.Precision.girls - df_gender.Precision.boys
df_gender["EO"] = df_gender.Recall.boys - df_gender.Recall.girls
df_gender["SA"] = df_gender.AUC.girls - df_gender.AUC.boys
df_gender["PE"] = df_gender.FPR.boys - df_gender.FPR.girls
df_gender = df_gender.drop(columns=["AUC", "Precision", "Recall", "FPR"])
df_gender.columns = df_gender.columns.droplevel(1)
df_gender = pd.pivot_table(
    df_gender,
    values=["PP", "EO", "SA", "PE"],
    index=["Sentence"],
    columns=[
        "max_depth",
        "min_samples_leaf",
        "min_samples_split",
    ],
)

In [None]:
from itertools import product

## create mean of results and map tp data frame
met = ["EO", "PE", "PP", "SA"]
index_ranges = [
    (2, 10, "02-9"),
    (10, 20, "10-19"),
    (20, 30, "20-29"),
    (30, 40, "30-39"),
    (40, 50, "40-49"),
    (50, 60, "50-60"),
]
param_ranges = {
    "max_depth": [1, 5, 10, 15, 20, 25, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
    "min_samples_leaf": [1, 1, 1, 1, 1, 1, 1, 5, 10, 15, 20, 25, 1, 1, 1, 1, 1, 1],
    "min_samples_split": [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 10, 15, 20, 25],
}

data_list = []
for x, (i_start, i_end, i_range), params in product(
    met, index_ranges, product(*param_ranges.values())
):
    indices = np.arange(i_start, i_end)
    val = np.mean(
        [
            df_gender[x][p1][p2][p3][i]
            for (p1, p2, p3) in zip(*param_ranges.values())
            for i in indices
        ]
    )
    data = {
        "Metrik": [x],
        "Model": "DTE",
        "Range": i_range,
        "Val": val,
        "max_depth": params[0],
        "min_samples_leaf": params[1],
        "min_samples_split": params[2],
    }
    data_list.append(data)

fertig = pd.DataFrame(data_list)

In [None]:
"""
functions to format results
set two threshols: one at |0.02| in orange and one at |0.05| in red
format all negative values in bold
"""


def threshold001(v, props=""):
    return props if (v > 0.02) or (v < -0.02) else None


def threshold005(v, props=""):
    return props if (v > 0.05) or (v < -0.05) else None


def negativeValue(v, props=""):
    return props if (v < 0) else None


def showTable(df):
    styled = (
        df.style.set_properties(color="black", align="right")
        .set_properties(**{"background-color": "white"})
        .applymap(threshold001, props="color:orange;")
        .applymap(threshold005, props="color:red;")
        .applymap(negativeValue, props="font-weight:bold;")
    )
    return styled

In [None]:
## show results table
mean_table = pd.pivot_table(
    fertig,
    values=["Val"],
    index=["max_depth", "min_samples_leaf", "min_samples_split", "Range"],
    columns=["Metrik", "Model"],
)
showTable(mean_table)

In [None]:
## code to print results of specific measures
# grouped = metrics.groupby(metrics.group)
# df_all = grouped.get_group("all")
# modell = df_all.groupby(df_all.max_depth)
# five = modell.get_group(1)
# n = five.groupby(five.min_samples_leaf)
# n = n.get_group(1)
# f = n.groupby(n.min_samples_split)
# f = f.get_group(2)
# ax = sns.lineplot(data=f, x='Sentence', y='Accuracy', hue='model')