In [9]:
import pandas as pd
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score,
    recall_score,
    precision_score,
    accuracy_score,
    confusion_matrix,
)
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from numpy import mean, absolute
import warnings

# Ignore the warning message
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
# define metrics data frame
metrics = pd.DataFrame(
    columns=[
        "model",
        "group",
        "subgroup",
        "Sentence",
        "Accuracy",
        "Precision",
        "Recall",
        "AUC",
        "FPR",
    ]
)

# define range of temporal model
n = list(range(2, 61))

# define feature cols
feature_cols = [
    "Erstloesung",
    "Schussel",
    "Erfolg",
    "Schwierigkeit",
    "ist_Schulzeit",
    "MehrfachFalsch",
    "vorher_abgebrochen",
    "Fehler",
    "Klassenstufe",
    "Jahredabei",
    "AnzahlAufgaben",
    "Sex__m",
    "Sex__w",
    "Testposition__pruefung",
    "Testposition__training",
    "Testposition__version",
    "Art__GK",
    "Art__GR",
    "Art__GZ",
    "Art__K",
    "Art__LB",
    "UserAttribut",
    "OrderNumber",
    "steps",
]

In [7]:
def get_metrics(clf, X, y, cv):
    """
    method to calculate model performance metrics
    """
    scores_a = cross_val_score(clf, X, y, scoring="accuracy", cv=cv, n_jobs=-1)
    a = mean(absolute(scores_a))
    scores_p = cross_val_score(clf, X, y, scoring="precision", cv=cv, n_jobs=-1)
    p = mean(absolute(scores_p))
    scores_r = cross_val_score(clf, X, y, scoring="recall", cv=cv, n_jobs=-1)
    r = mean(absolute(scores_r))
    scores_auc = cross_val_score(clf, X, y, scoring="roc_auc", cv=cv, n_jobs=-1)
    roc_auc = mean(absolute(scores_auc))

    pred = clf.predict(X)
    tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
    fpr = fp / (fp + tn)
    fpr = 0
    return a, p, r, roc_auc, fpr

Decision Tree Classifier, builds model for each t, evaluates for each subgroup and saves the metrics in list

In [10]:
for i in n:
    path = "matrices_allsessions/matrix" + str(i) + ".pkl"
    infile = open(path, "rb")
    df = pickle.load(infile)
    infile.close()
    df = df.reset_index()

    X = df[feature_cols]
    y = df.y
    y = y.astype("int")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1
    )
    k = 5
    cv = KFold(n_splits=k, random_state=None)

    # DTE
    clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)
    clf = clf.fit(X_train, y_train)

    a, p, r, roc_auc, fpr = get_metrics(clf, X_test, y_test, cv)

    metrics = metrics.append(
        {
            "model": "DTE",
            "group": "all",
            "subgroup": "all",
            "Sentence": i,
            "Accuracy": a,
            "Precision": p,
            "Recall": r,
            "AUC": roc_auc,
            "FPR": fpr,
        },
        ignore_index=True,
    )

    # for each matrix and each demographic category, use model to predict y and save results
    group = [
        "abiEltern",
        "abiEltern",
        "gender",
        "gender",
        "erstsprache",
        "erstsprache",
        "buecher",
        "buecher",
    ]
    subgroup = [
        "abi",
        "keinAbi",
        "boys",
        "girls",
        "deutsch",
        "migration",
        "buch0",
        "buch1",
    ]
    matrice = [
        "matrices_forte_abi",
        "matrices_forte_keinAbi",
        "matrices_forte_boys",
        "matrices_forte_girls",
        "matrices_forte_deutsch",
        "matrices_forte_migration",
        "matrices_forte_buecher_0",
        "matrices_forte_buecher_1",
    ]

    for group, subgroup, matrix in zip(group, subgroup, matrice):
        path = matrix + "/matrix" + str(i) + ".pkl"
        infile = open(path, "rb")
        df = pickle.load(infile)
        infile.close()
        df = df.reset_index()
        X = df[feature_cols]
        y = df.y
        y = y.astype("int")

        a, p, r, roc_auc, fpr = get_metrics(clf, X, y, cv)
        metrics = metrics.append(
            {
                "model": "DTE",
                "group": group,
                "subgroup": subgroup,
                "Sentence": i,
                "Accuracy": a,
                "Precision": p,
                "Recall": r,
                "AUC": roc_auc,
                "FPR": fpr,
            },
            ignore_index=True,
        )

KNN

In [None]:
for i in n:
    path = "matrices_allsessions/matrix" + str(i) + ".pkl"
    infile = open(path, "rb")
    df = pickle.load(infile)
    infile.close()
    df = df.reset_index()

    X = df[feature_cols]
    y = df.y
    y = y.astype("int")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1
    )
    k = 5
    cv = KFold(n_splits=k, random_state=None)

    # knn
    knn = KNeighborsClassifier(n_neighbors=2)
    knn = knn.fit(X_train, y_train)

    a, p, r, roc_auc, fpr = get_metrics(knn, X_test, y_test, cv)
    metrics = metrics.append(
        {
            "model": "KNN",
            "group": "all",
            "subgroup": "all",
            "Sentence": i,
            "Accuracy": a,
            "Precision": p,
            "Recall": r,
            "AUC": roc_auc,
            "FPR": fpr,
        },
        ignore_index=True,
    )

    # for each matrix and each demographic category, use model to predict y and save results
    group = [
        "abiEltern",
        "abiEltern",
        "gender",
        "gender",
        "erstsprache",
        "erstsprache",
        "buecher",
        "buecher",
    ]
    subgroup = [
        "abi",
        "keinAbi",
        "boys",
        "girls",
        "deutsch",
        "migration",
        "buch0",
        "buch1",
    ]
    matrice = [
        "matrices_forte_abi",
        "matrices_forte_keinAbi",
        "matrices_forte_boys",
        "matrices_forte_girls",
        "matrices_forte_deutsch",
        "matrices_forte_migration",
        "matrices_buecher_0",
        "matrices_buecher_1",
    ]
    for group, subgroup, matrix in zip(group, subgroup, matrice):
        print(subgroup)
        path = matrix + "/matrix" + str(i) + ".pkl"
        infile = open(path, "rb")
        df = pickle.load(infile)
        infile.close()
        df = df.reset_index()
        X = df[feature_cols]
        y = df.y
        y = y.astype("int")

        a, p, r, roc_auc, fpr = get_metrics(knn, X, y, cv)
        metrics = metrics.append(
            {
                "model": "KNN",
                "group": group,
                "subgroup": subgroup,
                "Sentence": i,
                "Accuracy": a,
                "Precision": p,
                "Recall": r,
                "AUC": roc_auc,
                "FPR": fpr,
            },
            ignore_index=True,
        )

Deep NN

In [None]:
def build_model():
    model = Sequential()
    model.add(Dense(24, input_dim=24, activation="relu"))
    model.add(Dense(48, activation="relu"))
    model.add(Dense(24, activation="relu"))
    model.add(Dense(12, activation="relu"))
    model.add(Dense(1, activation="sigmoid"))

    return model


def get_dn_metrics(model, X, y):
    """
    method to calculate model performance metrics for neural network
    """
    yhat_probs = model.predict(X, verbose=0)
    yhat_classes = (model.predict(X) > 0.5).astype("int32")
    # reduce to 1d array
    yhat_probs = yhat_probs[:, 0]
    yhat_classes = yhat_classes[:, 0]
    a = accuracy_score(y, yhat_classes)
    p = precision_score(y, yhat_classes)
    r = recall_score(y, yhat_classes)
    roc_auc = roc_auc_score(y, yhat_probs)
    tn, fp, fn, tp = confusion_matrix(y, yhat_classes).ravel()
    fpr = fp / (fp + tn)

    return a, p, r, roc_auc, fpr


for i in n:
    path = "matrices_allsessions/matrix" + str(i) + ".pkl"
    infile = open(path, "rb")
    df = pickle.load(infile)
    infile.close()
    df = df.reset_index()

    y_len = len(feature_cols)
    X = df[feature_cols].astype(float)
    y = df.y
    y = y.astype("int")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1
    )

    model = build_model()

    model.compile(loss="binary_crossentropy", optimizer="Adam", metrics=["accuracy"])

    model.fit(
        x=X_train,
        y=y_train,
        epochs=10,
        batch_size=128,
        verbose=0,
        validation_data=(X_test, y_test),
    )

    scores = model.evaluate(x=X_test, y=y_test, verbose=0)

    a, p, r, roc_auc, fpr = get_dn_metrics(model, X_test, y_test)
    metrics = metrics.append(
        {
            "model": "DL",
            "group": "all",
            "subgroup": "all",
            "Sentence": i,
            "Accuracy": a,
            "Precision": p,
            "Recall": r,
            "AUC": roc_auc,
            "FPR": fpr,
        },
        ignore_index=True,
    )

    # for each matrix and each demographic category, use model to predict y and save results
    group = [
        "abiEltern",
        "abiEltern",
        "gender",
        "gender",
        "erstsprache",
        "erstsprache",
        "buecher",
        "buecher",
    ]
    subgroup = [
        "abi",
        "keinAbi",
        "boys",
        "girls",
        "deutsch",
        "migration",
        "buch0",
        "buch1",
    ]
    matrice = [
        "matrices_forte_abi",
        "matrices_forte_keinAbi",
        "matrices_forte_boys",
        "matrices_forte_girls",
        "matrices_forte_deutsch",
        "matrices_forte_migration",
        "matrices_buecher_0",
        "matrices_buecher_1",
    ]

    for group, subgroup, matrice in zip(group, subgroup, matrice):
        print(subgroup)
        path = matrice + "/matrix" + str(i) + ".pkl"
        infile = open(path, "rb")
        df = pickle.load(infile)
        infile.close()
        df = df.reset_index()
        y_len = len(feature_cols)
        X = df[feature_cols].astype(float)
        y = df.y
        y = y.astype("int")
        a, p, r, roc_auc, fpr = get_dn_metrics(model, X, y)

        metrics = metrics.append(
            {
                "model": "DL",
                "group": group,
                "subgroup": subgroup,
                "Sentence": i,
                "Accuracy": a,
                "Precision": p,
                "Recall": r,
                "AUC": roc_auc,
                "FPR": fpr,
            },
            ignore_index=True,
        )

In [None]:
# print(metrics)
metrics.to_excel("all_metrics.xlsx")
metrics.to_pickle("all_metrics.pkl")