In [1]:
import pandas as pd
import pickle
import seaborn as sns
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from numpy import mean
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
import warnings
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

# Ignore the warning message
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
def KNN(y_test, x_test, X_train, y_train, cv, X, y):
    """
    method fits classifier, carries out cross validation and calculates performance metrics
    optional code to print tree as a figure or text
    """
    knn = KNeighborsClassifier(n_neighbors=2)
    knn = knn.fit(X_train, y_train)

    # calculate scores with cv
    scores_a = cross_val_score(
        knn, X_train, y_train, scoring="accuracy", cv=cv, n_jobs=-1
    )
    a = mean(scores_a)

    scores_p = cross_val_score(
        knn, X_train, y_train, scoring="precision", cv=cv, n_jobs=-1
    )
    p = mean(scores_p)

    scores_r = cross_val_score(
        knn, X_train, y_train, scoring="recall", cv=cv, n_jobs=-1
    )
    r = mean(scores_r)

    scores_f1 = cross_val_score(knn, X_train, y_train, scoring="f1", cv=cv, n_jobs=-1)
    f1_cv = mean(scores_f1)

    # evaluate the model on the test set
    pred = knn.predict(x_test)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    y_pred = knn.predict_proba(x_test)[:, 1]
    auc = metrics.roc_auc_score(y_test, y_pred)

    # predict and calculate probability
    pred = knn.predict(x_test)
    probs = knn.predict_proba(x_test)

    # perform permutation importance
    results = permutation_importance(knn, X, y, scoring="neg_mean_squared_error")

    return accuracy, precision, recall, f1, auc, probs, knn, results

In [None]:
# specify data frames and set range and feature_cols

metrics_KNN = pd.DataFrame(
    columns=["Sentence", "Accuracy", "Precision", "Recall", "F1", "AUC"]
)
featureImportance = pd.DataFrame(columns=["Sentence", "Feature", "Score"])

n = list(range(2, 61))

feature_cols = [
    "Erstloesung",
    "Schussel",
    "Erfolg",
    "Schwierigkeit",
    "ist_Schulzeit",
    "MehrfachFalsch",
    "vorher_abgebrochen",
    "Fehler",
    "Klassenstufe",
    "Jahredabei",
    "AnzahlAufgaben",
    "Sex__m",
    "Sex__w",
    "Testposition__pruefung",
    "Testposition__training",
    "Testposition__version",
    "Art__GK",
    "Art__GR",
    "Art__GZ",
    "Art__K",
    "Art__LB",
    "UserAttribut",
    "OrderNumber",
    "steps",
]

In [3]:
# loops through all matrices and fits model, saves metrics

for i in n:
    path = "matrices_allsessions/matrix" + str(i) + ".pkl"
    infile = open(path, "rb")
    df = pickle.load(infile)
    infile.close()
    df = df.reset_index()

    X = df[feature_cols]
    y = df.y
    y = y.astype("int")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1
    )
    k = 5
    cv = KFold(n_splits=k, random_state=None)

    a, p, r, f1, auc, probs, knn, results = KNN(
        y_test, X_test, X_train, y_train, cv, X, y
    )
    metrics_KNN = metrics_KNN.append(
        {
            "Sentence": i,
            "Accuracy": a,
            "Precision": p,
            "Recall": r,
            "F1": f1,
            "AUC": auc,
        },
        ignore_index=True,
    )

    # summarize feature importance
    importance = results.importances_mean
    for k, v in enumerate(importance):
        # print("Feature: %0d, Score: %.5f" % (k, v))
        featureImportance = featureImportance.append(
            {"Sentence": i, "Feature": k, "Score": v}, ignore_index=True
        )

metrics_KNN.to_pickle("model_metrics/metrics_KNN.pkl")

In [None]:
# plot feature importance
featureImportance_grouped = featureImportance.groupby("Feature").agg(
    {"Score": ["mean"]}
)

plt.figure(figsize=(20, 5))
ax = sns.barplot(
    x=featureImportance_grouped.index,
    y=featureImportance_grouped.Score["mean"],
    data=featureImportance_grouped,
    color="#00338d",
)

In [25]:
# plot metrics
sns.set_theme()

sns.lineplot(data=metrics_KNN, x="Sentence", y="Accuracy")
plt.show()

sns.lineplot(data=metrics_KNN, x="Sentence", y="Precision")
plt.show()

sns.lineplot(data=metrics_KNN, x="Sentence", y="Recall")
plt.show()

sns.lineplot(data=metrics_KNN, x="Sentence", y="F1")
plt.show()

sns.lineplot(data=metrics_KNN, x="Sentence", y="AUC")
plt.show()

In [None]:
# Code to generate sample probability data
# probability_sample = probs[:, :1].tolist()
# probability_sample = pd.DataFrame(probability_sample)
# probability_sample.to_pickle('KNN55.pkl')