# Cooles Metric Vergleichs Zeugs juhuu 😎😎😎

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, balanced_accuracy_score

In [2]:
def get_knn_scores(X_train, y_train, X_test, y_test, k_range, metric):
    P, R, A, BA, ks = [], [], [], [], []
    for k in k_range:
        if k % 2 != 0:
            knn = KNeighborsClassifier(n_neighbors=k, weights='distance', metric=metric)
            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_test)

            P.append(precision_score(y_test, y_pred))
            R.append(recall_score(y_test, y_pred))
            A.append(accuracy_score(y_test, y_pred))
            BA.append(balanced_accuracy_score(y_test, y_pred))
            ks.append(k)
    
    return P,R,A,BA, ks


def get_svc_scores(X_train, y_train, X_test, y_test, C_range):
    P, R, A, BA, Cs = [], [], [], [], []
    for C in C_range:
        svc = LinearSVC(random_state=42, C=C, dual=False)
        svc.fit(X_train, y_train)
        y_pred = svc.predict(X_test)

        P.append(precision_score(y_test, y_pred))
        R.append(recall_score(y_test, y_pred))
        A.append(accuracy_score(y_test, y_pred))
        BA.append(balanced_accuracy_score(y_test, y_pred))
        Cs.append(C)
    
    return P,R,A,BA, Cs

In [3]:
# decide which dataset and scaling
DATASET = 'num'

SCALING = True

In [4]:
# load selected dataset
if DATASET == 'cat':
    df = pd.read_csv("../data/speeddating/speeddating_cat.csv")
else:
    
    df = pd.read_csv("../data/speeddating/speeddating_num.csv")

train, test = train_test_split(df, test_size=0.2, random_state=44)

X_train = train.drop(['match'], axis=1)
y_train = train['match']
X_test = test.drop(['match'], axis=1)
y_test = test['match']

In [5]:
# scaling if enabled
if SCALING:
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

In [6]:
# generate strings for labels and filenames
sscaled = 'min-max-scaled' if SCALING else 'unscaled'
dscaled = '_sc' if SCALING else ''
sdata = 'numeric' if DATASET == 'num' else 'categorical'
ddata = '_num' if DATASET == 'num' else '_cat'

## kNN

In [None]:
 # manhattan | euclidean | chebyshev
metric = 'chebyshev'
k_range = np.arange(1,29)

P, R, A,BA,y = get_knn_scores(X_train, y_train, X_test, y_test, k_range, metric)

In [None]:
plt.plot(y, P)
plt.plot(y, R)
plt.plot(y, A)
plt.plot(y, BA)
plt.legend(['prc', 'rec', 'acc', 'bac'], loc='lower right')
plt.title(f'{sscaled}, {sdata} data, {metric} distance')
plt.xlabel('k')
# plt.savefig(f'knn{ddata}{dscaled}_{metric}.png')
plt.show()

## SVC

In [None]:
C_range = np.logspace(-4, -2, num=50)

P,R,A,BA,y = get_svc_scores(X_train, y_train, X_test, y_test, C_range)

In [None]:
plt.plot(y, P)
plt.plot(y, R)
plt.plot(y, A)
plt.plot(y, BA)
plt.legend(['prc', 'rec', 'acc', 'bac'], loc='lower right')
plt.title(f'{sscaled}, {sdata} data')
plt.xlabel('C')
plt.xscale('log')
# plt.savefig(f'svc{ddata}{dscaled}.png')
plt.show()