# K-Neighbors Classifier

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
train = pd.read_parquet("../data/prototype/train.parquet")
stats = pd.read_parquet("../data/prototype/train_statistics.parquet")
validation = pd.read_parquet("../data/prototype/validation.parquet")

FEATURE_COLUMNS = [
    "Time_spent_Alone",
    "Social_event_attendance",
    "Going_outside",
    "Friends_circle_size",
    "Post_frequency",
]
TARGET_COLUMN = "Personality"

In [3]:
# Default strategy for NaNs
for col in FEATURE_COLUMNS:
    validation[col] = validation[col].fillna(value=stats[col]["mean"])

In [4]:
X_train = train[FEATURE_COLUMNS].values
y_train = train[TARGET_COLUMN].values
X_test = validation[FEATURE_COLUMNS].values
y_test = validation[TARGET_COLUMN].values

In [5]:
results = []
for n_neighbors in [5, 20, 50, 100, 500]:
    for weights in ["uniform", "distance"]:
        for p in [1.0, 1.25, 1.5, 2.0]:
            model = KNeighborsClassifier(
                n_neighbors=n_neighbors,
                weights=weights,
                p=p,
            )
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            results.append(
                {
                    "N": n_neighbors,
                    "Weights": weights,
                    "p": p,
                    "Accuracy": accuracy_score(y_test, preds),
                }
            )
results = pd.DataFrame(results).sort_values("Accuracy", ascending=False)
results

Unnamed: 0,N,Weights,p,Accuracy
0,5,uniform,1.0,0.97139
16,50,uniform,1.0,0.97112
8,20,uniform,1.0,0.97085
17,50,uniform,1.25,0.97085
35,500,uniform,2.0,0.97085
32,500,uniform,1.0,0.97085
24,100,uniform,1.0,0.97058
25,100,uniform,1.25,0.97058
27,100,uniform,2.0,0.97058
26,100,uniform,1.5,0.97058
