# SVM

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [2]:
train = pd.read_parquet("../data/prototype/train.parquet")
stats = pd.read_parquet("../data/prototype/train_statistics.parquet")
validation = pd.read_parquet("../data/prototype/validation.parquet")

FEATURE_COLUMNS = [
    "Time_spent_Alone",
    "Social_event_attendance",
    "Going_outside",
    "Friends_circle_size",
    "Post_frequency",
]
TARGET_COLUMN = "Personality"
SEED = 42

In [3]:
# Default strategy for NaNs
for col in FEATURE_COLUMNS:
    validation[col] = validation[col].fillna(value=stats[col]["mean"])

In [4]:
X_train = train[FEATURE_COLUMNS].values
y_train = train[TARGET_COLUMN].values
X_test = validation[FEATURE_COLUMNS].values
y_test = validation[TARGET_COLUMN].values

In [5]:
results = []
for kernel in ["linear", "poly", "rbf", "sigmoid"]:
    for decision_function_shape in ["ovr", "ovo"]:
        for C in [1.0, 2.0, 3.0, 4.0]:
            model = SVC(
                kernel=kernel,
                decision_function_shape=decision_function_shape,
                C=C,
                random_state=SEED,
            )
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            results.append(
                {
                    "Kernel": kernel,
                    "Decision": decision_function_shape,
                    "C": C,
                    "Accuracy": accuracy_score(y_test, preds),
                }
            )
results = pd.DataFrame(results).sort_values("Accuracy", ascending=False)
results

Unnamed: 0,Kernel,Decision,C,Accuracy
0,linear,ovr,1.0,0.97031
1,linear,ovr,2.0,0.97031
2,linear,ovr,3.0,0.97031
3,linear,ovr,4.0,0.97031
4,linear,ovo,1.0,0.97031
5,linear,ovo,2.0,0.97031
6,linear,ovo,3.0,0.97031
7,linear,ovo,4.0,0.97031
20,rbf,ovo,1.0,0.97031
16,rbf,ovr,1.0,0.97031
