# Support Vector Machine Model

Rough Plan:
- load the pre made splits
- build the preprocessing part
- Train different models: basic linear svm, balanced classweight, tune c, maybe nonlinear svm with rbf

Very imbalanced data so using accuracy is not too helpful to gain insight

In [39]:
#Imports
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.svm import SVC
from sklearn.svm import LinearSVC #computes faster (maybe change back to original to keep uniform across iterations, not sure yet)

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)

from sklearn.model_selection import GridSearchCV

In [40]:
#Loading train test val splits and getting the numeric and categorical columns
X_train = pd.read_csv("../data/X_train.csv")
X_val   = pd.read_csv("../data/X_val.csv")
X_test  = pd.read_csv("../data/X_test.csv")

y_train = pd.read_csv("../data/y_train.csv").squeeze()
y_val   = pd.read_csv("../data/y_val.csv").squeeze()
y_test  = pd.read_csv("../data/y_test.csv").squeeze()

print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("Train class balance:", y_train.value_counts(normalize=True).to_dict())

X_train: (25519, 10) y_train: (25519,)
Train class balance: {0: 0.9894980210823308, 1: 0.010501978917669188}


In [41]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(class_weight='balanced', random_state=42))
])

param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': [1, 0.1, 0.01, 0.001],
    'svm__kernel': ['rbf']
}

In [42]:
grid = GridSearchCV(
    pipeline, 
    param_grid, 
    refit=True, 
    verbose=1, 
    scoring='f1',  # optimising for f1 score
    n_jobs=-1
)

In [43]:
numeric_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = X_train.select_dtypes(exclude=["number"]).columns.tolist()

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

Numeric columns: ['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'DAYS_BIRTH', 'CNT_FAM_MEMBERS', 'FLAG_MOBIL']
Categorical columns: ['OCCUPATION_TYPE', 'CODE_GENDER', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE']


In [29]:
#Imputing with the median on numeric columns as well as sclaing
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

#Fill missing values with "Missing" in categorical columns + one hot encode
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

#Combining into one preprocessing step and transforming so they;re usable
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="drop"
)

In [30]:
#Calc the metrics and return a dict
def evaluate(model, X, y, name="SET"):
    pred = model.predict(X)
    return {
        "set": name,
        "accuracy": accuracy_score(y, pred),
        "precision": precision_score(y, pred, zero_division=0),
        "recall": recall_score(y, pred, zero_division=0),
        "f1": f1_score(y, pred, zero_division=0),
        "confusion_matrix": confusion_matrix(y, pred),
        "report": classification_report(y, pred, zero_division=0),
    }

#printing the dict
def print_eval(res):
    print(f"\n {res['set']} ")
    print("Accuracy :", res["accuracy"])
    print("Precision:", res["precision"])
    print("Recall   :", res["recall"])
    print("F1       :", res["f1"])
    print("Confusion matrix:\n", res["confusion_matrix"])
    print("\nReport:\n", res["report"])

In [31]:
#First SVM, simplest one. It's linear and has no class weight. Shows why imbalance is a problem
svm_v1 = Pipeline(steps=[
    ("preproc", preprocessor),
    ("clf", SVC(kernel="linear", C=1.0))  # no class_weight yet
])

svm_v1.fit(X_train, y_train) #Training

#Eval on each split and print
res_train_v1 = evaluate(svm_v1, X_train, y_train, "TRAIN (v1)")
res_val_v1   = evaluate(svm_v1, X_val, y_val, "VAL (v1)")
res_test_v1  = evaluate(svm_v1, X_test, y_test, "TEST (v1)")

print_eval(res_train_v1)
print_eval(res_val_v1)
print_eval(res_test_v1)


 TRAIN (v1) 
Accuracy : 0.9894980210823308
Precision: 0.0
Recall   : 0.0
F1       : 0.0
Confusion matrix:
 [[25251     0]
 [  268     0]]

Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99     25251
           1       0.00      0.00      0.00       268

    accuracy                           0.99     25519
   macro avg       0.49      0.50      0.50     25519
weighted avg       0.98      0.99      0.98     25519


 VAL (v1) 
Accuracy : 0.989394770524776
Precision: 0.0
Recall   : 0.0
F1       : 0.0
Confusion matrix:
 [[5411    0]
 [  58    0]]

Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      5411
           1       0.00      0.00      0.00        58

    accuracy                           0.99      5469
   macro avg       0.49      0.50      0.50      5469
weighted avg       0.98      0.99      0.98      5469


 TEST (v1) 
Accuracy : 0.9895776193088316
Precision: 0

In [32]:
#Second SVM, includes class_weight = "balanced" to help the case where high_risk = 1
svm_v2 = Pipeline(steps=[
    ("preproc", preprocessor),
    ("clf", SVC(kernel="linear", C=1.0, class_weight="balanced", max_iter=15000, tol=1e-3)) #Has a limit to help reduce run time (remove this if you have time)
])

svm_v2.fit(X_train, y_train)

res_train_v2 = evaluate(svm_v2, X_train, y_train, "TRAIN (v2)")
res_val_v2   = evaluate(svm_v2, X_val, y_val, "VAL (v2)")
res_test_v2  = evaluate(svm_v2, X_test, y_test, "TEST (v2)")

print_eval(res_train_v2)
print_eval(res_val_v2)
print_eval(res_test_v2)




 TRAIN (v2) 
Accuracy : 0.2363337121360555
Precision: 0.010792099368764
Recall   : 0.7910447761194029
F1       : 0.02129369224588188
Confusion matrix:
 [[ 5819 19432]
 [   56   212]]

Report:
               precision    recall  f1-score   support

           0       0.99      0.23      0.37     25251
           1       0.01      0.79      0.02       268

    accuracy                           0.24     25519
   macro avg       0.50      0.51      0.20     25519
weighted avg       0.98      0.24      0.37     25519


 VAL (v2) 
Accuracy : 0.2316694093984275
Precision: 0.010396975425330813
Recall   : 0.7586206896551724
F1       : 0.020512820512820513
Confusion matrix:
 [[1223 4188]
 [  14   44]]

Report:
               precision    recall  f1-score   support

           0       0.99      0.23      0.37      5411
           1       0.01      0.76      0.02        58

    accuracy                           0.23      5469
   macro avg       0.50      0.49      0.19      5469
weighted avg   

In [33]:
#Third SVM model, finding the best C. Smaller C = more regularisation, Larger C = less regularisation
C_values = [0.001, 0.01, 0.1, 1.0,2.0,3.0,7.0, 10.0,55.0, 100.0]
rows = []

for C in C_values:
    model = Pipeline(steps=[
        ("preproc", preprocessor),
        #Changed to use LinearSVC since it computes much faster, change this back if need be
        ("clf", LinearSVC(C=C, class_weight="balanced", random_state=42, max_iter=20000))
    ])
    model.fit(X_train, y_train)
    pred_val = model.predict(X_val)
    
    #Store results into here then below we turn it into a table to view
    rows.append({
        "C": C,
        "val_precision": precision_score(y_val, pred_val, zero_division=0),
        "val_recall": recall_score(y_val, pred_val, zero_division=0),
        "val_f1": f1_score(y_val, pred_val, zero_division=0),
    })

results_C = pd.DataFrame(rows).sort_values("val_f1", ascending=False)
results_C


Unnamed: 0,C,val_precision,val_recall,val_f1
2,0.1,0.010347,0.344828,0.02009
3,1.0,0.01032,0.344828,0.02004
4,2.0,0.01032,0.344828,0.02004
5,3.0,0.01032,0.344828,0.02004
1,0.01,0.010309,0.344828,0.02002
6,7.0,0.010309,0.344828,0.02002
7,10.0,0.010309,0.344828,0.02002
8,55.0,0.010309,0.344828,0.02002
9,100.0,0.010309,0.344828,0.02002
0,0.001,0.010111,0.344828,0.019646


In [34]:
#Take the best C (based on Val F1 score) and check SVMs results
best_C = results_C.iloc[0]["C"]
print("Best C by validation F1:", best_C)

X_train_full = pd.concat([X_train, X_val], axis=0)
y_train_full = pd.concat([y_train, y_val], axis=0)

svm_v3 = Pipeline(steps=[
    ("preproc", preprocessor),
    ("clf", LinearSVC(C=float(best_C), class_weight="balanced", random_state=42, max_iter=20000))
])

svm_v3.fit(X_train_full, y_train_full)

res_test_v3 = evaluate(svm_v3, X_test, y_test, "TEST (v3 LinearSVC tuned C)")

print_eval(res_test_v3)

Best C by validation F1: 0.1

 TEST (v3 LinearSVC tuned C) 
Accuracy : 0.631194002559883
Precision: 0.015810276679841896
Recall   : 0.5614035087719298
F1       : 0.030754444978375782
Confusion matrix:
 [[3420 1992]
 [  25   32]]

Report:
               precision    recall  f1-score   support

           0       0.99      0.63      0.77      5412
           1       0.02      0.56      0.03        57

    accuracy                           0.63      5469
   macro avg       0.50      0.60      0.40      5469
weighted avg       0.98      0.63      0.76      5469



In [35]:
#Fourth model is trying non-linear SVM using RBF as well as smaller list of tuning C. Maybe nonlinear performs better?
C_values = [0.1, 1, 10]
gamma_values = ["scale", 0.1, 0.01]

rows = []
for C in C_values:
    for gamma in gamma_values:
        model = Pipeline(steps=[
            ("preproc", preprocessor),
            ("clf", SVC(kernel="rbf", C=C, gamma=gamma, class_weight="balanced"))
        ])
        model.fit(X_train, y_train)
        pred_val = model.predict(X_val)

        rows.append({
            "C": C,
            "gamma": gamma,
            "val_precision": precision_score(y_val, pred_val, zero_division=0),
            "val_recall": recall_score(y_val, pred_val, zero_division=0),
            "val_f1": f1_score(y_val, pred_val, zero_division=0),
            "val_pred_pos": int((pred_val == 1).sum())
        })

rbf_results = pd.DataFrame(rows).sort_values("val_f1", ascending=False)
rbf_results

KeyboardInterrupt: 

In [None]:
#Taking the best RBF paramaters based on val F1 and returning its results
best = rbf_results.iloc[0]
best_C = float(best["C"])
best_gamma = best["gamma"]

print("Best RBF params by validation F1:", best_C, best_gamma)

X_train_full = pd.concat([X_train, X_val], axis=0)
y_train_full = pd.concat([y_train, y_val], axis=0)

svm_v4 = Pipeline(steps=[
    ("preproc", preprocessor),
    ("clf", SVC(kernel="rbf", C=best_C, gamma=best_gamma, class_weight="balanced"))
])

svm_v4.fit(X_train_full, y_train_full)

res_test_v4 = evaluate(svm_v4, X_test, y_test, f"TEST (v4 RBF, C={best_C}, gamma={best_gamma})")
print_eval(res_test_v4)


Best RBF params by validation F1: 10.0 scale

 TEST (v4 RBF, C=10.0, gamma=scale) 
Accuracy : 0.7957579082099104
Precision: 0.027629233511586453
Recall   : 0.543859649122807
F1       : 0.05258693808312129
Confusion matrix:
 [[4321 1091]
 [  26   31]]

Report:
               precision    recall  f1-score   support

           0       0.99      0.80      0.89      5412
           1       0.03      0.54      0.05        57

    accuracy                           0.80      5469
   macro avg       0.51      0.67      0.47      5469
weighted avg       0.98      0.80      0.88      5469



In [None]:
#Little summary table to show which 'iterations' are the best performing compared to each other
summary = pd.DataFrame([
    {"model": "SVM v1 (linear, no weights)", "test_f1": res_test_v1["f1"], "test_recall": res_test_v1["recall"], "test_precision": res_test_v1["precision"]},
    {"model": "SVM v2 (linear, balanced)",   "test_f1": res_test_v2["f1"], "test_recall": res_test_v2["recall"], "test_precision": res_test_v2["precision"]},
    {"model": "SVM v3 (LinearSVC tuned C)",  "test_f1": res_test_v3["f1"], "test_recall": res_test_v3["recall"], "test_precision": res_test_v3["precision"]},
    {"model": "SVM v4 (RBF tuned)",          "test_f1": res_test_v4["f1"], "test_recall": res_test_v4["recall"], "test_precision": res_test_v4["precision"]},
]).sort_values("test_f1", ascending=False)

summary

Unnamed: 0,model,test_f1,test_recall,test_precision
3,SVM v4 (RBF tuned),0.052587,0.54386,0.027629
2,SVM v3 (LinearSVC tuned C),0.030754,0.561404,0.01581
1,"SVM v2 (linear, balanced)",0.018212,0.684211,0.009229
0,"SVM v1 (linear, no weights)",0.0,0.0,0.0


# Conclusion
Since the classes are extremely imbalanced, accuracy isn't really looked at. We look at precision, recall and F1 for the class high_risk = 1
So the iterations
v1 linear, with no weight class: model failed to identify any positive cases
v2 linear, class weight is balanced: increases recall to 0.68 but precision is extremely low meaning F1 is also very low. So in other words, it caught more true positives but created a lot of false positives
v3 tuned c: improves F1 slightly, not much change from v2
v4 tuned RBF: best performance out of the 4, the nonlinear model seems to capture things that the linear models couldn't, but still very poor results
Overall, decision tree and random forest models performed far better than SVM 