# SVM

In [5]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from data_preprocess import load_training_data,load_test_data, normalize_features
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np
import time
from joblib import dump
from sklearn import svm
import joblib

# Training

In [6]:
# Load data
X_train, y_train = load_training_data('data/train.csv')

# Normalize the features
X_train = normalize_features(X_train)

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100, 200],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

# Initialize a DataFrame to store the results
results_df = pd.DataFrame(columns=['kernel', 'C', 'gamma', 'accuracy', 'f1_score'])

# Define 5-fold cross validation test harness
kfold = KFold(n_splits=5, shuffle=True)

# Perform 5-fold cross validation
for kernel in param_grid['kernel']:
    for C in param_grid['C']:
        for gamma in param_grid['gamma']:
            start_time = time.time()

            # Create a SVM classifier using the kernel trick
            clf = svm.SVC(kernel=kernel, C=C, gamma=gamma)
            
            # Perform cross-validation manually to get the number of iterations
            cv_results = []
            f1_results = []
            for train_index, test_index in kfold.split(X_train):
                X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
                y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
                clf.fit(X_train_fold, y_train_fold)
                score = clf.score(X_test_fold, y_test_fold)
                cv_results.append(score)
                y_pred = clf.predict(X_test_fold)
                f1 = f1_score(y_test_fold, y_pred, average='weighted')
                f1_results.append(f1)

            # Save the model
            model_filename = "models/svm.joblib"
            dump(clf, model_filename)

            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Kernel: {kernel}, C: {C}, Gamma: {gamma}\nCross-validation mean accuracy: {np.mean(cv_results)*100}%\nElapsed time: {elapsed_time} seconds\n")

            new_row = pd.DataFrame({
                'kernel': [kernel],
                'C': [C],
                'gamma': [gamma],
                'accuracy': [np.mean(cv_results)],
                'f1_score': [np.mean(f1_results)]
            })

            results_df = pd.concat([results_df, new_row], ignore_index=True)

Kernel: linear, C: 0.1, Gamma: 1
Cross-validation mean accuracy: 58.53419282511211%
Elapsed time: 0.8874905109405518 seconds

Kernel: linear, C: 0.1, Gamma: 0.1
Cross-validation mean accuracy: 58.53539397821908%
Elapsed time: 0.4089205265045166 seconds

Kernel: linear, C: 0.1, Gamma: 0.01
Cross-validation mean accuracy: 57.9103939782191%
Elapsed time: 0.44971346855163574 seconds

Kernel: linear, C: 0.1, Gamma: 0.001
Cross-validation mean accuracy: 58.271540679051895%
Elapsed time: 0.43506765365600586 seconds

Kernel: linear, C: 1, Gamma: 1
Cross-validation mean accuracy: 57.641335682254955%
Elapsed time: 0.8371241092681885 seconds

Kernel: linear, C: 1, Gamma: 0.1
Cross-validation mean accuracy: 57.37948430493274%
Elapsed time: 0.6830554008483887 seconds

Kernel: linear, C: 1, Gamma: 0.01
Cross-validation mean accuracy: 57.72661755285073%
Elapsed time: 0.8625984191894531 seconds

Kernel: linear, C: 1, Gamma: 0.001
Cross-validation mean accuracy: 58.71356502242152%
Elapsed time: 0.80401

## The Best Parameters

In [7]:
# Sort the results by accuracy in descending order
sorted_results = results_df.sort_values(by='accuracy', ascending=False)

# Print the best parameters and corresponding accuracy
best_params = sorted_results.iloc[0]
print(f"Best parameters: Kernel={best_params['kernel']}, C={best_params['C']}, Gamma={best_params['gamma']}")
print(f"Best accuracy: {best_params['accuracy']*100}%")

Best parameters: Kernel=rbf, C=1.0, Gamma=1
Best accuracy: 62.46316463805254%


## Train the Model

In [8]:
# Find the parameters with the highest accuracy
best_params = results_df.loc[results_df['accuracy'].idxmax()]

# Print the best parameters
print(f"Best parameters:\nKernel: {best_params['kernel']}, C: {best_params['C']}, Gamma: {best_params['gamma']}\nAccuracy: {best_params['accuracy']}, F1 Score: {best_params['f1_score']}")

# Train the model with the best parameters on the full training set
best_clf = svm.SVC(kernel=best_params['kernel'], C=best_params['C'], gamma=best_params['gamma'])
best_clf.fit(X_train, y_train)

# Save the best model
best_model_filename = "models/svm.joblib"
dump(best_clf, best_model_filename)

Best parameters:
Kernel: rbf, C: 1.0, Gamma: 1
Accuracy: 0.6246316463805254, F1 Score: 0.5931782391006815


['models/svm.joblib']

## Predict

In [10]:
# Load the model
best_model_filename = f"models/svm.joblib"
best_clf = joblib.load(best_model_filename)

# Load the test data
X_test, y_test = load_test_data('data/test.csv')

# Normalize the test data
X_test_normalized = normalize_features(X_test)

# Predict the labels of the test set
y_pred = best_clf.predict(X_test_normalized)

# Calculate the accuracy of the model on the test set
accuracy = accuracy_score(y_test, y_pred)

# Calculate the F1 score of the model on the test set
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the accuracy
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Accuracy: 0.6416666666666667
F1 Score: 0.6288297020605569
