# Support Vector Machine (SVM) Model

## Data Preprocessing

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict

# Set working directory
month_file = '6_October'

# Set working directory
os.chdir("/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/"+month_file+"/Ending Balances/Per_Player")

## Load Dataframes

### 1 MIN

In [2]:
# Filter Columns
filter = ['session_time', 'gender', 'age_gen', 'day', 'timeofday', 'first_outcome',
        'first_wager','first_p/b', 'last_outcome', 'last_wager', 'last_p/b',
        'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom', 
        'std_slotdenom', 'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
        'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
        'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope', 
        '#inc_slotdenom', '#dec_slotdenom', '#inc_maxbet', '#dec_maxbet', '#W', '#L', '#NH', '#D',
        'w/min', 'l/min', '#2ws', '2ws_profit', '2ws_wgramt','2ws/min', 
        '#3ws', '3ws_profit', '3ws_wgramt', '3ws/min', '#4ws', '4ws_profit', '4ws_wgramt', '4ws/min', 
        'w/g', 'l/g', 'nh/g', 'd/g', 'ave_time_per_gamble', 
        'min_time_per_gamble', 'max_time_per_gamble', 'total_gambles',
        'machines_changes', 'unique_machines', 'ave_time_per_machine', 'classification']

# Columns NOT INCLUDED
# 'playerkey', 'rank', 'age_range', '#W', '#L', '#NH', '#D','total_duration', 'total_gambles'

# Load dataset
dataset = pd.read_parquet('df_1min_top_vs_ntop_players.parquet', columns=filter)

# Keep only session_time 1
dataset = dataset[dataset['session_time'] == 1]
# Drop age_range and playerkey
dataset = dataset.drop(['session_time'], axis=1)

# Convert ave_time_per_machine to seconds
dataset['ave_time_per_machine'] = dataset['ave_time_per_machine'].dt.total_seconds()

# # Seperate dependent and independent variables
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X[:, 0] = le.fit_transform(X[:, 0])

# # # Encode age_generartion, first_outoce, last_outcome, time of day columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

y = le.fit_transform(y)

## Handling Class Imbalance 
# Apply SMOTE - SMOTE generates synthetic samples for the minority class to balance the dataset:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled , test_size = 0.2, random_state = 1)

sc = StandardScaler()

# Scale all columns except the encoded ones
X_train[:, 25:] = sc.fit_transform(X_train[:, 25:])
X_test[:, 25:] = sc.transform(X_test[:, 25:])

classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro', zero_division=1))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1 Score: ', f1_score(y_test, y_pred, average='macro'))

[[612 362]
 [307 629]]
Accuracy:  0.6497382198952879
Precision:  0.6503268260920647
Recall:  0.6501726513276822
F1 Score:  0.649710470197058


In [3]:
# Define the SVM hyperparameters and their respective ranges
param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']  # Kernel type
}

# Create an SVM classifier
svm_classifier = SVC(random_state=0)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the data to perform the search
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='macro', zero_division=1))
print('Recall:', recall_score(y_test, y_pred, average='macro'))
print('F1 Score:', f1_score(y_test, y_pred, average='macro'))

# Print the best hyperparameters
print('Best Hyperparameters:', best_params)

Confusion Matrix:
[[822 152]
 [113 823]]
Accuracy: 0.8612565445026178
Precision: 0.861623474564651
Recall: 0.8616080047034873
F1 Score: 0.8612565064709046
Best Hyperparameters: {'C': 10, 'kernel': 'rbf'}


In [4]:

## Summary Table of Results
# Table creation 
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
results = []
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'df_{time_interval}min_top_vs_ntop_players.parquet'
    dataset = pd.read_parquet(file_name, columns=filter)

    # Keep only session_time 1
    dataset = dataset[dataset['session_time'] == 1]
    # Drop age_range and playerkey
    dataset = dataset.drop(['session_time'], axis=1)

    # Convert ave_time_per_machine to seconds
    dataset['ave_time_per_machine'] = dataset['ave_time_per_machine'].dt.total_seconds()

    # # Seperate dependent and independent variables
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X[:, 0] = le.fit_transform(X[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X = np.array(ct.fit_transform(X))
    y = le.fit_transform(y)

    ## Handling Class Imbalance 
    # Apply SMOTE - SMOTE generates synthetic samples for the minority class to balance the dataset:
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled , test_size = 0.2, random_state = 1)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_train[:, 25:] = sc.fit_transform(X_train[:, 25:])
    X_test[:, 25:] = sc.transform(X_test[:, 25:])

    # Best hyperparameters from tuning
    best_hyperparameters = best_params

    # Initialize logistic regression model with best hyperparameters
    classifier = SVC(random_state=0, **best_hyperparameters)
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])

# Create a DataFrame for the results
columns = ['Time', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
# print(results_df)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
  Time &  Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &     0.861 &      0.862 &   0.862 &     0.861 \\
 5 min &     0.888 &      0.888 &   0.888 &     0.888 \\
10 min &     0.894 &      0.894 &   0.894 &     0.894 \\
15 min &     0.906 &      0.906 &   0.906 &     0.906 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)


## Cross Validation

In [5]:
## Summary Table of Results
# Table creation 
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
cvscores = []
results = []
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'df_{time_interval}min_top_vs_ntop_players.parquet'
    dataset = pd.read_parquet(file_name, columns=filter)

    # Keep only session_time 1
    dataset = dataset[dataset['session_time'] == 1]
    # Drop age_range and playerkey
    dataset = dataset.drop(['session_time'], axis=1)

    # Convert ave_time_per_machine to seconds
    dataset['ave_time_per_machine'] = dataset['ave_time_per_machine'].dt.total_seconds()

    # # Seperate dependent and independent variables
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X[:, 0] = le.fit_transform(X[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X = np.array(ct.fit_transform(X))
    y = le.fit_transform(y)

    ## Handling Class Imbalance 
    # Apply SMOTE - SMOTE generates synthetic samples for the minority class to balance the dataset:
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_resampled[:, 25:] = sc.fit_transform(X_resampled[:, 25:])

    # Best hyperparameters from tuning
    best_hyperparameters = best_params

    classifier = SVC(random_state = 0, **best_hyperparameters)
    
    # Perform cross-validation
    y_pred_cv = cross_val_predict(classifier, X_resampled, y_resampled, cv=10, n_jobs=-1)

    # Perform 10-fold cross-validation
    cv_scores = cross_val_score(classifier, X_resampled, y_resampled, cv=10, scoring='accuracy', n_jobs=-1)
    mean_accuracy = cv_scores.mean()
    std_accuracy = cv_scores.std()
    cv_scores = cv_scores.tolist()
    cv_scores = [ '%.3f' % elem for elem in cv_scores]

    # Calculate precision, recall, and F1-score
    precision = precision_score(y_resampled, y_pred_cv, average='macro')
    recall = recall_score(y_resampled, y_pred_cv, average='macro')
    f1 = f1_score(y_resampled, y_pred_cv, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(mean_accuracy, 3), round(std_accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])
    cvscores.append([f'{time_interval} min', *cv_scores])

# Create a DataFrame for the results
columns = ['Time', 'Mean_Accuracy', 'Std_Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
cv_columns = ['Time', 'CV 1', 'CV 2', 'CV 3', 'CV 4', 'CV 5', 'CV 6', 'CV 7', 'CV 8', 'CV 9', 'CV 10']
cv_results_df = pd.DataFrame(cvscores, columns=cv_columns)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
latex_table2 = cv_results_df.to_latex(index=False, escape=False)
print(latex_table2)
print(latex_table)


\begin{tabular}{lllllllllll}
\toprule
  Time &  CV 1 &  CV 2 &  CV 3 &  CV 4 &  CV 5 &  CV 6 &  CV 7 &  CV 8 &  CV 9 & CV 10 \\
\midrule
 1 min & 0.749 & 0.768 & 0.839 & 0.872 & 0.858 & 0.885 & 0.798 & 0.857 & 0.865 & 0.875 \\
 5 min & 0.812 & 0.826 & 0.835 & 0.881 & 0.889 & 0.894 & 0.861 & 0.854 & 0.893 & 0.890 \\
10 min & 0.835 & 0.846 & 0.867 & 0.890 & 0.897 & 0.908 & 0.874 & 0.883 & 0.874 & 0.926 \\
15 min & 0.851 & 0.871 & 0.869 & 0.899 & 0.903 & 0.906 & 0.870 & 0.890 & 0.874 & 0.915 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrrrrr}
\toprule
  Time &  Mean_Accuracy &  Std_Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &          0.836 &         0.046 &      0.841 &   0.836 &     0.836 \\
 5 min &          0.863 &         0.029 &      0.869 &   0.863 &     0.863 \\
10 min &          0.880 &         0.026 &      0.884 &   0.880 &     0.880 \\
15 min &          0.885 &         0.019 &      0.889 &   0.885 &     0.885 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)
  latex_table2 = cv_results_df.to_latex(index=False, escape=False)
