# K-Nearest Neighbors (K-NN) Classification

## Data Preprocessing

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import random
import counting_fns as cf


# List of all the months
months = ['2_June', '3_July', '4_August', '5_September', '6_October']

# Define the path to the data directory and columns to keep
data_path = "/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month"

# Filter Columns
filter = ['session_time', 'gender', 'age_gen', 'day', 'timeofday', 'first_outcome',
        'first_wager','first_p/b', 'last_outcome', 'last_wager', 'last_p/b',
        'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom', 
        'std_slotdenom', 'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
        'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
        'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope', 
        '#inc_slotdenom', '#dec_slotdenom', '#inc_maxbet', '#dec_maxbet', '#W', '#L', '#NH', '#D',
        'w/min', 'l/min', '#2ws', '2ws_profit', '2ws_wgramt','2ws/min', 
        '#3ws', '3ws_profit', '3ws_wgramt', '3ws/min', '#4ws', '4ws_profit', '4ws_wgramt', '4ws/min', 
        'w/g', 'l/g', 'nh/g', 'd/g', 'ave_time_per_gamble', 
        'min_time_per_gamble', 'max_time_per_gamble', 'total_gambles',
        'machines_changes', 'unique_machines', 'ave_time_per_machine', 'classification']



## Randomized Months

In [3]:
# Randomly select 3 months for training set
# random.seed(350)
months_train = random.sample(months, 4)

# Print the months in the training set
print("Months in training set:", months_train)

# Create a list of remaining months for the test set
months_test = [month for month in months if month not in months_train]
# Print the months in the test set
print("Months in test set:", months_test)

Months in training set: ['6_October', '3_July', '2_June', '5_September']
Months in test set: ['4_August']


## 1 MINUTE

In [7]:
# Load the datasets
datasets = cf.load_and_preprocess_datasets_min_ntop(months, data_path, '1min', filter)

# Create training and test sets
dt_train = pd.concat([datasets['dtf'+month[1:]] for month in months_train])
dt_test = pd.concat([datasets['dtf'+month[1:]] for month in months_test])

# # Seperate dependent and independent variables
X_train = dt_train.iloc[:, :-1].values
y_train = dt_train.iloc[:, -1].values
X_test = dt_test.iloc[:, :-1].values
y_test = dt_test.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X_train[:, 0] = le.fit_transform(X_train[:, 0])
X_test[:, 0] = le.fit_transform(X_test[:, 0])


# # Encode age_generartion, first_outoce, last_outcome columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 5])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.transform(X_test))

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# # Feature Scaling
sc = StandardScaler()

# Scale all columns except the encoded ones
X_train[:, 14:] = sc.fit_transform(X_train[:, 14:])
X_test[:, 14:] = sc.transform(X_test[:, 14:])


classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1 Score: ', f1_score(y_test, y_pred, average='macro'))

[[10297   122]
 [ 1139    37]]
Accuracy:  0.891246226821906
Precision:  0.5665533205303346
Recall:  0.509876603967242
F1 Score:  0.49886612222025145


## 2 Min

In [8]:
# Load the datasets for 2min, named it datasets_2
datasets_2 = cf.load_and_preprocess_datasets_min_ntop(months, data_path, '2min', filter)

# Create training and test sets
dt_train = pd.concat([datasets_2['dtf'+month[1:]] for month in months_train])
dt_test = pd.concat([datasets_2['dtf'+month[1:]] for month in months_test])

# # Seperate dependent and independent variables
X_train = dt_train.iloc[:, :-1].values
y_train = dt_train.iloc[:, -1].values
X_test = dt_test.iloc[:, :-1].values
y_test = dt_test.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X_train[:, 0] = le.fit_transform(X_train[:, 0])
X_test[:, 0] = le.fit_transform(X_test[:, 0])


# # Encode age_generartion, first_outoce, last_outcome columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 5])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.transform(X_test))

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# # Feature Scaling
sc = StandardScaler()

# Scale all columns except the encoded ones
X_train[:, 14:] = sc.fit_transform(X_train[:, 14:])
X_test[:, 14:] = sc.transform(X_test[:, 14:])

classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1 Score: ', f1_score(y_test, y_pred, average='macro'))

[[10291   128]
 [ 1132    44]]
Accuracy:  0.8913324708926261
Precision:  0.5783578215310197
Recall:  0.5125648589409849
F1 Score:  0.5037974370560407


## 3 Min

In [9]:
# Load the datasets for 3min, named it datasets_3
datasets_3 = cf.load_and_preprocess_datasets_min_ntop(months, data_path, '3min', filter)

# Create training and test sets
dt_train = pd.concat([datasets_3['dtf'+month[1:]] for month in months_train])
dt_test = pd.concat([datasets_3['dtf'+month[1:]] for month in months_test])

# # Seperate dependent and independent variables
X_train = dt_train.iloc[:, :-1].values
y_train = dt_train.iloc[:, -1].values
X_test = dt_test.iloc[:, :-1].values
y_test = dt_test.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X_train[:, 0] = le.fit_transform(X_train[:, 0])
X_test[:, 0] = le.fit_transform(X_test[:, 0])


# # Encode age_generartion, first_outoce, last_outcome columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 5])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.transform(X_test))

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# # Feature Scaling
sc = StandardScaler()

# Scale all columns except the encoded ones
X_train[:, 14:] = sc.fit_transform(X_train[:, 14:])
X_test[:, 14:] = sc.transform(X_test[:, 14:])

classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1 Score: ', f1_score(y_test, y_pred, average='macro'))

[[10276   143]
 [ 1135    41]]
Accuracy:  0.8897800776196636
Precision:  0.5616803294304122
Recall:  0.5105695099807848
F1 Score:  0.5008754142976476


## 4 min

In [10]:
# Load the datasets for 4min, named it datasets_4
datasets_4 = cf.load_and_preprocess_datasets_min_ntop(months, data_path, '4min', filter)

# Create training and test sets
dt_train = pd.concat([datasets_4['dtf'+month[1:]] for month in months_train])
dt_test = pd.concat([datasets_4['dtf'+month[1:]] for month in months_test])

# # Seperate dependent and independent variables
X_train = dt_train.iloc[:, :-1].values
y_train = dt_train.iloc[:, -1].values
X_test = dt_test.iloc[:, :-1].values
y_test = dt_test.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X_train[:, 0] = le.fit_transform(X_train[:, 0])
X_test[:, 0] = le.fit_transform(X_test[:, 0])


# # Encode age_generartion, first_outoce, last_outcome columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 5])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.transform(X_test))

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# # Feature Scaling
sc = StandardScaler()

# Scale all columns except the encoded ones
X_train[:, 14:] = sc.fit_transform(X_train[:, 14:])
X_test[:, 14:] = sc.transform(X_test[:, 14:])

classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1 Score: ', f1_score(y_test, y_pred, average='macro'))

[[10277   142]
 [ 1135    41]]
Accuracy:  0.8898663216903838
Precision:  0.5622935018071286
Recall:  0.5106174992311926
F1 Score:  0.5009218416042238


## 5 MINUTES

In [11]:
# Load the datasets for 5min, named it datasets_5
datasets_5 = cf.load_and_preprocess_datasets_min_ntop(months, data_path, '5min', filter)

# Create training and test sets
dt_train = pd.concat([datasets_5['dtf'+month[1:]] for month in months_train])
dt_test = pd.concat([datasets_5['dtf'+month[1:]] for month in months_test])

# # Seperate dependent and independent variables
X_train = dt_train.iloc[:, :-1].values
y_train = dt_train.iloc[:, -1].values
X_test = dt_test.iloc[:, :-1].values
y_test = dt_test.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X_train[:, 0] = le.fit_transform(X_train[:, 0])
X_test[:, 0] = le.fit_transform(X_test[:, 0])


# # Encode age_generartion, first_outoce, last_outcome columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 5])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.transform(X_test))

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# # Feature Scaling
sc = StandardScaler()

# Scale all columns except the encoded ones
X_train[:, 14:] = sc.fit_transform(X_train[:, 14:])
X_test[:, 14:] = sc.transform(X_test[:, 14:])

classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1 Score: ', f1_score(y_test, y_pred, average='macro'))


[[10256   163]
 [ 1120    56]]
Accuracy:  0.8893488572660629
Precision:  0.5786274396470339
Recall:  0.5159872759930347
F1 Score:  0.5107100129012524


### 10 Min

In [12]:
# Load the datasets for 5min, named it datasets_5
datasets_10 = cf.load_and_preprocess_datasets_min_ntop(months, data_path, '10min', filter)

# Create training and test sets
dt_train = pd.concat([datasets_5['dtf'+month[1:]] for month in months_train])
dt_test = pd.concat([datasets_5['dtf'+month[1:]] for month in months_test])

# # Seperate dependent and independent variables
X_train = dt_train.iloc[:, :-1].values
y_train = dt_train.iloc[:, -1].values
X_test = dt_test.iloc[:, :-1].values
y_test = dt_test.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X_train[:, 0] = le.fit_transform(X_train[:, 0])
X_test[:, 0] = le.fit_transform(X_test[:, 0])


# # Encode age_generartion, first_outoce, last_outcome columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 5])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.transform(X_test))

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# # Feature Scaling
sc = StandardScaler()

# Scale all columns except the encoded ones
X_train[:, 14:] = sc.fit_transform(X_train[:, 14:])
X_test[:, 14:] = sc.transform(X_test[:, 14:])

classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1 Score: ', f1_score(y_test, y_pred, average='macro'))

[[10256   163]
 [ 1120    56]]
Accuracy:  0.8893488572660629
Precision:  0.5786274396470339
Recall:  0.5159872759930347
F1 Score:  0.5107100129012524


### 15 Min

In [13]:
# Load the datasets for 5min, named it datasets_5
datasets_15 = cf.load_and_preprocess_datasets_min_ntop(months, data_path, '15min', filter)

# Create training and test sets
dt_train = pd.concat([datasets_5['dtf'+month[1:]] for month in months_train])
dt_test = pd.concat([datasets_5['dtf'+month[1:]] for month in months_test])

# # Seperate dependent and independent variables
X_train = dt_train.iloc[:, :-1].values
y_train = dt_train.iloc[:, -1].values
X_test = dt_test.iloc[:, :-1].values
y_test = dt_test.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X_train[:, 0] = le.fit_transform(X_train[:, 0])
X_test[:, 0] = le.fit_transform(X_test[:, 0])


# # Encode age_generartion, first_outoce, last_outcome columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 5])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.transform(X_test))

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# # Feature Scaling
sc = StandardScaler()

# Scale all columns except the encoded ones
X_train[:, 14:] = sc.fit_transform(X_train[:, 14:])
X_test[:, 14:] = sc.transform(X_test[:, 14:])

classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1 Score: ', f1_score(y_test, y_pred, average='macro'))

[[10256   163]
 [ 1120    56]]
Accuracy:  0.8893488572660629
Precision:  0.5786274396470339
Recall:  0.5159872759930347
F1 Score:  0.5107100129012524


In [4]:
# Table creation 
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
results = []
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'{time_interval}min'
    # Load the datasets
    datasets = cf.load_and_preprocess_datasets_min_ntop(months, data_path, file_name, filter)

    # Create training and test sets
    dt_train = pd.concat([datasets['dtf'+month[1:]] for month in months_train])
    dt_test = pd.concat([datasets['dtf'+month[1:]] for month in months_test])

    # # Seperate dependent and independent variables
    X_train = dt_train.iloc[:, :-1].values
    y_train = dt_train.iloc[:, -1].values
    X_test = dt_test.iloc[:, :-1].values
    y_test = dt_test.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X_train[:, 0] = le.fit_transform(X_train[:, 0])
    X_test[:, 0] = le.fit_transform(X_test[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X_train = np.array(ct.fit_transform(X_train))
    X_test = np.array(ct.transform(X_test))

    y_train = le.fit_transform(y_train)
    y_test = le.fit_transform(y_test)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_train[:, 25:] = sc.fit_transform(X_train[:, 25:])
    X_test[:, 25:] = sc.transform(X_test[:, 25:])

    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])

# Create a DataFrame for the results
columns = ['Time', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
# print(results_df)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
  Time &  Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &     0.892 &      0.564 &   0.509 &     0.497 \\
 5 min &     0.891 &      0.588 &   0.515 &     0.509 \\
10 min &     0.892 &      0.623 &   0.528 &     0.531 \\
15 min &     0.892 &      0.640 &   0.538 &     0.546 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)
