# Logistic Regression

## Data Preprocessing

In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import os
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import counting_fns as cf
from sklearn.model_selection import StratifiedKFold


# List of all the months
months = ['2_June', '3_July', '4_August', '5_September', '6_October']

# Define the path to the data directory and columns to keep
data_path = "/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month"

# Filter Columns
filter = ['session_time', 'gender', 'age_gen', 'day', 'timeofday', 'first_outcome',
        'first_wager','first_p/b', 'last_outcome', 'last_wager', 'last_p/b',
        'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom', 
        'std_slotdenom', 'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
        'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
        'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope', 
        '#inc_slotdenom', '#dec_slotdenom', '#inc_maxbet', '#dec_maxbet', '#W', '#L', '#NH', '#D',
        'w/min', 'l/min', '#2ws', '2ws_profit', '2ws_wgramt','2ws/min', 
        '#3ws', '3ws_profit', '3ws_wgramt', '3ws/min', '#4ws', '4ws_profit', '4ws_wgramt', '4ws/min', 
        'w/g', 'l/g', 'nh/g', 'd/g', 'ave_time_per_gamble', 
        'min_time_per_gamble', 'max_time_per_gamble', 'total_gambles',
        'machines_changes', 'unique_machines', 'ave_time_per_machine', 'classification']

In [2]:
# Pick best overall parameters
best_params = {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

## 1 Minute

In [3]:
data_1min = cf.load_and_preprocess_datasets_min_ntop(months, data_path, '1min', filter)

# Concat all dataframes on data_1min
dataset = pd.concat(data_1min)

print(dataset.columns)

# # Seperate dependent and independent variables
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X[:, 0] = le.fit_transform(X[:, 0])

# # # Encode age_generartion, first_outoce, last_outcome, time of day columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
y = le.fit_transform(y)

# Initialize stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

# Lists to store evaluation metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
results= []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    sc = StandardScaler()
    # Scale all columns except the encoded ones
    X_train[:, 19:] = sc.fit_transform(X_train[:, 19:])
    X_test[:, 19:] = sc.transform(X_test[:, 19:])

    ## Handling Class Imbalance 
    # Apply SMOTE - SMOTE generates synthetic samples for the minority class to balance the dataset:
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    classifier = RandomForestClassifier(random_state=0, **best_params)
    classifier.fit(X_train_resampled, y_train_resampled)

    y_pred = classifier.predict(X_test)

    # Calculate evaluation metrics for this fold
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='macro', zero_division=1))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

# Calculate the average evaluation metrics over all folds
average_accuracy = np.mean(accuracy_scores)
std_accuracy = np.std(accuracy_scores)
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1 = np.mean(f1_scores)

# Append results for this time interval
results.append([ '1 min', round(average_accuracy, 3), round(std_accuracy, 3), round(average_precision, 3), round(average_recall, 3), round(average_f1, 3)])

# Create a DataFrame for the results
# columns = ['Time', 'Mean Accuracy', 'Acc. Std.', 'Mean Precision', 'Mean Recall', 'Mean F1 Score']
# results_df_1min = pd.DataFrame(results_1min, columns=columns)


Index(['gender', 'age_gen', 'day', 'timeofday', 'first_outcome', 'first_wager',
       'first_p/b', 'last_outcome', 'last_wager', 'last_p/b', 'beginning_amt',
       'ending_amt', 'ending_balance', 'ave_slotdenom', 'std_slotdenom',
       'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
       'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
       'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit',
       'depletion_slope', '#inc_slotdenom', '#dec_slotdenom', '#inc_maxbet',
       '#dec_maxbet', '#W', '#L', '#NH', '#D', 'w/min', 'l/min', '#2ws',
       '2ws_profit', '2ws_wgramt', '2ws/min', '#3ws', '3ws_profit',
       '3ws_wgramt', '3ws/min', '#4ws', '4ws_profit', '4ws_wgramt', '4ws/min',
       'w/g', 'l/g', 'nh/g', 'd/g', 'ave_time_per_gamble',
       'min_time_per_gamble', 'max_time_per_gamble', 'total_gambles',
       'machines_changes', 'unique_machines', 'ave_time_per_machine',
       'classification'],
      dtype='object')


## 5 minutes

In [4]:
data_5min = cf.load_and_preprocess_datasets_min_ntop(months, data_path, '5min', filter)

# Concat all dataframes on data_1min
dataset = pd.concat(data_5min)

print(dataset.columns)

# # Seperate dependent and independent variables
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X[:, 0] = le.fit_transform(X[:, 0])

# # # Encode age_generartion, first_outoce, last_outcome, time of day columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
y = le.fit_transform(y)

# Initialize stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

# Lists to store evaluation metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    sc = StandardScaler()
    # Scale all columns except the encoded ones
    X_train[:, 19:] = sc.fit_transform(X_train[:, 19:])
    X_test[:, 19:] = sc.transform(X_test[:, 19:])

    ## Handling Class Imbalance 
    # Apply SMOTE - SMOTE generates synthetic samples for the minority class to balance the dataset:
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    classifier = RandomForestClassifier(random_state=0, **best_params)
    classifier.fit(X_train_resampled, y_train_resampled)

    y_pred = classifier.predict(X_test)

    # Calculate evaluation metrics for this fold
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='macro', zero_division=1))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

# Calculate the average evaluation metrics over all folds
average_accuracy = np.mean(accuracy_scores)
std_accuracy = np.std(accuracy_scores)
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1 = np.mean(f1_scores)

# Append results for this time interval
results.append([ '5 min', round(average_accuracy, 3), round(std_accuracy, 3), round(average_precision, 3), round(average_recall, 3), round(average_f1, 3)])


Index(['gender', 'age_gen', 'day', 'timeofday', 'first_outcome', 'first_wager',
       'first_p/b', 'last_outcome', 'last_wager', 'last_p/b', 'beginning_amt',
       'ending_amt', 'ending_balance', 'ave_slotdenom', 'std_slotdenom',
       'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
       'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
       'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit',
       'depletion_slope', '#inc_slotdenom', '#dec_slotdenom', '#inc_maxbet',
       '#dec_maxbet', '#W', '#L', '#NH', '#D', 'w/min', 'l/min', '#2ws',
       '2ws_profit', '2ws_wgramt', '2ws/min', '#3ws', '3ws_profit',
       '3ws_wgramt', '3ws/min', '#4ws', '4ws_profit', '4ws_wgramt', '4ws/min',
       'w/g', 'l/g', 'nh/g', 'd/g', 'ave_time_per_gamble',
       'min_time_per_gamble', 'max_time_per_gamble', 'total_gambles',
       'machines_changes', 'unique_machines', 'ave_time_per_machine',
       'classification'],
      dtype='object')


KeyboardInterrupt: 

## 10 minutes

In [None]:
data_10min = cf.load_and_preprocess_datasets_min_ntop(months, data_path, '10min', filter)

# Concat all dataframes on data_1min
dataset = pd.concat(data_10min)

print(dataset.columns)

# # Seperate dependent and independent variables
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X[:, 0] = le.fit_transform(X[:, 0])

# # # Encode age_generartion, first_outoce, last_outcome, time of day columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
y = le.fit_transform(y)

# Initialize stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

# Lists to store evaluation metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    sc = StandardScaler()
    # Scale all columns except the encoded ones
    X_train[:, 19:] = sc.fit_transform(X_train[:, 19:])
    X_test[:, 19:] = sc.transform(X_test[:, 19:])

    ## Handling Class Imbalance 
    # Apply SMOTE - SMOTE generates synthetic samples for the minority class to balance the dataset:
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    classifier = RandomForestClassifier(random_state=0, **best_params)
    classifier.fit(X_train_resampled, y_train_resampled)

    y_pred = classifier.predict(X_test)

    # Calculate evaluation metrics for this fold
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='macro', zero_division=1))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

# Calculate the average evaluation metrics over all folds
average_accuracy = np.mean(accuracy_scores)
std_accuracy = np.std(accuracy_scores)
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1 = np.mean(f1_scores)

# Append results for this time interval
results.append([ '10 min', round(average_accuracy, 3), round(std_accuracy, 3), round(average_precision, 3), round(average_recall, 3), round(average_f1, 3)])

Index(['gender', 'age_gen', 'day', 'timeofday', 'first_outcome', 'first_wager',
       'first_p/b', 'last_outcome', 'last_wager', 'last_p/b', 'beginning_amt',
       'ending_amt', 'ending_balance', 'ave_slotdenom', 'std_slotdenom',
       'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
       'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
       'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit',
       'depletion_slope', '#inc_slotdenom', '#dec_slotdenom', '#inc_maxbet',
       '#dec_maxbet', '#W', '#L', '#NH', '#D', 'w/min', 'l/min', '#2ws',
       '2ws_profit', '2ws_wgramt', '2ws/min', '#3ws', '3ws_profit',
       '3ws_wgramt', '3ws/min', '#4ws', '4ws_profit', '4ws_wgramt', '4ws/min',
       'w/g', 'l/g', 'nh/g', 'd/g', 'ave_time_per_gamble',
       'min_time_per_gamble', 'max_time_per_gamble', 'total_gambles',
       'machines_changes', 'unique_machines', 'ave_time_per_machine',
       'classification'],
      dtype='object')


## 15 minutes

In [None]:
data_15min = cf.load_and_preprocess_datasets_min_ntop(months, data_path, '15min', filter)

# Concat all dataframes on data_1min
dataset = pd.concat(data_15min)

print(dataset.columns)

# # Seperate dependent and independent variables
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X[:, 0] = le.fit_transform(X[:, 0])

# # # Encode age_generartion, first_outoce, last_outcome, time of day columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
y = le.fit_transform(y)

# Initialize stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

# Lists to store evaluation metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    sc = StandardScaler()
    # Scale all columns except the encoded ones
    X_train[:, 19:] = sc.fit_transform(X_train[:, 19:])
    X_test[:, 19:] = sc.transform(X_test[:, 19:])

    ## Handling Class Imbalance 
    # Apply SMOTE - SMOTE generates synthetic samples for the minority class to balance the dataset:
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    classifier = RandomForestClassifier(random_state=0, **best_params)
    classifier.fit(X_train_resampled, y_train_resampled)

    y_pred = classifier.predict(X_test)

    # Calculate evaluation metrics for this fold
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='macro', zero_division=1))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

# Calculate the average evaluation metrics over all folds
average_accuracy = np.mean(accuracy_scores)
std_accuracy = np.std(accuracy_scores)
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1 = np.mean(f1_scores)

# Append results for this time interval
results.append([ '15 min', round(average_accuracy, 3), round(std_accuracy, 3), round(average_precision, 3), round(average_recall, 3), round(average_f1, 3)])

Index(['gender', 'age_gen', 'day', 'timeofday', 'first_outcome', 'first_wager',
       'first_p/b', 'last_outcome', 'last_wager', 'last_p/b', 'beginning_amt',
       'ending_amt', 'ending_balance', 'ave_slotdenom', 'std_slotdenom',
       'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
       'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
       'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit',
       'depletion_slope', '#inc_slotdenom', '#dec_slotdenom', '#inc_maxbet',
       '#dec_maxbet', '#W', '#L', '#NH', '#D', 'w/min', 'l/min', '#2ws',
       '2ws_profit', '2ws_wgramt', '2ws/min', '#3ws', '3ws_profit',
       '3ws_wgramt', '3ws/min', '#4ws', '4ws_profit', '4ws_wgramt', '4ws/min',
       'w/g', 'l/g', 'nh/g', 'd/g', 'ave_time_per_gamble',
       'min_time_per_gamble', 'max_time_per_gamble', 'total_gambles',
       'machines_changes', 'unique_machines', 'ave_time_per_machine',
       'classification'],
      dtype='object')


## Overall Results

In [None]:
# Create a DataFrame for the results
columns = ['Time', 'Mean Accuracy', 'Acc. Std', ' Mean Precision', 'Mean Recall', 'Mean F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
# print(results_df)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
print(latex_table)
    

\begin{tabular}{lrrrrr}
\toprule
  Time &  Mean Accuracy &  Acc. Std &   Mean Precision &  Mean Recall &  Mean F1 Score \\
\midrule
 1 min &          0.620 &     0.006 &            0.548 &        0.626 &          0.499 \\
 5 min &          0.657 &     0.003 &            0.563 &        0.661 &          0.529 \\
10 min &          0.684 &     0.005 &            0.575 &        0.687 &          0.552 \\
15 min &          0.708 &     0.006 &            0.583 &        0.701 &          0.570 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)
