# Logistic Regression

## Data Preprocessing

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

# Set working directory
month_file = '2_June'
# Set working directory
os.chdir("/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/"+month_file+"/Ending Balances/Per_Player")

## Load Dataframes

### 1 MIN

In [2]:
# Filter Columns
filter = ['session_time', 'gender', 'age_gen', 'day', 'timeofday', 'first_outcome',
        'first_wager','first_p/b', 'last_outcome', 'last_wager', 'last_p/b',
        'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom', 
        'std_slotdenom', 'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
        'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
        'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope', 
        '#inc_slotdenom', '#dec_slotdenom', '#inc_maxbet', '#dec_maxbet', '#W', '#L', '#NH', '#D',
        'w/min', 'l/min', '#2ws', '2ws_profit', '2ws_wgramt','2ws/min', 
        '#3ws', '3ws_profit', '3ws_wgramt', '3ws/min', '#4ws', '4ws_profit', '4ws_wgramt', '4ws/min', 
        'w/g', 'l/g', 'nh/g', 'd/g', 'ave_time_per_gamble', 
        'min_time_per_gamble', 'max_time_per_gamble', 'total_gambles',
        'machines_changes', 'unique_machines', 'ave_time_per_machine', 'classification']

# Columns NOT INCLUDED
# 'playerkey', 'rank', 'age_range', '#W', '#L', '#NH', '#D','total_duration', 'total_gambles'

# Load dataset
dataset = pd.read_parquet('df_1min_top_vs_ntop_players.parquet', columns=filter)

# Keep only session_time 1
dataset = dataset[dataset['session_time'] == 1]
# Drop age_range and playerkey
dataset = dataset.drop(['session_time'], axis=1)

# Convert ave_time_per_machine to seconds
dataset['ave_time_per_machine'] = dataset['ave_time_per_machine'].dt.total_seconds()

# # Seperate dependent and independent variables
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X[:, 0] = le.fit_transform(X[:, 0])

# # # Encode age_generartion, first_outoce, last_outcome, time of day columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

y = le.fit_transform(y)

## Handling Class Imbalance 
# Apply SMOTE - SMOTE generates synthetic samples for the minority class to balance the dataset:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled , test_size = 0.2, random_state = 1)

sc = StandardScaler()

# Scale all columns except the encoded ones
X_train[:, 19:] = sc.fit_transform(X_train[:, 19:])
X_test[:, 19:] = sc.transform(X_test[:, 19:])

classifier = LogisticRegression(random_state = 0, max_iter=1000)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro', zero_division=1))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1 Score: ', f1_score(y_test, y_pred, average='macro'))

[[32 13]
 [ 7 41]]
Accuracy:  0.7849462365591398
Precision:  0.7898860398860399
Recall:  0.7826388888888889
F1 Score:  0.7829131652661063


In [3]:
# Define hyperparameters for tuning
param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Penalty (L1 or L2)
    'solver': ['liblinear', 'saga'],  # Solver algorithm
    'max_iter': [300, 500, 1000]  # Maximum number of iterations
}

# Initialize logistic regression model
classifier = LogisticRegression(random_state=0)

# Create GridSearchCV instance
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

# Get the best model
best_classifier = grid_search.best_estimator_

# Predict using the best model
y_pred = best_classifier.predict(X_test)

# Evaluate the best model
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='macro', zero_division=1))
print('Recall:', recall_score(y_test, y_pred, average='macro'))
print('F1 Score:', f1_score(y_test, y_pred, average='macro'))

# Print the best hyperparameters
print('Best Hyperparameters:', grid_search.best_params_)



Confusion Matrix:
[[35 10]
 [ 2 46]]
Accuracy: 0.8709677419354839
Precision: 0.8836872586872586
Recall: 0.8680555555555556
F1 Score: 0.8691369606003753
Best Hyperparameters: {'C': 10, 'max_iter': 300, 'penalty': 'l1', 'solver': 'liblinear'}


In [4]:
## Summary Table of Results
# Table creation 
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
results = []
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'df_{time_interval}min_top_vs_ntop_players.parquet'
    dataset = pd.read_parquet(file_name, columns=filter)

    # Keep only session_time 1
    dataset = dataset[dataset['session_time'] == 1]
    # Drop age_range and playerkey
    dataset = dataset.drop(['session_time'], axis=1)

    # Convert ave_time_per_machine to seconds
    dataset['ave_time_per_machine'] = dataset['ave_time_per_machine'].dt.total_seconds()

    # # Seperate dependent and independent variables
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X[:, 0] = le.fit_transform(X[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X = np.array(ct.fit_transform(X))
    y = le.fit_transform(y)

    ## Handling Class Imbalance 
    # Apply SMOTE - SMOTE generates synthetic samples for the minority class to balance the dataset:
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled , test_size = 0.2, random_state = 1)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_train[:, 19:] = sc.fit_transform(X_train[:, 19:])
    X_test[:, 19:] = sc.transform(X_test[:, 19:])

    # Best hyperparameters from tuning
    best_hyperparameters = grid_search.best_params_

    classifier = LogisticRegression(random_state = 0, **best_hyperparameters)
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])

# Create a DataFrame for the results
columns = ['Time', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
# print(results_df)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
  Time &  Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &     0.871 &      0.884 &   0.868 &     0.869 \\
 5 min &     0.914 &      0.922 &   0.912 &     0.913 \\
10 min &     0.903 &      0.905 &   0.902 &     0.903 \\
15 min &     0.925 &      0.931 &   0.923 &     0.924 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)


## Cross Validation

In [5]:
## Summary Table of Results
# Table creation 
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
cvscores = []
results = []
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'df_{time_interval}min_top_vs_ntop_players.parquet'
    dataset = pd.read_parquet(file_name, columns=filter)

    # Keep only session_time 1
    dataset = dataset[dataset['session_time'] == 1]
    # Drop age_range and playerkey
    dataset = dataset.drop(['session_time'], axis=1)

    # Convert ave_time_per_machine to seconds
    dataset['ave_time_per_machine'] = dataset['ave_time_per_machine'].dt.total_seconds()

    # # Seperate dependent and independent variables
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X[:, 0] = le.fit_transform(X[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X = np.array(ct.fit_transform(X))
    y = le.fit_transform(y)

    ## Handling Class Imbalance 
    # Apply SMOTE - SMOTE generates synthetic samples for the minority class to balance the dataset:
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_resampled[:, 19:] = sc.fit_transform(X_resampled[:, 19:])

    # Best hyperparameters from tuning
    best_hyperparameters = grid_search.best_params_

    classifier = LogisticRegression(random_state = 0, **best_hyperparameters)
    
    # Perform cross-validation
    y_pred_cv = cross_val_predict(classifier, X_resampled, y_resampled, cv=10)

    # Perform 10-fold cross-validation
    cv_scores = cross_val_score(classifier, X_resampled, y_resampled, cv=10, scoring='accuracy')
    mean_accuracy = cv_scores.mean()
    std_accuracy = cv_scores.std()
    cv_scores = cv_scores.tolist()
    cv_scores = [ '%.3f' % elem for elem in cv_scores]

    # Calculate precision, recall, and F1-score
    precision = precision_score(y_resampled, y_pred_cv, average='macro')
    recall = recall_score(y_resampled, y_pred_cv, average='macro')
    f1 = f1_score(y_resampled, y_pred_cv, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(mean_accuracy, 3), round(std_accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])
    cvscores.append([f'{time_interval} min', *cv_scores])

# Create a DataFrame for the results
columns = ['Time', 'Mean_Accuracy', 'Std_Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
cv_columns = ['Time', 'CV 1', 'CV 2', 'CV 3', 'CV 4', 'CV 5', 'CV 6', 'CV 7', 'CV 8', 'CV 9', 'CV 10']
cv_results_df = pd.DataFrame(cvscores, columns=cv_columns)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
latex_table2 = cv_results_df.to_latex(index=False, escape=False)
print(latex_table2)
print(latex_table)

\begin{tabular}{lllllllllll}
\toprule
  Time &  CV 1 &  CV 2 &  CV 3 &  CV 4 &  CV 5 &  CV 6 &  CV 7 &  CV 8 &  CV 9 & CV 10 \\
\midrule
 1 min & 0.617 & 0.830 & 0.809 & 0.851 & 0.848 & 0.783 & 0.717 & 0.913 & 0.935 & 0.935 \\
 5 min & 0.766 & 0.851 & 0.851 & 0.894 & 0.957 & 0.913 & 0.957 & 0.978 & 0.935 & 1.000 \\
10 min & 0.660 & 0.830 & 0.894 & 0.872 & 0.891 & 0.891 & 0.870 & 0.935 & 0.957 & 0.935 \\
15 min & 0.638 & 0.830 & 0.894 & 0.851 & 0.891 & 0.957 & 0.870 & 0.913 & 0.957 & 0.978 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrrrrr}
\toprule
  Time &  Mean_Accuracy &  Std_Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &          0.824 &         0.095 &      0.856 &   0.823 &     0.819 \\
 5 min &          0.910 &         0.068 &      0.921 &   0.909 &     0.909 \\
10 min &          0.873 &         0.079 &      0.893 &   0.873 &     0.871 \\
15 min &          0.878 &         0.092 &      0.900 &   0.877 &     0.875 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)
  latex_table2 = cv_results_df.to_latex(index=False, escape=False)
