# Logistic Regression

## Data Preprocessing

In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import random
import counting_fns as cf
from sklearn.utils import shuffle


# List of all the months
months = ['2_June', '3_July', '4_August', '5_September', '6_October']

# Define the path to the data directory and columns to keep
data_path = "/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month"

# Filter Columns
filter = ['session_time', 'gender', 'age_gen', 'day', 'timeofday', 'first_outcome',
        'first_wager','first_p/b', 'last_outcome', 'last_wager', 'last_p/b',
        'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom', 
        'std_slotdenom', 'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
        'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
        'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope', 
        '#inc_slotdenom', '#dec_slotdenom', '#inc_maxbet', '#dec_maxbet', '#W', '#L', '#NH', '#D',
        'w/min', 'l/min', '#2ws', '2ws_profit', '2ws_wgramt','2ws/min', 
        '#3ws', '3ws_profit', '3ws_wgramt', '3ws/min', '#4ws', '4ws_profit', '4ws_wgramt', '4ws/min', 
        'w/g', 'l/g', 'nh/g', 'd/g', 'ave_time_per_gamble', 
        'min_time_per_gamble', 'max_time_per_gamble', 'total_gambles',
        'machines_changes', 'unique_machines', 'ave_time_per_machine', 'classification']

In [4]:
# Hyperparameters
Best_params_oct = {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}
Best_params_sep = {'C': 10, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}
Best_params_aug = {'C': 1, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}
Best_params_jul = {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'solver': 'liblinear'}
Best_params_june =  {'C': 10, 'max_iter': 300, 'penalty': 'l1', 'solver': 'liblinear'}

# Best_params overall
best_params = {'C': 10, 'max_iter': 1500, 'penalty': 'l2', 'solver': 'lbfgs'}

## Load Dataframes

In [6]:
# Table creation 
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
results = []
dataframes_features = {}
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'{time_interval}min'
    # Load the datasets
    datasets = cf.load_and_preprocess_datasets_min_ntop(months, data_path, file_name, filter)

    # Create training and test sets
    dt_train = pd.concat([datasets['dtf'+month[1:]] for month in months_train])
    dt_test = pd.concat([datasets['dtf'+month[1:]] for month in months_test])

    # # Seperate dependent and independent variables
    X_train = dt_train.iloc[:, :-1].values
    y_train = dt_train.iloc[:, -1].values
    X_test = dt_test.iloc[:, :-1].values
    y_test = dt_test.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X_train[:, 0] = le.fit_transform(X_train[:, 0])
    X_test[:, 0] = le.transform(X_test[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)

    y_train = le.fit_transform(y_train)
    y_test = le.fit_transform(y_test)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_train[:, 25:] = sc.fit_transform(X_train[:, 25:])
    X_test[:, 25:] = sc.transform(X_test[:, 25:])

    # Apply SMOTE to balance the dataset
    sm = SMOTE(random_state=42)
    X_train_resample, y_train_resample = sm.fit_resample(X_train, y_train)

    # Shuffle the resampled data
    X_train_resample, y_train_resample = shuffle(X_train_resample, y_train_resample, random_state=42)

    classifier = LogisticRegression(random_state = 0, **best_params)
    classifier.fit(X_train_resample, y_train_resample)

    y_pred = classifier.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])

    # drop last column
    dataset= dt_train.drop(['classification'], axis=1)

    # Get the feature names without 'remainder__' prefix
    feature_names = cf.get_feature_names_without_prefix(ct, input_features=dataset.columns)

    # Get the coefficients and feature names
    coefficients = classifier.coef_[0]

    # Create a DataFrame to display coefficients and feature names
    coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

    # Sort the coefficients by magnitude
    coefficients_df['Abs_Coefficient'] = np.abs(coefficients_df['Coefficient'])
    coefficients_df = coefficients_df.sort_values(by='Abs_Coefficient', ascending=False)

    dataframes_features[f'coefficients_df_{time_interval}min'] = coefficients_df
    

# Create a DataFrame for the results
columns = ['Time', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
# print(results_df)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
  Time &  Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &     0.600 &      0.516 &   0.540 &     0.467 \\
 5 min &     0.569 &      0.539 &   0.602 &     0.471 \\
10 min &     0.608 &      0.554 &   0.639 &     0.502 \\
15 min &     0.662 &      0.575 &   0.685 &     0.544 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)


In [7]:
# Top 5 features for each time interval
for time_interval in time_intervals:
    print(f'Top 5 features for {time_interval} min')
    print(dataframes_features[f'coefficients_df_{time_interval}min'].head(5))
    print('\n')

Top 5 features for 1 min
                    Feature  Coefficient  Abs_Coefficient
2    encoder__age_gen_Gen Z    -1.426960         1.426960
35            max_slotdenom    -1.173588         1.173588
34            min_slotdenom     1.025136         1.025136
37         min_theo_payback     0.973820         0.973820
4   encoder__age_gen_Silent     0.763462         0.763462


Top 5 features for 5 min
                          Feature  Coefficient  Abs_Coefficient
2          encoder__age_gen_Gen Z    -1.283728         1.283728
4         encoder__age_gen_Silent     0.777079         0.777079
3     encoder__age_gen_Millenials    -0.567228         0.567228
0   encoder__age_gen_Baby Boomers     0.557004         0.557004
37               min_theo_payback     0.448205         0.448205


Top 5 features for 10 min
                        Feature  Coefficient  Abs_Coefficient
2        encoder__age_gen_Gen Z    -1.162365         1.162365
4       encoder__age_gen_Silent     0.728987         0.728987
36

In [None]:
# Save datafranes to csv files
for 