# Logistic Regression

## Data Preprocessing

In [33]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

# Set working directory
month_file = '2_June'
# Set working directory
os.chdir("/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/"+month_file+"/Ending Balances/Per_Player")

## Load Dataframes

In [48]:
# Filter Columns
filter = ['session_time', 'gender',  'sim_play', 'age_gen', 'first_outcome', 'last_outcome', 
        'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom', 
        'min_slotdenom', 'max_slotdenom', 'ave_theo_payback', 'min_theo_payback', 
        'max_theo_payback', 'ave_wageramt', 'min_wager', 'max_wager', 
        'ave_p/b', 'max_p/b', 'first_p/b', 'last_p/b', 'w/min', 'l/min', 
        'nh/min', 'd/min', 'w/g', 'l/g', 'nh/g', 'd/g', '#2ws', '2ws_profit', '2ws_wgramt', 
        '#3ws', '3ws_profit', '3ws_wgramt', '#4ws', '4ws_profit', '4ws_wgramt', 
        'ave_time_per_gamble','machines_changes', 'unique_machines', 
        'ave_time_per_machine', 'percentile']

# Columns NOT INCLUDED
# 'playerkey', 'rank', 'age_range', '#W', '#L', '#NH', '#D','total_duration', 'total_gambles'

# Load dataset
dataset = pd.read_parquet('per_player_ending_balance_by_session.parquet', columns=filter)

# Keep only session_time 1
dataset = dataset[dataset['session_time'] == 1]
# Drop age_range and playerkey
dataset = dataset.drop(['session_time'], axis=1)

# Convert time deltas to numeric values
dataset['ave_time_per_gamble'] = dataset['ave_time_per_gamble'].dt.total_seconds()
dataset['ave_time_per_machine'] = dataset['ave_time_per_machine'].dt.total_seconds()

# # Seperate dependent and independent variables
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Taking Care of Missing Data

X[:, 3:31] it covers from beggining account balance to 4ws_wgramt column

We do not need this step since we do not have missing data!

In [84]:
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# imputer.fit(X[:, 3:31])
# X[:, 3:31] = imputer.transform(X[:, 3:31])

## Encoding Categorical Data

### Encoding the Independent Variable

In [37]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender and simplay
X[:, 0] = le.fit_transform(X[:, 0])
X[:, 1] = le.fit_transform(X[:, 1])

# # Encode rank and age_generartion columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2, 3, 4])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

### Encoding the Dependent Variable

In [38]:
y = le.fit_transform(y)

## Splitting the dataset into the Training set and Test set

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

## Feature Scaling

In [43]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# Scale all columns except the encoded ones
X_train[:, 14:] = sc.fit_transform(X_train[:, 14:])
X_test[:, 14:] = sc.transform(X_test[:, 14:])

## Training the Logistic Regression model on 

1.   X_train
2.   y_train

the Training set

In [44]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

### Which IDVs have the most weight in the model

In [50]:
# Get the feature names after one-hot encoding
feature_names = ct.get_feature_names_out()

# Get the coefficients and feature importance
coefficients = classifier.coef_[0]
feature_importance = list(zip(feature_names, coefficients))

# Sort the feature importance by absolute coefficient values
feature_importance.sort(key=lambda x: abs(x[1]), reverse=True)

# Print the feature importance
for feature, importance in feature_importance:
    print(f"{feature}: {importance}")

remainder__x41: 1.420711755235393
remainder__x18: 1.0525626765243217
remainder__x40: 1.0442705569264576
remainder__x29: 1.0109339038118972
remainder__x16: 0.6735446064175719
remainder__x32: 0.5941275969650018
remainder__x10: 0.5537565584653025
remainder__x15: -0.5316484769076729
remainder__x35: 0.5243428879840037
remainder__x12: -0.480587808830943
remainder__x21: 0.4426500967278362
remainder__x24: 0.4393659061072288
encoder__x3_near-hit: 0.4354229305711972
remainder__x14: -0.3974811833515881
remainder__x38: -0.35554647571447245
remainder__x23: 0.339091916249507
remainder__x30: 0.33313874915915
remainder__x9: -0.3274131807611045
remainder__x20: -0.32704939436760916
remainder__x27: -0.3013187659621852
remainder__x36: 0.27540762046476647
remainder__x26: 0.26420537571006547
remainder__x34: 0.23376360140726093
remainder__x11: -0.2320116858087338
encoder__x3_loss: -0.22856859166907986
remainder__x37: 0.2172656480871856
remainder__x31: 0.21354143798105646
encoder__x4_loss: -0.1907399492750969

In [51]:
# Get the feature names after one-hot encoding
feature_names = ct.get_feature_names_out()

# Get the coefficients and feature importance
coefficients = classifier.coef_[0]
feature_importance = list(zip(feature_names, coefficients))

# Sort the feature importance by absolute coefficient values
feature_importance.sort(key=lambda x: abs(x[1]), reverse=True)

# Create a list with feature and importance
feature_importance_list = [(feature, importance) for feature, importance in feature_importance]

# Sort the list from highest to lowest importance
feature_importance_list.sort(key=lambda x: abs(x[1]), reverse=True)

# Print the feature importance list
for feature, importance in feature_importance_list:
    print(f"{feature}: {importance}")


remainder__x41: 1.420711755235393
remainder__x18: 1.0525626765243217
remainder__x40: 1.0442705569264576
remainder__x29: 1.0109339038118972
remainder__x16: 0.6735446064175719
remainder__x32: 0.5941275969650018
remainder__x10: 0.5537565584653025
remainder__x15: -0.5316484769076729
remainder__x35: 0.5243428879840037
remainder__x12: -0.480587808830943
remainder__x21: 0.4426500967278362
remainder__x24: 0.4393659061072288
encoder__x3_near-hit: 0.4354229305711972
remainder__x14: -0.3974811833515881
remainder__x38: -0.35554647571447245
remainder__x23: 0.339091916249507
remainder__x30: 0.33313874915915
remainder__x9: -0.3274131807611045
remainder__x20: -0.32704939436760916
remainder__x27: -0.3013187659621852
remainder__x36: 0.27540762046476647
remainder__x26: 0.26420537571006547
remainder__x34: 0.23376360140726093
remainder__x11: -0.2320116858087338
encoder__x3_loss: -0.22856859166907986
remainder__x37: 0.2172656480871856
remainder__x31: 0.21354143798105646
encoder__x4_loss: -0.1907399492750969

### Predicting the Test set results

In [45]:
y_pred = classifier.predict(X_test)

print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [1 1]]


### Making the Confusion Matrix and Accuracy Score

In [46]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ', accuracy_score(y_test, y_pred))

# Interpretation of confusion matrix
print('True Positive (B20): ', cm[0][0])
print('True Negative (T20): ', cm[1][1])
print('False Positive: ', cm[0][1])
print('False Negative: ', cm[1][0])

[[ 7  0]
 [ 3 13]]
Accuracy:  0.8695652173913043
True Positive (B20):  7
True Negative (T20):  13
False Positive:  0
False Negative:  3
