# Decision-Tree

## Data Preprocessing

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
import random
import counting_fns as cf


# List of all the months
months = ['2_June', '3_July', '4_August', '5_September', '6_October']

# Define the path to the data directory and columns to keep
data_path = "/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month"

# Filter Desire Columns
filter = ['session_time', 'gender', 'age_gen', 'first_outcome',
        'first_wager','first_p/b', 'last_outcome', 'last_wager', 'last_p/b',
        'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom', 
        'std_slotdenom', 'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
        'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
        'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope', 
        '#inc_slotdenom', '#dec_slotdenom', '#inc_maxbet', '#dec_maxbet', 
        '#W', '#L', '#NH', '#D','w/g', 'l/g', 'nh/g', 'd/g', '#2ws', '2ws_profit', '2ws_wgramt', '#3ws',
        '3ws_profit', '3ws_wgramt', '#4ws', '4ws_profit', '4ws_wgramt','ave_time_per_gamble', 
        'min_time_per_gamble', 'max_time_per_gamble',
        'machines_changes', 'unique_machines',  'ave_time_per_machine', 'classification']


## Randomized Months

In [2]:
# Randomly select 3 months for training set
# random.seed(350)
months_train = random.sample(months, 4)

# Print the months in the training set
print("Months in training set:", months_train)

# Create a list of remaining months for the test set
months_test = [month for month in months if month not in months_train]
# Print the months in the test set
print("Months in test set:", months_test)

Months in training set: ['3_July', '6_October', '5_September', '2_June']
Months in test set: ['4_August']


## Load Dataframes

### 1 MIN

In [8]:
# Load the datasets
datasets = cf.load_and_preprocess_datasets_min_all(months, data_path, '1min', filter)

# Create training and test sets
dt_train = pd.concat([datasets['dtf'+month[1:]] for month in months_train])
dt_test = pd.concat([datasets['dtf'+month[1:]] for month in months_test])

# # Seperate dependent and independent variables
X_train = dt_train.iloc[:, :-1].values
y_train = dt_train.iloc[:, -1].values
X_test = dt_test.iloc[:, :-1].values
y_test = dt_test.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X_train[:, 0] = le.fit_transform(X_train[:, 0])
X_test[:, 0] = le.fit_transform(X_test[:, 0])


# # Encode age_generartion, first_outoce, last_outcome columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 5])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.transform(X_test))

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# # Feature Scaling
sc = StandardScaler()

# Scale all columns except the encoded ones
X_train[:, 14:] = sc.fit_transform(X_train[:, 14:])
X_test[:, 14:] = sc.transform(X_test[:, 14:])

classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1 Score: ', f1_score(y_test, y_pred, average='macro'))

[[906 777 683 567]
 [727 765 776 662]
 [630 745 834 744]
 [470 685 704 952]]
Accuracy:  0.29732519136492647
Precision:  0.29816365707283576
Recall:  0.29777126354699435
F1 Score:  0.2978278138115156


## 2 Min

In [4]:
# Load the datasets for 2min, named it datasets_2
datasets_2 = cf.load_and_preprocess_datasets_min_all(months, data_path, '2min', filter)

# Create training and test sets
dt_train = pd.concat([datasets_2['dtf'+month[1:]] for month in months_train])
dt_test = pd.concat([datasets_2['dtf'+month[1:]] for month in months_test])

# # Seperate dependent and independent variables
X_train = dt_train.iloc[:, :-1].values
y_train = dt_train.iloc[:, -1].values
X_test = dt_test.iloc[:, :-1].values
y_test = dt_test.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X_train[:, 0] = le.fit_transform(X_train[:, 0])
X_test[:, 0] = le.fit_transform(X_test[:, 0])


# # Encode age_generartion, first_outoce, last_outcome columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 5])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.transform(X_test))

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# # Feature Scaling
sc = StandardScaler()

# Scale all columns except the encoded ones
X_train[:, 14:] = sc.fit_transform(X_train[:, 14:])
X_test[:, 14:] = sc.transform(X_test[:, 14:])

classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1 Score: ', f1_score(y_test, y_pred, average='macro'))

[[929 760 727 517]
 [731 808 776 615]
 [693 787 773 700]
 [512 611 714 974]]
Accuracy:  0.29964737249505463
Precision:  0.3005802037726606
Recall:  0.3001930148965408
F1 Score:  0.30037044391566153


### 3 Min

In [5]:
# Load the datasets for 3min, named it datasets_3
datasets_3 = cf.load_and_preprocess_datasets_min_all(months, data_path, '3min', filter)

# Create training and test sets
dt_train = pd.concat([datasets_3['dtf'+month[1:]] for month in months_train])
dt_test = pd.concat([datasets_3['dtf'+month[1:]] for month in months_test])

# # Seperate dependent and independent variables
X_train = dt_train.iloc[:, :-1].values
y_train = dt_train.iloc[:, -1].values
X_test = dt_test.iloc[:, :-1].values
y_test = dt_test.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X_train[:, 0] = le.fit_transform(X_train[:, 0])
X_test[:, 0] = le.fit_transform(X_test[:, 0])


# # Encode age_generartion, first_outoce, last_outcome columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 5])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.transform(X_test))

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# # Feature Scaling
sc = StandardScaler()

# Scale all columns except the encoded ones
X_train[:, 14:] = sc.fit_transform(X_train[:, 14:])
X_test[:, 14:] = sc.transform(X_test[:, 14:])

classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1 Score: ', f1_score(y_test, y_pred, average='macro'))

[[ 948  793  735  457]
 [ 724  855  757  594]
 [ 730  769  799  655]
 [ 462  624  656 1069]]
Accuracy:  0.31573062698890514
Precision:  0.3171278754760924
Recall:  0.3164728579434433
F1 Score:  0.31675998556066676


### 4 Min

In [6]:
# Load the datasets for 4min, named it datasets_4
datasets_4 = cf.load_and_preprocess_datasets_min_all(months, data_path, '4min', filter)

# Create training and test sets
dt_train = pd.concat([datasets_4['dtf'+month[1:]] for month in months_train])
dt_test = pd.concat([datasets_4['dtf'+month[1:]] for month in months_test])

# # Seperate dependent and independent variables
X_train = dt_train.iloc[:, :-1].values
y_train = dt_train.iloc[:, -1].values
X_test = dt_test.iloc[:, :-1].values
y_test = dt_test.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X_train[:, 0] = le.fit_transform(X_train[:, 0])
X_test[:, 0] = le.fit_transform(X_test[:, 0])


# # Encode age_generartion, first_outoce, last_outcome columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 5])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.transform(X_test))

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# # Feature Scaling
sc = StandardScaler()

# Scale all columns except the encoded ones
X_train[:, 14:] = sc.fit_transform(X_train[:, 14:])
X_test[:, 14:] = sc.transform(X_test[:, 14:])

classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1 Score: ', f1_score(y_test, y_pred, average='macro'))

[[ 962  803  712  456]
 [ 781  816  718  615]
 [ 719  734  827  673]
 [ 453  539  627 1192]]
Accuracy:  0.3265674722628365
Precision:  0.32623097176105575
Recall:  0.3276481685784679
F1 Score:  0.3268768197252367


### 5 MIN

In [7]:
# Load the datasets for 5min, named it datasets_5
datasets_5 = cf.load_and_preprocess_datasets_min_all(months, data_path, '5min', filter)

# Create training and test sets
dt_train = pd.concat([datasets_5['dtf'+month[1:]] for month in months_train])
dt_test = pd.concat([datasets_5['dtf'+month[1:]] for month in months_test])

# # Seperate dependent and independent variables
X_train = dt_train.iloc[:, :-1].values
y_train = dt_train.iloc[:, -1].values
X_test = dt_test.iloc[:, :-1].values
y_test = dt_test.iloc[:, -1].values

# Econde gender column (Binary)
le = LabelEncoder()

# Binary Encode gender
X_train[:, 0] = le.fit_transform(X_train[:, 0])
X_test[:, 0] = le.fit_transform(X_test[:, 0])


# # Encode age_generartion, first_outoce, last_outcome columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 5])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.transform(X_test))

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# # Feature Scaling
sc = StandardScaler()

# Scale all columns except the encoded ones
X_train[:, 14:] = sc.fit_transform(X_train[:, 14:])
X_test[:, 14:] = sc.transform(X_test[:, 14:])

classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='macro'))
print('Recall: ', recall_score(y_test, y_pred, average='macro'))
print('F1 Score: ', f1_score(y_test, y_pred, average='macro'))

[[1005  798  713  417]
 [ 868  796  758  508]
 [ 719  747  846  641]
 [ 382  584  627 1218]]
Accuracy:  0.3324159284424185
Precision:  0.3337323995265421
Recall:  0.33352775121886324
F1 Score:  0.3336232253635535


In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(classifier, X_train, y_train, n_repeats=10, random_state=42)

# Get feature importances and feature names
importances = result.importances_mean
feature_names = ct.get_feature_names_out()

# Sort feature importances
feature_importance = list(zip(feature_names, importances))
feature_importance.sort(key=lambda x: x[1], reverse=True)

# Print feature importances
for feature, importance in feature_importance:
    print(f"{feature}: {importance}")