# Random Forest

## Data Preprocessing

In [4]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import random
import counting_fns as cf
from sklearn.utils import shuffle

# List of all the months
months = ['2_June', '3_July', '4_August', '5_September', '6_October']

# Define the path to the data directory and columns to keep
data_path = "/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month"

# Filter Columns
filter = ['session_time', 'gender', 'age_gen', 'day', 'timeofday', 'first_outcome',
        'first_wager','first_p/b', 'last_outcome', 'last_wager', 'last_p/b',
        'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom', 
        'std_slotdenom', 'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
        'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
        'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope', 
        '#inc_slotdenom', '#dec_slotdenom', '#inc_maxbet', '#dec_maxbet', '#W', '#L', '#NH', '#D',
        'w/min', 'l/min', '#2ws', '2ws_profit', '2ws_wgramt','2ws/min', 
        '#3ws', '3ws_profit', '3ws_wgramt', '3ws/min', '#4ws', '4ws_profit', '4ws_wgramt', '4ws/min', 
        'w/g', 'l/g', 'nh/g', 'd/g', 'ave_time_per_gamble', 
        'min_time_per_gamble', 'max_time_per_gamble', 'total_gambles',
        'machines_changes', 'unique_machines', 'ave_time_per_machine', 'classification']

In [5]:
# best_params_oct = {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
# best_params_sept = {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
# best_params_aug = {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
# best_params_july = {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
# best_params_june = {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

# Pick best overall parameters
best_params = {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


## June

In [6]:
# Table creation 
months_train = ['4_August', '3_July', '5_September', '6_October']
months_test = ['2_June']
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
results = []
dataframes_features = {}
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'{time_interval}min'
    # Load the datasets
    datasets = cf.load_and_preprocess_datasets_min_ntop(months, data_path, file_name, filter)

    # Create training and test sets
    dt_train = pd.concat([datasets['dtf'+month[1:]] for month in months_train])
    dt_test = pd.concat([datasets['dtf'+month[1:]] for month in months_test])

    # # Seperate dependent and independent variables
    X_train = dt_train.iloc[:, :-1].values
    y_train = dt_train.iloc[:, -1].values
    X_test = dt_test.iloc[:, :-1].values
    y_test = dt_test.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X_train[:, 0] = le.fit_transform(X_train[:, 0])
    X_test[:, 0] = le.transform(X_test[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)

    y_train = le.fit_transform(y_train)
    y_test = le.fit_transform(y_test)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_train[:, 25:] = sc.fit_transform(X_train[:, 25:])
    X_test[:, 25:] = sc.transform(X_test[:, 25:])

    # Apply SMOTE to balance the dataset
    sm = SMOTE(random_state=42)
    X_train_resample, y_train_resample = sm.fit_resample(X_train, y_train)

    # Shuffle the resampled data
    X_train_resample, y_train_resample = shuffle(X_train_resample, y_train_resample, random_state=42)

    classifier = RandomForestClassifier(random_state=0, **best_params)
    classifier.fit(X_train_resample, y_train_resample)

    y_pred = classifier.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])

    coefficients_df = cf.permutation_importance_rf(dt_train, X_test, y_test, classifier, ct)

    dataframes_features[f'coefficients_df_{time_interval}min'] = coefficients_df

# Create a DataFrame for the results
columns = ['Time', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
# print(results_df)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
  Time &  Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &     0.888 &      0.446 &   0.498 &     0.470 \\
 5 min &     0.892 &      0.699 &   0.531 &     0.534 \\
10 min &     0.881 &      0.446 &   0.494 &     0.468 \\
15 min &     0.888 &      0.649 &   0.529 &     0.531 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)


In [7]:
# Top 5 features for each time interval
for time_interval in time_intervals:
    print(f'Top 5 features for {time_interval} min')
    print(dataframes_features[f'coefficients_df_{time_interval}min'].head(5))
    print('\n')

Top 5 features for 1 min
                  feature  coefficient
0  encoder__age_gen_Gen X     0.005385
1              last_wager    -0.003846
2           beginning_amt    -0.003846
3              max_profit    -0.003846
4            ave_wageramt    -0.003846


Top 5 features for 5 min
                         feature  coefficient
0  encoder__age_gen_Baby Boomers     0.007308
1    encoder__age_gen_Millenials     0.004231
2            min_time_per_gamble     0.004231
3    encoder__first_outcome_loss     0.003846
4                             #L     0.003846


Top 5 features for 10 min
         feature  coefficient
0     2ws_profit    -0.006538
1     ending_amt    -0.005769
2             #W    -0.005385
3  total_gambles    -0.005000
4      first_p/b     0.005000


Top 5 features for 15 min
                          feature  coefficient
0                            #2ws     0.023462
1                         2ws/min     0.009231
2   encoder__age_gen_Baby Boomers     0.008077
3  encoder__la

In [8]:
# Define Path
os.chdir('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/Feauture Importance/Minutes/RF/June')
# Save datafraes to csv files
for time_interval in time_intervals:
    print(f'Saving features for {time_interval} min interval')
    dataframes_features[f'coefficients_df_{time_interval}min'].to_csv(f'coefficients_df_{time_interval}min.csv')
    

Saving features for 1 min interval
Saving features for 5 min interval
Saving features for 10 min interval
Saving features for 15 min interval


## July

In [9]:
# Table creation 
months_train = ['4_August','2_June' , '5_September', '6_October']
months_test = ['3_July']
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
results = []
dataframes_features = {}
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'{time_interval}min'
    # Load the datasets
    datasets = cf.load_and_preprocess_datasets_min_ntop(months, data_path, file_name, filter)

    # Create training and test sets
    dt_train = pd.concat([datasets['dtf'+month[1:]] for month in months_train])
    dt_test = pd.concat([datasets['dtf'+month[1:]] for month in months_test])

    # # Seperate dependent and independent variables
    X_train = dt_train.iloc[:, :-1].values
    y_train = dt_train.iloc[:, -1].values
    X_test = dt_test.iloc[:, :-1].values
    y_test = dt_test.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X_train[:, 0] = le.fit_transform(X_train[:, 0])
    X_test[:, 0] = le.transform(X_test[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)

    y_train = le.fit_transform(y_train)
    y_test = le.fit_transform(y_test)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_train[:, 25:] = sc.fit_transform(X_train[:, 25:])
    X_test[:, 25:] = sc.transform(X_test[:, 25:])

    # Apply SMOTE to balance the dataset
    sm = SMOTE(random_state=42)
    X_train_resample, y_train_resample = sm.fit_resample(X_train, y_train)

    # Shuffle the resampled data
    X_train_resample, y_train_resample = shuffle(X_train_resample, y_train_resample, random_state=42)

    classifier = RandomForestClassifier(random_state=0, **best_params)
    classifier.fit(X_train_resample, y_train_resample)

    y_pred = classifier.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])

    coefficients_df = cf.permutation_importance_rf(dt_train, X_test, y_test, classifier, ct)

    dataframes_features[f'coefficients_df_{time_interval}min'] = coefficients_df

# Create a DataFrame for the results
columns = ['Time', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
# print(results_df)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
  Time &  Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &     0.894 &      0.619 &   0.515 &     0.506 \\
 5 min &     0.896 &      0.664 &   0.525 &     0.525 \\
10 min &     0.895 &      0.658 &   0.535 &     0.541 \\
15 min &     0.897 &      0.690 &   0.555 &     0.572 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)


In [10]:
# Top 5 features for each time interval
for time_interval in time_intervals:
    print(f'Top 5 features for {time_interval} min')
    print(dataframes_features[f'coefficients_df_{time_interval}min'].head(5))
    print('\n')

Top 5 features for 1 min
                         feature  coefficient
0         encoder__age_gen_Gen X     0.002294
1  encoder__age_gen_Baby Boomers     0.001956
2               encoder__day_1.0     0.001001
3   encoder__timeofday_afternoon     0.000932
4               encoder__day_6.0     0.000816


Top 5 features for 5 min
                         feature  coefficient
0  encoder__age_gen_Baby Boomers     0.002502
1         encoder__age_gen_Gen X     0.002371
2                             #D     0.001147
3                  beginning_amt     0.001093
4     encoder__last_outcome_loss     0.001032


Top 5 features for 10 min
                         feature  coefficient
0  encoder__age_gen_Baby Boomers     0.003257
1         encoder__age_gen_Gen X     0.002841
2                  total_gambles    -0.001925
3                        2ws/min     0.001771
4                             #D     0.001378


Top 5 features for 15 min
                         feature  coefficient
0                 

In [11]:
# Define Path
os.chdir('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/Feauture Importance/Minutes/RF/July')
# Save datafraes to csv files
for time_interval in time_intervals:
    print(f'Saving features for {time_interval} min interval')
    dataframes_features[f'coefficients_df_{time_interval}min'].to_csv(f'coefficients_df_{time_interval}min.csv')
    

Saving features for 1 min interval
Saving features for 5 min interval
Saving features for 10 min interval
Saving features for 15 min interval


## August

In [12]:
# Table creation 
months_train = ['2_June', '3_July', '5_September', '6_October']
months_test = ['4_August']
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
results = []
dataframes_features = {}
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'{time_interval}min'
    # Load the datasets
    datasets = cf.load_and_preprocess_datasets_min_ntop(months, data_path, file_name, filter)

    # Create training and test sets
    dt_train = pd.concat([datasets['dtf'+month[1:]] for month in months_train])
    dt_test = pd.concat([datasets['dtf'+month[1:]] for month in months_test])

    # # Seperate dependent and independent variables
    X_train = dt_train.iloc[:, :-1].values
    y_train = dt_train.iloc[:, -1].values
    X_test = dt_test.iloc[:, :-1].values
    y_test = dt_test.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X_train[:, 0] = le.fit_transform(X_train[:, 0])
    X_test[:, 0] = le.transform(X_test[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)

    y_train = le.fit_transform(y_train)
    y_test = le.fit_transform(y_test)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_train[:, 25:] = sc.fit_transform(X_train[:, 25:])
    X_test[:, 25:] = sc.transform(X_test[:, 25:])

    # Apply SMOTE to balance the dataset
    sm = SMOTE(random_state=42)
    X_train_resample, y_train_resample = sm.fit_resample(X_train, y_train)

    # Shuffle the resampled data
    X_train_resample, y_train_resample = shuffle(X_train_resample, y_train_resample, random_state=42)

    classifier = RandomForestClassifier(random_state=0, **best_params)
    classifier.fit(X_train_resample, y_train_resample)

    y_pred = classifier.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])

    coefficients_df = cf.permutation_importance_rf(dt_train, X_test, y_test, classifier, ct)

    dataframes_features[f'coefficients_df_{time_interval}min'] = coefficients_df

# Create a DataFrame for the results
columns = ['Time', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
# print(results_df)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
  Time &  Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &     0.894 &      0.609 &   0.514 &     0.506 \\
 5 min &     0.897 &      0.665 &   0.523 &     0.522 \\
10 min &     0.897 &      0.680 &   0.535 &     0.541 \\
15 min &     0.896 &      0.679 &   0.545 &     0.557 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)


In [13]:
# Top 5 features for each time interval
for time_interval in time_intervals:
    print(f'Top 5 features for {time_interval} min')
    print(dataframes_features[f'coefficients_df_{time_interval}min'].head(5))
    print('\n')

Top 5 features for 1 min
                         feature  coefficient
0                     ending_amt    -0.001414
1               ave_theo_payback    -0.001380
2  encoder__age_gen_Baby Boomers     0.001216
3               encoder__day_1.0     0.000914
4                  total_gambles    -0.000888


Top 5 features for 5 min
                         feature  coefficient
0  encoder__age_gen_Baby Boomers     0.001992
1         encoder__age_gen_Gen X     0.001949
2                         gender     0.000992
3                             #W    -0.000992
4                  beginning_amt     0.000880


Top 5 features for 10 min
                         feature  coefficient
0                           #2ws     0.002812
1                        2ws/min     0.002751
2         encoder__age_gen_Gen X     0.001440
3   encoder__timeofday_afternoon     0.001397
4  encoder__age_gen_Baby Boomers     0.001130


Top 5 features for 15 min
            feature  coefficient
0              #2ws     0.00721

In [14]:
# Define Path
os.chdir('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/Feauture Importance/Minutes/RF/August')
# Save datafraes to csv files
for time_interval in time_intervals:
    print(f'Saving features for {time_interval} min interval')
    dataframes_features[f'coefficients_df_{time_interval}min'].to_csv(f'coefficients_df_{time_interval}min.csv')
    

Saving features for 1 min interval
Saving features for 5 min interval
Saving features for 10 min interval
Saving features for 15 min interval


## September

In [15]:
# Table creation 
months_train = ['4_August', '3_July', '2_June', '6_October']
months_test = ['5_September']
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
results = []
dataframes_features = {}
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'{time_interval}min'
    # Load the datasets
    datasets = cf.load_and_preprocess_datasets_min_ntop(months, data_path, file_name, filter)

    # Create training and test sets
    dt_train = pd.concat([datasets['dtf'+month[1:]] for month in months_train])
    dt_test = pd.concat([datasets['dtf'+month[1:]] for month in months_test])

    # # Seperate dependent and independent variables
    X_train = dt_train.iloc[:, :-1].values
    y_train = dt_train.iloc[:, -1].values
    X_test = dt_test.iloc[:, :-1].values
    y_test = dt_test.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X_train[:, 0] = le.fit_transform(X_train[:, 0])
    X_test[:, 0] = le.transform(X_test[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)

    y_train = le.fit_transform(y_train)
    y_test = le.fit_transform(y_test)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_train[:, 25:] = sc.fit_transform(X_train[:, 25:])
    X_test[:, 25:] = sc.transform(X_test[:, 25:])

    # Apply SMOTE to balance the dataset
    sm = SMOTE(random_state=42)
    X_train_resample, y_train_resample = sm.fit_resample(X_train, y_train)

    # Shuffle the resampled data
    X_train_resample, y_train_resample = shuffle(X_train_resample, y_train_resample, random_state=42)

    classifier = RandomForestClassifier(random_state=0, **best_params)
    classifier.fit(X_train_resample, y_train_resample)

    y_pred = classifier.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])

    coefficients_df = cf.permutation_importance_rf(dt_train, X_test, y_test, classifier, ct)

    dataframes_features[f'coefficients_df_{time_interval}min'] = coefficients_df

# Create a DataFrame for the results
columns = ['Time', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
# print(results_df)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
  Time &  Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &     0.892 &      0.613 &   0.520 &     0.516 \\
 5 min &     0.893 &      0.642 &   0.529 &     0.531 \\
10 min &     0.891 &      0.642 &   0.540 &     0.550 \\
15 min &     0.887 &      0.629 &   0.546 &     0.557 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)


In [16]:
# Top 5 features for each time interval
for time_interval in time_intervals:
    print(f'Top 5 features for {time_interval} min')
    print(dataframes_features[f'coefficients_df_{time_interval}min'].head(5))
    print('\n')

Top 5 features for 1 min
                         feature  coefficient
0  encoder__age_gen_Baby Boomers     0.002965
1         encoder__age_gen_Gen X     0.002254
2               encoder__day_1.0     0.002142
3               max_theo_payback    -0.001366
4                  min_slotdenom     0.001272


Top 5 features for 5 min
                         feature  coefficient
0  encoder__age_gen_Baby Boomers     0.004210
1         encoder__age_gen_Gen X     0.003040
2                  total_gambles    -0.002376
3                  max_slotdenom     0.001469
4               encoder__day_1.0     0.001459


Top 5 features for 10 min
                         feature  coefficient
0                           #2ws     0.009523
1                        2ws/min     0.005145
2         encoder__age_gen_Gen X     0.003545
3                  total_gambles    -0.003143
4  encoder__age_gen_Baby Boomers     0.002881


Top 5 features for 15 min
         feature  coefficient
0           #2ws     0.010935
1  t

In [17]:
# Define Path
os.chdir('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/Feauture Importance/Minutes/RF/September')
# Save datafraes to csv files
for time_interval in time_intervals:
    print(f'Saving features for {time_interval} min interval')
    dataframes_features[f'coefficients_df_{time_interval}min'].to_csv(f'coefficients_df_{time_interval}min.csv')
    

Saving features for 1 min interval
Saving features for 5 min interval
Saving features for 10 min interval
Saving features for 15 min interval


## October

In [18]:
# Table creation 
months_train = ['4_August', '3_July', '5_September', '2_June' ]
months_test = ['6_October']
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
results = []
dataframes_features = {}
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'{time_interval}min'
    # Load the datasets
    datasets = cf.load_and_preprocess_datasets_min_ntop(months, data_path, file_name, filter)

    # Create training and test sets
    dt_train = pd.concat([datasets['dtf'+month[1:]] for month in months_train])
    dt_test = pd.concat([datasets['dtf'+month[1:]] for month in months_test])

    # # Seperate dependent and independent variables
    X_train = dt_train.iloc[:, :-1].values
    y_train = dt_train.iloc[:, -1].values
    X_test = dt_test.iloc[:, :-1].values
    y_test = dt_test.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X_train[:, 0] = le.fit_transform(X_train[:, 0])
    X_test[:, 0] = le.transform(X_test[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)

    y_train = le.fit_transform(y_train)
    y_test = le.fit_transform(y_test)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_train[:, 25:] = sc.fit_transform(X_train[:, 25:])
    X_test[:, 25:] = sc.transform(X_test[:, 25:])

    # Apply SMOTE to balance the dataset
    sm = SMOTE(random_state=42)
    X_train_resample, y_train_resample = sm.fit_resample(X_train, y_train)

    # Shuffle the resampled data
    X_train_resample, y_train_resample = shuffle(X_train_resample, y_train_resample, random_state=42)

    classifier = RandomForestClassifier(random_state=0, **best_params)
    classifier.fit(X_train_resample, y_train_resample)

    y_pred = classifier.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])

    coefficients_df = cf.permutation_importance_rf(dt_train, X_test, y_test, classifier, ct)

    dataframes_features[f'coefficients_df_{time_interval}min'] = coefficients_df

# Create a DataFrame for the results
columns = ['Time', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
# print(results_df)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
  Time &  Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &     0.894 &      0.632 &   0.527 &     0.528 \\
 5 min &     0.894 &      0.647 &   0.536 &     0.543 \\
10 min &     0.892 &      0.642 &   0.543 &     0.554 \\
15 min &     0.892 &      0.651 &   0.556 &     0.572 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)


In [19]:
# Top 5 features for each time interval
for time_interval in time_intervals:
    print(f'Top 5 features for {time_interval} min')
    print(dataframes_features[f'coefficients_df_{time_interval}min'].head(5))
    print('\n')

Top 5 features for 1 min
                         feature  coefficient
0  encoder__age_gen_Baby Boomers     0.003766
1         encoder__age_gen_Gen X     0.003089
2                      max_wager     0.002486
3                   ave_wageramt     0.002090
4                  max_slotdenom     0.001921


Top 5 features for 5 min
                         feature  coefficient
0  encoder__age_gen_Baby Boomers     0.004331
1                             #W    -0.002335
2                     max_profit    -0.002298
3                  total_gambles    -0.001751
4            min_time_per_gamble    -0.001582


Top 5 features for 10 min
                         feature  coefficient
0                           #2ws     0.008230
1                        2ws/min     0.007137
2                  total_gambles    -0.002957
3                             #W    -0.002618
4  encoder__age_gen_Baby Boomers     0.002260


Top 5 features for 15 min
                         feature  coefficient
0                 

In [20]:
# Define Path
os.chdir('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/Feauture Importance/Minutes/RF/October')
# Save datafraes to csv files
for time_interval in time_intervals:
    print(f'Saving features for {time_interval} min interval')
    dataframes_features[f'coefficients_df_{time_interval}min'].to_csv(f'coefficients_df_{time_interval}min.csv')
    

Saving features for 1 min interval
Saving features for 5 min interval
Saving features for 10 min interval
Saving features for 15 min interval
