# Logistic Regression

## Data Preprocessing

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTE
import random
import counting_fns as cf
from sklearn.utils import shuffle


# List of all the months
months = ['2_June', '3_July', '4_August', '5_September', '6_October']

# Define the path to the data directory and columns to keep
data_path = "/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month"

# Filter Columns
filter = ['session_time', 'gender', 'age_gen', 'day', 'timeofday', 'first_outcome',
        'first_wager','first_p/b', 'last_outcome', 'last_wager', 'last_p/b',
        'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom', 
        'std_slotdenom', 'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
        'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
        'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope', 
        '#inc_slotdenom', '#dec_slotdenom', '#inc_maxbet', '#dec_maxbet', '#W', '#L', '#NH', '#D',
        'w/min', 'l/min', '#2ws', '2ws_profit', '2ws_wgramt','2ws/min', 
        '#3ws', '3ws_profit', '3ws_wgramt', '3ws/min', '#4ws', '4ws_profit', '4ws_wgramt', '4ws/min', 
        'w/g', 'l/g', 'nh/g', 'd/g', 'ave_time_per_gamble', 
        'min_time_per_gamble', 'max_time_per_gamble', 'total_gambles',
        'machines_changes', 'unique_machines', 'ave_time_per_machine', 'classification']

In [2]:
# Linear SVM
best_params = {'C':1.0, 'dual': False, 'penalty': 'l2', 'tol': 1e-5}

## June

In [3]:
# Table creation 
months_train = ['4_August', '3_July', '5_September', '6_October']
months_test = ['2_June']
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
results = []
dataframes_features = {}
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'{time_interval}min'
    # Load the datasets
    datasets = cf.load_and_preprocess_datasets_min_ntop(months, data_path, file_name, filter)

    # Create training and test sets
    dt_train = pd.concat([datasets['dtf'+month[1:]] for month in months_train])
    dt_test = pd.concat([datasets['dtf'+month[1:]] for month in months_test])

    # # Seperate dependent and independent variables
    X_train = dt_train.iloc[:, :-1].values
    y_train = dt_train.iloc[:, -1].values
    X_test = dt_test.iloc[:, :-1].values
    y_test = dt_test.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X_train[:, 0] = le.fit_transform(X_train[:, 0])
    X_test[:, 0] = le.transform(X_test[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)

    y_train = le.fit_transform(y_train)
    y_test = le.fit_transform(y_test)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_train[:, 25:] = sc.fit_transform(X_train[:, 25:])
    X_test[:, 25:] = sc.transform(X_test[:, 25:])

    # Apply SMOTE to balance the dataset
    sm = SMOTE(random_state=42)
    X_train_resample, y_train_resample = sm.fit_resample(X_train, y_train)

    # Shuffle the resampled data
    X_train_resample, y_train_resample = shuffle(X_train_resample, y_train_resample, random_state=42)

    classifier = LinearSVC(random_state = 0, **best_params)
    classifier.fit(X_train_resample, y_train_resample)

    y_pred = classifier.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])

    # drop last column
    dataset= dt_train.drop(['classification'], axis=1)

    # Get the feature names without 'remainder__' prefix
    feature_names = cf.get_feature_names_without_prefix(ct, input_features=dataset.columns)

    # Get the coefficients and feature names
    coefficients = classifier.coef_[0]

    # Create a DataFrame to display coefficients and feature names
    coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

    # Sort the coefficients by magnitude
    coefficients_df['Abs_Coefficient'] = np.abs(coefficients_df['Coefficient'])
    coefficients_df = coefficients_df.sort_values(by='Abs_Coefficient', ascending=False)

    dataframes_features[f'coefficients_df_{time_interval}min'] = coefficients_df
    

# Create a DataFrame for the results
columns = ['Time', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
# print(results_df)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
  Time &  Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &     0.588 &      0.514 &   0.534 &     0.460 \\
 5 min &     0.585 &      0.543 &   0.610 &     0.481 \\
10 min &     0.612 &      0.549 &   0.625 &     0.500 \\
15 min &     0.646 &      0.571 &   0.676 &     0.533 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)


In [4]:
# Top 5 features for each time interval
for time_interval in time_intervals:
    print(f'Top 5 features for {time_interval} min')
    print(dataframes_features[f'coefficients_df_{time_interval}min'].head(5))
    print('\n')

Top 5 features for 1 min
                    Feature  Coefficient  Abs_Coefficient
2    encoder__age_gen_Gen Z    -0.574967         0.574967
35            max_slotdenom    -0.574500         0.574500
34            min_slotdenom     0.433347         0.433347
37         min_theo_payback     0.412006         0.412006
4   encoder__age_gen_Silent     0.326525         0.326525


Top 5 features for 5 min
                          Feature  Coefficient  Abs_Coefficient
2          encoder__age_gen_Gen Z    -0.516098         0.516098
4         encoder__age_gen_Silent     0.326626         0.326626
3     encoder__age_gen_Millenials    -0.247121         0.247121
0   encoder__age_gen_Baby Boomers     0.229988         0.229988
37               min_theo_payback     0.199939         0.199939


Top 5 features for 10 min
                        Feature  Coefficient  Abs_Coefficient
2        encoder__age_gen_Gen Z    -0.454725         0.454725
4       encoder__age_gen_Silent     0.296644         0.296644
36

In [5]:
# Define Path
os.chdir('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/Feauture Importance/Minutes/SVM/June')
# Save datafraes to csv files
for time_interval in time_intervals:
    print(f'Saving features for {time_interval} min interval')
    dataframes_features[f'coefficients_df_{time_interval}min'].to_csv(f'coefficients_df_{time_interval}min.csv')
    

Saving features for 1 min interval
Saving features for 5 min interval
Saving features for 10 min interval
Saving features for 15 min interval


## July

In [6]:
# Table creation 
months_train = ['4_August', '2_June', '5_September', '6_October']
months_test = ['3_July']
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
results = []
dataframes_features = {}
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'{time_interval}min'
    # Load the datasets
    datasets = cf.load_and_preprocess_datasets_min_ntop(months, data_path, file_name, filter)

    # Create training and test sets
    dt_train = pd.concat([datasets['dtf'+month[1:]] for month in months_train])
    dt_test = pd.concat([datasets['dtf'+month[1:]] for month in months_test])

    # # Seperate dependent and independent variables
    X_train = dt_train.iloc[:, :-1].values
    y_train = dt_train.iloc[:, -1].values
    X_test = dt_test.iloc[:, :-1].values
    y_test = dt_test.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X_train[:, 0] = le.fit_transform(X_train[:, 0])
    X_test[:, 0] = le.transform(X_test[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)

    y_train = le.fit_transform(y_train)
    y_test = le.fit_transform(y_test)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_train[:, 25:] = sc.fit_transform(X_train[:, 25:])
    X_test[:, 25:] = sc.transform(X_test[:, 25:])

    # Apply SMOTE to balance the dataset
    sm = SMOTE(random_state=42)
    X_train_resample, y_train_resample = sm.fit_resample(X_train, y_train)

    # Shuffle the resampled data
    X_train_resample, y_train_resample = shuffle(X_train_resample, y_train_resample, random_state=42)

    classifier = LinearSVC(random_state = 0, **best_params)
    classifier.fit(X_train_resample, y_train_resample)

    y_pred = classifier.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])

    # drop last column
    dataset= dt_train.drop(['classification'], axis=1)

    # Get the feature names without 'remainder__' prefix
    feature_names = cf.get_feature_names_without_prefix(ct, input_features=dataset.columns)

    # Get the coefficients and feature names
    coefficients = classifier.coef_[0]

    # Create a DataFrame to display coefficients and feature names
    coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

    # Sort the coefficients by magnitude
    coefficients_df['Abs_Coefficient'] = np.abs(coefficients_df['Coefficient'])
    coefficients_df = coefficients_df.sort_values(by='Abs_Coefficient', ascending=False)

    dataframes_features[f'coefficients_df_{time_interval}min'] = coefficients_df
    

# Create a DataFrame for the results
columns = ['Time', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
# print(results_df)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
  Time &  Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &     0.632 &      0.548 &   0.626 &     0.505 \\
 5 min &     0.671 &      0.565 &   0.665 &     0.537 \\
10 min &     0.696 &      0.578 &   0.693 &     0.560 \\
15 min &     0.717 &      0.585 &   0.702 &     0.575 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)


In [7]:
# Top 5 features for each time interval
for time_interval in time_intervals:
    print(f'Top 5 features for {time_interval} min')
    print(dataframes_features[f'coefficients_df_{time_interval}min'].head(5))
    print('\n')

Top 5 features for 1 min
                          Feature  Coefficient  Abs_Coefficient
2          encoder__age_gen_Gen Z    -0.530661         0.530661
4         encoder__age_gen_Silent     0.331297         0.331297
3     encoder__age_gen_Millenials    -0.289336         0.289336
0   encoder__age_gen_Baby Boomers     0.278834         0.278834
35                  max_slotdenom    -0.234882         0.234882


Top 5 features for 5 min
                          Feature  Coefficient  Abs_Coefficient
2          encoder__age_gen_Gen Z    -0.453895         0.453895
4         encoder__age_gen_Silent     0.324802         0.324802
3     encoder__age_gen_Millenials    -0.278212         0.278212
37               min_theo_payback     0.226648         0.226648
0   encoder__age_gen_Baby Boomers     0.223903         0.223903


Top 5 features for 10 min
                          Feature  Coefficient  Abs_Coefficient
2          encoder__age_gen_Gen Z    -0.379948         0.379948
4         encoder__age_g

In [8]:
# Define Path
os.chdir('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/Feauture Importance/Minutes/SVM/July')
# Save datafraes to csv files
for time_interval in time_intervals:
    print(f'Saving features for {time_interval} min interval')
    dataframes_features[f'coefficients_df_{time_interval}min'].to_csv(f'coefficients_df_{time_interval}min.csv')
    

Saving features for 1 min interval
Saving features for 5 min interval
Saving features for 10 min interval
Saving features for 15 min interval


## August

In [9]:
# Table creation 
months_train = ['2_June', '3_July', '5_September', '6_October']
months_test = ['4_August']
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
results = []
dataframes_features = {}
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'{time_interval}min'
    # Load the datasets
    datasets = cf.load_and_preprocess_datasets_min_ntop(months, data_path, file_name, filter)

    # Create training and test sets
    dt_train = pd.concat([datasets['dtf'+month[1:]] for month in months_train])
    dt_test = pd.concat([datasets['dtf'+month[1:]] for month in months_test])

    # # Seperate dependent and independent variables
    X_train = dt_train.iloc[:, :-1].values
    y_train = dt_train.iloc[:, -1].values
    X_test = dt_test.iloc[:, :-1].values
    y_test = dt_test.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X_train[:, 0] = le.fit_transform(X_train[:, 0])
    X_test[:, 0] = le.transform(X_test[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)

    y_train = le.fit_transform(y_train)
    y_test = le.fit_transform(y_test)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_train[:, 25:] = sc.fit_transform(X_train[:, 25:])
    X_test[:, 25:] = sc.transform(X_test[:, 25:])

    # Apply SMOTE to balance the dataset
    sm = SMOTE(random_state=42)
    X_train_resample, y_train_resample = sm.fit_resample(X_train, y_train)

    # Shuffle the resampled data
    X_train_resample, y_train_resample = shuffle(X_train_resample, y_train_resample, random_state=42)

    classifier = LinearSVC(random_state = 0, **best_params)
    classifier.fit(X_train_resample, y_train_resample)

    y_pred = classifier.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])

    # drop last column
    dataset= dt_train.drop(['classification'], axis=1)

    # Get the feature names without 'remainder__' prefix
    feature_names = cf.get_feature_names_without_prefix(ct, input_features=dataset.columns)

    # Get the coefficients and feature names
    coefficients = classifier.coef_[0]

    # Create a DataFrame to display coefficients and feature names
    coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

    # Sort the coefficients by magnitude
    coefficients_df['Abs_Coefficient'] = np.abs(coefficients_df['Coefficient'])
    coefficients_df = coefficients_df.sort_values(by='Abs_Coefficient', ascending=False)

    dataframes_features[f'coefficients_df_{time_interval}min'] = coefficients_df
    

# Create a DataFrame for the results
columns = ['Time', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
# print(results_df)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
  Time &  Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &     0.639 &      0.548 &   0.623 &     0.508 \\
 5 min &     0.668 &      0.565 &   0.665 &     0.536 \\
10 min &     0.697 &      0.578 &   0.693 &     0.560 \\
15 min &     0.716 &      0.585 &   0.705 &     0.575 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)


In [10]:
# Top 5 features for each time interval
for time_interval in time_intervals:
    print(f'Top 5 features for {time_interval} min')
    print(dataframes_features[f'coefficients_df_{time_interval}min'].head(5))
    print('\n')

Top 5 features for 1 min
                   Feature  Coefficient  Abs_Coefficient
35           max_slotdenom    -1.504112         1.504112
34           min_slotdenom     0.944080         0.944080
2   encoder__age_gen_Gen Z    -0.701754         0.701754
42               max_wager    -0.576905         0.576905
37        min_theo_payback     0.501846         0.501846


Top 5 features for 5 min
                          Feature  Coefficient  Abs_Coefficient
2          encoder__age_gen_Gen Z    -0.655895         0.655895
4         encoder__age_gen_Silent     0.383845         0.383845
0   encoder__age_gen_Baby Boomers     0.245356         0.245356
3     encoder__age_gen_Millenials    -0.199684         0.199684
37               min_theo_payback     0.151562         0.151562


Top 5 features for 10 min
                          Feature  Coefficient  Abs_Coefficient
2          encoder__age_gen_Gen Z    -0.621159         0.621159
4         encoder__age_gen_Silent     0.363924         0.363924
0 

In [11]:
# Define Path
os.chdir('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/Feauture Importance/Minutes/SVM/August')
# Save datafraes to csv files
for time_interval in time_intervals:
    print(f'Saving features for {time_interval} min interval')
    dataframes_features[f'coefficients_df_{time_interval}min'].to_csv(f'coefficients_df_{time_interval}min.csv')
    

Saving features for 1 min interval
Saving features for 5 min interval
Saving features for 10 min interval
Saving features for 15 min interval


## September

In [12]:
# Table creation 
months_train = ['2_June', '3_July', '4_August', '6_October']
months_test = ['5_September']
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
results = []
dataframes_features = {}
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'{time_interval}min'
    # Load the datasets
    datasets = cf.load_and_preprocess_datasets_min_ntop(months, data_path, file_name, filter)

    # Create training and test sets
    dt_train = pd.concat([datasets['dtf'+month[1:]] for month in months_train])
    dt_test = pd.concat([datasets['dtf'+month[1:]] for month in months_test])

    # # Seperate dependent and independent variables
    X_train = dt_train.iloc[:, :-1].values
    y_train = dt_train.iloc[:, -1].values
    X_test = dt_test.iloc[:, :-1].values
    y_test = dt_test.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X_train[:, 0] = le.fit_transform(X_train[:, 0])
    X_test[:, 0] = le.transform(X_test[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)

    y_train = le.fit_transform(y_train)
    y_test = le.fit_transform(y_test)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_train[:, 25:] = sc.fit_transform(X_train[:, 25:])
    X_test[:, 25:] = sc.transform(X_test[:, 25:])

    # Apply SMOTE to balance the dataset
    sm = SMOTE(random_state=42)
    X_train_resample, y_train_resample = sm.fit_resample(X_train, y_train)

    # Shuffle the resampled data
    X_train_resample, y_train_resample = shuffle(X_train_resample, y_train_resample, random_state=42)

    classifier = LinearSVC(random_state = 0, **best_params)
    classifier.fit(X_train_resample, y_train_resample)

    y_pred = classifier.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])

    # drop last column
    dataset= dt_train.drop(['classification'], axis=1)

    # Get the feature names without 'remainder__' prefix
    feature_names = cf.get_feature_names_without_prefix(ct, input_features=dataset.columns)

    # Get the coefficients and feature names
    coefficients = classifier.coef_[0]

    # Create a DataFrame to display coefficients and feature names
    coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

    # Sort the coefficients by magnitude
    coefficients_df['Abs_Coefficient'] = np.abs(coefficients_df['Coefficient'])
    coefficients_df = coefficients_df.sort_values(by='Abs_Coefficient', ascending=False)

    dataframes_features[f'coefficients_df_{time_interval}min'] = coefficients_df
    

# Create a DataFrame for the results
columns = ['Time', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
# print(results_df)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
  Time &  Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &     0.610 &      0.548 &   0.627 &     0.495 \\
 5 min &     0.642 &      0.561 &   0.659 &     0.521 \\
10 min &     0.669 &      0.571 &   0.679 &     0.542 \\
15 min &     0.692 &      0.580 &   0.697 &     0.560 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)


In [13]:
# Top 5 features for each time interval
for time_interval in time_intervals:
    print(f'Top 5 features for {time_interval} min')
    print(dataframes_features[f'coefficients_df_{time_interval}min'].head(5))
    print('\n')

Top 5 features for 1 min
                   Feature  Coefficient  Abs_Coefficient
36        ave_theo_payback    -1.039382         1.039382
37        min_theo_payback     0.777527         0.777527
2   encoder__age_gen_Gen Z    -0.555548         0.555548
35           max_slotdenom    -0.451831         0.451831
38        max_theo_payback     0.451432         0.451432


Top 5 features for 5 min
                          Feature  Coefficient  Abs_Coefficient
2          encoder__age_gen_Gen Z    -0.476444         0.476444
4         encoder__age_gen_Silent     0.283215         0.283215
3     encoder__age_gen_Millenials    -0.237062         0.237062
0   encoder__age_gen_Baby Boomers     0.224177         0.224177
36               ave_theo_payback     0.173490         0.173490


Top 5 features for 10 min
                        Feature  Coefficient  Abs_Coefficient
36             ave_theo_payback     0.442502         0.442502
2        encoder__age_gen_Gen Z    -0.424177         0.424177
4       

In [14]:
# Define Path
os.chdir('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/Feauture Importance/Minutes/SVM/September')
# Save datafraes to csv files
for time_interval in time_intervals:
    print(f'Saving features for {time_interval} min interval')
    dataframes_features[f'coefficients_df_{time_interval}min'].to_csv(f'coefficients_df_{time_interval}min.csv')
    

Saving features for 1 min interval
Saving features for 5 min interval
Saving features for 10 min interval
Saving features for 15 min interval


## October

In [15]:
# Table creation 
months_train = ['2_June', '3_July', '4_August', '5_September']
months_test = ['6_October']
# Define time intervals
time_intervals = [1, 5, 10, 15]

# Initialize lists to store results
results = []
dataframes_features = {}
for time_interval in time_intervals:
    # Load dataset for the specific time interval
    file_name = f'{time_interval}min'
    # Load the datasets
    datasets = cf.load_and_preprocess_datasets_min_ntop(months, data_path, file_name, filter)

    # Create training and test sets
    dt_train = pd.concat([datasets['dtf'+month[1:]] for month in months_train])
    dt_test = pd.concat([datasets['dtf'+month[1:]] for month in months_test])

    # # Seperate dependent and independent variables
    X_train = dt_train.iloc[:, :-1].values
    y_train = dt_train.iloc[:, -1].values
    X_test = dt_test.iloc[:, :-1].values
    y_test = dt_test.iloc[:, -1].values

    # Econde gender column (Binary)
    le = LabelEncoder()

    # Binary Encode gender
    X_train[:, 0] = le.fit_transform(X_train[:, 0])
    X_test[:, 0] = le.transform(X_test[:, 0])

    # # # Encode age_generartion, first_outoce, last_outcome, time of day columns
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7])], remainder='passthrough')
    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)

    y_train = le.fit_transform(y_train)
    y_test = le.fit_transform(y_test)

    sc = StandardScaler()

    # Scale all columns except the encoded ones
    X_train[:, 25:] = sc.fit_transform(X_train[:, 25:])
    X_test[:, 25:] = sc.transform(X_test[:, 25:])

    # Apply SMOTE to balance the dataset
    sm = SMOTE(random_state=42)
    X_train_resample, y_train_resample = sm.fit_resample(X_train, y_train)

    # Shuffle the resampled data
    X_train_resample, y_train_resample = shuffle(X_train_resample, y_train_resample, random_state=42)

    classifier = LinearSVC(random_state = 0, **best_params)
    classifier.fit(X_train_resample, y_train_resample)

    y_pred = classifier.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Append results for this time interval
    results.append([f'{time_interval} min', round(accuracy, 3), round(precision, 3), round(recall, 3), round(f1, 3)])

    # drop last column
    dataset= dt_train.drop(['classification'], axis=1)

    # Get the feature names without 'remainder__' prefix
    feature_names = cf.get_feature_names_without_prefix(ct, input_features=dataset.columns)

    # Get the coefficients and feature names
    coefficients = classifier.coef_[0]

    # Create a DataFrame to display coefficients and feature names
    coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

    # Sort the coefficients by magnitude
    coefficients_df['Abs_Coefficient'] = np.abs(coefficients_df['Coefficient'])
    coefficients_df = coefficients_df.sort_values(by='Abs_Coefficient', ascending=False)

    dataframes_features[f'coefficients_df_{time_interval}min'] = coefficients_df
    

# Create a DataFrame for the results
columns = ['Time', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df = pd.DataFrame(results, columns=columns)

# Print the results as a table
# print(results_df)

# Print the results as a LaTeX table
latex_table = results_df.to_latex(index=False, escape=False)
print(latex_table)

\begin{tabular}{lrrrr}
\toprule
  Time &  Accuracy &  Precision &  Recall &  F1 Score \\
\midrule
 1 min &     0.568 &      0.542 &   0.615 &     0.469 \\
 5 min &     0.607 &      0.552 &   0.641 &     0.496 \\
10 min &     0.643 &      0.566 &   0.675 &     0.524 \\
15 min &     0.673 &      0.575 &   0.694 &     0.547 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False, escape=False)


In [16]:
# Top 5 features for each time interval
for time_interval in time_intervals:
    print(f'Top 5 features for {time_interval} min')
    print(dataframes_features[f'coefficients_df_{time_interval}min'].head(5))
    print('\n')

Top 5 features for 1 min
                    Feature  Coefficient  Abs_Coefficient
38         max_theo_payback    -0.593478         0.593478
2    encoder__age_gen_Gen Z    -0.574659         0.574659
35            max_slotdenom    -0.461322         0.461322
37         min_theo_payback     0.450136         0.450136
4   encoder__age_gen_Silent     0.359091         0.359091


Top 5 features for 5 min
                          Feature  Coefficient  Abs_Coefficient
2          encoder__age_gen_Gen Z    -0.510054         0.510054
4         encoder__age_gen_Silent     0.308364         0.308364
37               min_theo_payback     0.290377         0.290377
3     encoder__age_gen_Millenials    -0.250996         0.250996
0   encoder__age_gen_Baby Boomers     0.233037         0.233037


Top 5 features for 10 min
                          Feature  Coefficient  Abs_Coefficient
2          encoder__age_gen_Gen Z    -0.436745         0.436745
4         encoder__age_gen_Silent     0.286735         0.286

In [17]:
# Define Path
os.chdir('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/Feauture Importance/Minutes/SVM/October')
# Save datafraes to csv files
for time_interval in time_intervals:
    print(f'Saving features for {time_interval} min interval')
    dataframes_features[f'coefficients_df_{time_interval}min'].to_csv(f'coefficients_df_{time_interval}min.csv')
    

Saving features for 1 min interval
Saving features for 5 min interval
Saving features for 10 min interval
Saving features for 15 min interval
