# Decision-Tree

## Data Preprocessing

In [25]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import counting_fns as cf
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder


# List of all the months
months = ['2_June', '3_July', '4_August', '5_September', '6_October']

# Define the path to the data directory and columns to keep
data_path = "/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month"

# Filter Columns based on Machine Learning Feature Selection
filter = ['session_time', 'playerkey', 'std_p/b', 'max_p/b', '2ws_wgramt', '3ws_profit', '3ws_wgramt', 'max_profit', '2ws_profit', 'ave_p/b', 'min_time_per_gamble', 'w/min', 'classification']


## Load Dataframes

In [27]:
datasets = cf.load_and_preprocess_datasets_ntop(months, data_path, 'session', filter)

# Concatenate all the datasets
dataset = pd.concat(datasets, ignore_index= True)

# Apply SMOTE to balance the dataset
sm = SMOTE(random_state=42)

X = dataset.drop(columns=['classification'])
y = dataset['classification']

# Encode gender column (Binary)
le = LabelEncoder()
y = le.fit_transform(y)

# Apply SMOTE to resample the minority class
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

# Convert back to dataframe
X_res = pd.DataFrame(X_res, columns= X.columns)
y_res = pd.DataFrame(y_res, columns= ['classification'])

# Concatenate the two dataframes
resample_dataset = pd.concat([X_res, y_res], axis=1)

# Count the number of each class
count = resample_dataset['classification'].value_counts()
print(count)

0    37812
1    37812
Name: classification, dtype: int64


## Save dataframe to STATA file

In [28]:
# Set saving directory
os.chdir("/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/Stata_Data")
# Save the dataset to stata file
resample_dataset.to_stata('Resample_Overall_Dataset.dta')

/var/folders/vp/1skwx2kd29s4fxnxx7tt9r6w0000gn/T/ipykernel_78400/1508396723.py:4: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    std_p/b   ->   std_p_b
    max_p/b   ->   max_p_b
    2ws_wgramt   ->   _2ws_wgramt
    3ws_profit   ->   _3ws_profit
    3ws_wgramt   ->   _3ws_wgramt
    2ws_profit   ->   _2ws_profit
    ave_p/b   ->   ave_p_b
    w/min   ->   w_min

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)

  resample_dataset.to_stata('Resample_Overall_Dataset.dta')
