# Similarities and Differences of Top vs. Bottom 20% (Pt.1 - Age, Gender, Won/Lost Amount)

In [20]:
# Define libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import os
import plotting_fn as pf
import counting_fns as cf

month_file = '2_June'
cut_off = 10000
# Set working directory
os.chdir("/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/"+month_file)


In [21]:
# Read in data
df_bottom_20 = pd.read_parquet("classification.parquet")


# Eliminate players who maximum number of gambles is 1
df_bottom_20 = df_bottom_20.groupby('playerkey').filter(lambda x: x['gambles'].nunique() > 1)

# Reset index
df_bottom_20.index = np.arange(1, len(df_bottom_20) + 1)

# Round wageredamt and profit to 2 decimal places
df_bottom_20['wageredamt'] = df_bottom_20['wageredamt'].round(1)
df_bottom_20['profit'] = df_bottom_20['profit'].round(1)
df_bottom_20['percent_return'] = df_bottom_20['percent_return'].round(1)


In [22]:
# Create a new column 'result_type' that is a categorical variable which takes the value 'loss' if the change is negative and 'gain' if the change is positive, and 'draw' of change is 0
df_bottom_20['result_type'] = df_bottom_20['percent_return'].apply(lambda x: 'loss' if x == -100 else 'near-hit' if x < 0 else 'gain' if x > 0 else 'draw')

# Create dummy variables from 'result_type'
dummy_variables = pd.get_dummies(df_bottom_20['result_type']).rename(columns=lambda x: '#' + str(x[0].capitalize()))

# Add the dummy variables to the original DataFrame
df_bottom_20 = pd.concat([df_bottom_20, dummy_variables], axis=1).reset_index(drop=True)

# Convert starttime to delte format for operations
df_bottom_20['start_time'] = pd.to_datetime(df_bottom_20['start_time'])

# Create new column called 'time_diff' which is the difference between the start time of the gamble and the start time of the previous gamble
df_bottom_20['time_diff'] = df_bottom_20.groupby(['playerkey', 'session_time'])['start_time'].diff()

# Convert time_diff to seconds
df_bottom_20['time_diff'] = df_bottom_20['time_diff'].dt.total_seconds().fillna(0)

### Filter visit 1

In [23]:
# Filter data frame by visit == 1
df_bottom_20 = df_bottom_20[df_bottom_20['visit'] == 1]

In [24]:
# Ude consecutive functions to find 2ws, 3ws, 4ws in a row in each dataset per visit per player
df_bottom_20 = pf.consecutive_wins(df_bottom_20, 'visit')



In [25]:
# Ude consecutive functions to find 2ws, 3ws, 4ws in a row in each dataset per session_time per player
df_bottom_20_s = pf.consecutive_wins(df_bottom_20, 'session_time')


In [26]:
# Create age ranges 
bins = [0, 24, 40, 55, 75, 150]
labels = ['18-24', '25-40', '41-55', '56-75', '76+']
generations = ['Gen Z', 'Millenials', 'Gen X', 'Baby Boomers', 'Silent']

# Use cut function to create age ranges for bottom 20% and top 20%
df_bottom_20['age_range'] = pd.cut(df_bottom_20['age'], bins=bins, labels=labels, right=False)
df_bottom_20['age_gen'] = pd.cut(df_bottom_20['age'], bins=bins, labels=generations, right=False)


In [27]:
# Lets do the same but for bottom 20%
players_increase_slot_b20 = cf.count_increase(df_bottom_20, "increase_slotdeno", "playerkey", "slotdenomination")
players_decrease_slot_b20 = cf.count_decrease(df_bottom_20, "decrease_slotdeno", "playerkey", "slotdenomination")
players_increase_maxbet_b20 = cf.count_increase(df_bottom_20, "increase_maxbet", "playerkey", "maxbet")
players_decrease_maxbet_b20 = cf.count_decrease(df_bottom_20, "decrease_maxbet", "playerkey", "maxbet")

Count of players who increase_slotdeno : 91
Count of times each player increase_slotdeno : {17: 1, 19: 3, 20: 26, 29: 25, 33: 15, 43: 1, 69: 1, 89: 2, 91: 2, 92: 1, 97: 2, 103: 3, 108: 1, 129: 1, 135: 1, 136: 1, 159: 5, 180: 4, 188: 1, 194: 1, 220: 2, 222: 4, 224: 2, 234: 6, 244: 1, 263: 1, 264: 12, 308: 3, 313: 1, 319: 1, 322: 2, 331: 5, 351: 1, 361: 1, 381: 1, 410: 1, 434: 1, 436: 1, 461: 1, 462: 1, 507: 1, 3: 1, 27: 2, 30: 1, 37: 2, 68: 2, 100: 1, 114: 1, 157: 1, 197: 2, 203: 1, 216: 1, 239: 2, 247: 1, 262: 1, 287: 2, 324: 1, 344: 1, 360: 1, 368: 1, 373: 1, 382: 1, 402: 1, 405: 1, 430: 2, 466: 1, 494: 2, 14: 2, 35: 2, 41: 1, 109: 1, 213: 2, 269: 3, 306: 1, 343: 1, 346: 2, 369: 1, 383: 1, 396: 1, 431: 1, 438: 1, 459: 1, 463: 1, 486: 1, 67: 1, 140: 1, 184: 1, 193: 1, 219: 1, 223: 1, 248: 1}
Player who changes the most: 20
------------------------------------------------------------------------------------------------------------------
Count of players who decrease_slotdeno : 86
Count 

In [28]:
# Lets crate a column called 'depletion_slope' which is the difference of 'playercashableamt' between the current gamble and the previous gamble
df_bottom_20['depletion_rate'] = df_bottom_20.groupby(['playerkey', 'session_time'])['playercashableamt'].diff().fillna(0)


# Separate by time

In [29]:
# Create df_bottom_1min for 1 minute duration
df_bottom_1min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=1))

# Create df_bottom_2min for 2 minutes duration
df_bottom_2min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=2))

# Create df_bottom_3min for 3 minutes duration
df_bottom_3min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=3))

# Create df_bottom_4min for 4 minutes duration
df_bottom_4min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=4))

# Create df_bottom_5min for 5 minutes duration
df_bottom_5min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=5))

# Create df_bottom_10min for 10 minutes duration
df_bottom_10min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=10))

# Create df_bottom_15min for 15 minutes duration
df_bottom_15min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=15))

In [30]:
# Print the unique number of players in each dataset
print("Number of players in bottom 1min: ", df_bottom_1min['playerkey'].nunique())
print("Number of players in bottom 2min: ", df_bottom_2min['playerkey'].nunique())
print("Number of players in bottom 3min: ", df_bottom_3min['playerkey'].nunique())
print("Number of players in bottom 4min: ", df_bottom_4min['playerkey'].nunique())
print("Number of players in bottom 5min: ", df_bottom_5min['playerkey'].nunique())

Number of players in bottom 1min:  272
Number of players in bottom 2min:  272
Number of players in bottom 3min:  272
Number of players in bottom 4min:  272
Number of players in bottom 5min:  272


#### First 1 minute

In [31]:
# Create dataframe
df_all_1min = pf.transform_ml(data_b=df_bottom_1min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_1min: ", df_all_1min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy = df_all_1min[df_all_1min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy)

# Eliminate list_discrepancy  from df_all_1min
df_all_1min = df_all_1min[~df_all_1min['playerkey'].isin(list_discrepancy)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_1min: ", df_all_1min['playerkey'].nunique())

# # # Reorder Columns
desired_order = ['playerkey', 'session_time', 'gender', 'age_range', 'age_gen',
                'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom',
                'std_slotdenom', 'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
                'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
                'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope',
                '#inc_maxbet', '#dec_maxbet', 'first_wager', 'first_outcome', 'first_p/b', 'last_wager',
                'last_outcome', 'last_p/b', '#W', '#L', '#NH', '#D', 'w/min', 'l/min', 'nh/min', 'd/min', 
                'w/g', 'l/g', 'nh/g', 'd/g', '#2ws', '2ws_profit', '2ws_wgramt', '#3ws', 
                '3ws_profit','3ws_wgramt', '#4ws', '4ws_profit', '4ws_wgramt', '2ws/min', '3ws/min', '4ws/min',
                'ave_time_per_gamble', 'min_time_per_gamble', 'max_time_per_gamble',
                'total_duration', 'total_gambles', 'machines_changes', '#inc_slotdenom', '#dec_slotdenom',
                'unique_machines', 'ave_time_per_machine','classification']

df_all_1min = df_all_1min.reindex(columns=desired_order)

Number of unique players in df_all_1min:  272
[ 67 118 140 147 178 181 193 226 239 344 405 506]
Number of unique players w/o discrepancies df_all_1min:  260


#### First 2 minutes

In [32]:
# Create dataframe
df_all_2min = pf.transform_ml(data_b=df_bottom_2min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_2min: ", df_all_2min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_2 = df_all_2min[df_all_2min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_2)

# Eliminate list_discrepancy  from df_all_2min
df_all_2min = df_all_2min[~df_all_2min['playerkey'].isin(list_discrepancy_2)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_2min: ", df_all_2min['playerkey'].nunique())

# Reorder Columns
df_all_2min = df_all_2min.reindex(columns=desired_order)

Number of unique players in df_all_2min:  272
[ 67 118 140 181 193 226 344]
Number of unique players w/o discrepancies df_all_2min:  265


#### First 3 minutes

In [33]:
# Create dataframe
df_all_3min = pf.transform_ml(data_b=df_bottom_3min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_3min: ", df_all_3min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_3 = df_all_3min[df_all_3min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_3)

# Eliminate list_discrepancy  from df_all_3min
df_all_3min = df_all_3min[~df_all_3min['playerkey'].isin(list_discrepancy_3)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_3min: ", df_all_3min['playerkey'].nunique())

# Reorder Columns
df_all_3min = df_all_3min.reindex(columns=desired_order)

Number of unique players in df_all_3min:  272
[ 67 118 140 181 193 226 344]
Number of unique players w/o discrepancies df_all_3min:  265


#### First 4 minutes

In [34]:
# Create dataframe
df_all_4min = pf.transform_ml(data_b=df_bottom_4min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_4min: ", df_all_4min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_4 = df_all_4min[df_all_4min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_4)

# Eliminate list_discrepancy  from df_all_4min
df_all_4min = df_all_4min[~df_all_4min['playerkey'].isin(list_discrepancy_4)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_4min: ", df_all_4min['playerkey'].nunique())

# Reorder Columns
df_all_4min = df_all_4min.reindex(columns=desired_order)

Number of unique players in df_all_4min:  272
[ 67 140 181 226 344]
Number of unique players w/o discrepancies df_all_4min:  267


#### First 5 minutes

In [35]:
# Create dataframe
df_all_5min = pf.transform_ml(data_b=df_bottom_5min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_5min: ", df_all_5min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_5 = df_all_5min[df_all_5min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_5)

# Eliminate list_discrepancy  from df_all_5min
df_all_5min = df_all_5min[~df_all_5min['playerkey'].isin(list_discrepancy_5)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_5min: ", df_all_5min['playerkey'].nunique())

# Reorder Columns
df_all_5min = df_all_5min.reindex(columns=desired_order)

Number of unique players in df_all_5min:  272
[ 67 140 181 226 344]
Number of unique players w/o discrepancies df_all_5min:  267


## Eliminate Discrepancies

In [36]:
# add the ndarrays together to get the total number of player
total_discrepancies = np.concatenate((list_discrepancy, list_discrepancy_2, list_discrepancy_3, list_discrepancy_4, list_discrepancy_5), axis=0)

# Print number of unique players
print("Number of unique players in total_discrepancies: ", len(total_discrepancies))

# Transform into a set to get unique values
total_unique_discrepancies = list(set(total_discrepancies))

# Print number of unique players
print("Number of unique players in total_unique_discrepancies: ", len(total_unique_discrepancies))

Number of unique players in total_discrepancies:  36
Number of unique players in total_unique_discrepancies:  12


In [37]:
# Eliminate total_unique_discrepancies from all the dataframes
df_all_1min = df_all_1min[~df_all_1min['playerkey'].isin(total_unique_discrepancies)]
df_all_2min = df_all_2min[~df_all_2min['playerkey'].isin(total_unique_discrepancies)]
df_all_3min = df_all_3min[~df_all_3min['playerkey'].isin(total_unique_discrepancies)]
df_all_4min = df_all_4min[~df_all_4min['playerkey'].isin(total_unique_discrepancies)]
df_all_5min = df_all_5min[~df_all_5min['playerkey'].isin(total_unique_discrepancies)]

# Print number of unique players
print("Number of unique players in df_all_1min: ", df_all_1min['playerkey'].nunique())
print("Number of unique players in df_all_2min: ", df_all_2min['playerkey'].nunique())
print("Number of unique players in df_all_3min: ", df_all_3min['playerkey'].nunique())
print("Number of unique players in df_all_4min: ", df_all_4min['playerkey'].nunique())
print("Number of unique players in df_all_5min: ", df_all_5min['playerkey'].nunique())


Number of unique players in df_all_1min:  260
Number of unique players in df_all_2min:  260
Number of unique players in df_all_3min:  260
Number of unique players in df_all_4min:  260
Number of unique players in df_all_5min:  260


In [38]:
# Save the dataframes to parquet
df_all_1min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_1min_ALL.parquet')
df_all_2min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_2min_ALL.parquet')
df_all_3min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_3min_ALL.parquet')
df_all_4min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_4min_ALL.parquet')
df_all_5min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_5min_ALL.parquet')