# Similarities and Differences of Top vs. Bottom 20% (Pt.1 - Age, Gender, Won/Lost Amount)

In [44]:
# Define libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import os
import plotting_fn as pf
import counting_fns as cf

month_file = '2_June'
cut_off = 10000
# Set working directory
os.chdir("/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/"+month_file)


In [45]:
# Read in data
df_bottom_20 = pd.read_parquet("Bottom_20_gambles.parquet")
df_top_20 = pd.read_parquet("Top_20_gambles.parquet")

# Eliminate players who maximum number of gambles is 1
df_bottom_20 = df_bottom_20.groupby('playerkey').filter(lambda x: x['gambles'].nunique() > 1)
df_top_20 = df_top_20.groupby('playerkey').filter(lambda x: x['gambles'].nunique() > 1)

# Reset index
df_bottom_20.index = np.arange(1, len(df_bottom_20) + 1)
df_top_20.index = np.arange(1, len(df_top_20) + 1)

# Round wageredamt and profit to 2 decimal places
df_bottom_20['wageredamt'] = df_bottom_20['wageredamt'].round(1)
df_bottom_20['profit'] = df_bottom_20['profit'].round(1)
df_bottom_20['percent_return'] = df_bottom_20['percent_return'].round(1)
df_top_20['wageredamt'] = df_top_20['wageredamt'].round(1)
df_top_20['profit'] = df_top_20['profit'].round(1)
df_top_20['percent_return'] = df_top_20['percent_return'].round(1)

In [46]:
# Create a new column 'result_type' that is a categorical variable which takes the value 'loss' if the change is negative and 'gain' if the change is positive, and 'draw' of change is 0
df_bottom_20['result_type'] = df_bottom_20['percent_return'].apply(lambda x: 'loss' if x == -100 else 'near-hit' if x < 0 else 'gain' if x > 0 else 'draw')

# Create dummy variables from 'result_type'
dummy_variables = pd.get_dummies(df_bottom_20['result_type']).rename(columns=lambda x: '#' + str(x[0].capitalize()))

# Add the dummy variables to the original DataFrame
df_bottom_20 = pd.concat([df_bottom_20, dummy_variables], axis=1).reset_index(drop=True)

# Create a new column 'result_type' that is a categorical variable which takes the value 'loss' if the change is negative and 'gain' if the change is positive, and 'draw' of change is 0
df_top_20['result_type'] = df_top_20['percent_return'].apply(lambda x: 'loss' if x == -100 else 'near-hit' if x < 0 else 'gain' if x > 0 else 'draw')

# Create dummy variables from 'result_type'
dummy_variables = pd.get_dummies(df_top_20['result_type']).rename(columns=lambda x: '#' + str(x[0].capitalize()))

# Add the dummy variables to the original DataFrame
df_top_20 = pd.concat([df_top_20, dummy_variables], axis=1).reset_index(drop=True)

# Convert starttime to delte format for operations
df_bottom_20['start_time'] = pd.to_datetime(df_bottom_20['start_time'])

df_top_20['start_time'] = pd.to_datetime(df_top_20['start_time'])

# Create new column called 'time_diff' which is the difference between the start time of the gamble and the start time of the previous gamble
df_bottom_20['time_diff'] = df_bottom_20.groupby(['playerkey', 'session_time'])['start_time'].diff()
df_top_20['time_diff'] = df_top_20.groupby(['playerkey', 'session_time'])['start_time'].diff()

# Convert time_diff to seconds
df_bottom_20['time_diff'] = df_bottom_20['time_diff'].dt.total_seconds().fillna(0)
df_top_20['time_diff'] = df_top_20['time_diff'].dt.total_seconds().fillna(0)

In [47]:
# Ude consecutive functions to find 2ws, 3ws, 4ws in a row in each dataset per visit per player
df_bottom_20 = pf.consecutive_wins(df_bottom_20, 'visit')
df_top_20 = pf.consecutive_wins(df_top_20, 'visit')


In [48]:
# Ude consecutive functions to find 2ws, 3ws, 4ws in a row in each dataset per session_time per player
df_bottom_20_s = pf.consecutive_wins(df_bottom_20, 'session_time')
df_top_20_s = pf.consecutive_wins(df_top_20, 'session_time')

In [49]:
# Create age ranges 
bins = [0, 24, 40, 55, 75, 150]
labels = ['18-24', '25-40', '41-55', '56-75', '76+']
generations = ['Gen Z', 'Millenials', 'Gen X', 'Baby Boomers', 'Silent']

# Use cut function to create age ranges for bottom 20% and top 20%
df_bottom_20['age_range'] = pd.cut(df_bottom_20['age'], bins=bins, labels=labels, right=False)
df_bottom_20['age_gen'] = pd.cut(df_bottom_20['age'], bins=bins, labels=generations, right=False)

df_top_20['age_range'] = pd.cut(df_top_20['age'], bins=bins, labels=labels, right=False)
df_top_20['age_gen'] = pd.cut(df_top_20['age'], bins=bins, labels=generations, right=False)

In [50]:
# Lets cound the number of times a player increase slot denominations
players_increase_slot_t20 = cf.count_increase(df_top_20, "increase_slotdeno", "playerkey", "slotdenomination")
players_decrease_slot_t20 = cf.count_decrease(df_top_20, "decrease_slotdeno", "playerkey", "slotdenomination")
players_increase_maxbet_t20 = cf.count_increase(df_top_20, "increase_maxbet", "playerkey", "maxbet")
players_decrease_maxbet_t20 = cf.count_decrease(df_top_20, "decrease_maxbet", "playerkey", "maxbet")

Count of players who increase_slotdeno : 34
Count of times each player increase_slotdeno : {17: 1, 19: 3, 20: 26, 29: 25, 33: 15, 43: 1, 69: 1, 89: 2, 90: 1, 92: 1, 103: 3, 135: 1, 144: 15, 159: 5, 188: 1, 194: 1, 222: 4, 224: 2, 234: 6, 244: 1, 263: 1, 264: 12, 308: 3, 313: 1, 319: 1, 322: 2, 331: 5, 351: 1, 361: 1, 363: 50, 410: 1, 461: 1, 475: 171, 507: 1}
Player who changes the most: 475
------------------------------------------------------------------------------------------------------------------
Count of players who decrease_slotdeno : 31
Count of times each player decrease_slotdeno : {17: 1, 19: 2, 20: 24, 29: 25, 33: 14, 43: 1, 69: 2, 89: 2, 99: 1, 103: 4, 135: 1, 144: 15, 159: 7, 188: 1, 222: 4, 224: 3, 234: 5, 244: 1, 263: 1, 264: 13, 308: 2, 322: 2, 331: 4, 351: 1, 361: 1, 363: 48, 410: 1, 461: 1, 475: 172, 507: 1, 521: 1}
Player who changes the most: 475
------------------------------------------------------------------------------------------------------------------
Cou

In [51]:
# Lets do the same but for bottom 20%
players_increase_slot_b20 = cf.count_increase(df_bottom_20, "increase_slotdeno", "playerkey", "slotdenomination")
players_decrease_slot_b20 = cf.count_decrease(df_bottom_20, "decrease_slotdeno", "playerkey", "slotdenomination")
players_increase_maxbet_b20 = cf.count_increase(df_bottom_20, "increase_maxbet", "playerkey", "maxbet")
players_decrease_maxbet_b20 = cf.count_decrease(df_bottom_20, "decrease_maxbet", "playerkey", "maxbet")

Count of players who increase_slotdeno : 4
Count of times each player increase_slotdeno : {67: 1, 219: 1, 223: 1, 248: 1}
Player who changes the most: 67
------------------------------------------------------------------------------------------------------------------
Count of players who decrease_slotdeno : 1
Count of times each player decrease_slotdeno : {406: 1}
Player who changes the most: 406
------------------------------------------------------------------------------------------------------------------
Count of players who increase_maxbet : 3
Count of times each player increase_maxbet : {164: 1, 236: 1, 406: 1}
Player who changes the most: 164
------------------------------------------------------------------------------------------------------------------
Count of players who decrease_maxbet : 6
Count of times each player decrease_maxbet : {67: 1, 219: 1, 223: 1, 236: 2, 389: 1, 457: 1}
Player who changes the most: 236
----------------------------------------------------------

In [52]:
# Lets crate a column called 'depletion_slope' which is the difference of 'playercashableamt' between the current gamble and the previous gamble
df_bottom_20['depletion_rate'] = df_bottom_20.groupby(['playerkey', 'session_time'])['playercashableamt'].diff().fillna(0)
df_top_20['depletion_rate'] = df_top_20.groupby(['playerkey', 'session_time'])['playercashableamt'].diff().fillna(0)


# Separate by time

In [53]:
# Create df_bottom_1min for 1 minute duration
df_bottom_1min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=1))

# Create df_bottom_5min for 5 minutes duration
df_bottom_5min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=5))

# Create df_bottom_10min for 10 minutes duration
df_bottom_10min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=10))

# Create df_bottom_15min for 15 minutes duration
df_bottom_15min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=15))

In [54]:
# Create df_top_1min for 1 minute duration
df_top_1min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=1))

# Create df_bottom_5min for 5 minutes duration
df_top_5min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=5))

# Create df_bottom_10min for 10 minutes duration
df_top_10min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=10))

# Create df_bottom_15min for 15 minutes duration
df_top_15min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=15))

#### First 1 minute

In [55]:
# Create dataframe
df_all_1min = pf.merge_dfs_per_player(data_t=df_top_1min, data_b=df_bottom_1min, grouping='session_time', print_results=False)


In [56]:
# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_1min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_1min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

print(df_sim_v)
# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

print(df_sim_no_repeat_v)

   index session_time playerkey
0      0            1        90
1      1            1       475
   index session_time playerkey
0      0            1        90
1      1            1       475


In [57]:
# Merge Sim play
df_all_1min['sim_play'] = df_all_1min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

print(df_all_1min.columns)

# # # Reorder Columns
desired_order = ['playerkey', 'session_time', 'rank', 'gender', 'age_range', 'age_gen',
                'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom',
                'std_slotdenom', 'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
                'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
                'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope',
                '#inc_maxbet', '#dec_maxbet', 'first_wager', 'first_outcome', 'first_p/b', 'last_wager',
                'last_outcome', 'last_p/b', '#W', '#L', '#NH', '#D', 'w/min', 'l/min', 'nh/min', 'd/min', 
                'w/g', 'l/g', 'nh/g', 'd/g', '#2ws', '2ws_profit', '2ws_wgramt', '#3ws', 
                '3ws_profit','3ws_wgramt', '#4ws', '4ws_profit', '4ws_wgramt', '2ws/min', '3ws/min', '4ws/min',
                'ave_time_per_gamble', 'std_time_per_gamble', 'min_time_per_gamble', 'max_time_per_gamble',
                'total_duration', 'total_gambles', 'machines_changes', '#inc_slotdenom', '#dec_slotdenom',
                'unique_machines', 'ave_time_per_machine', 'sim_play','percentile']

df_all_1min = df_all_1min.reindex(columns=desired_order)

Index(['index', 'playerkey', 'session_time', 'rank', 'gender', 'age_range',
       'age_gen', 'beginning_amt', 'ending_amt', 'ending_balance',
       'ave_slotdenom', 'std_slotdenom', 'min_slotdenom', 'max_slotdenom',
       'ave_theo_payback', 'min_theo_payback', 'max_theo_payback',
       'ave_wageramt', 'std_wageramt', 'min_wager', 'max_wager', 'ave_p/b',
       'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope', '#inc_slotdenom',
       '#dec_slotdenom', '#inc_maxbet', '#dec_maxbet', 'first_wager',
       'first_outcome', 'first_p/b', 'last_wager', 'last_outcome', 'last_p/b',
       'machines_changes', '#W', '#L', '#NH', '#D', 'w/g', 'l/g', 'nh/g',
       'd/g', '#2ws', '2ws_profit', '2ws_wgramt', '#3ws', '3ws_profit',
       '3ws_wgramt', '#4ws', '4ws_profit', '4ws_wgramt', 'ave_time_per_gamble',
       'std_time_per_gamble', 'min_time_per_gamble', 'max_time_per_gamble',
       'total_duration', 'total_gambles', 'unique_machines',
       'ave_time_per_machine', 'w/min', 'l/min'

In [58]:
df_all_1min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_1min.parquet')

#### First 5 minutes

In [59]:
# Create dataframe
df_all_5min = pf.merge_dfs_per_player(data_t=df_top_5min, data_b=df_bottom_5min, grouping='session_time', print_results=False)

# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_5min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_5min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_5min['sim_play'] = df_all_5min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# Reorder Columns

df_all_5min = df_all_5min.reindex(columns=desired_order)

df_all_5min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_5min.parquet')

#### First 10 minutes

In [60]:

# Create dataframe
df_all_10min = pf.merge_dfs_per_player(data_t=df_top_10min, data_b=df_bottom_10min, grouping='session_time', print_results=False)

# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_10min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_10min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_10min['sim_play'] = df_all_10min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# Reorder Columns

df_all_10min = df_all_10min.reindex(columns=desired_order)

df_all_10min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_10min.parquet')

#### First 15 minutes

In [61]:

# Create dataframe
df_all_15min = pf.merge_dfs_per_player(data_t=df_top_15min, data_b=df_bottom_15min, grouping='session_time', print_results=False)

# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_15min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_15min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_15min['sim_play'] = df_all_15min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# Reorder Columns

df_all_15min = df_all_15min.reindex(columns=desired_order)

df_all_15min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_15min.parquet')