# Similarities and Differences of Top vs. Bottom 20% (Pt.1 - Age, Gender, Won/Lost Amount)

In [1]:
# Define libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import os
import plotting_fn as pf
import counting_fns as cf

month_file = '5_September'
cut_off = 10000
# Set working directory
os.chdir("/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/"+month_file)


In [2]:
# Read in data
df_bottom_20 = pd.read_parquet("classification.parquet")


# Eliminate players who maximum number of gambles is 1
df_bottom_20 = df_bottom_20.groupby('playerkey').filter(lambda x: x['gambles'].nunique() > 1)

# Reset index
df_bottom_20.index = np.arange(1, len(df_bottom_20) + 1)

# Round wageredamt and profit to 2 decimal places
df_bottom_20['wageredamt'] = df_bottom_20['wageredamt'].round(1)
df_bottom_20['profit'] = df_bottom_20['profit'].round(1)
df_bottom_20['percent_return'] = df_bottom_20['percent_return'].round(1)


In [3]:
# Create a new column 'result_type' that is a categorical variable which takes the value 'loss' if the change is negative and 'gain' if the change is positive, and 'draw' of change is 0
df_bottom_20['result_type'] = df_bottom_20['percent_return'].apply(lambda x: 'loss' if x == -100 else 'near-hit' if x < 0 else 'gain' if x > 0 else 'draw')

# Create dummy variables from 'result_type'
dummy_variables = pd.get_dummies(df_bottom_20['result_type']).rename(columns=lambda x: '#' + str(x[0].capitalize()))

# Add the dummy variables to the original DataFrame
df_bottom_20 = pd.concat([df_bottom_20, dummy_variables], axis=1).reset_index(drop=True)

# Convert starttime to delte format for operations
df_bottom_20['start_time'] = pd.to_datetime(df_bottom_20['start_time'])

# Create new column called 'time_diff' which is the difference between the start time of the gamble and the start time of the previous gamble
df_bottom_20['time_diff'] = df_bottom_20.groupby(['playerkey', 'session_time'])['start_time'].diff()

# Convert time_diff to seconds
df_bottom_20['time_diff'] = df_bottom_20['time_diff'].dt.total_seconds().fillna(0)

### Filter visit 1

In [4]:
# Filter data frame by visit == 1
df_bottom_20 = df_bottom_20[df_bottom_20['visit'] == 1]

In [5]:
# Ude consecutive functions to find 2ws, 3ws, 4ws in a row in each dataset per visit per player
df_bottom_20 = pf.consecutive_wins(df_bottom_20, 'visit')



In [6]:
# Ude consecutive functions to find 2ws, 3ws, 4ws in a row in each dataset per session_time per player
df_bottom_20_s = pf.consecutive_wins(df_bottom_20, 'session_time')


In [7]:
# Create age ranges 
bins = [0, 24, 40, 55, 75, 150]
labels = ['18-24', '25-40', '41-55', '56-75', '76+']
generations = ['Gen Z', 'Millenials', 'Gen X', 'Baby Boomers', 'Silent']

# Use cut function to create age ranges for bottom 20% and top 20%
df_bottom_20['age_range'] = pd.cut(df_bottom_20['age'], bins=bins, labels=labels, right=False)
df_bottom_20['age_gen'] = pd.cut(df_bottom_20['age'], bins=bins, labels=generations, right=False)


In [8]:
# Lets do the same but for bottom 20%
players_increase_slot_b20 = cf.count_increase(df_bottom_20, "increase_slotdeno", "playerkey", "slotdenomination")
players_decrease_slot_b20 = cf.count_decrease(df_bottom_20, "decrease_slotdeno", "playerkey", "slotdenomination")
players_increase_maxbet_b20 = cf.count_increase(df_bottom_20, "increase_maxbet", "playerkey", "maxbet")
players_decrease_maxbet_b20 = cf.count_decrease(df_bottom_20, "decrease_maxbet", "playerkey", "maxbet")

Count of players who increase_slotdeno : 3033
Count of times each player increase_slotdeno : {14: 6, 156: 2, 332: 2, 336: 2, 646: 2, 719: 2, 841: 4, 901: 15, 978: 7, 989: 2, 1032: 1, 1408: 1, 1438: 1043, 1515: 1, 1553: 2, 1660: 1, 1699: 1, 1715: 8, 1757: 139, 1950: 2, 1953: 1, 1959: 1, 2004: 1, 2005: 4, 2054: 4, 2067: 1, 2073: 1, 2214: 1, 2235: 1, 2332: 1, 2341: 3, 2442: 21, 2570: 2, 2595: 4, 2710: 2, 2806: 2, 2817: 1, 2981: 2, 2993: 37, 3010: 2, 3120: 6, 3127: 3, 3231: 1, 3265: 4, 3271: 1, 3360: 1, 3708: 2, 3756: 1, 3933: 2, 3955: 1, 4022: 1, 4051: 1, 4269: 1, 4270: 1, 4333: 1, 4483: 14, 4554: 1, 4652: 1, 4699: 1, 4874: 2, 5083: 3, 5152: 1, 5179: 1, 5242: 1, 5482: 65, 5525: 1, 5675: 1, 5789: 1, 5922: 1, 5932: 1, 6090: 67, 6142: 4, 6252: 1, 6284: 1, 6488: 2, 6502: 2, 6556: 5, 6579: 1, 6605: 2, 6695: 1, 6730: 1, 6894: 3, 6897: 1, 7119: 1, 7120: 2, 7121: 2, 7167: 2, 7257: 2, 7390: 1, 7400: 3, 7676: 2, 7746: 2, 8005: 2, 8164: 3, 8276: 2, 8370: 5, 8383: 10, 8412: 3, 8639: 5, 8719: 3, 8749:

In [9]:
# Lets crate a column called 'depletion_slope' which is the difference of 'playercashableamt' between the current gamble and the previous gamble
df_bottom_20['depletion_rate'] = df_bottom_20.groupby(['playerkey', 'session_time'])['playercashableamt'].diff().fillna(0)


# Separate by time

In [10]:
# Create df_bottom_1min for 1 minute duration
df_bottom_1min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=1))

# Create df_bottom_2min for 2 minutes duration
df_bottom_2min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=2))

# Create df_bottom_3min for 3 minutes duration
df_bottom_3min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=3))

# Create df_bottom_4min for 4 minutes duration
df_bottom_4min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=4))

# Create df_bottom_5min for 5 minutes duration
df_bottom_5min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=5))

# Create df_bottom_10min for 10 minutes duration
df_bottom_10min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=10))

# Create df_bottom_15min for 15 minutes duration
df_bottom_15min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=15))

In [11]:
# Print the unique number of players in each dataset
print("Number of players in bottom 1min: ", df_bottom_1min['playerkey'].nunique())
print("Number of players in bottom 2min: ", df_bottom_2min['playerkey'].nunique())
print("Number of players in bottom 3min: ", df_bottom_3min['playerkey'].nunique())
print("Number of players in bottom 4min: ", df_bottom_4min['playerkey'].nunique())
print("Number of players in bottom 5min: ", df_bottom_5min['playerkey'].nunique())

Number of players in bottom 1min:  11166
Number of players in bottom 2min:  11166
Number of players in bottom 3min:  11166
Number of players in bottom 4min:  11166
Number of players in bottom 5min:  11166


#### First 1 minute

In [12]:
# Create dataframe
df_all_1min = pf.transform_ml(data_b=df_bottom_1min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_1min: ", df_all_1min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy = df_all_1min[df_all_1min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy)

# Eliminate list_discrepancy  from df_all_1min
df_all_1min = df_all_1min[~df_all_1min['playerkey'].isin(list_discrepancy)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_1min: ", df_all_1min['playerkey'].nunique())

# # # Reorder Columns
desired_order = ['playerkey', 'session_time', 'gender', 'age_range', 'age_gen',
                'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom',
                'std_slotdenom', 'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
                'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
                'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope',
                '#inc_maxbet', '#dec_maxbet', 'first_wager', 'first_outcome', 'first_p/b', 'last_wager',
                'last_outcome', 'last_p/b', '#W', '#L', '#NH', '#D', 'w/min', 'l/min', 'nh/min', 'd/min', 
                'w/g', 'l/g', 'nh/g', 'd/g', '#2ws', '2ws_profit', '2ws_wgramt', '#3ws', 
                '3ws_profit','3ws_wgramt', '#4ws', '4ws_profit', '4ws_wgramt', '2ws/min', '3ws/min', '4ws/min',
                'ave_time_per_gamble', 'min_time_per_gamble', 'max_time_per_gamble',
                'total_duration', 'total_gambles', 'machines_changes', '#inc_slotdenom', '#dec_slotdenom',
                'unique_machines', 'ave_time_per_machine','classification']

df_all_1min = df_all_1min.reindex(columns=desired_order)

Number of unique players in df_all_1min:  11166
[  236   917   989  1263  2431  2747  2867  2910  6016  6484  7400  7401
  7476  7615  7774  7979  8070  8883 10609 11112 12079 12846 13088 13185
 13220 14062 14692 15179 15335 17145 17811 19589 19636 20072 20213 21167
 21653 22643 22967 23808 23849 24056 25363 25819 25960 26097 27103 28073
 28463 28863 30143 30222 30243 30259 30289 30313 30400 30427 30528 30532
 30619 30688 30741 30771 30826 30838 30868 30870 30889 30916 30959 31022
 31037 31043 31080 31111 31172 31191 31194 31234 31238 31246 31263 31268
 31269 31294 31319 31339 31387 31410 31412 31534 31597 31607 31720 31743
 31782 31859 31877 31890 31919 31929 31951 31956 31974 31990 32008 32051
 32073 32100 32140 32154 32164 32174 32201 32205 32276 32304 32352 32379
 32380 32436 32476 32479 32534 32538 32616 32617 32657 32666 32668 32775
 32779 32793 32801 32806 32856 32862 32872 32885 32930 32935 32940 32963
 32974 33124 33133 33170 33219 33243 33244 33270 33292 33329 33376 33381
 33

#### First 2 minutes

In [13]:
# Create dataframe
df_all_2min = pf.transform_ml(data_b=df_bottom_2min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_2min: ", df_all_2min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_2 = df_all_2min[df_all_2min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_2)

# Eliminate list_discrepancy  from df_all_2min
df_all_2min = df_all_2min[~df_all_2min['playerkey'].isin(list_discrepancy_2)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_2min: ", df_all_2min['playerkey'].nunique())

# Reorder Columns
df_all_2min = df_all_2min.reindex(columns=desired_order)

Number of unique players in df_all_2min:  11166
[  236   989  1263  2747  2867  2910  7400  7401  7476  7615 10609 11112
 12079 12846 14062 14692 15179 15335 17811 19589 19636 20213 22967 23808
 23849 24056 25819 26097 27103 28073 28463 28863 30143 30222 30243 30259
 30289 30313 30427 30528 30532 30688 30771 30826 30838 30868 30870 30916
 30959 31022 31043 31080 31111 31172 31191 31234 31238 31246 31263 31294
 31339 31387 31410 31412 31597 31720 31743 31782 31859 31877 31890 31919
 31951 31956 31990 32051 32172 32174 32201 32205 32379 32380 32479 32617
 32657 32666 32862 32940 33133 33170 33243 33244 33292 33329 33390 33478
 33656 33827 33952 33983 33990 34110 34116 34228 34264 34358 34424 34573
 34576 34589 34654 34789 34806 34900 34994 34997 35005 35057 35087 35124
 35230 35326 35359 35368 35441 35527 35614 35638 35697 35720 35789 35835
 35855 35871 35961 36057 36112 36235 36247 36299 36342 36449 36707 36755
 36820 36831 36879 36933 36950 36963 37068 37093 37163 37200 37212 37217
 37

#### First 3 minutes

In [14]:
# Create dataframe
df_all_3min = pf.transform_ml(data_b=df_bottom_3min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_3min: ", df_all_3min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_3 = df_all_3min[df_all_3min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_3)

# Eliminate list_discrepancy  from df_all_3min
df_all_3min = df_all_3min[~df_all_3min['playerkey'].isin(list_discrepancy_3)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_3min: ", df_all_3min['playerkey'].nunique())

# Reorder Columns
df_all_3min = df_all_3min.reindex(columns=desired_order)

Number of unique players in df_all_3min:  11166
[  236   989  1263  2747  2867  2910  7401  7615 10609 11112 12079 12846
 14062 14692 15179 15335 17811 19589 19636 20213 22967 23808 23849 24056
 25819 26097 28073 28463 30222 30289 30427 30528 30532 30688 30771 30838
 30959 31043 31080 31111 31172 31238 31246 31263 31294 31339 31387 31410
 31720 31743 31782 31859 31877 31890 31919 31951 31990 32051 32172 32174
 32201 32205 32379 32380 32479 32617 32657 32862 32940 33133 33170 33243
 33244 33292 33329 33390 33656 33664 33990 34110 34116 34186 34264 34358
 34573 34576 34589 34789 34900 34997 35057 35087 35124 35209 35230 35326
 35359 35441 35614 35697 35720 35835 35855 35871 35961 36112 36235 36247
 36299 36342 36441 36707 36820 36879 36933 36950 36963 37068 37163 37200
 37212 37217 37335 37657 37734 37753 37871 37948 38005 38067 38130 38381
 38433 38448 38473 38480 38490 38514 38535 38691 38771 38785 38792 38866
 38867 38885 38906 38957 39056 39076 39104 39113 39207 39241 39297 39410
 39

#### First 4 minutes

In [15]:
# Create dataframe
df_all_4min = pf.transform_ml(data_b=df_bottom_4min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_4min: ", df_all_4min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_4 = df_all_4min[df_all_4min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_4)

# Eliminate list_discrepancy  from df_all_4min
df_all_4min = df_all_4min[~df_all_4min['playerkey'].isin(list_discrepancy_4)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_4min: ", df_all_4min['playerkey'].nunique())

# Reorder Columns
df_all_4min = df_all_4min.reindex(columns=desired_order)

Number of unique players in df_all_4min:  11166
[  236   989  1263  2747  2867  2910  7401  7615 11112 12079 12846 14062
 15179 15335 17811 19589 20213 22967 23808 23849 24056 25819 26097 28073
 28463 30289 30427 30528 30532 30688 30771 30838 30959 31043 31080 31111
 31238 31246 31263 31339 31387 31720 31743 31782 31859 31877 31890 31919
 31951 31990 32051 32172 32174 32201 32205 32379 32479 32617 32657 32862
 32940 33133 33170 33244 33292 33329 33656 33664 33990 34110 34116 34186
 34264 34358 34573 34576 34589 34789 34872 34900 34997 35057 35087 35124
 35209 35230 35326 35359 35441 35614 35697 35720 35835 35855 35871 36112
 36235 36247 36299 36441 36707 36820 36879 36897 36933 36950 36963 37068
 37163 37200 37212 37335 37657 37734 37753 37871 37948 38067 38130 38381
 38433 38448 38473 38480 38490 38514 38535 38691 38771 38785 38792 38866
 38867 38885 38906 39056 39076 39104 39113 39207 39241 39410 39473 39474
 39533 39577 39689 39835 39883 40017 40141 40349 40499 40726 40793 40794
 40

#### First 5 minutes

In [16]:
# Create dataframe
df_all_5min = pf.transform_ml(data_b=df_bottom_5min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_5min: ", df_all_5min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_5 = df_all_5min[df_all_5min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_5)

# Eliminate list_discrepancy  from df_all_5min
df_all_5min = df_all_5min[~df_all_5min['playerkey'].isin(list_discrepancy_5)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_5min: ", df_all_5min['playerkey'].nunique())

# Reorder Columns
df_all_5min = df_all_5min.reindex(columns=desired_order)

Number of unique players in df_all_5min:  11166
[  236   989  1263  2747  2867  2910  7401  7615 11112 12079 14062 15179
 15335 17811 19589 20213 22967 23808 23849 24056 28073 28463 30289 30291
 30427 30528 30532 30688 30769 30771 30838 30959 31043 31080 31111 31238
 31246 31263 31339 31387 31743 31782 31859 31877 31890 31919 31951 31990
 32051 32172 32174 32201 32205 32379 32479 32617 32657 32940 33133 33170
 33244 33292 33329 33570 33664 33990 34110 34116 34186 34264 34358 34576
 34589 34789 34872 34900 34997 35057 35124 35209 35230 35326 35359 35441
 35697 35720 35835 35855 35871 36112 36235 36247 36299 36441 36707 36879
 36897 36933 36950 36963 37163 37200 37212 37335 37657 37734 37753 37871
 37948 38067 38130 38381 38433 38448 38473 38480 38490 38514 38534 38535
 38691 38785 38792 38866 38867 38868 38885 38906 39056 39076 39104 39113
 39207 39241 39410 39473 39474 39533 39577 39689 39835 39883 40017 40141
 40349 40499 40693 40793 40794 40896 40957 41110 41473 41527 41572 41586
 41

## Eliminate Discrepancies

In [17]:
# add the ndarrays together to get the total number of player
total_discrepancies = np.concatenate((list_discrepancy, list_discrepancy_2, list_discrepancy_3, list_discrepancy_4, list_discrepancy_5), axis=0)

# Print number of unique players
print("Number of unique players in total_discrepancies: ", len(total_discrepancies))

# Transform into a set to get unique values
total_unique_discrepancies = list(set(total_discrepancies))

# Print number of unique players
print("Number of unique players in total_unique_discrepancies: ", len(total_unique_discrepancies))

Number of unique players in total_discrepancies:  1215
Number of unique players in total_unique_discrepancies:  449


In [18]:
# Eliminate total_unique_discrepancies from all the dataframes
df_all_1min = df_all_1min[~df_all_1min['playerkey'].isin(total_unique_discrepancies)]
df_all_2min = df_all_2min[~df_all_2min['playerkey'].isin(total_unique_discrepancies)]
df_all_3min = df_all_3min[~df_all_3min['playerkey'].isin(total_unique_discrepancies)]
df_all_4min = df_all_4min[~df_all_4min['playerkey'].isin(total_unique_discrepancies)]
df_all_5min = df_all_5min[~df_all_5min['playerkey'].isin(total_unique_discrepancies)]

# Print number of unique players
print("Number of unique players in df_all_1min: ", df_all_1min['playerkey'].nunique())
print("Number of unique players in df_all_2min: ", df_all_2min['playerkey'].nunique())
print("Number of unique players in df_all_3min: ", df_all_3min['playerkey'].nunique())
print("Number of unique players in df_all_4min: ", df_all_4min['playerkey'].nunique())
print("Number of unique players in df_all_5min: ", df_all_5min['playerkey'].nunique())


Number of unique players in df_all_1min:  10717
Number of unique players in df_all_2min:  10717
Number of unique players in df_all_3min:  10717
Number of unique players in df_all_4min:  10717
Number of unique players in df_all_5min:  10717


In [19]:
# Save the dataframes to parquet
df_all_1min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_1min_ALL.parquet')
df_all_2min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_2min_ALL.parquet')
df_all_3min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_3min_ALL.parquet')
df_all_4min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_4min_ALL.parquet')
df_all_5min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_5min_ALL.parquet')