# Similarities and Differences of Top vs. Bottom 20% (Pt.1 - Age, Gender, Won/Lost Amount)

In [1]:
# Define libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import os
import plotting_fn as pf
import counting_fns as cf

month_file = '5_September'
cut_off = 10000
# Set working directory
os.chdir("/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/"+month_file)


In [2]:
# Read in data
df_bottom_20 = pd.read_parquet("top_players.parquet")


# Eliminate players who maximum number of gambles is 1
df_bottom_20 = df_bottom_20.groupby('playerkey').filter(lambda x: x['gambles'].nunique() > 1)

# Reset index
df_bottom_20.index = np.arange(1, len(df_bottom_20) + 1)

# Round wageredamt and profit to 2 decimal places
df_bottom_20['wageredamt'] = df_bottom_20['wageredamt'].round(1)
df_bottom_20['profit'] = df_bottom_20['profit'].round(1)
df_bottom_20['percent_return'] = df_bottom_20['percent_return'].round(1)


In [3]:
# Create a new column 'result_type' that is a categorical variable which takes the value 'loss' if the change is negative and 'gain' if the change is positive, and 'draw' of change is 0
df_bottom_20['result_type'] = df_bottom_20['percent_return'].apply(lambda x: 'loss' if x == -100 else 'near-hit' if x < 0 else 'gain' if x > 0 else 'draw')

# Create dummy variables from 'result_type'
dummy_variables = pd.get_dummies(df_bottom_20['result_type']).rename(columns=lambda x: '#' + str(x[0].capitalize()))

# Add the dummy variables to the original DataFrame
df_bottom_20 = pd.concat([df_bottom_20, dummy_variables], axis=1).reset_index(drop=True)

# Convert starttime to delte format for operations
df_bottom_20['start_time'] = pd.to_datetime(df_bottom_20['start_time'])

# Create new column called 'time_diff' which is the difference between the start time of the gamble and the start time of the previous gamble
df_bottom_20['time_diff'] = df_bottom_20.groupby(['playerkey', 'session_time'])['start_time'].diff()

# Convert time_diff to seconds
df_bottom_20['time_diff'] = df_bottom_20['time_diff'].dt.total_seconds().fillna(0)

### Filter visit 1

In [4]:
# Filter data frame by visit == 1
df_bottom_20 = df_bottom_20[df_bottom_20['visit'] == 1]

In [5]:
# Ude consecutive functions to find 2ws, 3ws, 4ws in a row in each dataset per visit per player
df_bottom_20 = pf.consecutive_wins(df_bottom_20, 'visit')



In [6]:
# Ude consecutive functions to find 2ws, 3ws, 4ws in a row in each dataset per session_time per player
df_bottom_20_s = pf.consecutive_wins(df_bottom_20, 'session_time')


In [7]:
# Create age ranges 
bins = [0, 24, 40, 55, 75, 150]
labels = ['18-24', '25-40', '41-55', '56-75', '76+']
generations = ['Gen Z', 'Millenials', 'Gen X', 'Baby Boomers', 'Silent']

# Use cut function to create age ranges for bottom 20% and top 20%
df_bottom_20['age_range'] = pd.cut(df_bottom_20['age'], bins=bins, labels=labels, right=False)
df_bottom_20['age_gen'] = pd.cut(df_bottom_20['age'], bins=bins, labels=generations, right=False)


In [8]:
# Lets do the same but for bottom 20%
players_increase_slot_b20 = cf.count_increase(df_bottom_20, "increase_slotdeno", "playerkey", "slotdenomination")
players_decrease_slot_b20 = cf.count_decrease(df_bottom_20, "decrease_slotdeno", "playerkey", "slotdenomination")
players_increase_maxbet_b20 = cf.count_increase(df_bottom_20, "increase_maxbet", "playerkey", "maxbet")
players_decrease_maxbet_b20 = cf.count_decrease(df_bottom_20, "decrease_maxbet", "playerkey", "maxbet")

Count of players who increase_slotdeno : 595
Count of times each player increase_slotdeno : {156: 2, 336: 2, 1438: 1043, 2073: 1, 4483: 14, 5179: 1, 6284: 1, 6695: 1, 8978: 2, 9191: 1, 9197: 3, 15956: 35, 21209: 2, 23329: 2, 30426: 3, 31137: 88, 31226: 1, 31456: 24, 31606: 163, 31698: 2, 31977: 1, 32192: 10, 32280: 1, 33286: 10, 33351: 2, 33458: 1, 33920: 150, 34837: 2, 35414: 7, 35559: 396, 35649: 1, 35670: 3, 35677: 1, 35865: 1, 35990: 1, 36119: 6, 36256: 166, 36358: 237, 36627: 2, 36691: 2, 36702: 3, 37008: 11, 37135: 8, 37798: 5, 38466: 2, 38956: 1, 39255: 11, 39286: 325, 39335: 4, 39419: 1, 39438: 2, 39463: 1, 39619: 2, 39754: 2, 39821: 8, 39848: 10, 39897: 3, 40104: 3, 40323: 2, 40344: 1, 40514: 6, 40693: 31, 40718: 1, 40977: 1, 41319: 6, 41839: 137, 332: 2, 646: 2, 719: 2, 841: 4, 1515: 1, 1715: 8, 2005: 4, 2332: 1, 2710: 2, 3010: 2, 3120: 6, 3127: 3, 3265: 4, 3708: 2, 4269: 1, 4699: 1, 6894: 3, 7257: 2, 7390: 1, 8370: 5, 8412: 3, 8749: 3, 9031: 1, 9202: 1, 9495: 1, 9976: 1, 121

In [9]:
# Lets crate a column called 'depletion_slope' which is the difference of 'playercashableamt' between the current gamble and the previous gamble
df_bottom_20['depletion_rate'] = df_bottom_20.groupby(['playerkey', 'session_time'])['playercashableamt'].diff().fillna(0)


# Separate by time

In [10]:
# Create df_bottom_1min for 1 minute duration
df_bottom_1min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=1))

# Create df_bottom_2min for 2 minutes duration
df_bottom_2min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=2))

# Create df_bottom_3min for 3 minutes duration
df_bottom_3min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=3))

# Create df_bottom_4min for 4 minutes duration
df_bottom_4min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=4))

# Create df_bottom_5min for 5 minutes duration
df_bottom_5min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=5))

# Create df_bottom_10min for 10 minutes duration
df_bottom_10min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=10))

# Create df_bottom_15min for 15 minutes duration
df_bottom_15min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=15))

In [11]:
# Print the unique number of players in each dataset
print("Number of players in bottom 1min: ", df_bottom_1min['playerkey'].nunique())
print("Number of players in bottom 2min: ", df_bottom_2min['playerkey'].nunique())
print("Number of players in bottom 3min: ", df_bottom_3min['playerkey'].nunique())
print("Number of players in bottom 4min: ", df_bottom_4min['playerkey'].nunique())
print("Number of players in bottom 5min: ", df_bottom_5min['playerkey'].nunique())

Number of players in bottom 1min:  1122
Number of players in bottom 2min:  1122
Number of players in bottom 3min:  1122
Number of players in bottom 4min:  1122
Number of players in bottom 5min:  1122


#### First 1 minute

In [12]:
# Create dataframe
df_all_1min = pf.transform_ml(data_b=df_bottom_1min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_1min: ", df_all_1min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy = df_all_1min[df_all_1min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy)

# Eliminate list_discrepancy  from df_all_1min
df_all_1min = df_all_1min[~df_all_1min['playerkey'].isin(list_discrepancy)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_1min: ", df_all_1min['playerkey'].nunique())

# # # Reorder Columns
desired_order = ['playerkey', 'session_time', 'gender', 'age_range', 'age_gen',
                'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom',
                'std_slotdenom', 'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
                'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
                'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope',
                '#inc_maxbet', '#dec_maxbet', 'first_wager', 'first_outcome', 'first_p/b', 'last_wager',
                'last_outcome', 'last_p/b', '#W', '#L', '#NH', '#D', 'w/min', 'l/min', 'nh/min', 'd/min', 
                'w/g', 'l/g', 'nh/g', 'd/g', '#2ws', '2ws_profit', '2ws_wgramt', '#3ws', 
                '3ws_profit','3ws_wgramt', '#4ws', '4ws_profit', '4ws_wgramt', '2ws/min', '3ws/min', '4ws/min',
                'ave_time_per_gamble', 'min_time_per_gamble', 'max_time_per_gamble',
                'total_duration', 'total_gambles', 'machines_changes', '#inc_slotdenom', '#dec_slotdenom',
                'unique_machines', 'ave_time_per_machine','classification']

df_all_1min = df_all_1min.reindex(columns=desired_order)

Number of unique players in df_all_1min:  1122
[ 2431  7400 20072 23808 26097 30259 30400 30688 30959 31956 32436 32479
 32793 34900 35265 36674 39821 40017 40053 40307 40693 40992 41489]
Number of unique players w/o discrepancies df_all_1min:  1099


#### First 2 minutes

In [13]:
# Create dataframe
df_all_2min = pf.transform_ml(data_b=df_bottom_2min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_2min: ", df_all_2min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_2 = df_all_2min[df_all_2min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_2)

# Eliminate list_discrepancy  from df_all_2min
df_all_2min = df_all_2min[~df_all_2min['playerkey'].isin(list_discrepancy_2)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_2min: ", df_all_2min['playerkey'].nunique())

# Reorder Columns
df_all_2min = df_all_2min.reindex(columns=desired_order)

Number of unique players in df_all_2min:  1122
[ 7400 23808 26097 30259 30688 30959 31956 32479 34900 39821 40017 40307
 40992]
Number of unique players w/o discrepancies df_all_2min:  1109


#### First 3 minutes

In [14]:
# Create dataframe
df_all_3min = pf.transform_ml(data_b=df_bottom_3min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_3min: ", df_all_3min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_3 = df_all_3min[df_all_3min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_3)

# Eliminate list_discrepancy  from df_all_3min
df_all_3min = df_all_3min[~df_all_3min['playerkey'].isin(list_discrepancy_3)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_3min: ", df_all_3min['playerkey'].nunique())

# Reorder Columns
df_all_3min = df_all_3min.reindex(columns=desired_order)

Number of unique players in df_all_3min:  1122
[23808 26097 30688 30959 32479 34900 40017]
Number of unique players w/o discrepancies df_all_3min:  1115


#### First 4 minutes

In [15]:
# Create dataframe
df_all_4min = pf.transform_ml(data_b=df_bottom_4min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_4min: ", df_all_4min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_4 = df_all_4min[df_all_4min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_4)

# Eliminate list_discrepancy  from df_all_4min
df_all_4min = df_all_4min[~df_all_4min['playerkey'].isin(list_discrepancy_4)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_4min: ", df_all_4min['playerkey'].nunique())

# Reorder Columns
df_all_4min = df_all_4min.reindex(columns=desired_order)

Number of unique players in df_all_4min:  1122
[23808 26097 30688 30959 32479 34900 40017]
Number of unique players w/o discrepancies df_all_4min:  1115


#### First 5 minutes

In [16]:
# Create dataframe
df_all_5min = pf.transform_ml(data_b=df_bottom_5min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_5min: ", df_all_5min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_5 = df_all_5min[df_all_5min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_5)

# Eliminate list_discrepancy  from df_all_5min
df_all_5min = df_all_5min[~df_all_5min['playerkey'].isin(list_discrepancy_5)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_5min: ", df_all_5min['playerkey'].nunique())

# Reorder Columns
df_all_5min = df_all_5min.reindex(columns=desired_order)

Number of unique players in df_all_5min:  1122
[23808 30688 30959 32479 34900 40017 40693]
Number of unique players w/o discrepancies df_all_5min:  1115


#### First 10 minutes

In [17]:
# Create dataframe
df_all_10min = pf.transform_ml(data_b=df_bottom_10min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_10min: ", df_all_10min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_10 = df_all_10min[df_all_10min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_10)

# Eliminate list_discrepancy  from df_all_10min
df_all_10min = df_all_10min[~df_all_10min['playerkey'].isin(list_discrepancy_10)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_10min: ", df_all_10min['playerkey'].nunique())

df_all_10min = df_all_10min.reindex(columns=desired_order)

Number of unique players in df_all_10min:  1122
[32479 38767 40017 40693]
Number of unique players w/o discrepancies df_all_10min:  1118


#### First 15 minutes

In [18]:
# Create dataframe
df_all_15min = pf.transform_ml(data_b=df_bottom_15min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_15min: ", df_all_15min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_15 = df_all_15min[df_all_15min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_15)

# Eliminate list_discrepancy  from df_all_10min
df_all_15min = df_all_15min[~df_all_15min['playerkey'].isin(list_discrepancy_15)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_15min: ", df_all_15min['playerkey'].nunique())

# Reorder Columns
df_all_15min = df_all_15min.reindex(columns=desired_order)

Number of unique players in df_all_15min:  1122
[38767 40017 40693]
Number of unique players w/o discrepancies df_all_15min:  1119


## Eliminate Discrepancies

In [19]:
# add the ndarrays together to get the total number of player
total_discrepancies = np.concatenate((list_discrepancy, list_discrepancy_2, list_discrepancy_3, list_discrepancy_4, list_discrepancy_5, list_discrepancy_10, list_discrepancy_15), axis=0)

# Print number of unique players
print("Number of unique players in total_discrepancies: ", len(total_discrepancies))

# Transform into a set to get unique values
total_unique_discrepancies = list(set(total_discrepancies))

# Print number of unique players
print("Number of unique players in total_unique_discrepancies: ", len(total_unique_discrepancies))

Number of unique players in total_discrepancies:  64
Number of unique players in total_unique_discrepancies:  24


In [20]:
# Eliminate total_unique_discrepancies from all the dataframes
df_all_1min = df_all_1min[~df_all_1min['playerkey'].isin(total_unique_discrepancies)]
df_all_2min = df_all_2min[~df_all_2min['playerkey'].isin(total_unique_discrepancies)]
df_all_3min = df_all_3min[~df_all_3min['playerkey'].isin(total_unique_discrepancies)]
df_all_4min = df_all_4min[~df_all_4min['playerkey'].isin(total_unique_discrepancies)]
df_all_5min = df_all_5min[~df_all_5min['playerkey'].isin(total_unique_discrepancies)]
df_all_10min = df_all_10min[~df_all_10min['playerkey'].isin(total_unique_discrepancies)]
df_all_15min = df_all_15min[~df_all_15min['playerkey'].isin(total_unique_discrepancies)]

# Print number of unique players
print("Number of unique players in df_all_1min: ", df_all_1min['playerkey'].nunique())
print("Number of unique players in df_all_2min: ", df_all_2min['playerkey'].nunique())
print("Number of unique players in df_all_3min: ", df_all_3min['playerkey'].nunique())
print("Number of unique players in df_all_4min: ", df_all_4min['playerkey'].nunique())
print("Number of unique players in df_all_5min: ", df_all_5min['playerkey'].nunique())
print("Number of unique players in df_all_10min: ", df_all_10min['playerkey'].nunique())
print("Number of unique players in df_all_15min: ", df_all_15min['playerkey'].nunique())


Number of unique players in df_all_1min:  1098
Number of unique players in df_all_2min:  1098
Number of unique players in df_all_3min:  1098
Number of unique players in df_all_4min:  1098
Number of unique players in df_all_5min:  1098
Number of unique players in df_all_10min:  1098
Number of unique players in df_all_15min:  1098


In [21]:
# Save the dataframes to parquet
df_all_1min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_1min_top_players.parquet')
df_all_2min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_2min_top_players.parquet')
df_all_3min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_3min_top_players.parquet')
df_all_4min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_4min_top_players.parquet')
df_all_5min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_5min_top_players.parquet')
df_all_10min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_10min_top_players.parquet')
df_all_15min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_15min_top_players.parquet')