# Similarities and Differences of Top vs. Bottom 20% (Pt.1 - Age, Gender, Won/Lost Amount)

In [1]:
# Define libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import os
import plotting_fn as pf
import counting_fns as cf

month_file = '5_September'
cut_off = 10000
# Set working directory
os.chdir("/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/"+month_file)


In [2]:
# Read in data
df_bottom_20 = pd.read_parquet("Bottom_20_gambles.parquet")
df_top_20 = pd.read_parquet("Top_20_gambles.parquet")

# Eliminate players who maximum number of gambles is 1
df_bottom_20 = df_bottom_20.groupby('playerkey').filter(lambda x: x['gambles'].nunique() > 1)
df_top_20 = df_top_20.groupby('playerkey').filter(lambda x: x['gambles'].nunique() > 1)

# Reset index
df_bottom_20.index = np.arange(1, len(df_bottom_20) + 1)
df_top_20.index = np.arange(1, len(df_top_20) + 1)

# Round wageredamt and profit to 2 decimal places
df_bottom_20['wageredamt'] = df_bottom_20['wageredamt'].round(1)
df_bottom_20['profit'] = df_bottom_20['profit'].round(1)
df_bottom_20['percent_return'] = df_bottom_20['percent_return'].round(1)
df_top_20['wageredamt'] = df_top_20['wageredamt'].round(1)
df_top_20['profit'] = df_top_20['profit'].round(1)
df_top_20['percent_return'] = df_top_20['percent_return'].round(1)

In [3]:
# Create a new column 'result_type' that is a categorical variable which takes the value 'loss' if the change is negative and 'gain' if the change is positive, and 'draw' of change is 0
df_bottom_20['result_type'] = df_bottom_20['percent_return'].apply(lambda x: 'loss' if x == -100 else 'near-hit' if x < 0 else 'gain' if x > 0 else 'draw')

# Create dummy variables from 'result_type'
dummy_variables = pd.get_dummies(df_bottom_20['result_type']).rename(columns=lambda x: '#' + str(x[0].capitalize()))

# Add the dummy variables to the original DataFrame
df_bottom_20 = pd.concat([df_bottom_20, dummy_variables], axis=1).reset_index(drop=True)

# Create a new column 'result_type' that is a categorical variable which takes the value 'loss' if the change is negative and 'gain' if the change is positive, and 'draw' of change is 0
df_top_20['result_type'] = df_top_20['percent_return'].apply(lambda x: 'loss' if x == -100 else 'near-hit' if x < 0 else 'gain' if x > 0 else 'draw')

# Create dummy variables from 'result_type'
dummy_variables = pd.get_dummies(df_top_20['result_type']).rename(columns=lambda x: '#' + str(x[0].capitalize()))

# Add the dummy variables to the original DataFrame
df_top_20 = pd.concat([df_top_20, dummy_variables], axis=1).reset_index(drop=True)

# Convert starttime to delte format for operations
df_bottom_20['start_time'] = pd.to_datetime(df_bottom_20['start_time'])

df_top_20['start_time'] = pd.to_datetime(df_top_20['start_time'])

# Create new column called 'time_diff' which is the difference between the start time of the gamble and the start time of the previous gamble
df_bottom_20['time_diff'] = df_bottom_20.groupby(['playerkey', 'session_time'])['start_time'].diff()
df_top_20['time_diff'] = df_top_20.groupby(['playerkey', 'session_time'])['start_time'].diff()

# Convert time_diff to seconds
df_bottom_20['time_diff'] = df_bottom_20['time_diff'].dt.total_seconds().fillna(0)
df_top_20['time_diff'] = df_top_20['time_diff'].dt.total_seconds().fillna(0)

### Filter visit 1

In [4]:
# Filter data frame by visit == 1
df_bottom_20 = df_bottom_20[df_bottom_20['visit'] == 1]
df_top_20 = df_top_20[df_top_20['visit'] == 1]

In [5]:
# Ude consecutive functions to find 2ws, 3ws, 4ws in a row in each dataset per visit per player
df_bottom_20 = pf.consecutive_wins(df_bottom_20, 'visit')
df_top_20 = pf.consecutive_wins(df_top_20, 'visit')


In [6]:
# Ude consecutive functions to find 2ws, 3ws, 4ws in a row in each dataset per session_time per player
df_bottom_20_s = pf.consecutive_wins(df_bottom_20, 'session_time')
df_top_20_s = pf.consecutive_wins(df_top_20, 'session_time')

In [7]:
# Create age ranges 
bins = [0, 24, 40, 55, 75, 150]
labels = ['18-24', '25-40', '41-55', '56-75', '76+']
generations = ['Gen Z', 'Millenials', 'Gen X', 'Baby Boomers', 'Silent']

# Use cut function to create age ranges for bottom 20% and top 20%
df_bottom_20['age_range'] = pd.cut(df_bottom_20['age'], bins=bins, labels=labels, right=False)
df_bottom_20['age_gen'] = pd.cut(df_bottom_20['age'], bins=bins, labels=generations, right=False)

df_top_20['age_range'] = pd.cut(df_top_20['age'], bins=bins, labels=labels, right=False)
df_top_20['age_gen'] = pd.cut(df_top_20['age'], bins=bins, labels=generations, right=False)

In [8]:
# Lets cound the number of times a player increase slot denominations
players_increase_slot_t20 = cf.count_increase(df_top_20, "increase_slotdeno", "playerkey", "slotdenomination")
players_decrease_slot_t20 = cf.count_decrease(df_top_20, "decrease_slotdeno", "playerkey", "slotdenomination")
players_increase_maxbet_t20 = cf.count_increase(df_top_20, "increase_maxbet", "playerkey", "maxbet")
players_decrease_maxbet_t20 = cf.count_decrease(df_top_20, "decrease_maxbet", "playerkey", "maxbet")

Count of players who increase_slotdeno : 1246
Count of times each player increase_slotdeno : {14: 6, 73: 17, 156: 2, 332: 2, 336: 2, 450: 1, 646: 2, 719: 2, 841: 4, 901: 15, 978: 7, 1032: 1, 1113: 118, 1337: 236, 1371: 4, 1408: 1, 1438: 1043, 1515: 1, 1553: 2, 1562: 2, 1563: 1, 1599: 5, 1660: 1, 1699: 1, 1715: 8, 1757: 139, 1909: 2, 1950: 2, 1959: 1, 2004: 1, 2005: 4, 2054: 4, 2067: 1, 2073: 1, 2214: 1, 2332: 1, 2341: 3, 2413: 1, 2570: 2, 2595: 4, 2631: 114, 2710: 2, 2816: 1, 2817: 1, 2981: 2, 2993: 37, 3010: 2, 3120: 6, 3127: 3, 3231: 1, 3265: 4, 3271: 1, 3360: 1, 3708: 2, 3756: 1, 3933: 2, 3955: 1, 4022: 1, 4051: 1, 4269: 1, 4270: 1, 4304: 2, 4333: 1, 4483: 14, 4508: 1, 4699: 1, 4874: 2, 4885: 2, 5083: 3, 5179: 1, 5242: 1, 5482: 65, 5525: 1, 5675: 1, 5713: 129, 5789: 1, 5922: 1, 5932: 1, 6090: 67, 6142: 4, 6252: 1, 6284: 1, 6314: 2, 6488: 2, 6502: 2, 6525: 872, 6556: 5, 6596: 11, 6605: 2, 6695: 1, 6894: 3, 6897: 1, 7119: 1, 7167: 2, 7257: 2, 7341: 3, 7390: 1, 7400: 3, 7695: 3, 7746: 

In [9]:
# Lets do the same but for bottom 20%
players_increase_slot_b20 = cf.count_increase(df_bottom_20, "increase_slotdeno", "playerkey", "slotdenomination")
players_decrease_slot_b20 = cf.count_decrease(df_bottom_20, "decrease_slotdeno", "playerkey", "slotdenomination")
players_increase_maxbet_b20 = cf.count_increase(df_bottom_20, "increase_maxbet", "playerkey", "maxbet")
players_decrease_maxbet_b20 = cf.count_decrease(df_bottom_20, "decrease_maxbet", "playerkey", "maxbet")

Count of players who increase_slotdeno : 152
Count of times each player increase_slotdeno : {184: 1, 7401: 1, 8092: 1, 8499: 1, 9255: 1, 18748: 1, 20635: 1, 21167: 1, 23758: 1, 26876: 1, 28197: 1, 30170: 1, 30198: 1, 30234: 1, 30254: 1, 30336: 1, 30492: 1, 30512: 1, 30573: 1, 30629: 1, 30638: 1, 30927: 2, 30971: 1, 31040: 1, 31099: 1, 31171: 1, 31292: 1, 31485: 1, 31502: 1, 31531: 1, 31614: 1, 31669: 2, 31697: 1, 31777: 1, 31888: 1, 32134: 1, 32164: 4, 32450: 1, 32645: 1, 32930: 1, 32952: 2, 32986: 1, 33090: 1, 33149: 1, 33262: 1, 33308: 1, 33362: 1, 33380: 1, 33388: 10, 33678: 1, 33766: 1, 33825: 1, 33848: 1, 33856: 1, 33897: 1, 34075: 1, 34159: 1, 34198: 1, 34300: 1, 34439: 2, 34472: 1, 34477: 1, 34741: 1, 34789: 1, 34827: 1, 34919: 1, 34978: 1, 35081: 1, 35083: 1, 35101: 1, 35485: 1, 35509: 1, 35512: 1, 35563: 1, 35824: 1, 35853: 1, 35898: 1, 36012: 1, 36021: 1, 36153: 1, 36255: 2, 36326: 1, 36345: 1, 36464: 1, 36672: 1, 36862: 1, 36877: 2, 36930: 1, 36973: 1, 37200: 2, 37354: 1, 37

In [10]:
# Lets crate a column called 'depletion_slope' which is the difference of 'playercashableamt' between the current gamble and the previous gamble
df_bottom_20['depletion_rate'] = df_bottom_20.groupby(['playerkey', 'session_time'])['playercashableamt'].diff().fillna(0)
df_top_20['depletion_rate'] = df_top_20.groupby(['playerkey', 'session_time'])['playercashableamt'].diff().fillna(0)


# Separate by time

In [11]:
# Create df_bottom_1min for 1 minute duration
df_bottom_1min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=1))

# Create df_bottom_2min for 2 minutes duration
df_bottom_2min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=2))

# Create df_bottom_3min for 3 minutes duration
df_bottom_3min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=3))

# Create df_bottom_4min for 4 minutes duration
df_bottom_4min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=4))

# Create df_bottom_5min for 5 minutes duration
df_bottom_5min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=5))

# Create df_bottom_10min for 10 minutes duration
df_bottom_10min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=10))

# Create df_bottom_15min for 15 minutes duration
df_bottom_15min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=15))

In [12]:
# Print the unique number of players in each dataset
print("Number of players in bottom 1min: ", df_bottom_1min['playerkey'].nunique())
print("Number of players in bottom 2min: ", df_bottom_2min['playerkey'].nunique())
print("Number of players in bottom 3min: ", df_bottom_3min['playerkey'].nunique())
print("Number of players in bottom 4min: ", df_bottom_4min['playerkey'].nunique())
print("Number of players in bottom 5min: ", df_bottom_5min['playerkey'].nunique())

Number of players in bottom 1min:  2330
Number of players in bottom 2min:  2330
Number of players in bottom 3min:  2330
Number of players in bottom 4min:  2330
Number of players in bottom 5min:  2330


In [13]:
# Create df_top_1min for 1 minute duration
df_top_1min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=1))

# Create df_top_2min for 2 minutes duration
df_top_2min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=2))

# Create df_top_3min for 3 minutes duration
df_top_3min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=3))

# Create df_top_4min for 4 minutes duration
df_top_4min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=4))

# Create df_bottom_5min for 5 minutes duration
df_top_5min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=5))

# Create df_bottom_10min for 10 minutes duration
df_top_10min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=10))

# Create df_bottom_15min for 15 minutes duration
df_top_15min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=15))

In [14]:
# Print the unique number of players in each dataset
print("Number of players in top 1min: ", df_top_1min['playerkey'].nunique())
print("Number of players in top 2min: ", df_top_2min['playerkey'].nunique())
print("Number of players in top 3min: ", df_top_3min['playerkey'].nunique())
print("Number of players in top 4min: ", df_top_4min['playerkey'].nunique())
print("Number of players in top 5min: ", df_top_5min['playerkey'].nunique())

Number of players in top 1min:  2385
Number of players in top 2min:  2385
Number of players in top 3min:  2385
Number of players in top 4min:  2385
Number of players in top 5min:  2385


#### First 1 minute

In [15]:
# Create dataframe
df_all_1min = pf.merge_dfs_per_player(data_t=df_top_1min, data_b=df_bottom_1min, grouping='session_time', print_results=False)

# Print number of unique players
print("Number of unique players in df_all_1min: ", df_all_1min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy = df_all_1min[df_all_1min.isna().any(axis=1)]['playerkey'].unique()

# print number of discrepancies
print("Number of discrepancies:", len(list_discrepancy))

# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_1min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_1min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_1min['sim_play'] = df_all_1min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# # # Reorder Columns
desired_order = ['playerkey', 'session_time', 'gender', 'age_range', 'age_gen',
                'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom',
                'std_slotdenom', 'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
                'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
                'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope',
                '#inc_maxbet', '#dec_maxbet', 'first_wager', 'first_outcome', 'first_p/b', 'last_wager',
                'last_outcome', 'last_p/b', '#W', '#L', '#NH', '#D', 'w/min', 'l/min', 'nh/min', 'd/min', 
                'w/g', 'l/g', 'nh/g', 'd/g', '#2ws', '2ws_profit', '2ws_wgramt', '#3ws', 
                '3ws_profit','3ws_wgramt', '#4ws', '4ws_profit', '4ws_wgramt', '2ws/min', '3ws/min', '4ws/min',
                'ave_time_per_gamble', 'min_time_per_gamble', 'max_time_per_gamble',
                'total_duration', 'total_gambles', 'machines_changes', '#inc_slotdenom', '#dec_slotdenom',
                'unique_machines', 'ave_time_per_machine', 'sim_play','percentile']

df_all_1min = df_all_1min.reindex(columns=desired_order)

Number of unique players in df_all_1min:  4715
Number of discrepancies: 192


#### First 2 minutes

In [16]:
# Create dataframe
df_all_2min = pf.merge_dfs_per_player(data_t=df_top_2min, data_b=df_bottom_2min, grouping='session_time', print_results=False)

# Print number of unique players
print("Number of unique players in df_all_2min: ", df_all_2min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy2 = df_all_2min[df_all_2min.isna().any(axis=1)]['playerkey'].unique()

# print number of discrepancies
print("Number of discrepancies:", len(list_discrepancy2))

# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_2min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_2min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_2min['sim_play'] = df_all_2min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# Reorder Columns

df_all_2min = df_all_2min.reindex(columns=desired_order)

Number of unique players in df_all_2min:  4715
Number of discrepancies: 110


#### First 3 minutes

In [17]:
# Create dataframe
df_all_3min = pf.merge_dfs_per_player(data_t=df_top_3min, data_b=df_bottom_3min, grouping='session_time', print_results=False)

# Print number of unique players
print("Number of unique players in df_all_3min: ", df_all_3min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy3 = df_all_3min[df_all_3min.isna().any(axis=1)]['playerkey'].unique()

# print number of discrepancies
print("Number of discrepancies:", len(list_discrepancy3))

# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_3min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_3min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_3min['sim_play'] = df_all_3min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# Reorder Columns

df_all_3min = df_all_3min.reindex(columns=desired_order)

Number of unique players in df_all_3min:  4715
Number of discrepancies: 91


#### First 4 minutes

In [18]:
# Create dataframe
df_all_4min = pf.merge_dfs_per_player(data_t=df_top_4min, data_b=df_bottom_4min, grouping='session_time', print_results=False)

# Print number of unique players
print("Number of unique players in df_all_4min: ", df_all_4min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy4 = df_all_4min[df_all_4min.isna().any(axis=1)]['playerkey'].unique()

# print number of discrepancies
print("Number of discrepancies:", len(list_discrepancy4))

# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_4min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_4min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances

df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_4min['sim_play'] = df_all_4min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# Reorder Columns

df_all_4min = df_all_4min.reindex(columns=desired_order)

Number of unique players in df_all_4min:  4715
Number of discrepancies: 80


#### First 5 minutes

In [19]:
# Create dataframe
df_all_5min = pf.merge_dfs_per_player(data_t=df_top_5min, data_b=df_bottom_5min, grouping='session_time', print_results=False)

# Print number of unique players
print("Number of unique players in df_all_5min: ", df_all_5min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy5 = df_all_5min[df_all_5min.isna().any(axis=1)]['playerkey'].unique()

# print number of discrepancies
print("Number of discrepancies:", len(list_discrepancy5))

# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_5min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_5min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_5min['sim_play'] = df_all_5min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# Reorder Columns

df_all_5min = df_all_5min.reindex(columns=desired_order)

Number of unique players in df_all_5min:  4715
Number of discrepancies: 76


#### First 10 minutes

In [20]:
# Create dataframe
df_all_10min = pf.merge_dfs_per_player(data_t=df_top_10min, data_b=df_bottom_10min, grouping='session_time', print_results=False)

# Print number of unique players
print("Number of unique players in df_all_10min: ", df_all_10min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy10 = df_all_10min[df_all_10min.isna().any(axis=1)]['playerkey'].unique()

# print number of discrepancies
print("Number of discrepancies:", len(list_discrepancy10))

# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_10min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_10min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_10min['sim_play'] = df_all_10min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# Reorder Columns

df_all_10min = df_all_10min.reindex(columns=desired_order)

Number of unique players in df_all_10min:  4715
Number of discrepancies: 61


#### First 15 minutes

In [21]:
# Create dataframe
df_all_15min = pf.merge_dfs_per_player(data_t=df_top_15min, data_b=df_bottom_15min, grouping='session_time', print_results=False)

# Print number of unique players
print("Number of unique players in df_all_15min: ", df_all_15min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy15 = df_all_15min[df_all_15min.isna().any(axis=1)]['playerkey'].unique()

# print number of discrepancies
print("Number of discrepancies:", len(list_discrepancy15))


# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_15min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_15min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_15min['sim_play'] = df_all_15min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# Reorder Columns

df_all_15min = df_all_15min.reindex(columns=desired_order)

Number of unique players in df_all_15min:  4715
Number of discrepancies: 57


## Eliminate All Discrepancies

In [22]:
# add the ndarrays together to get the total number of player
total_discrepancies = np.concatenate((list_discrepancy, list_discrepancy2, list_discrepancy3, list_discrepancy4, list_discrepancy5, list_discrepancy10, list_discrepancy15), axis=0)

# Print number of unique players
print("Number of unique players in total_discrepancies: ", len(total_discrepancies))

# Transform into a set to get unique values
total_unique_discrepancies = list(set(total_discrepancies))

# Print number of unique players
print("Number of unique players in total_unique_discrepancies: ", len(total_unique_discrepancies))


Number of unique players in total_discrepancies:  667
Number of unique players in total_unique_discrepancies:  196


In [23]:
# Eliminate total_unique_discrepancies from all the dataframes
df_all_1min = df_all_1min[~df_all_1min['playerkey'].isin(total_unique_discrepancies)]
df_all_2min = df_all_2min[~df_all_2min['playerkey'].isin(total_unique_discrepancies)]
df_all_3min = df_all_3min[~df_all_3min['playerkey'].isin(total_unique_discrepancies)]
df_all_4min = df_all_4min[~df_all_4min['playerkey'].isin(total_unique_discrepancies)]
df_all_5min = df_all_5min[~df_all_5min['playerkey'].isin(total_unique_discrepancies)]
df_all_10min = df_all_10min[~df_all_10min['playerkey'].isin(total_unique_discrepancies)]
df_all_15min = df_all_15min[~df_all_15min['playerkey'].isin(total_unique_discrepancies)]

# Print number of unique players
print("Number of unique players in df_all_1min: ", df_all_1min['playerkey'].nunique())
print("Number of unique players in df_all_2min: ", df_all_2min['playerkey'].nunique())
print("Number of unique players in df_all_3min: ", df_all_3min['playerkey'].nunique())
print("Number of unique players in df_all_4min: ", df_all_4min['playerkey'].nunique())
print("Number of unique players in df_all_5min: ", df_all_5min['playerkey'].nunique())
print("Number of unique players in df_all_10min: ", df_all_10min['playerkey'].nunique())
print("Number of unique players in df_all_15min: ", df_all_15min['playerkey'].nunique())

Number of unique players in df_all_1min:  4519
Number of unique players in df_all_2min:  4519
Number of unique players in df_all_3min:  4519
Number of unique players in df_all_4min:  4519
Number of unique players in df_all_5min:  4519
Number of unique players in df_all_10min:  4519
Number of unique players in df_all_15min:  4519


## Save Data to Parquet

In [24]:
# Save the dataframes to parquet files
df_all_1min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_1min.parquet')
df_all_2min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_2min.parquet')
df_all_3min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_3min.parquet')
df_all_4min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_4min.parquet')
df_all_5min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_5min.parquet')
df_all_10min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_10min.parquet')
df_all_15min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_15min.parquet')