# Similarities and Differences of Top vs. Bottom 20% (Pt.1 - Age, Gender, Won/Lost Amount)

In [1]:
# Define libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import os
import plotting_fn as pf
import counting_fns as cf

month_file = '4_August'
cut_off = 10000
# Set working directory
os.chdir("/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/"+month_file)


In [2]:
# Read in data
df_bottom_20 = pd.read_parquet("Bottom_20_gambles.parquet")
df_top_20 = pd.read_parquet("Top_20_gambles.parquet")

# Eliminate players who maximum number of gambles is 1
df_bottom_20 = df_bottom_20.groupby('playerkey').filter(lambda x: x['gambles'].nunique() > 1)
df_top_20 = df_top_20.groupby('playerkey').filter(lambda x: x['gambles'].nunique() > 1)

# Reset index
df_bottom_20.index = np.arange(1, len(df_bottom_20) + 1)
df_top_20.index = np.arange(1, len(df_top_20) + 1)

# Round wageredamt and profit to 2 decimal places
df_bottom_20['wageredamt'] = df_bottom_20['wageredamt'].round(1)
df_bottom_20['profit'] = df_bottom_20['profit'].round(1)
df_bottom_20['percent_return'] = df_bottom_20['percent_return'].round(1)
df_top_20['wageredamt'] = df_top_20['wageredamt'].round(1)
df_top_20['profit'] = df_top_20['profit'].round(1)
df_top_20['percent_return'] = df_top_20['percent_return'].round(1)

In [3]:
# Create a new column 'result_type' that is a categorical variable which takes the value 'loss' if the change is negative and 'gain' if the change is positive, and 'draw' of change is 0
df_bottom_20['result_type'] = df_bottom_20['percent_return'].apply(lambda x: 'loss' if x == -100 else 'near-hit' if x < 0 else 'gain' if x > 0 else 'draw')

# Create dummy variables from 'result_type'
dummy_variables = pd.get_dummies(df_bottom_20['result_type']).rename(columns=lambda x: '#' + str(x[0].capitalize()))

# Add the dummy variables to the original DataFrame
df_bottom_20 = pd.concat([df_bottom_20, dummy_variables], axis=1).reset_index(drop=True)

# Create a new column 'result_type' that is a categorical variable which takes the value 'loss' if the change is negative and 'gain' if the change is positive, and 'draw' of change is 0
df_top_20['result_type'] = df_top_20['percent_return'].apply(lambda x: 'loss' if x == -100 else 'near-hit' if x < 0 else 'gain' if x > 0 else 'draw')

# Create dummy variables from 'result_type'
dummy_variables = pd.get_dummies(df_top_20['result_type']).rename(columns=lambda x: '#' + str(x[0].capitalize()))

# Add the dummy variables to the original DataFrame
df_top_20 = pd.concat([df_top_20, dummy_variables], axis=1).reset_index(drop=True)

# Convert starttime to delte format for operations
df_bottom_20['start_time'] = pd.to_datetime(df_bottom_20['start_time'])

df_top_20['start_time'] = pd.to_datetime(df_top_20['start_time'])

# Create new column called 'time_diff' which is the difference between the start time of the gamble and the start time of the previous gamble
df_bottom_20['time_diff'] = df_bottom_20.groupby(['playerkey', 'session_time'])['start_time'].diff()
df_top_20['time_diff'] = df_top_20.groupby(['playerkey', 'session_time'])['start_time'].diff()

# Convert time_diff to seconds
df_bottom_20['time_diff'] = df_bottom_20['time_diff'].dt.total_seconds().fillna(0)
df_top_20['time_diff'] = df_top_20['time_diff'].dt.total_seconds().fillna(0)

### Filter visit 1

In [4]:
# Filter data frame by visit == 1
df_bottom_20 = df_bottom_20[df_bottom_20['visit'] == 1]
df_top_20 = df_top_20[df_top_20['visit'] == 1]

In [5]:
# Ude consecutive functions to find 2ws, 3ws, 4ws in a row in each dataset per visit per player
df_bottom_20 = pf.consecutive_wins(df_bottom_20, 'visit')
df_top_20 = pf.consecutive_wins(df_top_20, 'visit')


In [6]:
# Ude consecutive functions to find 2ws, 3ws, 4ws in a row in each dataset per session_time per player
df_bottom_20_s = pf.consecutive_wins(df_bottom_20, 'session_time')
df_top_20_s = pf.consecutive_wins(df_top_20, 'session_time')

In [7]:
# Create age ranges 
bins = [0, 24, 40, 55, 75, 150]
labels = ['18-24', '25-40', '41-55', '56-75', '76+']
generations = ['Gen Z', 'Millenials', 'Gen X', 'Baby Boomers', 'Silent']

# Use cut function to create age ranges for bottom 20% and top 20%
df_bottom_20['age_range'] = pd.cut(df_bottom_20['age'], bins=bins, labels=labels, right=False)
df_bottom_20['age_gen'] = pd.cut(df_bottom_20['age'], bins=bins, labels=generations, right=False)

df_top_20['age_range'] = pd.cut(df_top_20['age'], bins=bins, labels=labels, right=False)
df_top_20['age_gen'] = pd.cut(df_top_20['age'], bins=bins, labels=generations, right=False)

In [8]:
# Lets cound the number of times a player increase slot denominations
players_increase_slot_t20 = cf.count_increase(df_top_20, "increase_slotdeno", "playerkey", "slotdenomination")
players_decrease_slot_t20 = cf.count_decrease(df_top_20, "decrease_slotdeno", "playerkey", "slotdenomination")
players_increase_maxbet_t20 = cf.count_increase(df_top_20, "increase_maxbet", "playerkey", "maxbet")
players_decrease_maxbet_t20 = cf.count_decrease(df_top_20, "decrease_maxbet", "playerkey", "maxbet")

Count of players who increase_slotdeno : 1385
Count of times each player increase_slotdeno : {234: 4, 433: 1, 460: 6, 575: 17, 603: 79, 646: 1, 719: 43, 996: 2, 1013: 1, 1233: 1, 1318: 5, 1358: 3, 1371: 1, 1376: 1, 1483: 2, 1553: 1, 1563: 1, 1700: 2, 1757: 2, 1764: 1, 1909: 3, 1953: 1, 1982: 1, 2070: 1, 2073: 1, 2097: 3, 2157: 11, 2214: 1, 2236: 3, 2325: 2, 2331: 6, 2341: 3, 2479: 1, 2561: 5, 2710: 1, 2736: 295, 2806: 3, 2858: 1, 2906: 2, 3010: 3, 3058: 1, 3127: 2, 3161: 35, 3360: 1, 3385: 1, 3677: 8, 3900: 1, 3955: 2, 4015: 2, 4050: 1, 4236: 1, 4245: 1, 4265: 2, 4304: 4, 4453: 1, 4483: 16, 4652: 48, 4680: 1, 4713: 1, 4748: 1, 5147: 3, 5256: 2, 5443: 1, 5518: 2, 5560: 1, 5571: 1, 5676: 1, 5745: 2, 5789: 1, 5829: 5, 5842: 7, 5947: 1, 6093: 1, 6255: 197, 6256: 1, 6271: 2, 6314: 1, 6468: 12, 6527: 1, 6608: 1, 6671: 1, 6687: 2, 6695: 1, 6894: 2, 6910: 2, 7051: 21, 7062: 1, 7110: 1, 7119: 1, 7141: 1, 7262: 1, 7320: 2, 7369: 2, 7377: 138, 7724: 1, 7739: 2, 7827: 8, 7927: 17, 7933: 2, 7934: 2

In [9]:
# Lets do the same but for bottom 20%
players_increase_slot_b20 = cf.count_increase(df_bottom_20, "increase_slotdeno", "playerkey", "slotdenomination")
players_decrease_slot_b20 = cf.count_decrease(df_bottom_20, "decrease_slotdeno", "playerkey", "slotdenomination")
players_increase_maxbet_b20 = cf.count_increase(df_bottom_20, "increase_maxbet", "playerkey", "maxbet")
players_decrease_maxbet_b20 = cf.count_decrease(df_bottom_20, "decrease_maxbet", "playerkey", "maxbet")

Count of players who increase_slotdeno : 232
Count of times each player increase_slotdeno : {1993: 1, 3754: 1, 5044: 1, 5112: 1, 5677: 1, 6615: 1, 6954: 1, 7746: 2, 8251: 1, 9818: 1, 12023: 1, 12456: 1, 13487: 1, 14308: 1, 15430: 1, 16001: 1, 16583: 1, 16586: 1, 16865: 1, 16871: 1, 16963: 1, 16971: 1, 16976: 1, 17047: 1, 17060: 1, 17124: 1, 17162: 1, 17168: 1, 17255: 1, 17395: 1, 17463: 5, 17492: 1, 17543: 1, 17643: 1, 17775: 2, 17837: 1, 17841: 1, 17897: 1, 18001: 2, 18136: 1, 18238: 1, 18358: 1, 18451: 1, 18456: 1, 18538: 1, 18548: 2, 18674: 1, 18839: 4, 19005: 1, 19121: 1, 19122: 1, 19287: 1, 19309: 1, 19339: 1, 19403: 1, 19411: 2, 19517: 1, 19538: 1, 19674: 1, 19716: 1, 19727: 1, 19811: 1, 19818: 1, 19830: 1, 19908: 1, 19923: 1, 19960: 1, 20023: 1, 20091: 1, 20108: 1, 20136: 2, 20151: 1, 20210: 2, 20211: 1, 20216: 1, 20318: 1, 20575: 1, 20589: 1, 20626: 1, 20645: 1, 20748: 1, 20798: 1, 20812: 1, 20866: 1, 21074: 1, 21108: 1, 21234: 1, 21366: 1, 21410: 1, 21428: 1, 21478: 5, 21480: 

In [10]:
# Lets crate a column called 'depletion_slope' which is the difference of 'playercashableamt' between the current gamble and the previous gamble
df_bottom_20['depletion_rate'] = df_bottom_20.groupby(['playerkey', 'session_time'])['playercashableamt'].diff().fillna(0)
df_top_20['depletion_rate'] = df_top_20.groupby(['playerkey', 'session_time'])['playercashableamt'].diff().fillna(0)


# Separate by time

In [11]:
# Create df_bottom_1min for 1 minute duration
df_bottom_1min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=1))

# Create df_bottom_2min for 2 minutes duration
df_bottom_2min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=2))

# Create df_bottom_3min for 3 minutes duration
df_bottom_3min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=3))

# Create df_bottom_4min for 4 minutes duration
df_bottom_4min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=4))

# Create df_bottom_5min for 5 minutes duration
df_bottom_5min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=5))

# Create df_bottom_10min for 10 minutes duration
df_bottom_10min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=10))

# Create df_bottom_15min for 15 minutes duration
df_bottom_15min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=15))

In [12]:
# Print the unique number of players in each dataset
print("Number of players in bottom 1min: ", df_bottom_1min['playerkey'].nunique())
print("Number of players in bottom 2min: ", df_bottom_2min['playerkey'].nunique())
print("Number of players in bottom 3min: ", df_bottom_3min['playerkey'].nunique())
print("Number of players in bottom 4min: ", df_bottom_4min['playerkey'].nunique())
print("Number of players in bottom 5min: ", df_bottom_5min['playerkey'].nunique())

Number of players in bottom 1min:  2561
Number of players in bottom 2min:  2561
Number of players in bottom 3min:  2561
Number of players in bottom 4min:  2561
Number of players in bottom 5min:  2561


In [13]:
# Create df_top_1min for 1 minute duration
df_top_1min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=1))

# Create df_top_2min for 2 minutes duration
df_top_2min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=2))

# Create df_top_3min for 3 minutes duration
df_top_3min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=3))

# Create df_top_4min for 4 minutes duration
df_top_4min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=4))

# Create df_bottom_5min for 5 minutes duration
df_top_5min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=5))

# Create df_bottom_10min for 10 minutes duration
df_top_10min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=10))

# Create df_bottom_15min for 15 minutes duration
df_top_15min = pf.filter_dataframe_by_time(df_top_20, pd.Timedelta(minutes=15))

In [14]:
# Print the unique number of players in each dataset
print("Number of players in top 1min: ", df_top_1min['playerkey'].nunique())
print("Number of players in top 2min: ", df_top_2min['playerkey'].nunique())
print("Number of players in top 3min: ", df_top_3min['playerkey'].nunique())
print("Number of players in top 4min: ", df_top_4min['playerkey'].nunique())
print("Number of players in top 5min: ", df_top_5min['playerkey'].nunique())

Number of players in top 1min:  2573
Number of players in top 2min:  2573
Number of players in top 3min:  2573
Number of players in top 4min:  2573
Number of players in top 5min:  2573


#### First 1 minute

In [15]:
# Create dataframe
df_all_1min = pf.merge_dfs_per_player(data_t=df_top_1min, data_b=df_bottom_1min, grouping='session_time', print_results=False)

# Print number of unique players
print("Number of unique players in df_all_1min: ", df_all_1min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy = df_all_1min[df_all_1min.isna().any(axis=1)]['playerkey'].unique()

# print number of discrepancies
print("Number of discrepancies:", len(list_discrepancy))

# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_1min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_1min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_1min['sim_play'] = df_all_1min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# # # Reorder Columns
desired_order = ['playerkey', 'session_time', 'gender', 'age_range', 'age_gen',
                'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom',
                'std_slotdenom', 'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
                'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
                'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope',
                '#inc_maxbet', '#dec_maxbet', 'first_wager', 'first_outcome', 'first_p/b', 'last_wager',
                'last_outcome', 'last_p/b', '#W', '#L', '#NH', '#D', 'w/min', 'l/min', 'nh/min', 'd/min', 
                'w/g', 'l/g', 'nh/g', 'd/g', '#2ws', '2ws_profit', '2ws_wgramt', '#3ws', 
                '3ws_profit','3ws_wgramt', '#4ws', '4ws_profit', '4ws_wgramt', '2ws/min', '3ws/min', '4ws/min',
                'ave_time_per_gamble', 'min_time_per_gamble', 'max_time_per_gamble',
                'total_duration', 'total_gambles', 'machines_changes', '#inc_slotdenom', '#dec_slotdenom',
                'unique_machines', 'ave_time_per_machine', 'sim_play','percentile']

df_all_1min = df_all_1min.reindex(columns=desired_order)

Number of unique players in df_all_1min:  5134
Number of discrepancies: 202


#### First 2 minutes

In [16]:
# Create dataframe
df_all_2min = pf.merge_dfs_per_player(data_t=df_top_2min, data_b=df_bottom_2min, grouping='session_time', print_results=False)

# Print number of unique players
print("Number of unique players in df_all_2min: ", df_all_2min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy2 = df_all_2min[df_all_2min.isna().any(axis=1)]['playerkey'].unique()

# print number of discrepancies
print("Number of discrepancies:", len(list_discrepancy2))

# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_2min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_2min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_2min['sim_play'] = df_all_2min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# Reorder Columns

df_all_2min = df_all_2min.reindex(columns=desired_order)

Number of unique players in df_all_2min:  5134
Number of discrepancies: 121


#### First 3 minutes

In [17]:
# Create dataframe
df_all_3min = pf.merge_dfs_per_player(data_t=df_top_3min, data_b=df_bottom_3min, grouping='session_time', print_results=False)

# Print number of unique players
print("Number of unique players in df_all_3min: ", df_all_3min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy3 = df_all_3min[df_all_3min.isna().any(axis=1)]['playerkey'].unique()

# print number of discrepancies
print("Number of discrepancies:", len(list_discrepancy3))

# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_3min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_3min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_3min['sim_play'] = df_all_3min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# Reorder Columns

df_all_3min = df_all_3min.reindex(columns=desired_order)

Number of unique players in df_all_3min:  5134
Number of discrepancies: 98


#### First 4 minutes

In [18]:
# Create dataframe
df_all_4min = pf.merge_dfs_per_player(data_t=df_top_4min, data_b=df_bottom_4min, grouping='session_time', print_results=False)

# Print number of unique players
print("Number of unique players in df_all_4min: ", df_all_4min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy4 = df_all_4min[df_all_4min.isna().any(axis=1)]['playerkey'].unique()

# print number of discrepancies
print("Number of discrepancies:", len(list_discrepancy4))

# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_4min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_4min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances

df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_4min['sim_play'] = df_all_4min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# Reorder Columns

df_all_4min = df_all_4min.reindex(columns=desired_order)

Number of unique players in df_all_4min:  5134
Number of discrepancies: 94


#### First 5 minutes

In [19]:
# Create dataframe
df_all_5min = pf.merge_dfs_per_player(data_t=df_top_5min, data_b=df_bottom_5min, grouping='session_time', print_results=False)

# Print number of unique players
print("Number of unique players in df_all_5min: ", df_all_5min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy5 = df_all_5min[df_all_5min.isna().any(axis=1)]['playerkey'].unique()

# print number of discrepancies
print("Number of discrepancies:", len(list_discrepancy5))

# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_5min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_5min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_5min['sim_play'] = df_all_5min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# Reorder Columns

df_all_5min = df_all_5min.reindex(columns=desired_order)

Number of unique players in df_all_5min:  5134
Number of discrepancies: 87


#### First 10 minutes

In [20]:
# Create dataframe
df_all_10min = pf.merge_dfs_per_player(data_t=df_top_10min, data_b=df_bottom_10min, grouping='session_time', print_results=False)

# Print number of unique players
print("Number of unique players in df_all_10min: ", df_all_10min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy10 = df_all_10min[df_all_10min.isna().any(axis=1)]['playerkey'].unique()

# print number of discrepancies
print("Number of discrepancies:", len(list_discrepancy10))

# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_10min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_10min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_10min['sim_play'] = df_all_10min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# Reorder Columns

df_all_10min = df_all_10min.reindex(columns=desired_order)

Number of unique players in df_all_10min:  5134
Number of discrepancies: 77


#### First 15 minutes

In [21]:
# Create dataframe
df_all_15min = pf.merge_dfs_per_player(data_t=df_top_15min, data_b=df_bottom_15min, grouping='session_time', print_results=False)

# Print number of unique players
print("Number of unique players in df_all_15min: ", df_all_15min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy15 = df_all_15min[df_all_15min.isna().any(axis=1)]['playerkey'].unique()

# print number of discrepancies
print("Number of discrepancies:", len(list_discrepancy15))


# Simultaneous Play
df_b20_sim_v = pf.simultaneous_play(40000, df_bottom_15min, 'session_time')
df_t20_sim_v = pf.simultaneous_play(40000, df_top_15min, 'session_time')

# Concat the two dataframes
df_sim_v = pd.concat([df_b20_sim_v, df_t20_sim_v]).reset_index()

# Remove repeated instances
df_sim_no_repeat_v = df_sim_v.drop_duplicates(subset=['session_time', 'playerkey'])

# Merge Sim play
df_all_15min['sim_play'] = df_all_15min[['playerkey', 'session_time']].apply(lambda x: tuple(x) in set(map(tuple, df_sim_no_repeat_v[['playerkey', 'session_time']].values)), axis=1)

# Reorder Columns

df_all_15min = df_all_15min.reindex(columns=desired_order)

Number of unique players in df_all_15min:  5134
Number of discrepancies: 76


## Eliminate All Discrepancies

In [22]:
# add the ndarrays together to get the total number of player
total_discrepancies = np.concatenate((list_discrepancy, list_discrepancy2, list_discrepancy3, list_discrepancy4, list_discrepancy5, list_discrepancy10, list_discrepancy15), axis=0)

# Print number of unique players
print("Number of unique players in total_discrepancies: ", len(total_discrepancies))

# Transform into a set to get unique values
total_unique_discrepancies = list(set(total_discrepancies))

# Print number of unique players
print("Number of unique players in total_unique_discrepancies: ", len(total_unique_discrepancies))


Number of unique players in total_discrepancies:  755
Number of unique players in total_unique_discrepancies:  221


In [23]:
# Eliminate total_unique_discrepancies from all the dataframes
df_all_1min = df_all_1min[~df_all_1min['playerkey'].isin(total_unique_discrepancies)]
df_all_2min = df_all_2min[~df_all_2min['playerkey'].isin(total_unique_discrepancies)]
df_all_3min = df_all_3min[~df_all_3min['playerkey'].isin(total_unique_discrepancies)]
df_all_4min = df_all_4min[~df_all_4min['playerkey'].isin(total_unique_discrepancies)]
df_all_5min = df_all_5min[~df_all_5min['playerkey'].isin(total_unique_discrepancies)]
df_all_10min = df_all_10min[~df_all_10min['playerkey'].isin(total_unique_discrepancies)]
df_all_15min = df_all_15min[~df_all_15min['playerkey'].isin(total_unique_discrepancies)]

# Print number of unique players
print("Number of unique players in df_all_1min: ", df_all_1min['playerkey'].nunique())
print("Number of unique players in df_all_2min: ", df_all_2min['playerkey'].nunique())
print("Number of unique players in df_all_3min: ", df_all_3min['playerkey'].nunique())
print("Number of unique players in df_all_4min: ", df_all_4min['playerkey'].nunique())
print("Number of unique players in df_all_5min: ", df_all_5min['playerkey'].nunique())
print("Number of unique players in df_all_10min: ", df_all_10min['playerkey'].nunique())
print("Number of unique players in df_all_15min: ", df_all_15min['playerkey'].nunique())

Number of unique players in df_all_1min:  4913
Number of unique players in df_all_2min:  4913
Number of unique players in df_all_3min:  4913
Number of unique players in df_all_4min:  4913
Number of unique players in df_all_5min:  4913
Number of unique players in df_all_10min:  4913
Number of unique players in df_all_15min:  4913


## Save Data to Parquet

In [24]:
# Save the dataframes to parquet files
df_all_1min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_1min.parquet')
df_all_2min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_2min.parquet')
df_all_3min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_3min.parquet')
df_all_4min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_4min.parquet')
df_all_5min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_5min.parquet')
df_all_10min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_10min.parquet')
df_all_15min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_15min.parquet')