# Similarities and Differences of Top vs. Bottom 20% (Pt.1 - Age, Gender, Won/Lost Amount)

In [2]:
# Define libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import os
import plotting_fn as pf
import counting_fns as cf

month_file = '3_July'
cut_off = 10000
# Set working directory
os.chdir("/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/"+month_file)


In [3]:
# Read in data
df_bottom_20 = pd.read_parquet("top_vs_ntop_players.parquet")


# Eliminate players who maximum number of gambles is 1
df_bottom_20 = df_bottom_20.groupby('playerkey').filter(lambda x: x['gambles'].nunique() > 1)

# Reset index
df_bottom_20.index = np.arange(1, len(df_bottom_20) + 1)

# Round wageredamt and profit to 2 decimal places
df_bottom_20['wageredamt'] = df_bottom_20['wageredamt'].round(1)
df_bottom_20['profit'] = df_bottom_20['profit'].round(1)
df_bottom_20['percent_return'] = df_bottom_20['percent_return'].round(1)


In [4]:
# Create a new column 'result_type' that is a categorical variable which takes the value 'loss' if the change is negative and 'gain' if the change is positive, and 'draw' of change is 0
df_bottom_20['result_type'] = df_bottom_20['percent_return'].apply(lambda x: 'loss' if x == -100 else 'near-hit' if x < 0 else 'gain' if x > 0 else 'draw')

# Create dummy variables from 'result_type'
dummy_variables = pd.get_dummies(df_bottom_20['result_type']).rename(columns=lambda x: '#' + str(x[0].capitalize()))

# Add the dummy variables to the original DataFrame
df_bottom_20 = pd.concat([df_bottom_20, dummy_variables], axis=1).reset_index(drop=True)

# Convert starttime to delte format for operations
df_bottom_20['start_time'] = pd.to_datetime(df_bottom_20['start_time'])

# Create new column called 'time_diff' which is the difference between the start time of the gamble and the start time of the previous gamble
df_bottom_20['time_diff'] = df_bottom_20.groupby(['playerkey', 'session_time'])['start_time'].diff()

# Convert time_diff to seconds
df_bottom_20['time_diff'] = df_bottom_20['time_diff'].dt.total_seconds().fillna(0)

### Filter visit 1

In [5]:
# Filter data frame by visit == 1
df_bottom_20 = df_bottom_20[df_bottom_20['visit'] == 1]

In [6]:
# Ude consecutive functions to find 2ws, 3ws, 4ws in a row in each dataset per visit per player
df_bottom_20 = pf.consecutive_wins(df_bottom_20, 'visit')

In [7]:
# Ude consecutive functions to find 2ws, 3ws, 4ws in a row in each dataset per session_time per player
df_bottom_20_s = pf.consecutive_wins(df_bottom_20, 'session_time')

In [8]:
# Create age ranges 
bins = [0, 24, 40, 55, 75, 150]
labels = ['18-24', '25-40', '41-55', '56-75', '76+']
generations = ['Gen Z', 'Millenials', 'Gen X', 'Baby Boomers', 'Silent']

# Use cut function to create age ranges for bottom 20% and top 20%
df_bottom_20['age_range'] = pd.cut(df_bottom_20['age'], bins=bins, labels=labels, right=False)
df_bottom_20['age_gen'] = pd.cut(df_bottom_20['age'], bins=bins, labels=generations, right=False)


In [9]:
# Lets do the same but for bottom 20%
players_increase_slot_b20 = cf.count_increase(df_bottom_20, "increase_slotdeno", "playerkey", "slotdenomination")
players_decrease_slot_b20 = cf.count_decrease(df_bottom_20, "decrease_slotdeno", "playerkey", "slotdenomination")
players_increase_maxbet_b20 = cf.count_increase(df_bottom_20, "increase_maxbet", "playerkey", "maxbet")
players_decrease_maxbet_b20 = cf.count_decrease(df_bottom_20, "decrease_maxbet", "playerkey", "maxbet")

Count of players who increase_slotdeno : 3755
Count of times each player increase_slotdeno : {4: 2, 14: 2, 90: 1, 159: 2, 180: 2, 203: 2, 223: 2, 239: 2, 245: 1, 248: 1, 260: 1, 274: 1, 278: 1, 324: 1, 410: 2, 429: 1, 430: 2, 447: 2, 461: 1, 464: 1, 506: 1, 509: 1, 518: 1, 534: 1, 541: 2, 542: 1, 546: 1, 547: 7, 554: 1, 574: 2, 599: 3, 608: 1, 612: 1, 615: 2, 616: 1, 624: 1, 630: 13, 636: 1, 641: 1, 643: 1, 645: 2, 646: 2, 655: 1, 656: 1, 658: 1, 660: 1, 661: 5, 662: 2, 669: 1, 677: 1, 684: 2, 698: 1, 705: 1, 711: 1, 715: 3, 724: 2, 725: 2, 739: 1, 746: 9, 749: 1, 758: 1, 759: 1, 769: 1, 779: 2, 780: 1, 782: 3, 783: 1, 784: 1, 787: 1, 789: 1, 799: 2, 809: 1, 810: 1, 814: 1, 824: 2, 825: 1, 836: 1, 839: 1, 842: 1, 843: 1, 845: 2, 864: 1, 870: 3, 879: 1, 884: 3, 898: 3, 899: 2, 903: 1, 905: 1, 914: 2, 916: 1, 927: 5, 934: 1, 938: 1, 942: 1, 943: 1, 957: 1, 962: 2, 976: 1, 997: 2, 1005: 1, 1020: 1, 1043: 2, 1046: 2, 1054: 1, 1060: 1, 1063: 1, 1072: 5, 1075: 2, 1081: 1, 1085: 1, 1090: 2, 1

In [10]:
# Lets crate a column called 'depletion_slope' which is the difference of 'playercashableamt' between the current gamble and the previous gamble
df_bottom_20['depletion_rate'] = df_bottom_20.groupby(['playerkey', 'session_time'])['playercashableamt'].diff().fillna(0)


# Separate by time

In [11]:
# Create df_bottom_1min for 1 minute duration
df_bottom_1min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=1))

# Create df_bottom_2min for 2 minutes duration
df_bottom_2min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=2))

# Create df_bottom_3min for 3 minutes duration
df_bottom_3min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=3))

# Create df_bottom_4min for 4 minutes duration
df_bottom_4min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=4))

# Create df_bottom_5min for 5 minutes duration
df_bottom_5min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=5))

# Create df_bottom_10min for 10 minutes duration
df_bottom_10min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=10))

# Create df_bottom_15min for 15 minutes duration
df_bottom_15min = pf.filter_dataframe_by_time(df_bottom_20, pd.Timedelta(minutes=15))

In [12]:
# Print the unique number of players in each dataset
print("Number of players in bottom 1min: ", df_bottom_1min['playerkey'].nunique())
print("Number of players in bottom 2min: ", df_bottom_2min['playerkey'].nunique())
print("Number of players in bottom 3min: ", df_bottom_3min['playerkey'].nunique())
print("Number of players in bottom 4min: ", df_bottom_4min['playerkey'].nunique())
print("Number of players in bottom 5min: ", df_bottom_5min['playerkey'].nunique())

Number of players in bottom 1min:  13551
Number of players in bottom 2min:  13551
Number of players in bottom 3min:  13551
Number of players in bottom 4min:  13551
Number of players in bottom 5min:  13551


#### First 1 minute

In [13]:
# Create dataframe
df_all_1min = pf.transform_ml(data_b=df_bottom_1min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_1min: ", df_all_1min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy = df_all_1min[df_all_1min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy)

# Eliminate list_discrepancy  from df_all_1min
df_all_1min = df_all_1min[~df_all_1min['playerkey'].isin(list_discrepancy)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_1min: ", df_all_1min['playerkey'].nunique())

# # # Reorder Columns
desired_order = ['playerkey', 'session_time', 'gender', 'age_range', 'age_gen',
                'beginning_amt', 'ending_amt', 'ending_balance', 'ave_slotdenom',
                'std_slotdenom', 'min_slotdenom', 'max_slotdenom', 'ave_theo_payback',
                'min_theo_payback', 'max_theo_payback', 'ave_wageramt', 'std_wageramt',
                'min_wager', 'max_wager', 'ave_p/b', 'std_p/b', 'max_p/b', 'max_profit', 'depletion_slope',
                '#inc_maxbet', '#dec_maxbet', 'first_wager', 'first_outcome', 'first_p/b', 'last_wager',
                'last_outcome', 'last_p/b', '#W', '#L', '#NH', '#D', 'w/min', 'l/min', 'nh/min', 'd/min', 
                'w/g', 'l/g', 'nh/g', 'd/g', '#2ws', '2ws_profit', '2ws_wgramt', '#3ws', 
                '3ws_profit','3ws_wgramt', '#4ws', '4ws_profit', '4ws_wgramt', '2ws/min', '3ws/min', '4ws/min',
                'ave_time_per_gamble', 'min_time_per_gamble', 'max_time_per_gamble',
                'total_duration', 'total_gambles', 'machines_changes', '#inc_slotdenom', '#dec_slotdenom',
                'unique_machines', 'ave_time_per_machine','classification']

df_all_1min = df_all_1min.reindex(columns=desired_order)

Number of unique players in df_all_1min:  13551
[  384   533   598   641   702   715   811   846   859   904   917   961
   970   997  1008  1021  1029  1038  1040  1049  1074  1088  1094  1103
  1116  1164  1175  1199  1204  1243  1301  1309  1346  1365  1411  1444
  1509  1510  1530  1542  1601  1612  1622  1625  1633  1636  1681  1683
  1686  1733  1787  1838  1841  1848  1888  1899  1907  1924  1954  1966
  2062  2082  2109  2116  2125  2126  2178  2187  2212  2225  2309  2376
  2495  2588  2600  2652  2762  2822  2844  2867  2889  2915  2929  2987
  2991  2994  3004  3076  3140  3203  3339  3365  3370  3464  3465  3467
  3476  3500  3506  3509  3524  3562  3615  3616  3684  3688  3697  3795
  3815  3912  3927  3975  4022  4026  4030  4117  4175  4183  4208  4249
  4323  4369  4403  4404  4449  4465  4499  4517  4542  4624  4627  4628
  4634  4646  4695  4698  4756  4777  4793  4812  4845  4942  4944  4952
  4975  5006  5023  5109  5120  5167  5198  5203  5233  5347  5348  5361
  5

#### First 2 minutes

In [14]:
# Create dataframe
df_all_2min = pf.transform_ml(data_b=df_bottom_2min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_2min: ", df_all_2min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_2 = df_all_2min[df_all_2min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_2)

# Eliminate list_discrepancy  from df_all_2min
df_all_2min = df_all_2min[~df_all_2min['playerkey'].isin(list_discrepancy_2)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_2min: ", df_all_2min['playerkey'].nunique())

# Reorder Columns
df_all_2min = df_all_2min.reindex(columns=desired_order)

Number of unique players in df_all_2min:  13551
[  384   598   641   702   715   846   970  1008  1038  1040  1049  1074
  1088  1094  1116  1135  1164  1199  1243  1301  1309  1346  1365  1411
  1509  1510  1530  1542  1612  1633  1681  1683  1686  1838  1848  1899
  1907  1924  2062  2082  2116  2125  2126  2178  2187  2212  2225  2309
  2376  2588  2600  2867  2889  2929  2991  3076  3138  3339  3433  3464
  3465  3476  3500  3524  3562  3615  3684  3688  3815  4022  4026  4030
  4175  4183  4249  4367  4373  4403  4404  4449  4465  4499  4634  4646
  4695  4777  4793  4845  4952  5109  5233  5347  5361  5461  5590  5662
  5817  5831  5916  6070  6116  6119  6141  6151  6165  6181  6299  6351
  6376  6404  6417  6430  6466  6526  6626  6654  6705  6742  6787  6817
  6840  6925  6927  6933  6955  6971  6990  7050  7234  7276  7353  7370
  7373  7399  7468  7478  7490  7552  7621  7666  7775  7854  7947  8086
  8178  8205  8236  8257  8332  8342  8366  8382  8420  8455  8517  8573
  8

#### First 3 minutes

In [15]:
# Create dataframe
df_all_3min = pf.transform_ml(data_b=df_bottom_3min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_3min: ", df_all_3min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_3 = df_all_3min[df_all_3min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_3)

# Eliminate list_discrepancy  from df_all_3min
df_all_3min = df_all_3min[~df_all_3min['playerkey'].isin(list_discrepancy_3)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_3min: ", df_all_3min['playerkey'].nunique())

# Reorder Columns
df_all_3min = df_all_3min.reindex(columns=desired_order)

Number of unique players in df_all_3min:  13551
[  598   641   702   715   829   846   970  1008  1038  1040  1049  1088
  1094  1116  1135  1164  1199  1243  1346  1365  1529  1542  1612  1633
  1681  1686  1848  1907  2062  2082  2116  2125  2187  2376  2588  2600
  2867  2889  2991  3076  3084  3138  3339  3433  3464  3465  3476  3500
  3524  3562  3615  3684  3688  3815  4030  4175  4183  4249  4367  4373
  4403  4404  4449  4465  4499  4646  4695  4777  4845  4952  5109  5233
  5347  5361  5590  5662  5831  6070  6116  6119  6165  6181  6299  6351
  6376  6404  6417  6430  6466  6626  6654  6705  6742  6787  6840  6925
  6927  6933  6955  6990  7050  7234  7276  7370  7373  7399  7468  7478
  7490  7621  7854  7947  8086  8178  8205  8221  8257  8332  8342  8366
  8382  8420  8455  9103  9141  9185  9368  9540  9799  9808  9824  9828
  9847  9942 10042 10043 10213 10336 10414 10428 10441 10519 10541 10544
 10577 10728 10796 10843 10861 10884 11010 11071 11112 11322 11346 11386
 11

#### First 4 minutes

In [16]:
# Create dataframe
df_all_4min = pf.transform_ml(data_b=df_bottom_4min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_4min: ", df_all_4min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_4 = df_all_4min[df_all_4min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_4)

# Eliminate list_discrepancy  from df_all_4min
df_all_4min = df_all_4min[~df_all_4min['playerkey'].isin(list_discrepancy_4)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_4min: ", df_all_4min['playerkey'].nunique())

# Reorder Columns
df_all_4min = df_all_4min.reindex(columns=desired_order)

Number of unique players in df_all_4min:  13551
[  598   715   829   970  1008  1038  1040  1049  1088  1094  1116  1135
  1164  1199  1243  1529  1612  1633  1848  1907  2062  2082  2125  2187
  2376  2588  2600  2867  2889  3084  3138  3339  3433  3464  3465  3500
  3524  3562  3684  3688  3815  4030  4175  4183  4249  4367  4373  4403
  4404  4449  4465  4499  4646  4695  4777  4845  4952  5109  5233  5662
  5831  6070  6116  6119  6165  6299  6351  6376  6404  6417  6430  6626
  6654  6742  6787  6840  6925  6927  6933  6955  6990  7050  7234  7276
  7373  7399  7468  7478  7490  7621  7854  7947  8086  8178  8205  8221
  8257  8332  8342  8366  8382  8420  8455  8485  9103  9141  9185  9368
  9540  9799  9808  9824  9828  9847 10042 10043 10213 10336 10414 10428
 10441 10519 10541 10544 10577 10796 10861 10884 11010 11071 11112 11322
 11346 11386 11387 11456 11520 11561 11574 11671 11719 11772 11800 11830
 11873 11943 11992 12161 12204 12389 12440 12451 12590 12655 12805 12845
 12

#### First 5 minutes

In [17]:
# Create dataframe
df_all_5min = pf.transform_ml(data_b=df_bottom_5min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_5min: ", df_all_5min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_5 = df_all_5min[df_all_5min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_5)

# Eliminate list_discrepancy  from df_all_5min
df_all_5min = df_all_5min[~df_all_5min['playerkey'].isin(list_discrepancy_5)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_5min: ", df_all_5min['playerkey'].nunique())

# Reorder Columns
df_all_5min = df_all_5min.reindex(columns=desired_order)

Number of unique players in df_all_5min:  13551
[  715   829   970  1008  1038  1040  1049  1088  1094  1116  1135  1164
  1199  1243  1529  1612  1633  1848  1907  2062  2082  2125  2187  2376
  2588  2600  2867  2889  3084  3138  3339  3433  3464  3500  3524  3562
  3684  3688  3815  4175  4183  4249  4367  4373  4403  4404  4449  4465
  4499  4646  4695  4777  4845  4952  5109  5233  5662  5831  6070  6116
  6119  6165  6299  6351  6376  6404  6417  6430  6654  6742  6787  6840
  6925  6927  6933  6955  6990  7050  7234  7276  7373  7399  7468  7478
  7490  7621  7947  8086  8178  8221  8332  8342  8382  8420  8455  8485
  9103  9141  9185  9368  9799  9808  9824  9828  9847 10042 10043 10213
 10336 10414 10428 10441 10519 10541 10544 10577 10861 10884 11010 11071
 11112 11322 11346 11386 11387 11456 11520 11561 11574 11671 11719 11772
 11800 11830 11873 11943 11992 12114 12161 12204 12295 12389 12451 12590
 12655 12805 12845 12864 12906 12907 12908 12947 12949 13005 13035 13133
 13

#### First 10 minutes

In [18]:
# Create dataframe
df_all_10min = pf.transform_ml(data_b=df_bottom_10min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_10min: ", df_all_10min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_10 = df_all_10min[df_all_10min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_10)

# Eliminate list_discrepancy  from df_all_10min
df_all_10min = df_all_10min[~df_all_10min['playerkey'].isin(list_discrepancy_10)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_10min: ", df_all_10min['playerkey'].nunique())

df_all_10min = df_all_10min.reindex(columns=desired_order)

Number of unique players in df_all_10min:  13551
[  598   715   829   970  1008  1038  1049  1088  1116  1135  1164  1199
  1529  1612  1633  1907  2062  2082  2173  2187  2376  2482  2600  2867
  3084  3138  3339  3433  3464  3500  3524  3562  3684  3688  3815  4175
  4183  4249  4367  4373  4403  4404  4449  4646  4777  4845  5109  5233
  5662  5831  5936  6070  6116  6299  6351  6376  6404  6430  6654  6686
  6742  6787  6840  6925  6927  6933  6955  6990  7050  7234  7373  7478
  7490  7621  8086  8178  8221  8332  8342  8382  8420  8455  8485  9103
  9141  9185  9368  9514  9799  9808  9824  9828 10042 10043 10213 10241
 10336 10414 10428 10441 10541 10577 10861 11010 11112 11322 11346 11386
 11387 11456 11478 11520 11561 11574 11671 11719 11772 11800 11830 11873
 11943 11979 11992 12114 12161 12204 12295 12389 12590 12626 12627 12655
 12845 12864 12906 12907 12908 12949 13005 13608 13609 14078 14133 14656
 14664 14712 14713 14801 14982 15090 15178 15208 15284 15305 15419 15494
 1

#### First 15 minutes

In [19]:
# Create dataframe
df_all_15min = pf.transform_ml(data_b=df_bottom_15min, grouping='session_time')

# Print number of unique players
print("Number of unique players in df_all_15min: ", df_all_15min['playerkey'].nunique())

# Get the IDs of players who have NaN values in any of the columns
list_discrepancy_15 = df_all_15min[df_all_15min.isna().any(axis=1)]['playerkey'].unique()

# print list of players with NaN values
print(list_discrepancy_15)

# Eliminate list_discrepancy  from df_all_10min
df_all_15min = df_all_15min[~df_all_15min['playerkey'].isin(list_discrepancy_15)]

# Print number of unique players
print("Number of unique players w/o discrepancies df_all_15min: ", df_all_15min['playerkey'].nunique())

# Reorder Columns
df_all_15min = df_all_15min.reindex(columns=desired_order)

Number of unique players in df_all_15min:  13551
[  598   829   970  1008  1038  1049  1088  1116  1135  1164  1199  1529
  1581  1612  1633  1907  1936  2062  2082  2173  2187  2376  2482  2600
  2867  3084  3138  3339  3433  3464  3500  3524  3562  3684  3688  3815
  4175  4183  4249  4367  4373  4403  4404  4449  4646  4777  4845  5109
  5233  5662  5831  5936  6070  6116  6299  6376  6404  6430  6654  6686
  6742  6787  6840  6925  6927  6933  6935  6955  6990  7050  7234  7373
  7478  7490  7621  8086  8221  8332  8342  8382  8455  8459  8485  9103
  9141  9185  9368  9514  9808  9828 10042 10043 10213 10241 10336 10414
 10541 10577 11010 11112 11224 11322 11346 11386 11387 11456 11478 11520
 11561 11574 11671 11719 11772 11800 11830 11943 11979 11992 12114 12161
 12204 12295 12342 12389 12590 12626 12627 12655 12845 12864 12906 12907
 12908 12949 13005 13497 13608 13609 14078 14133 14656 14664 14712 14713
 14743 14982 15090 15178 15208 15284 15305 15419 15494 15548 15599 15645
 1

## Eliminate Discrepancies

In [20]:
# add the ndarrays together to get the total number of player
total_discrepancies = np.concatenate((list_discrepancy, list_discrepancy_2, list_discrepancy_3, list_discrepancy_4, list_discrepancy_5, list_discrepancy_10, list_discrepancy_15), axis=0)

# Print number of unique players
print("Number of unique players in total_discrepancies: ", len(total_discrepancies))

# Transform into a set to get unique values
total_unique_discrepancies = list(set(total_discrepancies))

# Print number of unique players
print("Number of unique players in total_unique_discrepancies: ", len(total_unique_discrepancies))

Number of unique players in total_discrepancies:  1784
Number of unique players in total_unique_discrepancies:  563


In [21]:
# Eliminate total_unique_discrepancies from all the dataframes
df_all_1min = df_all_1min[~df_all_1min['playerkey'].isin(total_unique_discrepancies)]
df_all_2min = df_all_2min[~df_all_2min['playerkey'].isin(total_unique_discrepancies)]
df_all_3min = df_all_3min[~df_all_3min['playerkey'].isin(total_unique_discrepancies)]
df_all_4min = df_all_4min[~df_all_4min['playerkey'].isin(total_unique_discrepancies)]
df_all_5min = df_all_5min[~df_all_5min['playerkey'].isin(total_unique_discrepancies)]
df_all_10min = df_all_10min[~df_all_10min['playerkey'].isin(total_unique_discrepancies)]
df_all_15min = df_all_15min[~df_all_15min['playerkey'].isin(total_unique_discrepancies)]

# Print number of unique players
print("Number of unique players in df_all_1min: ", df_all_1min['playerkey'].nunique())
print("Number of unique players in df_all_2min: ", df_all_2min['playerkey'].nunique())
print("Number of unique players in df_all_3min: ", df_all_3min['playerkey'].nunique())
print("Number of unique players in df_all_4min: ", df_all_4min['playerkey'].nunique())
print("Number of unique players in df_all_5min: ", df_all_5min['playerkey'].nunique())
print("Number of unique players in df_all_10min: ", df_all_10min['playerkey'].nunique())
print("Number of unique players in df_all_15min: ", df_all_15min['playerkey'].nunique())


Number of unique players in df_all_1min:  12988
Number of unique players in df_all_2min:  12988
Number of unique players in df_all_3min:  12988
Number of unique players in df_all_4min:  12988
Number of unique players in df_all_5min:  12988
Number of unique players in df_all_10min:  12988
Number of unique players in df_all_15min:  12988


In [22]:
# Save the dataframes to parquet
df_all_1min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_1min_top_vs_ntop_players.parquet')
df_all_2min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_2min_top_vs_ntop_players.parquet')
df_all_3min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_3min_top_vs_ntop_players.parquet')
df_all_4min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_4min_top_vs_ntop_players.parquet')
df_all_5min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_5min_top_vs_ntop_players.parquet')
df_all_10min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_10min_top_vs_ntop_players.parquet')
df_all_15min.to_parquet('/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/'+month_file+'/Ending Balances/Per_Player/df_15min_top_vs_ntop_players.parquet')