# Exploratory Data Analysis (Time & Gambles)

We are going to be looking for patterns in players who only played in the month of June. This is with the purpose to distinguish players between short and long positions, and the reasons why they hold this positions. 

In [31]:
# import all necesary libraries for the project
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import warnings
warnings.filterwarnings('ignore')


In [32]:
# Define working directory
path = "/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/1_Generic"
os.chdir(path)

# Filter Columns
filter = ['playerkey', 'playercashableamt', 'wageredamt', 'maxbet', 'grosswin',
       'currencyinamt', 'assetnumber', 'theoreticalpaybackpercent', 
       'age', 'rank', 'gender', 'date', 'start_time', 'end_time', 'duration',
       'slotdenomination']
# Import data
dtf = pd.read_parquet('month_7_year_2015.parquet', columns=filter)

# Check data
dtf.columns

# Select only the rows where duration is greater than or equal to zero
dtf = dtf[dtf['duration'] >= pd.Timedelta(0)]

In [33]:
# Sort data by start_time and playerkey
dtf = dtf.sort_values(by=['playerkey', 'date', 'start_time'])

## Calculate Foundamental Variables

The following variables were calculated using existing data:
* _player_loss_: how much money each player has lost in each gamble.
* _player_wins_: equals the amount of money they bet plus how much they won.
* _percent_return_: the return in player's bets for each gamble played. 

$$\text{percent return} = (\frac{df[wins] - df[wageredamt]}{df[wageredamt]})*100$$

* _playercashableamt_pct_change_: calculates the rate of change of player's outstanding gambling amount. 

$$\text{playercashableamt \% change} = (\frac{df[playercashableamt_{t+1}] - df[playercashableamt_{t}]}{df[playercashableamt_{t}]})*100$$

In [34]:
# Crate a new colum that is the inverse of casino_grosswin, named "player_loss"
dtf['ops'] = dtf['grosswin']*-1

# Create a new column that is the inverse of casino_grosswin, named "player_loss" that includes all negative values of 'ops'
dtf['player_loss'] = np.where(dtf['ops'] < 0, dtf['ops'], 0)

# Create a new column that is the inverse of casino_grosswin, named "player_wins" that includes all positive values of 'ops'
dtf['player_wins'] = np.where(dtf['ops'] > 0, dtf['ops'], 0)

# Create total_amt_won column
dtf['total_amt_won'] = dtf["wageredamt"]+dtf["ops"]

# Create colum profit
dtf['profit'] = dtf['total_amt_won'] - dtf['wageredamt']

# # Calculate percentage return for each gamble and add it as a new column
dtf["percent_return"] = ((dtf["wageredamt"]+dtf["ops"]) - dtf["wageredamt"]) / dtf["wageredamt"] * 100

# # Create a time series variable for each player that starts at 1 and increases by 1 for each row
draw_condition = dtf['grosswin'] == 0

for i, row in dtf.iterrows():
    if draw_condition[i]:
        dtf.loc[i, 'profit'] = 0

# Drop 'ops' column
dtf = dtf.drop(columns=['ops'])

# Create a time series variable for each player that starts at 1 and increases by 1 for each row
dtf["gambles"] = dtf.groupby("playerkey").cumcount() + 1


### Calulates Number of Visits

In [35]:
# Group the DataFrame by playerkey
groups = dtf.groupby('playerkey')

# Initialize the visit column to 1 for the first visit of each player
dtf['visit'] = groups['date'].transform(lambda x: (x.diff().dt.days >= 1).cumsum() + 1)

# Reset the visit count for each new player
dtf['visit'] = dtf.groupby('playerkey')['visit'].apply(lambda x: x - x.iloc[0] + 1)


### Calculate Sessions based on Time

If there is a pause of 30 minutes or more between gambles it is considered the end of a session, and the start of a new one.

In [36]:
# Convert the start_time column to datetime
dtf["start_time"] = pd.to_datetime(dtf["start_time"])

# Sort data by start_time and playerkey
dtf = dtf.sort_values(by=['playerkey', 'date', 'start_time'])

# Compute the time difference between consecutive gambles for each player
dtf['time_diff'] = dtf.groupby('playerkey')['start_time'].diff()

# Initialize the session_time column to 1 for the first gamble of each player
dtf['session_time'] = dtf.groupby('playerkey').ngroup() + 1

# Update the session_time column based on the time difference
dtf['session_time'] += (dtf['time_diff'] > pd.Timedelta(minutes=30)).cumsum()

# Reset the session_time count for each new visit
dtf['session_time'] = dtf.groupby(['playerkey', 'visit'])['session_time'].apply(lambda x: x - x.iloc[0] + 1)

# Remove the temporary time_diff column
dtf = dtf.drop('time_diff', axis=1)

### Calculate Sessions based on Machine Change

Everytime a player switches machine a new sessions begins.

In [37]:
# Initialize the session_machine column to 1 for the first machine of each player
dtf['session_machine'] = (dtf.groupby("playerkey")["assetnumber"].diff() != 0).cumsum()

# Reset the session_machine count for each new visit
dtf['session_machine'] = dtf.groupby(['playerkey', 'visit'])['session_machine'].apply(lambda x: x - x.iloc[0] + 1)


### Calculate the number of gambles per Visit

In [38]:
# Create a column named "gambles_visit" that is the number of gambles per visit
dtf["gambles_visit"] = dtf.groupby(["playerkey", "visit"])["gambles"].cumcount() + 1

# Reset the gambles_visit count for each new visit
dtf['gambles_visit'] = dtf.groupby(['playerkey', 'visit'])['gambles_visit'].apply(lambda x: x - x.iloc[0] + 1)


### Calculate the number of gambles per Session Time

In [39]:
# Create a column named "gambles_session" that is the number of gambles per session
dtf["gambles_session"] = dtf.groupby(["playerkey", "session_time"])["gambles"].cumcount() + 1

# Reset the gambles_session count for each new session
dtf['gambles_session'] = dtf.groupby(['playerkey', 'session_time'])['gambles_session'].apply(lambda x: x - x.iloc[0] + 1)


### Calculate the number of gambles per Session Machine

In [40]:
# Create a column named "gambles_machine" that is the number of gambles per session_machine
dtf["gambles_machine"] = dtf.groupby(["playerkey", "session_machine"])["gambles"].cumcount() + 1

# Reset the gambles_machine count for each new session_machine
dtf['gambles_machine'] = dtf.groupby(['playerkey', 'session_machine'])['gambles_machine'].apply(lambda x: x - x.iloc[0] + 1)

## Frequencies of Gambles

### General

Let's see who plays the most, the least, and the min, max, average, and median number of gambles withouth making a distictions between visits, sessions per time or machine.

In [41]:
# Total number of players
print("Total number of players:", dtf["playerkey"].nunique())

# Total number of gambles
print("Total number of gambles:", dtf.groupby('playerkey')['gambles'].max().sum())

print("--------------------------------------------")
# What is the maximum number of periods played by each player?
print("Maximimum # of gambles of a single player:", dtf.groupby('playerkey')['gambles'].max().max())

#Who is the player with the maximum number of periods played?
print("Player who gambled the most:", dtf.groupby('playerkey')['gambles'].max().idxmax())

print("--------------------------------------------")

# What is the minimum number of periods played by each player?
print("Minimum # of gambles of a single player:", dtf.groupby('playerkey')['gambles'].max().min())

#Who is the player with the minimum number of periods played?
print("Player who gambled the least:", dtf.groupby('playerkey')['gambles'].max().idxmin())

print("--------------------------------------------")

# What is the median number of periods played by each player?
print("Median # of gambles of all players:", round(dtf.groupby('playerkey')['gambles'].max().median(), 2))

# How many players played at least the median # gambles?
print("Number of players who played exactly the median:", dtf.groupby('playerkey')['gambles'].max()[dtf.groupby('playerkey')['gambles'].max() == 150].count())
print("Number of players who gambled less than median:", dtf.groupby('playerkey')['gambles'].max()[dtf.groupby('playerkey')['gambles'].max() < 150].count())
print("Number of players who gambled more than median:", dtf.groupby('playerkey')['gambles'].max()[dtf.groupby('playerkey')['gambles'].max() > 150].count())
print("--------------------------------------------")

# Most common # of gambles per person
print("Most common # of gambles:", dtf.groupby('playerkey')['gambles'].max().mode().tolist())

# What is the average number of periods played by each player?
print("Average # of gambles of all players:", round(dtf.groupby('playerkey')['gambles'].max().mean(), 2))

# How many players played at least 278 gambles?
print("Number of players who gambled at least the mode:", dtf.groupby('playerkey')['gambles'].max()[dtf.groupby('playerkey')['gambles'].max() >= 278].count())

Total number of players: 14472
Total number of gambles: 6389913
--------------------------------------------
Maximimum # of gambles of a single player: 23431
Player who gambled the most: 1166
--------------------------------------------
Minimum # of gambles of a single player: 1
Player who gambled the least: 124
--------------------------------------------
Median # of gambles of all players: 160.0
Number of players who played exactly the median: 15
Number of players who gambled less than median: 7002
Number of players who gambled more than median: 7455
--------------------------------------------
Most common # of gambles: [21]
Average # of gambles of all players: 441.54
Number of players who gambled at least the mode: 5156


#### Visits and Sessions

In [42]:
# Calculate the average number of visitis per player
print("Average # of visits per player:", round(dtf.groupby('playerkey')['visit'].max().mean(), 2))

# Calculate median number of visits per player
print("Median # of visits per player:", round(dtf.groupby('playerkey')['visit'].max().median(), 2))

# Calculate the most common number of visits per player
print("Most common # of visits per player:", dtf.groupby('playerkey')['visit'].max().mode().tolist())

# Seperation line
print("--------------------------------------------")

# Calculate the average number of gambles per visit
print("Average # of gambles per visit:", round(dtf.groupby(['playerkey', 'visit'])['gambles_visit'].max().mean(), 2))

# Calculate the median number of gambles per visit
print("Median # of gambles per visit:", round(dtf.groupby(['playerkey', 'visit'])['gambles_visit'].max().median(), 2))

# Calculate the most common number of gambles per visit
print("Most common # of gambles per visit:", dtf.groupby(['playerkey', 'visit'])['gambles_visit'].max().mode().tolist())

# Seperation line
print("--------------------------------------------")

# Calculate the average number of gambles per session
print("Average # of gambles per session:", round(dtf.groupby(['playerkey', 'session_time'])['gambles_session'].max().mean(), 2))

# Calculate the median number of gambles per session
print("Median # of gambles per session:", round(dtf.groupby(['playerkey', 'session_time'])['gambles_session'].max().median(), 2))

# Calculate the most common number of gambles per session
print("Most common # of gambles per session:", dtf.groupby(['playerkey', 'session_time'])['gambles_session'].max().mode().tolist())

# Seperation line
print("--------------------------------------------")

# Calculate the average number of gambles per session_machine
print("Average # of gambles per session_machine:", round(dtf.groupby(['playerkey', 'session_machine'])['gambles_machine'].max().mean(), 2))

# Calculate the median number of gambles per session_machine
print("Median # of gambles per session_machine:", round(dtf.groupby(['playerkey', 'session_machine'])['gambles_machine'].max().median(), 2))

# Calculate the most common number of gambles per session_machine
print("Most common # of gambles per session_machine:", dtf.groupby(['playerkey', 'session_machine'])['gambles_machine'].max().mode().tolist())


Average # of visits per player: 1.61
Median # of visits per player: 1.0
Most common # of visits per player: [1]
--------------------------------------------
Average # of gambles per visit: 273.92
Median # of gambles per visit: 127.0
Most common # of gambles per visit: [26]
--------------------------------------------
Average # of gambles per session: 283.32
Median # of gambles per session: 127.0
Most common # of gambles per session: [21]
--------------------------------------------
Average # of gambles per session_machine: 22.56
Median # of gambles per session_machine: 2.0
Most common # of gambles per session_machine: [1]


### Breakdown

#### Per Visit Analysis

In [43]:
# What were the unique number of visits per player?
print("Total visits: ", dtf["visit"].unique().tolist())

# How many players played in each visit?
for i in dtf["visit"].unique().tolist():
    n_players = len(dtf[dtf["visit"] == i]["playerkey"].unique())
    gambles_data = dtf[dtf["visit"] == i]
    total_gambles = gambles_data.groupby('playerkey')['gambles_visit'].max().sum()
    print("Visit:", i)
    print(" a) # of Players:", n_players)
    print(" b) Total # of gambles:", total_gambles)
    print(" c) Average # of sessions:", round(gambles_data.groupby('playerkey')['session_time'].max().mean(), 2))
    print(" d) Max # of sessions:", gambles_data.groupby('playerkey')['session_time'].max().max())
    print("--------------------------------------------")

Total visits:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
Visit: 1
 a) # of Players: 14472
 b) Total # of gambles: 3220695
 c) Average # of sessions: 1.35
 d) Max # of sessions: 8
--------------------------------------------
Visit: 2
 a) # of Players: 5354
 b) Total # of gambles: 1795681
 c) Average # of sessions: 1.69
 d) Max # of sessions: 10
--------------------------------------------
Visit: 3
 a) # of Players: 2122
 b) Total # of gambles: 780905
 c) Average # of sessions: 1.71
 d) Max # of sessions: 8
--------------------------------------------
Visit: 4
 a) # of Players: 756
 b) Total # of gambles: 297972
 c) Average # of sessions: 1.72
 d) Max # of sessions: 8
--------------------------------------------
Visit: 5
 a) # of Players: 294
 b) Total # of gambles: 123127
 c) Average # of sessions: 1.76
 d) Max # of sessions: 5
--------------------------------------------
Visit: 6
 a) # of Players: 126
 b) Total # of gambles: 59991
 c) A

In [44]:
# How many gambles played in each visit?
for i in dtf["visit"].unique().tolist():
    n_players = len(dtf[dtf["visit"] == i]["playerkey"].unique())
    gambles_data = dtf[dtf["visit"] == i]
    total_gambles = gambles_data.groupby('playerkey')['gambles_visit'].max().sum()
    print("Visit:", i)
    print(" a) Average # of gambles per player:", round(total_gambles / n_players, 2))
    print(" b) Median # of gambles per player:", round(gambles_data.groupby('playerkey')['gambles_visit'].max().median(), 2))
    print(" c) Maximum # of gambles per player:", gambles_data.groupby('playerkey')['gambles_visit'].max().max())
    print("--------------------------------------------")


Visit: 1
 a) Average # of gambles per player: 222.55
 b) Median # of gambles per player: 105.5
 c) Maximum # of gambles per player: 5855
--------------------------------------------
Visit: 2
 a) Average # of gambles per player: 335.39
 b) Median # of gambles per player: 161.0
 c) Maximum # of gambles per player: 6741
--------------------------------------------
Visit: 3
 a) Average # of gambles per player: 368.0
 b) Median # of gambles per player: 181.5
 c) Maximum # of gambles per player: 5638
--------------------------------------------
Visit: 4
 a) Average # of gambles per player: 394.14
 b) Median # of gambles per player: 199.0
 c) Maximum # of gambles per player: 5213
--------------------------------------------
Visit: 5
 a) Average # of gambles per player: 418.8
 b) Median # of gambles per player: 228.0
 c) Maximum # of gambles per player: 3656
--------------------------------------------
Visit: 6
 a) Average # of gambles per player: 476.12
 b) Median # of gambles per player: 295

#### Per Session Analysis 

In [45]:
# What were the unique number of visits per player?
print("Unique sessions per player: ", dtf["session_time"].unique().tolist())

# How many players and gambles per session?
for i in dtf["session_time"].unique().tolist():
    n_players = len(dtf[dtf["session_time"] == i]["playerkey"].unique())
    gambles_data = dtf[dtf["session_time"] == i]
    total_gambles = gambles_data.groupby('playerkey')['gambles_session'].max().sum()
    print("Session:", i)
    print(" a) # of Players:", n_players)
    print(" b) Total # of gambles:", total_gambles)
    print("--------------------------------------------")

Unique sessions per player:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Session: 1
 a) # of Players: 14472
 b) Total # of gambles: 4144218
--------------------------------------------
Session: 2
 a) # of Players: 4928
 b) Total # of gambles: 1384021
--------------------------------------------
Session: 3
 a) # of Players: 1904
 b) Total # of gambles: 528439
--------------------------------------------
Session: 4
 a) # of Players: 769
 b) Total # of gambles: 210390
--------------------------------------------
Session: 5
 a) # of Players: 308
 b) Total # of gambles: 82541
--------------------------------------------
Session: 6
 a) # of Players: 114
 b) Total # of gambles: 28367
--------------------------------------------
Session: 7
 a) # of Players: 41
 b) Total # of gambles: 9072
--------------------------------------------
Session: 8
 a) # of Players: 13
 b) Total # of gambles: 2171
--------------------------------------------
Session: 9
 a) # of Players: 4
 b) Total # of gambles: 465
----------

In [46]:
# What is the max, min, and average number of gambles per session?
for i in dtf["session_time"].unique().tolist():
    n_players = len(dtf[dtf["session_time"] == i]["playerkey"].unique())
    gambles_data = dtf[dtf["session_time"] == i]
    total_gambles = gambles_data.groupby('playerkey')['gambles_session'].max().sum()
    print("Session:", i)
    print(" a) Average # of gambles per player:", round(total_gambles / n_players, 0))
    print(" b) Median # of gambles per player:", round(gambles_data.groupby('playerkey')['gambles_session'].max().median(), 0))
    print(" c) Maximum # of gambles per player:", gambles_data.groupby('playerkey')['gambles_session'].max().max())
    print("--------------------------------------------")


Session: 1
 a) Average # of gambles per player: 286.0
 b) Median # of gambles per player: 125.0
 c) Maximum # of gambles per player: 20826
--------------------------------------------
Session: 2
 a) Average # of gambles per player: 281.0
 b) Median # of gambles per player: 127.0
 c) Maximum # of gambles per player: 6903
--------------------------------------------
Session: 3
 a) Average # of gambles per player: 278.0
 b) Median # of gambles per player: 134.0
 c) Maximum # of gambles per player: 6643
--------------------------------------------
Session: 4
 a) Average # of gambles per player: 274.0
 b) Median # of gambles per player: 143.0
 c) Maximum # of gambles per player: 4071
--------------------------------------------
Session: 5
 a) Average # of gambles per player: 268.0
 b) Median # of gambles per player: 132.0
 c) Maximum # of gambles per player: 3096
--------------------------------------------
Session: 6
 a) Average # of gambles per player: 249.0
 b) Median # of gambles per pl

## Durations

### General

Let's calculate avergae durtion of gambles.

In [47]:
# Calculate the total duration played
print('Total duration played:', dtf['duration'].sum())

# Calculate the average duration played per player
print('Average duration played per player:', dtf.groupby('playerkey')['duration'].sum().mean())

# How many players played for more than the average duration?
print('Number of players who played more than the average duration:', dtf.groupby('playerkey')['duration'].sum()[dtf.groupby('playerkey')['duration'].sum() > dtf.groupby('playerkey')['duration'].sum().mean()].count())

# How many players played for less than the average duration?
print('Number of players who played less than the average duration:', dtf.groupby('playerkey')['duration'].sum()[dtf.groupby('playerkey')['duration'].sum() < dtf.groupby('playerkey')['duration'].sum().mean()].count())

# Seperation line
print("--------------------------------------------")

# Calculate the median duration played per player
print('Median duration played per player:', dtf.groupby('playerkey')['duration'].sum().median())

# How many players played for more than the median duration?
print('Number of players who played more than the median duration:', dtf.groupby('playerkey')['duration'].sum()[dtf.groupby('playerkey')['duration'].sum() > dtf.groupby('playerkey')['duration'].sum().median()].count())

# How many players played for less than the median duration?
print('Number of players who played less than the median duration:', dtf.groupby('playerkey')['duration'].sum()[dtf.groupby('playerkey')['duration'].sum() < dtf.groupby('playerkey')['duration'].sum().median()].count())

# sEperation line
print("--------------------------------------------")

# Calcualte the minimum duration played per player
print('Minimum duration played per player:', dtf.groupby('playerkey')['duration'].sum().min())
# Who is the player with the minimum duration played?
print('Player with the minimum duration played:', dtf.groupby('playerkey')['duration'].sum().idxmin())
print("--------------------------------------------")
# Calcualte the maximum duration played per player
print('Maximum duration played per player:', dtf.groupby('playerkey')['duration'].sum().max())
# Who is the player with the maximum duration played?
print('Player with the maximum duration played:', dtf.groupby('playerkey')['duration'].sum().idxmax())
# Number of visits played by the player with the maximum duration played?
print('Number of visits played by the player with the maximum duration played:', len(dtf[dtf['playerkey'] == dtf.groupby('playerkey')['duration'].sum().idxmax()]['visit'].unique().tolist()))


Total duration played: 381 days 06:17:33.055000
Average duration played per player: 0 days 00:37:56.192167979
Number of players who played more than the average duration: 3762
Number of players who played less than the average duration: 10710
--------------------------------------------
Median duration played per player: 0 days 00:14:28.013500
Number of players who played more than the median duration: 7236
Number of players who played less than the median duration: 7236
--------------------------------------------
Minimum duration played per player: 0 days 00:00:02.050000
Player with the minimum duration played: 7239
--------------------------------------------
Maximum duration played per player: 1 days 20:10:05.843000
Player with the maximum duration played: 1166
Number of visits played by the player with the maximum duration played: 19


#### Vists and Sessions

In [48]:
# What is the average duration played per visit?
# average duration played per visit for each player
print('Average duration played per visit:', dtf.groupby(['playerkey', 'visit'])['duration'].sum().mean())

# What is the median duration played per visit?
print('Median duration played per visit:', dtf.groupby(['playerkey', 'visit'])['duration'].sum().median())

# Seperation line
print("--------------------------------------------")

# What is the average duration played per session?
print('Average duration played per session:', dtf.groupby(['playerkey', 'session_time'])['duration'].sum().mean())

# What is the median duration played per session?
print('Median duration played per session:', dtf.groupby(['playerkey', 'session_time'])['duration'].sum().median())

# Seperation line
print("--------------------------------------------")

# What is the average duration played per session_machine?
print('Average duration played per session_machine:', dtf.groupby(['playerkey', 'session_machine'])['duration'].sum().mean())

# What is the median duration played per session_machine?
print('Median duration played per session_machine:', dtf.groupby(['playerkey', 'session_machine'])['duration'].sum().median())

# Seperation line
print("--------------------------------------------")

Average duration played per visit: 0 days 00:23:32.082178283
Median duration played per visit: 0 days 00:11:32.904000
--------------------------------------------
Average duration played per session: 0 days 00:24:20.541502837
Median duration played per session: 0 days 00:11:27.127500
--------------------------------------------
Average duration played per session_machine: 0 days 00:01:56.287713063
Median duration played per session_machine: 0 days 00:00:08.033000
--------------------------------------------


### Breakdown

#### Per Visit Analysis

In [49]:
# Total duration per visit
for i in dtf["visit"].unique().tolist():
    total_duration = dtf[dtf["visit"] == i]["duration"].sum()
    print("Visit:", i)
    print(" a) Total duration played:", total_duration)
    print("--------------------------------------------------")

Visit: 1
 a) Total duration played: 193 days 23:48:45.840000
--------------------------------------------------
Visit: 2
 a) Total duration played: 106 days 19:45:08.025000
--------------------------------------------------
Visit: 3
 a) Total duration played: 45 days 07:33:00.099000
--------------------------------------------------
Visit: 4
 a) Total duration played: 17 days 10:03:06.139000
--------------------------------------------------
Visit: 5
 a) Total duration played: 7 days 11:58:16.664000
--------------------------------------------------
Visit: 6
 a) Total duration played: 3 days 11:57:37.378000
--------------------------------------------------
Visit: 7
 a) Total duration played: 1 days 23:18:01.951000
--------------------------------------------------
Visit: 8
 a) Total duration played: 0 days 23:07:31.958000
--------------------------------------------------
Visit: 9
 a) Total duration played: 0 days 20:10:13.852000
--------------------------------------------------
Visi

In [50]:

# What is the max, min, and average duration played per visit?
for i in dtf["visit"].unique().tolist():
    n_players = len(dtf[dtf["visit"] == i]["playerkey"].unique())
    total_duration = dtf[dtf["visit"] == i]["duration"].sum()
    visit_data = dtf[dtf["visit"] == i]
    print("Visit:", i)
    print(" a) Average duration played:", total_duration / n_players)
    print(" b) Max duration played:", visit_data.groupby("playerkey")["duration"].sum().max())
    print(" c) Min duration played:", visit_data.groupby("playerkey")["duration"].sum().min())
    print("------------------------------------------------------------")
    

#Overall average duration played per visit across all players

Visit: 1
 a) Average duration played: 0 days 00:19:18.162371475
 b) Max duration played: 0 days 06:41:57.363000
 c) Min duration played: 0 days 00:00:01.870000
------------------------------------------------------------
Visit: 2
 a) Average duration played: 0 days 00:28:43.852824990
 b) Max duration played: 0 days 10:49:02.134000
 c) Min duration played: 0 days 00:00:01.890000
------------------------------------------------------------
Visit: 3
 a) Average duration played: 0 days 00:30:45.042459472
 b) Max duration played: 0 days 06:41:32.688000
 c) Min duration played: 0 days 00:00:03.167000
------------------------------------------------------------
Visit: 4
 a) Average duration played: 0 days 00:33:10.722406084
 b) Max duration played: 0 days 05:32:17.524000
 c) Min duration played: 0 days 00:00:07.910000
------------------------------------------------------------
Visit: 5
 a) Average duration played: 0 days 00:36:43.730149659
 b) Max duration played: 0 days 04:29:33.351000
 c) 

#### Per Session Analysis

In [51]:
# Total duration per session
for i in dtf["session_time"].unique().tolist():
    total_duration = dtf[dtf["session_time"] == i]["duration"].sum()
    print("Session:", i)
    print(" a) Total duration played:", total_duration)
    print("--------------------------------------------------")

Session: 1
 a) Total duration played: 248 days 16:29:03.377000
--------------------------------------------------
Session: 2
 a) Total duration played: 82 days 01:42:05.526000
--------------------------------------------------
Session: 3
 a) Total duration played: 31 days 05:37:14.019000
--------------------------------------------------
Session: 4
 a) Total duration played: 12 days 06:26:52.419000
--------------------------------------------------
Session: 5
 a) Total duration played: 4 days 16:30:34.774000
--------------------------------------------------
Session: 6
 a) Total duration played: 1 days 15:18:36.707000
--------------------------------------------------
Session: 7
 a) Total duration played: 0 days 12:01:01.030000
--------------------------------------------------
Session: 8
 a) Total duration played: 0 days 03:16:43.730000
--------------------------------------------------
Session: 9
 a) Total duration played: 0 days 00:41:17.821000
--------------------------------------

In [52]:
# What is the max, min, ave duration of each session?
for i in dtf["session_time"].unique().tolist():
    n_players = len(dtf[dtf["session_time"] == i]["playerkey"].unique())
    total_duration = dtf[dtf["session_time"] == i]["duration"].sum()
    session_data = dtf[dtf["session_time"] == i]
    print("Session:", i)
    print(" a) Average duration played:", total_duration / n_players)
    print(" b) Max duration played:", session_data.groupby("playerkey")["duration"].sum().max())
    print(" c) Min duration played:", session_data.groupby("playerkey")["duration"].sum().min())
    # Separation line:
    print("--------------------------------------------------")

Session: 1
 a) Average duration played: 0 days 00:24:44.697579947
 b) Max duration played: 1 days 15:26:19.562000
 c) Min duration played: 0 days 00:00:02.050000
--------------------------------------------------
Session: 2
 a) Average duration played: 0 days 00:23:58.905342126
 b) Max duration played: 0 days 07:57:36.016000
 c) Min duration played: 0 days 00:00:02.370000
--------------------------------------------------
Session: 3
 a) Average duration played: 0 days 00:23:37.349799894
 b) Max duration played: 0 days 07:06:27.328000
 c) Min duration played: 0 days 00:00:02.494000
--------------------------------------------------
Session: 4
 a) Average duration played: 0 days 00:22:58.429673602
 b) Max duration played: 0 days 05:20:57.508000
 c) Min duration played: 0 days 00:00:03.290000
--------------------------------------------------
Session: 5
 a) Average duration played: 0 days 00:21:55.047967532
 b) Max duration played: 0 days 03:23:16.251000
 c) Min duration played: 0 days 00

# Percentiles

## Overall Gambles

In [53]:
# Calculate the number of gambles per each player, and store it in a list
gambles_per_player = []
for i in dtf["playerkey"].unique().tolist():
    gambles_per_player.append(dtf[dtf["playerkey"] == i]["gambles"].max())

# Calculate the 20th percentile of the number of gambles per player and store is as percentile_20
percentile_20 = np.percentile(gambles_per_player, 20)
print("20th percentile of the number of gambles per player:", percentile_20)

# Calculate the 80th percentile of the number of gambles per player and store is as percentile_80
percentile_80 = np.percentile(gambles_per_player, 80)
print("80th percentile of the number of gambles per player:", percentile_80)

# Seperation line
print("------------------------------------------------------------------")

# Calculate the 10th percentile of the number of gambles per player and store is as percentile_10
percentile_10 = np.percentile(gambles_per_player, 10)
print("10th percentile of the number of gambles per player:", percentile_10)
# Calculate the 90th percentile of the number of gambles per player and store is as percentile_90
percentile_90 = np.percentile(gambles_per_player, 90)
print("90th percentile of the number of gambles per player:", percentile_90)


20th percentile of the number of gambles per player: 42.0
80th percentile of the number of gambles per player: 558.0
------------------------------------------------------------------
10th percentile of the number of gambles per player: 20.0
90th percentile of the number of gambles per player: 1060.0


In [54]:
# Let's create two separarte dataframes for the top 20% and bottom 20% of players by creating a list of player for each percentile
top_20 = dtf.groupby("playerkey")["gambles"].max()[dtf.groupby("playerkey")["gambles"].max() >= percentile_80].index.tolist()
bottom_20 = dtf.groupby("playerkey")["gambles"].max()[dtf.groupby("playerkey")["gambles"].max() <= percentile_20].index.tolist()

# How many players are in the top 20%?
print("Number of players in the top 20%:", len(top_20))

# How many players are in the bottom 20%?
print("Number of players in the bottom 20%:", len(bottom_20))

# Create a dataframe for the top 20% of players
dtf_top_20 = dtf[dtf['playerkey'].isin(top_20)]
dtf_top_20['percentile'] = 'top_20'

# Create a dataframe for the bottom 20% of players
dtf_bottom_20 = dtf[dtf['playerkey'].isin(bottom_20)]
dtf_bottom_20['percentile'] = 'bottom_20'

# Concatenate the two dataframes
dtf_20_gambles = pd.concat([dtf_top_20, dtf_bottom_20])

Number of players in the top 20%: 2898
Number of players in the bottom 20%: 2936


In [55]:
# Let's create two separarte dataframes for the top 10% and bottom 10% of players by creating a list of player for each percentile
top_10 = dtf.groupby("playerkey")["gambles"].max()[dtf.groupby("playerkey")["gambles"].max() >= percentile_90].index.tolist()
bottom_10 = dtf.groupby("playerkey")["gambles"].max()[dtf.groupby("playerkey")["gambles"].max() <= percentile_10].index.tolist()

# How many players are in the top 10%?
print("Number of players in the top 10%:", len(top_10))

# How many players are in the bottom 10%?
print("Number of players in the bottom 10%:", len(bottom_10))

# Create a dataframe for the top 10% of players
dtf_top_10 = dtf[dtf['playerkey'].isin(top_10)]
dtf_top_10['percentile'] = 'top_10'

# Create a dataframe for the bottom 10% of players
dtf_bottom_10 = dtf[dtf['playerkey'].isin(bottom_10)]
dtf_bottom_10['percentile'] = 'bottom_10'

# Concatenate the two dataframes
dtf_10_gambles = pd.concat([dtf_top_10, dtf_bottom_10])

Number of players in the top 10%: 1450
Number of players in the bottom 10%: 1450


## Duration

In [56]:
# Calculate the duration per player and store it in a list
duration_per_player = []
for i in dtf["playerkey"].unique().tolist():
    duration_per_player.append(dtf[dtf["playerkey"] == i]["duration"].sum())

# Calculate the 20th percentile of the duration per player and store is as percentile_20_duration
percentile_20_duration = np.percentile(duration_per_player, 20)
print("20th percentile of the duration per player:", percentile_20_duration)

# Calculate the 80th percentile of the duration per player and store is as percentile_80_duration
percentile_80_duration = np.percentile(duration_per_player, 80)
print("80th percentile of the duration per player:", percentile_80_duration)

# Seperation line
print("------------------------------------------------------------------")

# Calculate the 10th percentile of the duration per player and store is as percentile_10_duration
percentile_10_duration = np.percentile(duration_per_player, 10)
print("10th percentile of the duration per player:", percentile_10_duration)

# Calculate the 90th percentile of the duration per player and store is as percentile_90_duration
percentile_90_duration = np.percentile(duration_per_player, 90)
print("90th percentile of the duration per player:", percentile_90_duration)

20th percentile of the duration per player: 0 days 00:03:35.661600
80th percentile of the duration per player: 0 days 00:49:52.843800001
------------------------------------------------------------------
10th percentile of the duration per player: 0 days 00:01:37.189000
90th percentile of the duration per player: 0 days 01:31:32.126400


In [57]:
# Let's create two separarte dataframes for the top 20% and bottom 20% of players by creating a list of player for each percentile
top_20_duration = dtf.groupby("playerkey")["duration"].sum()[dtf.groupby("playerkey")["duration"].sum() >= percentile_80_duration].index.tolist()
bottom_20_duration = dtf.groupby("playerkey")["duration"].sum()[dtf.groupby("playerkey")["duration"].sum() <= percentile_20_duration].index.tolist()

# How many players are in the top 20%?
print("Number of players in the top 20%:", len(top_20_duration))

# How many players are in the bottom 20%?
print("Number of players in the bottom 20%:", len(bottom_20_duration))

# Create a dataframe for the top 20% of players
dtf_top_20_duration = dtf[dtf['playerkey'].isin(top_20_duration)]
dtf_top_20_duration['percentile'] = 'top_20'

# Create a dataframe for the bottom 20% of players
dtf_bottom_20_duration = dtf[dtf['playerkey'].isin(bottom_20_duration)]
dtf_bottom_20_duration['percentile'] = 'bottom_20'

# Concatenate the two dataframes
dtf_20_duration = pd.concat([dtf_top_20_duration, dtf_bottom_20_duration])

Number of players in the top 20%: 2895
Number of players in the bottom 20%: 2895


In [58]:
# Let's create two separarte dataframes for the top 10% and bottom 10% of players by creating a list of player for each percentile
top_10_duration = dtf.groupby("playerkey")["duration"].sum()[dtf.groupby("playerkey")["duration"].sum() >= percentile_90_duration].index.tolist()
bottom_10_duration = dtf.groupby("playerkey")["duration"].sum()[dtf.groupby("playerkey")["duration"].sum() <= percentile_10_duration].index.tolist()

# How many players are in the top 10%?
print("Number of players in the top 10%:", len(top_10_duration))

# How many players are in the bottom 10%?
print("Number of players in the bottom 10%:", len(bottom_10_duration))

# Create a dataframe for the top 10% of players
dtf_top_10_duration = dtf[dtf['playerkey'].isin(top_10_duration)]
dtf_top_10_duration['percentile'] = 'top_10'

# Create a dataframe for the bottom 10% of players
dtf_bottom_10_duration = dtf[dtf['playerkey'].isin(bottom_10_duration)]
dtf_bottom_10_duration['percentile'] = 'bottom_10'

# Concatenate the two dataframes
dtf_10_duration = pd.concat([dtf_top_10_duration, dtf_bottom_10_duration])


Number of players in the top 10%: 1448
Number of players in the bottom 10%: 1448


## Safety Check

In [59]:
# Are the lists top_20_gambles and top_20_duration players the same?
print("Are the lists top_20_gambles and top_20_duration players the same?", top_20 == top_20_duration)
# Which players are in the top 20% of gambles but not in the top 20% of duration?
print("# of Players in the top 20% of gambles but not in the top 20% of duration:", len(set(top_20) - set(top_20_duration)))

# Seperation line
print("------------------------------------------------------------------")
# Are the lists bottom_20_gambles and bottom_20_duration players the same?
print("Are the lists bottom_20_gambles and bottom_20_duration players the same?", bottom_20 == bottom_20_duration)

# Which players are in the bottom 20% of gambles but not in the bottom 20% of duration?
print("# of Players in the bottom 20% of gambles but not in the bottom 20% of duration:", len(set(bottom_20) - set(bottom_20_duration)))

# Seperation line
print("------------------------------------------------------------------")
# Are the lists top_10_gambles and top_10_duration players the same?
print("Are the lists top_10_gambles and top_10_duration players the same?", top_10 == top_10_duration)

# Which players are in the top 10% of gambles but not in the top 10% of duration?
print("# of Players in the top 10% of gambles but not in the top 10% of duration:", len(set(top_10) - set(top_10_duration)))

# Seperation line
print("------------------------------------------------------------------")
# Are the lists bottom_10_gambles and bottom_10_duration players the same?
print("Are the lists bottom_10_gambles and bottom_10_duration players the same?", bottom_10 == bottom_10_duration)

# Which players are in the bottom 10% of gambles but not in the bottom 10% of duration?
print("# of Players in the bottom 10% of gambles but not in the bottom 10% of duration:", len(set(bottom_10) - set(bottom_10_duration)))


Are the lists top_20_gambles and top_20_duration players the same? False
# of Players in the top 20% of gambles but not in the top 20% of duration: 301
------------------------------------------------------------------
Are the lists bottom_20_gambles and bottom_20_duration players the same? False
# of Players in the bottom 20% of gambles but not in the bottom 20% of duration: 404
------------------------------------------------------------------
Are the lists top_10_gambles and top_10_duration players the same? False
# of Players in the top 10% of gambles but not in the top 10% of duration: 193
------------------------------------------------------------------
Are the lists bottom_10_gambles and bottom_10_duration players the same? False
# of Players in the bottom 10% of gambles but not in the bottom 10% of duration: 248


# Save Dataframes

In [60]:
# Define saving path
path = "/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/3_July"
os.chdir(path)

# General Dataframe to parquet
dtf.to_parquet("General.parquet")

# Save the dataframes for the top 20% and bottom 20% of players
dtf_20_gambles.to_parquet("Combine_20_gambles.parquet")
dtf_20_duration.to_parquet("Combine_20_duration.parquet")

# Save the dataframes for the top 10% and bottom 10% of players
dtf_10_gambles.to_parquet("Combine_10_gambles.parquet")
dtf_10_duration.to_parquet("Combine_10_duration.parquet")

# Save bottom 20% of players
dtf_bottom_20_duration.to_parquet("Bottom_20_duration.parquet")

# Save top 20% of players
dtf_top_20_duration.to_parquet("Top_20_duration.parquet")

# Save bottom 10% of players
dtf_bottom_10_duration.to_parquet("Bottom_10_duration.parquet")

# Save top 10% of players
dtf_top_10_duration.to_parquet("Top_10_duration.parquet")

# Save bottom 20% of players
dtf_bottom_20.to_parquet("Bottom_20_gambles.parquet")

# Save top 20% of players
dtf_top_20.to_parquet("Top_20_gambles.parquet")

# Save bottom 10% of players
dtf_bottom_10.to_parquet("Bottom_10_gambles.parquet")

# Save top 10% of players
dtf_top_10.to_parquet("Top_10_gambles.parquet")
