# Exploratory Data Analysis

We are going to be looking for patterns in players who only played in the month of June. This is with the purpose to distinguish players between short and long positions, and the reasons why they hold this positions. 

In [55]:
# import all necesary libraries for the project
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import warnings
warnings.filterwarnings('ignore')


In [56]:
# Define working directory
path = "/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month"
os.chdir(path)

# Filter Columns
filter = ['playerkey', 'playercashableamt', 'wageredamt', 'maxbet', 'grosswin',
       'currencyinamt', 'assetnumber', 'theoreticalpaybackpercent', 
       'age', 'rank', 'gender', 'date', 'start_time', 'end_time', 'duration',
       'slotdenomination']
# Import data
dtf = pd.read_parquet('month_6_year_2015.parquet', columns=filter)

# Check data
dtf.columns

Index(['playerkey', 'playercashableamt', 'wageredamt', 'maxbet', 'grosswin',
       'currencyinamt', 'assetnumber', 'theoreticalpaybackpercent', 'age',
       'rank', 'gender', 'date', 'start_time', 'end_time', 'duration',
       'slotdenomination'],
      dtype='object')

In [51]:
# Sort data by start_time and playerkey
dtf = dtf.sort_values(by=['playerkey', 'date', 'start_time'])

## Calculate Foundamental Variables

The following variables were calculated using existing data:
* _player_loss_: how much money each player has lost in each gamble.
* _player_wins_: equals the amount of money they bet plus how much they won.
* _percent_return_: the return in player's bets for each gamble played. 

$$\text{percent return} = (\frac{df[wins] - df[wageredamt]}{df[wageredamt]})*100$$

* _playercashableamt_pct_change_: calculates the rate of change of player's outstanding gambling amount. 

$$\text{playercashableamt \% change} = (\frac{df[playercashableamt_{t+1}] - df[playercashableamt_{t}]}{df[playercashableamt_{t}]})*100$$

In [57]:
# Crate a new colum that is the inverse of casino_grosswin, named "player_loss"
dtf["player_loss"] = dtf["grosswin"] * -1

# how much each player wins
dtf['player_wins'] = dtf['wageredamt'] + dtf['player_loss']

# Calculate percentage return for each gamble and add it as a new column
dtf["percent_return"] = (dtf["player_wins"] - dtf["wageredamt"]) / dtf["wageredamt"] * 100

# Calculate the percent rate of change of playerscashableamt per playerkey
dtf["playercashableamt_pct_change"] = dtf.groupby("playerkey")["playercashableamt"].pct_change()
# Print the first 5 rows of the DataFrame

# Create a time series variable for each player that starts at 1 and increases by 1 for each row
dtf["gambles"] = dtf.groupby("playerkey").cumcount() + 1


In [58]:
# Group the DataFrame by playerkey
groups = dtf.groupby('playerkey')

# Initialize the visit column to 1 for the first visit of each player
dtf['visit'] = groups['date'].transform(lambda x: (x.diff().dt.days >= 1).cumsum() + 1)

# Reset the visit count for each new player
dtf['visit'] = dtf.groupby('playerkey')['visit'].apply(lambda x: x - x.iloc[0] + 1)


In [62]:
# Convert the start_time column to datetime
dtf["start_time"] = pd.to_datetime(dtf["start_time"])

# Compute the time difference between consecutive gambles for each player
dtf['time_diff'] = dtf.groupby('playerkey')['start_time'].diff()

# Initialize the session_time column to 1 for the first gamble of each player
dtf['session_time'] = dtf.groupby('playerkey').ngroup() + 1

# Update the session_time column based on the time difference
dtf['session_time'] += (dtf['time_diff'] > pd.Timedelta(minutes=30)).cumsum()

# Reset the session_time count for each new visit
dtf['session_time'] = dtf.groupby(['playerkey', 'visit'])['session_time'].apply(lambda x: x - x.iloc[0] + 1)

# Remove the temporary time_diff column
dtf = dtf.drop('time_diff', axis=1)

In [71]:
# Initialize the session_machine column to 1 for the first machine of each player
dtf['session_machine'] = (dtf.groupby("playerkey")["assetnumber"].diff() != 0).cumsum()

# Reset the session_machine count for each new visit
dtf['session_machine'] = dtf.groupby(['playerkey', 'visit'])['session_machine'].apply(lambda x: x - x.iloc[0] + 1)


## Frequency of Gambles

Let's see who plays the most, the least, and the min, max, average, and median number of gambles 

In [27]:
# Total number of players
print("Total number of players:", dtf["playerkey"].nunique())

# Total number of gambles
print("Total number of gambles:", dtf.groupby('playerkey')['gambles'].max().sum())

print("--------------------------------------------")
# What is the maximum number of periods played by each player?
print("Maximimum # of gambles of a single player:", dtf.groupby('playerkey')['gambles'].max().max())

#Who is the player with the maximum number of periods played?
print("Player who gambled the most:", dtf.groupby('playerkey')['gambles'].max().idxmax())

print("--------------------------------------------")

# What is the minimum number of periods played by each player?
print("Minimum # of gambles of a single player:", dtf.groupby('playerkey')['gambles'].max().min())

#Who is the player with the minimum number of periods played?
print("Player who gambled the least:", dtf.groupby('playerkey')['gambles'].max().idxmin())

print("--------------------------------------------")

# What is the median number of periods played by each player?
print("Median # of gambles of all players:", round(dtf.groupby('playerkey')['gambles'].max().median(), 2))

# How many players played at least 150 gambles?
print("Number of players who played exactly 150 times:", dtf.groupby('playerkey')['gambles'].max()[dtf.groupby('playerkey')['gambles'].max() == 150].count())
print("Number of players who gambled less than 150 times:", dtf.groupby('playerkey')['gambles'].max()[dtf.groupby('playerkey')['gambles'].max() < 150].count())
print("Number of players who gambled more than 150 times:", dtf.groupby('playerkey')['gambles'].max()[dtf.groupby('playerkey')['gambles'].max() > 150].count())
print("--------------------------------------------")

# What is the average number of periods played by each player?
print("Average # of gambles of all players:", round(dtf.groupby('playerkey')['gambles'].max().mean(), 2))

# How many players played at least 278 gambles?
print("Number of players who gambled at least 278 times:", dtf.groupby('playerkey')['gambles'].max()[dtf.groupby('playerkey')['gambles'].max() >= 278].count())

Total number of players: 282
Total number of gambles: 78246
--------------------------------------------
Maximimum # of gambles of a single player: 3107
Player who gambled the most: 33
--------------------------------------------
Minimum # of gambles of a single player: 1
Player who gambled the least: 15
--------------------------------------------
Median # of gambles of all players: 150.0
Number of players who played exactly 150 times: 3
Number of players who gambled less than 150 times: 140
Number of players who gambled more than 150 times: 139
--------------------------------------------
Average # of gambles of all players: 277.47
Number of players who gambled at least 278 times: 87
