# Similarities and Differences between Categories (Pt.1 - Age, Gender, Won/Lost Amount)

In [1]:
# Define libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import os
import plotting_fn as pf
import counting_fns as cf

month_file = '6_October'
cut_off = 10000
# Set working directory
os.chdir("/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/By month/"+month_file)


In [2]:
# Read in data
df = pd.read_parquet("top_vs_ntop_players.parquet")

# Round wageredamt and profit to 2 decimal places
df['wageredamt'] = df['wageredamt'].round(1)
df['profit'] = df['profit'].round(1)
df['percent_return'] = df['percent_return'].round(1)


In [3]:
# Create a new column 'result_type' that is a categorical variable which takes the value 'loss' if the change is negative and 'gain' if the change is positive, and 'draw' of change is 0
df['result_type'] = df['percent_return'].apply(lambda x: 'loss' if x == -100 else 'near-hit' if x < 0 else 'gain' if x > 0 else 'draw')

# Create dummy variables from 'result_type'
dummy_variables = pd.get_dummies(df['result_type']).rename(columns=lambda x: '#' + str(x[0].capitalize()))

# Add the dummy variables to the original DataFrame
df = pd.concat([df, dummy_variables], axis=1).reset_index(drop=True)

# Convert starttime to delte format for operations
df['start_time'] = pd.to_datetime(df['start_time'])

In [4]:
# Create new column called 'time_diff' which is the difference between the start time of the gamble and the start time of the previous gamble
df['time_diff'] = df.groupby(['playerkey', 'session_time'])['start_time'].diff()

# Convert time_diff to seconds
df['time_diff'] = df['time_diff'].dt.total_seconds().fillna(0)

## General Overview

### Players, Visits, Sessions of Each Group

In [5]:
# Number of gamblers
print("Number of gamblers in dataframe:", len(df['playerkey'].unique()))
print('--------------------------------------------------')

# Print number of unique visits
print("Number of unique visits in dtf:", len(df['visit'].unique()))
print('--------------------------------------------------')

# Print number of unique sessions
print("Number of unique sessions in dtf:", len(df['session_time'].unique()))
print('--------------------------------------------------')

Number of gamblers in dataframe: 5570
--------------------------------------------------
Number of unique visits in dtf: 10
--------------------------------------------------
Number of unique sessions in dtf: 10
--------------------------------------------------


In [6]:
# Lets crate a column called 'depletion_slope' which is the difference of 'playercashableamt' between the current gamble and the previous gamble
df['depletion_rate'] = df.groupby(['playerkey', 'session_time'])['playercashableamt'].diff().fillna(0)


In [7]:
# Print name of columns
print("Name of columns:", df.columns)

# Filter columns
filter =  ['playerkey', 'playercashableamt', 'wageredamt', 'maxbet', 'grosswin', 
           'theoreticalpaybackpercent', 'age', 'gender', 'slotdenomination', 
           'total_amt_won', 'profit', 'percent_return',
           'clasification', 'result_type', '#D', '#G', '#L', '#N', 'time_diff',
           'depletion_rate']

# Filter columns
df = df[filter]

# Set saving directory
os.chdir("/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data/Statistics")

# Save dataframe as parquet file
df.to_parquet("data_for_stats_october.parquet")

Name of columns: Index(['playerkey', 'playercashableamt', 'wageredamt', 'maxbet', 'grosswin',
       'currencyinamt', 'assetnumber', 'theoreticalpaybackpercent', 'age',
       'rank', 'gender', 'date', 'start_time', 'end_time', 'duration',
       'slotdenomination', 'day', 'time', 'hour', 'timeofday', 'player_loss',
       'player_wins', 'total_amt_won', 'profit', 'percent_return', 'gambles',
       'visit', 'session_time', 'session_machine', 'gambles_visit',
       'gambles_session', 'gambles_machine', 'machines_played',
       'clasification', 'result_type', '#D', '#G', '#L', '#N', 'time_diff',
       'depletion_rate'],
      dtype='object')
