# Exploratory Data Analysis
I am using this notebook to learn how gamblers behaviors are similar to those of investors.

## Define Libraries

In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from ipywidgets import interactive, fixed, IntSlider, HBox, Layout, VBox

# Getting rid of the SettingWithCopyWarning: 
pd.options.mode.chained_assignment = None

## Upload Data

In [2]:
# Set working directory
path = '/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Data'
os.chdir(path)

# Load data into a DataFrame
dtf = pd.read_parquet("df40.parquet")

print(dtf.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90000 entries, 0 to 90273
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   playercashableamt             90000 non-null  float64
 1   wageredamt                    90000 non-null  float64
 2   casino_grosswin               90000 non-null  float64
 3   playerkey                     90000 non-null  int64  
 4   slotdenominationname          90000 non-null  object 
 5   slotthemekey                  90000 non-null  int64  
 6   maxbet                        90000 non-null  int64  
 7   player_loss                   90000 non-null  float64
 8   player_wins                   90000 non-null  float64
 9   percent_return                90000 non-null  float64
 10  playercashableamt_pct_change  89823 non-null  float64
 11  time                          90000 non-null  int64  
dtypes: float64(7), int64(4), object(1)
memory usage: 8.9+ MB
Non

In [3]:
# Print unique values of slotdenominationname
print(dtf['slotdenominationname'].unique())

# Delete rows were slotdenominationname is equal to 'Unknown Denomination'
dtf = dtf[dtf['slotdenominationname'] != 'Unknown Denomination']

# Print unique values of slotdenominationname
print(dtf['slotdenominationname'].unique())

# Create a new column called slotdenomination that is equal to the slotdenominationname column that do not have dollar signs and spaces
dtf['slotdenomination'] = dtf['slotdenominationname'].str.replace('$', '').str.replace(' ', '')

# Print unique values of slotdenomination
print(dtf['slotdenomination'].unique())

# Convert slotdenomination to float
dtf['slotdenomination'] = dtf['slotdenomination'].astype(float)

# Drop slotdenominationname column
dtf = dtf.drop(columns=['slotdenominationname'])

print(dtf.info())

['$5.00 ' '$10.00 ' '$1.00 ' '$2.00 ' '$0.25 ' '$0.05 ' '$0.01 ' '$0.02 '
 '$0.10 ' '$0.50 ' '$25.00 ' '$100.00 ' 'Unknown Denomination' '$50.00 ']
['$5.00 ' '$10.00 ' '$1.00 ' '$2.00 ' '$0.25 ' '$0.05 ' '$0.01 ' '$0.02 '
 '$0.10 ' '$0.50 ' '$25.00 ' '$100.00 ' '$50.00 ']
['5.00' '10.00' '1.00' '2.00' '0.25' '0.05' '0.01' '0.02' '0.10' '0.50'
 '25.00' '100.00' '50.00']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 89996 entries, 0 to 90273
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   playercashableamt             89996 non-null  float64
 1   wageredamt                    89996 non-null  float64
 2   casino_grosswin               89996 non-null  float64
 3   playerkey                     89996 non-null  int64  
 4   slotthemekey                  89996 non-null  int64  
 5   maxbet                        89996 non-null  int64  
 6   player_loss                   89996 non-nu

  dtf['slotdenomination'] = dtf['slotdenominationname'].str.replace('$', '').str.replace(' ', '')


### Change Recognition
It is time to see which palyers changed machines to either increas or decrease their minimum bets.  

* Define function to look for our desire chage.
* We are going to be using _slotdenominationname_ to observe the change
* Create a variable _change_ that is 1 everytime _slotdenomination_ changes, 0 otherwise for each player

In [4]:
# Count number of unique players
print("Total number of Slot Players:", dtf['playerkey'].nunique())

Total number of Slot Players: 83


In [5]:
# Create a colunm increase_bet that is 1 everytime slotdenomination increases 0 otherwise per player
dtf["increase_slotdeno"] = dtf.groupby("playerkey")["slotdenomination"].diff().fillna(0)
dtf["increase_slotdeno"] = dtf["increase_slotdeno"].apply(lambda x: 1 if x > 0 else 0)

# Create a list of players that increase their bet
players_increase_slot = dtf[dtf["increase_slotdeno"] == 1]["playerkey"].unique().tolist()
print("Players who change slot for higher min bet:", players_increase_slot)
print("Count of players that increase their slotdeno:", len(players_increase_slot))

# Create a colunm decrease_bet that is 1 everytime slotdenomination decreases 0 otherwise per player
dtf["decrease_slot"] = dtf.groupby("playerkey")["slotdenomination"].diff().fillna(0)
dtf["decrease_slot"] = dtf["decrease_slot"].apply(lambda x: 1 if x < 0 else 0)

# Create a list of players that decrease their bet
players_decrease_slot = dtf[dtf["decrease_slot"] == 1]["playerkey"].unique().tolist()
print("Players who change slot for lower min bet:", players_decrease_slot)
print("Count of players that decrease their slotdeno:", len(players_decrease_slot))

Players who change slot for higher min bet: [2, 3, 4, 6, 7, 8, 9, 11, 12, 14, 17, 18, 19, 20, 27, 29, 33, 35, 36, 37, 38, 40, 41, 43, 44, 47, 48, 49, 51, 54, 56, 57, 61, 68, 69, 70, 73, 76, 79, 83, 84, 85, 87, 89, 91, 92, 93, 94, 95, 97]
Count of players that increase their slotdeno: 50
Players who change slot for lower min bet: [2, 3, 4, 6, 8, 9, 11, 12, 14, 18, 19, 20, 27, 29, 30, 33, 35, 37, 38, 43, 44, 47, 48, 49, 51, 53, 54, 61, 65, 68, 69, 70, 72, 73, 76, 79, 83, 85, 86, 87, 89, 90, 91, 93, 94, 95, 96, 97, 99, 100]
Count of players that decrease their slotdeno: 50


In [6]:
# Create same code as above but for maxbet
# Create a colunm increase_bet that is 1 everytime maxbet increases 0 otherwise per player
dtf["increase_maxbet"] = dtf.groupby("playerkey")["maxbet"].diff().fillna(0)
dtf["increase_maxbet"] = dtf["increase_maxbet"].apply(lambda x: 1 if x > 0 else 0)

# Create a list of players that increase their bet
players_increase_maxbet = dtf[dtf["increase_maxbet"] == 1]["playerkey"].unique().tolist()
print("Players that increase their bet:", players_increase_maxbet)
print("Count of players that increase their bet:", len(players_increase_maxbet))

# Create a colunm decrease_bet that is 1 everytime maxbet decreases 0 otherwise per player
dtf["decrease_maxbet"] = dtf.groupby("playerkey")["maxbet"].diff().fillna(0)
dtf["decrease_maxbet"] = dtf["decrease_maxbet"].apply(lambda x: 1 if x < 0 else 0)

# Create a list of players that decrease their bet
players_decrease_maxbet = dtf[dtf["decrease_maxbet"] == 1]["playerkey"].unique().tolist()
print("Players that decrease their bet:", players_decrease_maxbet)
print("Count of players that decrease their bet:", len(players_decrease_maxbet))

Players that increase their bet: [2, 3, 4, 6, 8, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 27, 29, 30, 33, 35, 36, 37, 38, 43, 44, 47, 48, 49, 51, 52, 53, 54, 61, 62, 63, 65, 66, 68, 69, 70, 72, 73, 76, 77, 79, 82, 83, 84, 85, 87, 89, 90, 91, 93, 94, 95, 96, 97, 99, 100]
Count of players that increase their bet: 62
Players that decrease their bet: [3, 4, 6, 7, 8, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 27, 33, 35, 36, 37, 38, 40, 41, 42, 43, 44, 46, 47, 48, 49, 51, 54, 56, 61, 62, 63, 65, 66, 68, 69, 70, 72, 73, 76, 77, 79, 83, 84, 85, 87, 89, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
Count of players that decrease their bet: 63


## Slicing DataFrames per Matched Players and Visualizing Outcomes

Stuff is working perfectly!


In [None]:
def filter_match(df, players_match, match_column, rolling_window, fill_value):
    # Create a new DataFrame with only the players that appear in players_match
    df_match_all = df[df["playerkey"].isin(players_match)]

    # Creaete a new column for called match_minus10_0 that is True for the rows around match_column is True
    df_match_all.loc[:, "match_rolling"] = df_match_all[match_column].rolling(window=rolling_window, center=True).apply(lambda x: any(x)).fillna(fill_value).astype(int)
    
    # Slice the DataFrame to only include the rows where match_rolling is True
    df_match_slice = df_match_all[df_match_all["match_rolling"] == True]
    # Return the new DataFrame
    return df_match_all, df_match_slice

In [None]:
# Create a new DataFrame with only the players that appear in players_match8 []
df40_match8_10_all, dtf40_match8_10_slice = filter_match(df=df40, players_match=players_match8, match_column="match_0_8", rolling_window=21, fill_value=False)

# Save the DataFrame to a parquet file
df40_match8_10_all.to_parquet("df40_match8_10_all.parquet")
print(df40_match8_10_all.shape)

# Save the DataFrame to a parquet file
dtf40_match8_10_slice.to_parquet("dtf40_match8_10_slice.parquet")
print(dtf40_match8_10_slice.shape)

In [None]:

# Create a new DataFrame with only the players that appear in players_match4_11
df40_match4_10_all, dtf40_match4_10_slice = filter_match(df=df40, players_match=players_match4_11, match_column="match_4_11", rolling_window=21, fill_value=False)

# Save the DataFrame to a parquet file
df40_match4_10_all.to_parquet("df40_match4_10_all.parquet")
print(df40_match4_10_all.shape)

# Save the DataFrame to a parquet file
dtf40_match4_10_slice.to_parquet("dtf40_match4_10_slice.parquet")
print(dtf40_match4_10_slice.shape)

## Interactive Plots

The following section would be used to explore the data in an interactive way. These plots allow for user interaction, such as zooming, panning, and selecting data points. Users can customize the plot by choosing different variables to plot, adjusting axes ranges, and selecting data subsets. The interactive plots provide a dynamic way to visually explore the data and can reveal patterns or relationships that might not be apparent from static plots alone. By using interactive plots, we can gain a deeper understanding of the data and make more informed decisions during the data analysis process.

In [None]:
import matplotlib.pyplot as plt
import ipywidgets as widgets

# Make a list of all the dataframes that are match and slice
dtf_lists = [dtf40_match8_10_slice, dtf40_match4_10_slice]

# Calculate the max and min values for the 'time' column for each DataFrame
time_max = max([df["time"].max() for df in dtf_lists])
time_min = min([df["time"].min() for df in dtf_lists])

print(time_max, time_min)

# Create a scatter plot of the players wins for only player with key 3
def plot_scatters(player_ID, df_index, x="time", y="percent_return", y_2=None, x_min=None, x_max=None, show_line=False, shade_area=False):
    df = dtf_lists[df_index]
    players = df["playerkey"].unique().tolist()
    player_df = df[df["playerkey"] == players[player_ID]]
    
    fig, ax1 = plt.subplots()
    ax1.set_xlabel(x)
    ax1.set_ylabel(y, color='royalblue')
    if x_min is not None and x_max is not None:
        player_df = player_df[(player_df[x] >= x_min) & (player_df[x] <= x_max)]
    ax1.scatter(x=player_df[x], y=player_df[y], color='royalblue')
    
    if y_2 is not None:
        ax2 = ax1.twinx()
        ax2.set_ylabel(y_2, color='r')
        if x_min is not None and x_max is not None:
            player_df = player_df[(player_df[x] >= x_min) & (player_df[x] <= x_max)]
        ax2.scatter(x=player_df[x], y=player_df[y_2], color='orangered', marker='s')
        ax2.tick_params(axis='y', labelcolor='orangered')
        # Add a line to the plot if show_line is True
        if show_line:
            ax2.plot(player_df[x], player_df[y_2], color='black', linewidth=0.8, linestyle='--')
        if shade_area:
            ax2.fill_between(player_df[x], player_df[y_2], color='lightcoral', alpha=0.5)
    
    if show_line:
        ax1.plot(player_df[x], player_df[y], color='black', linewidth=0.8)
        
    if shade_area:
        ax1.fill_between(player_df[x], player_df[y], color='lightblue', alpha=0.5)

    ax1.tick_params(axis='y', labelcolor='black')
    ax1.grid()
    plt.title(f"Player {players[player_ID]}")
    plt.show()

# Create widgets for playerkey, df_index, x, y, y_2, x_min, and x_max
df_index_widget = widgets.Dropdown(options=[(f"DataFrame {i}", i) for i in range(len(dtf_lists))])
x_widget = widgets.Dropdown(options=list(dtf_lists[0].columns), value="time")
y_widget = widgets.Dropdown(options=list(dtf_lists[0].columns), value="percent_return")
y_2_widget = widgets.Dropdown(options=[None]+list(dtf_lists[0].columns), value=None)
x_min_widget = widgets.FloatText(description="x_min", value=time_min)
x_max_widget = widgets.FloatText(description="x_max", value=time_max)
show_line_widget = widgets.Checkbox(description='Show line', value=False)
shade_area_widget = widgets.Checkbox(description='Shade area', value=False)

# Create a function to update the players_widget based on the selected df_index
def update_players_widget(df_index):
    df = dtf_lists[df_index]
    players = df["playerkey"].unique().tolist()
    player_key_widget.options = [(p, i) for i, p in enumerate(players)]

# Create a players_widget for the initial df_index value
initial_df_index = df_index_widget.value
initial_df = dtf_lists[initial_df_index]
initial_players = initial_df["playerkey"].unique().tolist()
player_key_widget = widgets.Dropdown(options=[], value=None)

# Call update_players_widget with the initial_df_index value to set the options for player_key_widget
update_players_widget(initial_df_index)

widgets.interact(plot_scatters, player_ID=player_key_widget, df_index=df_index_widget,
                 x=x_widget, y=y_widget, y_2=y_2_widget, x_min=x_min_widget, x_max=x_max_widget,
                 show_line=show_line_widget, shade_area=shade_area_widget)

# Update the player_key_widget options when df_index changes
def on_df_index_change(change):
    update_players_widget(change.new)

df_index_widget.observe(on_df_index_change, names='value')
update_players_widget(initial_df_index)  # update the player_key_widget options initially
