This Notebook is aimed at trying to analyse the E-Sport games present in the dataset with the goal of estimating the causal effect of patch recency on underdog win probability 

In [115]:
#Import libraries
import pandas as pd
from dowhy import CausalModel
import numpy as np 

For our data we will use the game_metadata.csv and game_player_stats.csv for the explanation of each column see github readme. The data is located in the out folder in the directory of this file
We will aggregrate the two into a dataset containing necessary info for the analysis

In [116]:
#Load datasets from csv files using pandas
game_metadata = pd.read_csv('./out/game_metadata.csv')
game_player_stats = pd.read_csv('./out/game_player_stats.csv')

print(f"Succesfully loaded datasets containing {len(game_metadata['gameid'].unique())} unique games")

Succesfully loaded datasets containing 74093 unique games


In [117]:
#Convert the date column of game_metadata to datetime 
game_metadata['date'] = pd.to_datetime(game_metadata['date'])
game_metadata = game_metadata.sort_values(by='date').reset_index(drop=True)
print(game_metadata.dtypes) 

gameid                object
date          datetime64[ns]
league                object
playoffs               int64
patch                float64
gamelength             int64
dtype: object


In [118]:
#Now we create our aggregrated dataframe
matches_df = game_metadata.copy()


In [119]:
#We now want to create a dictonary where we save for each patch when the first time was it was played 

patch_start = matches_df.groupby('patch')['date'].min().to_dict()

#Now we add that to the dataframe in the form of a column that has the days since patch as a value
matches_df['patch_start_date'] = matches_df['patch'].map(patch_start)
matches_df['days_since_patch'] = (matches_df['date'] - matches_df['patch_start_date']).dt.days

#For now we want to drop the patch start date column but we can add it again anytime
matches_df.drop(columns='patch_start_date', inplace=True)


In [120]:
#Now we want to add team_A and team_b to each game for that we first group the gameplayerstats in such a way that 
#we have only the two teams for each gameid
teams_df = game_player_stats.groupby(['gameid', 'teamid']).first().reset_index()

# Aggregate the teamid into a list per gameid 
teams_per_game = teams_df.groupby('gameid')['teamid'].agg(list).reset_index()

# Assign team_a and team_b
teams_per_game['team_a'] = teams_per_game['teamid'].apply(lambda x: x[0])
teams_per_game['team_b'] = teams_per_game['teamid'].apply(lambda x: x[1])

# Drop the list column
teams_per_game = teams_per_game.drop(columns=['teamid'])

# Merge with matches
matches_df = matches_df.merge(teams_per_game, on='gameid', how='left')
print(matches_df.dtypes)

gameid                      object
date                datetime64[ns]
league                      object
playoffs                     int64
patch                      float64
gamelength                   int64
days_since_patch             int64
team_a                      object
team_b                      object
dtype: object


In [121]:
#Now we want to get the winner e.g. the result of the match into the df
#First we groupby gameids and take the first row for our result since it will always be team_A
game_results = game_player_stats.groupby('gameid').apply(
    lambda x: pd.Series({
        'winner': 'team_a' if x.iloc[0]['result'] == 1 else 'team_b'
    })
).reset_index()

#Now we merge that into our matches df
matches_df = matches_df.merge(game_results, on='gameid', how='left')
print(matches_df.dtypes)

gameid                      object
date                datetime64[ns]
league                      object
playoffs                     int64
patch                      float64
gamelength                   int64
days_since_patch             int64
team_a                      object
team_b                      object
winner                      object
dtype: object


In [122]:
#Add the treatment variable, e.g. whether or not there has been a patch in the last seven days 
matches_df['recent_patch'] = ((matches_df['days_since_patch']) <= 7).astype(int)
print(matches_df['recent_patch'].value_counts())

0    53610
1    20483
Name: recent_patch, dtype: int64


Now we define the method of calculating our underdog and favorite, for which we will use the default Elo Model
We intialize our elo with 1500 for each team and update it using the following formula 
$R_{\text{new}} = R_{\text{old}} + K \cdot (\frac{1}{1 + 10^{(R_{\text{opponent}} - R_{\text{old}})/400}})$

In [123]:
def update_elo(rating_A, rating_B, result, k=20):
    outcome_a = None
    if result == 'team_a':
        outcome_a = 1
    else:
        outcome_a = 0
    
    expected_outcome_a = 1 / (1+10 ** ((rating_B-rating_A) / 400))
    expected_outcome_b = 1 - expected_outcome_a

    new_rating_a = rating_A + k*(outcome_a - expected_outcome_a)
    new_rating_b = rating_B + k*((1 - outcome_a) - expected_outcome_b)

    return new_rating_a, new_rating_b

In [124]:
#Now we want to calculate the elo for every match 
initial_elo = 1500
K = 20

# Track current Elo for each team in a dict for easy access
team_elo = {}

# Pre-allocate arrays for pre-match Elo
pre_elo_a = np.zeros(len(matches_df))
pre_elo_b = np.zeros(len(matches_df))

# Iterate through matches efficiently
team_a_list = matches_df['team_a'].to_numpy()
team_b_list = matches_df['team_b'].to_numpy()
results = matches_df['winner'].to_numpy()

team_streak = {}

winstreak_a = np.zeros(len(matches_df))
winstreak_b = np.zeros(len(matches_df))

for i in range(len(matches_df)):
    team_a = team_a_list[i]
    team_b = team_b_list[i]
    
    # Current Elo or initialize
    elo_a = team_elo.get(team_a, initial_elo)
    elo_b = team_elo.get(team_b, initial_elo)
    streak_a = team_streak.get(team_a, 0)
    streak_b = team_streak.get(team_b, 0)

    # Save pre-match Elo and winstreak
    pre_elo_a[i] = elo_a
    pre_elo_b[i] = elo_b
    winstreak_a[i] = streak_a
    winstreak_b[i] = streak_b

    # Update Elo
    new_rating_a, new_rating_b = update_elo(elo_a, elo_b, results[i], K)
    
    # Update dictionary
    team_elo[team_a] = new_rating_a
    team_elo[team_b] = new_rating_b

    #update the winstreak
    if results[i] == 'team_a':
        team_streak[team_a] = streak_a + 1
        team_streak[team_b] = 0
    else:
        team_streak[team_b] = streak_b + 1
        team_streak[team_a] = 0

# Assign back to DataFrame
matches_df['elo_a'] = pre_elo_a
matches_df['elo_b'] = pre_elo_b
matches_df['winstreak_a'] = winstreak_a
matches_df['winstreak_b'] = winstreak_b

In [125]:
#For example lets print out max and min elo 
max_elo = max(team_elo.values())
print("Max Elo across all teams:", max_elo)

print(matches_df.dtypes)

Max Elo across all teams: 1612.314340095789
gameid                      object
date                datetime64[ns]
league                      object
playoffs                     int64
patch                      float64
gamelength                   int64
days_since_patch             int64
team_a                      object
team_b                      object
winner                      object
recent_patch                 int64
elo_a                      float64
elo_b                      float64
winstreak_a                float64
winstreak_b                float64
dtype: object


In [126]:
#Now add the remaining columns 
#Add elo diff 
matches_df['elo_diff'] = matches_df['elo_a'] - matches_df['elo_b']

#Add underdog column
matches_df['underdog'] = matches_df.apply(
    lambda row: 'team_a' if row['elo_a'] < row['elo_b'] else 'team_b',
    axis=1
)

#Add a column for site advantage (e.g. if the underdog is blue side)
matches_df['underdog_side_adv'] = matches_df.apply(
    lambda row: 1 if row['underdog'] == 'team_a' else 0,
    axis=1
)
#add the outcome column or the upset column 
matches_df['upset'] = (matches_df['winner'] == matches_df['underdog']).astype(int)

#save matches_df since its an interesting / useful dataframe 
matches_df.to_csv('./out/matches_df.csv')

Now we analyse the causal effect using dowhy 

In [127]:
import warnings

treatment = 'recent_patch'  
outcome = 'upset'         
confounders = ['elo_diff']

warnings.simplefilter(action='ignore', category=FutureWarning)


# Create the causal model
model = CausalModel(
    data=matches_df,
    treatment=treatment,
    outcome=outcome,
    common_causes=confounders
)

# Identify causal effect
identified_estimand = model.identify_effect()

# Estimate the effect using a linear regression
estimate = model.estimate_effect(
    identified_estimand,
    method_name="backdoor.propensity_score_matching"
)

# Print the effect
print("Causal effect of recent_patch on upset:")
print(estimate.value)

# Optional: Refute the estimate to check robustness
refutation = model.refute_estimate(
    identified_estimand,
    estimate,
    method_name="placebo_treatment_refuter"
)
print(refutation)

Causal effect of recent_patch on upset:
-0.0021324551577072055
Refute: Use a Placebo Treatment
Estimated effect:-0.0021324551577072055
New effect:-7.355620638926755e-05
p value:0.94



In [128]:
print(matches_df.dtypes)

gameid                       object
date                 datetime64[ns]
league                       object
playoffs                      int64
patch                       float64
gamelength                    int64
days_since_patch              int64
team_a                       object
team_b                       object
winner                       object
recent_patch                  int64
elo_a                       float64
elo_b                       float64
winstreak_a                 float64
winstreak_b                 float64
elo_diff                    float64
underdog                     object
underdog_side_adv             int64
upset                         int64
propensity_score            float64
dtype: object


In [129]:
# Define treatment, outcome, and confounders
treatment = "underdog_side_adv"
outcome = "upset"
confounders = ["elo_diff", "elo_a", "elo_b", "recent_patch", "playoffs"]

# Initialize the causal model
model = CausalModel(
    data=matches_df,
    treatment=treatment,
    outcome=outcome,
    common_causes=confounders
)

# Identify the causal effect
identified_estimand = model.identify_effect()

# Estimate the causal effect using linear regression
estimate = model.estimate_effect(
    identified_estimand,
    method_name="backdoor.linear_regression"
)
print(f"Causal effect of {treatment} on {outcome}: {estimate.value}")

Causal effect of underdog_side_adv on upset: 0.06733957711046917


In [130]:
'''matches_df['treatment'] = (matches_df['winstreak_a'] >= 3).astype(int)  # Example for team A
matches_df['outcome'] = (matches_df['winner'] == 'team_a').astype(int)

import dowhy
from dowhy import CausalModel

# Choose the variables for the model
treatment = 'treatment'
outcome = 'outcome'
confounders = [
    'elo_diff',
    'winstreak_b',      # opponent streak
    'playoffs'
]

# Create a causal model
model = CausalModel(
    data=matches_df,
    treatment=treatment,
    outcome=outcome,
    common_causes=confounders
)

# View the causal graph
model.view_model(layout="dot")

# Identify the causal effect
identified_estimand = model.identify_effect()

# Estimate the causal effect using linear regression as a start
estimate = model.estimate_effect(
    identified_estimand,
    method_name="backdoor.linear_regression"
)

print("Causal Estimate:", estimate.value)
#print("p-value:", estimate.test_stat_significance['p_value'])

# Refute the estimate (robustness checks)
refute1 = model.refute_estimate(identified_estimand, estimate, method_name="random_common_cause")
print(refute1)

refute2 = model.refute_estimate(identified_estimand, estimate, method_name="placebo_treatment_refuter")
print(refute2)

refute3 = model.refute_estimate(identified_estimand, estimate, method_name="data_subset_refuter")
print(refute3)'''

'matches_df[\'treatment\'] = (matches_df[\'winstreak_a\'] >= 3).astype(int)  # Example for team A\nmatches_df[\'outcome\'] = (matches_df[\'winner\'] == \'team_a\').astype(int)\n\nimport dowhy\nfrom dowhy import CausalModel\n\n# Choose the variables for the model\ntreatment = \'treatment\'\noutcome = \'outcome\'\nconfounders = [\n    \'elo_diff\',\n    \'winstreak_b\',      # opponent streak\n    \'playoffs\'\n]\n\n# Create a causal model\nmodel = CausalModel(\n    data=matches_df,\n    treatment=treatment,\n    outcome=outcome,\n    common_causes=confounders\n)\n\n# View the causal graph\nmodel.view_model(layout="dot")\n\n# Identify the causal effect\nidentified_estimand = model.identify_effect()\n\n# Estimate the causal effect using linear regression as a start\nestimate = model.estimate_effect(\n    identified_estimand,\n    method_name="backdoor.linear_regression"\n)\n\nprint("Causal Estimate:", estimate.value)\n#print("p-value:", estimate.test_stat_significance[\'p_value\'])\n\n# R