In [1]:
!pip install pymc
!pip install numpyro
!pip install aeppl
!pip install jax

Collecting numpyro
  Downloading numpyro-0.15.0-py3-none-any.whl (345 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.7/345.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: numpyro
Successfully installed numpyro-0.15.0
Collecting aeppl
  Downloading aeppl-0.1.5-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting aesara>=2.8.13 (from aeppl)
  Downloading aesara-2.9.3-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: aesara, aeppl
Successfully installed aeppl-0.1.5 aesara-2.9.3


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd drive
%cd MyDrive
%cd cs179

/content/drive
/content/drive/MyDrive
/content/drive/MyDrive/cs179


In [4]:
import pandas as pd

csv_file_path = '/drive/MyDrive/cs179/games.csv'

#getting info
df = pd.read_csv('games.csv')

#print(df['white_id'])
white_ids = df['white_id']
black_ids =  df["black_id"]
winners = df["winner"]

print(white_ids[:5])

0         bourgris
1             a-00
2           ischia
3    daniamurashov
4        nik221107
Name: white_id, dtype: object


In [5]:
import numpy as np

uniqueppl = white_ids.tolist()
uniqueppl.extend(black_ids.tolist())

uppl = np.array(list(set(uniqueppl)))#set of unique people

n_players = len(uppl)

# reformat gamesplayed -> winner first, loser second
gamesplayed = []
for game in range(len(df)):
    white_id = df.iloc[game].white_id
    black_id = df.iloc[game].black_id

    if df.iloc[game].winner == "white":
        gamesplayed.append([white_id, black_id])
    if df.iloc[game].winner == "black":
        gamesplayed.append([black_id, white_id])

print(gamesplayed[0:5])

[['bourgris', 'a-00'], ['skinnerua', 'a-00'], ['ischia', 'a-00'], ['daniamurashov', 'adivanov2009'], ['nik221107', 'adivanov2009']]


In [8]:
import numpy as np
import pymc as pm

# convert str to int
player_to_id = {player: i for i, player in enumerate(uppl)}
player_ids = np.array([[player_to_id[winner], player_to_id[loser]] for winner, loser in gamesplayed])

# check sizes -> something is wrong here. tracce has more players than uppl
print(len(uppl))
print(len(player_to_id))
print(len(player_ids))

with pm.Model() as model:
    player_sd = pm.HalfNormal("player_sd", sigma=1.0)
    player_skills_raw = pm.Normal(
        "player_skills_raw", 0.0, sigma=1.0, shape=(n_players,)
    )
    player_skills = pm.Deterministic("player_skills", player_skills_raw * player_sd)

    winner_ids = player_ids[:, 0]
    loser_ids = player_ids[:, 1]

    logit_skills = player_skills[winner_ids] - player_skills[loser_ids]
    lik = pm.Bernoulli(
        "win_lik", logit_p=logit_skills, observed=np.ones(winner_ids.shape[0])
    )

    # Sample using the numpyro backend for JAX
    trace = pm.sample(1000, tune=1000, chains=4, cores=1, nuts_sampler='numpyro', random_seed=42)

15635
15635
19108


  pmap_numpyro = MCMC(
sample: 100%|██████████| 2000/2000 [00:46<00:00, 43.12it/s, 31 steps of size 1.16e-01. acc. prob=0.86]
sample: 100%|██████████| 2000/2000 [00:42<00:00, 47.50it/s, 31 steps of size 1.15e-01. acc. prob=0.86]
sample: 100%|██████████| 2000/2000 [00:43<00:00, 46.44it/s, 31 steps of size 1.11e-01. acc. prob=0.88]
sample: 100%|██████████| 2000/2000 [00:45<00:00, 44.22it/s, 31 steps of size 1.27e-01. acc. prob=0.84]


In [9]:
import arviz as az

summary = az.summary(trace, kind="stats")

In [10]:
# filter out raw player skills (useless)
player_skills_summary = summary[~summary.index.str.startswith('player_skills_raw')]
player_skills_summary = player_skills_summary[1:]

player_means = player_skills_summary['mean']
player_sds = player_skills_summary['sd']

# Create df
player_skills_df = pd.DataFrame({
    'Player Name': uppl,
    'Skill Mean': player_means.values,
    'Skill SD': player_sds.values
})

sorted_player_skills_df = player_skills_df.sort_values(by='Skill Mean', ascending=False)
print(sorted_player_skills_df)


               Player Name  Skill Mean  Skill SD
2847             chesscarl       3.476     0.664
3922              siindbad       3.250     0.803
2719              mmichael       3.091     0.840
7857           amir2002zzz       2.962     0.855
14387           steelviper       2.871     0.853
...                    ...         ...       ...
15419             ghaffari      -2.763     0.720
10588           sveenemand      -2.776     0.680
322               mccheese      -2.911     0.722
6262   josephelbouhessaini      -2.920     0.890
9220            stellanova      -3.136     0.812

[15635 rows x 3 columns]


In [11]:
# Concatenate white_id and black_id columns to get all player ids
all_player_ids = pd.concat([df['white_id'], df['black_id']]).unique()

player_avg_ratings = {}

for player_id in all_player_ids:
    # Get ratings for the current player from both white and black ratings
    white_ratings = df.loc[df['white_id'] == player_id, 'white_rating']
    black_ratings = df.loc[df['black_id'] == player_id, 'black_rating']
    # Calculate the average rating for the player
    avg_rating = pd.concat([white_ratings, black_ratings]).mean()
    # Store the average rating for the player
    player_avg_ratings[player_id] = avg_rating

sorted_player_avg_ratings = sorted(player_avg_ratings.items(), key=lambda x: x[1], reverse=True)

avg_rating_df = pd.DataFrame(sorted_player_avg_ratings, columns=['Player Name', 'Elo Rating'])

print(avg_rating_df)


               Player Name   Elo Rating
0               justicebot  2711.500000
1              blitzbullet  2622.000000
2                lance5500  2617.727273
3                 avill050  2588.000000
4      shahoviy_komentator  2586.000000
...                    ...          ...
15630          epicchess66   795.500000
15631     christinebitonti   795.000000
15632              natalua   793.000000
15633             hortense   791.000000
15634  ragnarlothbrook_spb   784.000000

[15635 rows x 2 columns]


In [12]:
from scipy.stats import spearmanr

# Get actual and predicted rankings for all players
actual_rankings_all = []
predicted_rankings_all = []

for i, player_name in enumerate(sorted_player_skills_df['Player Name']):
    actual_ranking = avg_rating_df.loc[avg_rating_df['Player Name'] == player_name].index[0]
    actual_rankings_all.append(actual_ranking)
    predicted_rankings_all.append(i)

# Compute the Spearman rank correlation coefficient for all players
spearman_corr_all, _ = spearmanr(predicted_rankings_all, actual_rankings_all)
print(f"Spearman Rank Correlation Coefficient for All Players: {spearman_corr_all}")


Spearman Rank Correlation Coefficient for All Players: 0.20530417334104267
