In [1]:
!pip install pymc
!pip install numpyro
!pip install aeppl
!pip install jax

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd drive
%cd MyDrive
%cd cs179

/content/drive
/content/drive/MyDrive
/content/drive/MyDrive/cs179


In [2]:
import pandas as pd

csv_file_path = '/drive/MyDrive/cs179/games.csv'

#getting info
df = pd.read_csv('games.csv')
df = df[df['rated'] != False]

#print(df['white_id'])
white_ids = df['white_id']
black_ids =  df["black_id"]
winners = df["winner"]

print(white_ids[:5])

1             a-00
2           ischia
3    daniamurashov
4        nik221107
6          capa_jr
Name: white_id, dtype: object


In [3]:
import numpy as np

uniqueppl = white_ids.tolist()
uniqueppl.extend(black_ids.tolist())

uppl = np.array(list(set(uniqueppl)))#set of unique people

n_players = len(uppl)

# reformat gamesplayed -> winner first, loser second
gamesplayed = []
for game in range(len(df)):
    white_id = df.iloc[game].white_id
    black_id = df.iloc[game].black_id

    if df.iloc[game].winner == "white":
        gamesplayed.append([white_id, black_id])
    if df.iloc[game].winner == "black":
        gamesplayed.append([black_id, white_id])

print(gamesplayed[0:5])

[['skinnerua', 'a-00'], ['ischia', 'a-00'], ['daniamurashov', 'adivanov2009'], ['nik221107', 'adivanov2009'], ['capa_jr', 'daniel_likes_chess']]


In [4]:
import numpy as np
import pymc as pm

# convert str to int
player_to_id = {player: i for i, player in enumerate(uppl)}
player_ids = np.array([[player_to_id[winner], player_to_id[loser]] for winner, loser in gamesplayed])

# check sizes -> something is wrong here. tracce has more players than uppl
print(len(uppl))
print(len(player_to_id))
print(len(player_ids))

with pm.Model() as model:
    player_sd = pm.HalfNormal("player_sd", sigma=1.0)
    player_skills_raw = pm.Normal(
        "player_skills_raw", 0.0, sigma=1.0, shape=(n_players,)
    )
    player_skills = pm.Deterministic("player_skills", player_skills_raw * player_sd)

    winner_ids = player_ids[:, 0]
    loser_ids = player_ids[:, 1]

    logit_skills = player_skills[winner_ids] - player_skills[loser_ids]
    lik = pm.Bernoulli(
        "win_lik", logit_p=logit_skills, observed=np.ones(winner_ids.shape[0])
    )

    # Sample using the numpyro backend for JAX
    trace = pm.sample(1000, tune=1000, chains=4, cores=1, nuts_sampler='numpyro', random_seed=42)



13179
13179
15436


  from .autonotebook import tqdm as notebook_tqdm
Compiling.. :   0%|          | 0/2000 [00:00<?, ?it/s]
[A
[A

[A[A

[A[A
[A

Running chain 0:   0%|          | 0/2000 [00:06<?, ?it/s]

Running chain 0:   5%|▌         | 100/2000 [00:32<08:04,  3.92it/s]

Running chain 0:  10%|█         | 200/2000 [00:38<04:21,  6.88it/s]
[A

Running chain 0:  15%|█▌        | 300/2000 [00:44<03:00,  9.44it/s]
[A

Running chain 0:  20%|██        | 400/2000 [00:51<02:25, 11.03it/s]
[A

Running chain 0:  25%|██▌       | 500/2000 [00:58<02:02, 12.25it/s]
[A

Running chain 0:  30%|███       | 600/2000 [01:04<01:46, 13.12it/s]
[A

Running chain 0:  35%|███▌      | 700/2000 [01:11<01:35, 13.61it/s]
[A

Running chain 0:  40%|████      | 800/2000 [01:16<01:18, 15.35it/s]
[A

Running chain 0:  45%|████▌     | 900/2000 [01:21<01:06, 16.65it/s]

[A[A
Running chain 0:  50%|█████     | 1000/2000 [01:27<01:00, 16.47it/s]

[A[A
Running chain 0:  55%|█████▌    | 1100/2000 [01:32<00:51, 17.57it/s]

[A

In [5]:
import arviz as az

summary = az.summary(trace, kind="stats")

In [6]:
# filter out raw player skills (useless)
player_skills_summary = summary[~summary.index.str.startswith('player_skills_raw')]
player_skills_summary = player_skills_summary[1:]

player_means = player_skills_summary['mean']
player_sds = player_skills_summary['sd']

# Create df
player_skills_df = pd.DataFrame({
    'Player Name': uppl,
    'Skill Mean': player_means.values,
    'Skill SD': player_sds.values
})

sorted_player_skills_df = player_skills_df.sort_values(by='Skill Mean', ascending=False)
print(sorted_player_skills_df)


           Player Name  Skill Mean  Skill SD
8139         chesscarl       3.370     0.653
630           siindbad       3.058     0.778
4969         smilsydov       3.002     0.677
12204         mmichael       2.875     0.802
10262      amir2002zzz       2.808     0.816
...                ...         ...       ...
7791   thebestofthebad      -2.423     0.955
10523       andreschil      -2.530     0.632
12578         ghaffari      -2.561     0.702
5962          mccheese      -2.927     0.802
7752        sveenemand      -3.002     0.790

[13179 rows x 3 columns]


In [7]:
# Concatenate white_id and black_id columns to get all player ids
all_player_ids = pd.concat([df['white_id'], df['black_id']]).unique()

player_avg_ratings = {}

for player_id in all_player_ids:
    # Get ratings for the current player from both white and black ratings
    white_ratings = df.loc[df['white_id'] == player_id, 'white_rating']
    black_ratings = df.loc[df['black_id'] == player_id, 'black_rating']
    # Calculate the average rating for the player
    avg_rating = pd.concat([white_ratings, black_ratings]).mean()
    # Store the average rating for the player
    player_avg_ratings[player_id] = avg_rating

sorted_player_avg_ratings = sorted(player_avg_ratings.items(), key=lambda x: x[1], reverse=True)

avg_rating_df = pd.DataFrame(sorted_player_avg_ratings, columns=['Player Name', 'Elo Rating'])

print(avg_rating_df)


               Player Name  Elo Rating
0              blitzbullet     2622.00
1                 avill050     2588.00
2               teatime007     2575.25
3                   tree33     2540.00
4                 lexisvar     2513.00
...                    ...         ...
13174           canabidiol      802.50
13175          epicchess66      795.50
13176              natalua      793.00
13177             hortense      791.00
13178  ragnarlothbrook_spb      784.00

[13179 rows x 2 columns]


In [8]:
from scipy.stats import spearmanr

# Get actual and predicted rankings for all players
actual_rankings_all = []
predicted_rankings_all = []

for i, player_name in enumerate(sorted_player_skills_df['Player Name']):
    actual_ranking = avg_rating_df.loc[avg_rating_df['Player Name'] == player_name].index[0]
    actual_rankings_all.append(actual_ranking)
    predicted_rankings_all.append(i)

# Compute the Spearman rank correlation coefficient for all players
spearman_corr_all, _ = spearmanr(predicted_rankings_all, actual_rankings_all)
print(f"Spearman Rank Correlation Coefficient for All Players: {spearman_corr_all}")


Spearman Rank Correlation Coefficient for All Players: 0.19463985842664727
