# Loading data

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import networkx as nx
import pystan

# Load training data and reduce (subsample) if desired

# Read thru file to get numeric ids for each player 
with open('train.csv') as f: lines = f.read().split('\n')

p = 0; playerid = {};
for i in range(len(lines)):
    csv = lines[i].split(',');
    if len(csv) != 10: continue;   # parse error or blank line
    player0,player1 = csv[1],csv[4];
    if player0 not in playerid: playerid[player0]=p; p+=1;
    if player1 not in playerid: playerid[player1]=p; p+=1;

nplayers = len(playerid)
playername = ['']*nplayers
for player in playerid: playername[ playerid[player] ]=player;  # id to name lookup


# Sparsifying parameters (discard some training examples):
pKeep = 1.0   # fraction of edges to consider (immed. throw out 1-p edges)
nEdge = 3     # try to keep nEdge opponents per player (may be more; asymmetric)
nKeep = 5     # keep at most nKeep games per opponent pairs (play each other multiple times)

nplays, nwins = np.zeros( (nplayers,nplayers) ), np.zeros( (nplayers,nplayers) );
for i in range(len(lines)):
    csv = lines[i].split(',');
    if len(csv) != 10: continue;   # parse error or blank line
    a,b = playerid[csv[1]],playerid[csv[4]];
    aw,bw = csv[2]=='[winner]',csv[5]=='[winner]';
    if (np.random.rand() < pKeep):
        if (nplays[a,b] < nKeep) and ( ((nplays[a,:]>0).sum() < nEdge) or ((nplays[:,b]>0).sum() < nEdge) ):
            nplays[a,b] += 1; nplays[b,a]+=1; nwins[a,b] += aw; nwins[b,a] += bw;

In [2]:
nplayers # number of unique players

999

In [3]:
playerid # map from playername -> playerID
playername[0] # list of playernames indexed by their ID
print(playername[0], "vs", playername[1])

MC vs Stats


In [4]:
nplays[0,1] # number of games between player 0 and player 1

2.0

In [5]:
nwins[0,1] # number of wins between player 0 and player 1
np.max(nwins) # maximum number of wins against a single opponent is 5 

5.0

In [18]:
np.sum(nplays)  # number of player vs player combinations

9354.0

In [6]:
import numpy as np
import pystan
import matplotlib.pyplot as plt
%matplotlib inline

win = []
PA = []
PB = []
for index, wins in np.ndenumerate(nwins):
    if wins == 0: continue
    win.append(int(wins))
    PA.append(index[0] + 1)           # increment so we can index starting at 1
    PB.append(index[1] + 1)           # player 0 is now player 1

In [7]:
print(win[:6], " # of wins PA had over PB") 
print(PA[:6], " PA's ID") 
print(PB[:6], " PB's ID")
len(win)            # total number of games

[1, 1, 2, 4, 4, 2]  # of wins PA had over PB
[1, 1, 1, 1, 1, 1]  PA's ID
[2, 4, 6, 7, 8, 9]  PB's ID


3321

##### Stan Model

In [8]:
skill_model = """
data {
  int<lower=1> N;             # Total number of players i.e 999
  int<lower=1> E;             # number of matchups (3321)
  real<lower=0> scale;        # scale value for probability computation
  int<lower=1,upper=5> win[E];        # PA wins vs PB
  int PA[E];                  # player info between each matchup
  int PB[E];                  # 
}
parameters {
  vector<lower=0> [N] skill;           # skill values for each player
}

model{
  for (i in 1:N){ skill[i]~normal(0,3); }
  for (i in 1:E){
    win[i] ~ binomial_logit(5, (scale)*(skill[PA[i]]-skill[PB[i]]) );
  }   # win probability is a binomial_logit function of skill difference (0-5)
}
"""

##### compile the model

In [9]:
import pickle
try:     # load it if already compiled
    sm = pickle.load(open('skill_model.pkl', 'rb'))
except:  # ow, compile and save compiled model
    sm = pystan.StanModel(model_code = skill_model)
    with open('skill_model.pkl', 'wb') as f: pickle.dump(sm, f)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_697fdb9c66a99070b5e525208306356f NOW.


In [10]:
skill_data = {
    'N': 999,
    'E': 3321,
    'scale': 0.7,
    'win': win,
    'PA': PA,
    'PB': PB
}

Now, we can perform MCMC on the model, and extract the samples:

In [11]:
fit = sm.sampling(data=skill_data, iter=10000, chains=1)

In [12]:
samples = fit.extract()

If we just want the mean estimate for each player's skill level, just take the empirical average over the samples:

In [13]:
player_skills = samples['skill'].mean(0)
print(player_skills)

[0.90609144 0.62436126 0.63230061 0.23528964 0.71095205 1.01382656
 0.4787382  0.55994435 0.20579434 1.10893401 1.3415139  0.81154325
 0.76594974 0.59331022 0.89188294 1.09066603 2.07571307 1.16190688
 0.6345189  1.12845268 1.11594884 1.00383142 1.05371308 0.87824355
 0.62479565 1.67357827 1.46019313 0.68137913 0.40865401 0.48191956
 1.14568414 2.69331408 0.60610885 0.76724954 1.70582223 1.77137615
 1.55484869 1.41801412 0.43946007 1.32199332 1.6387192  0.28335897
 0.59844302 0.37010834 1.08937856 1.24711152 0.72221306 0.5649091
 1.22308589 0.5474032  1.02100391 0.76097246 2.05005469 0.95024122
 1.08799806 0.9914249  0.4886044  0.50692379 2.19385535 0.41892485
 0.97056795 0.58468874 1.23876689 0.79377365 0.39020452 1.39655607
 1.42364348 0.72045303 0.67867555 0.47404143 0.84546617 1.53763862
 2.58999221 1.00115166 0.26432951 1.4700894  0.68559427 1.45343636
 1.07990802 1.5787172  0.31131955 0.49195734 1.54557319 1.38511893
 0.61000298 1.12447567 0.86723017 2.82068234 0.52560691 2.87140

Finding the name of the player with the highest skill according to our model

In [14]:
ind = np.unravel_index(np.argmax(player_skills, axis=None), player_skills.shape)
ind2 = np.unravel_index(np.argmin(player_skills, axis=None), player_skills.shape)

print("the highest skill level is: ", player_skills[ind[0]], " and his name is: ", playername[ind[0]])
print("the lowest skill level is: ", player_skills[ind2[0]], " and his name is: ", playername[ind2[0]])

the highest skill level is:  3.7176528801405655  and his name is:  Aicy
the lowest skill level is:  0.20579433786607296  and his name is:  Zest


The above result is surprising because Zest is a good player. According to this link https://www.lineups.com/esports/top-10-starcraft-ii-players-of-all-time/ these are the top ten players of all time. So they should have high skill levels.

In [15]:
top10 = np.array([
    playerid["Mvp"],
    playerid["Life"],
    playerid["TaeJa"],
    playerid["MC"],
    playerid["Polt"],
    playerid["INnoVation"],
    playerid["Zest"],
    playerid["NesTea"],
    playerid["MMA"],
    playerid["Rain"]
])

for id in top10:
    print(playername[id], "'s skill level is: ", player_skills[id])


Mvp 's skill level is:  0.6745208719647124
Life 's skill level is:  0.9705679476466387
TaeJa 's skill level is:  0.31654552078883597
MC 's skill level is:  0.9060914391783299
Polt 's skill level is:  0.8753359426189606
INnoVation 's skill level is:  0.23528964382238188
Zest 's skill level is:  0.20579433786607296
NesTea 's skill level is:  0.7393129362439659
MMA 's skill level is:  0.6786755451258175
Rain 's skill level is:  0.7222130631421745


So, we're actually getting the opposite of what we were expecting. I think this has to do with the fact that I changed the sampling distribution from bernoulli_logit to binomial_logit. But we can just say that lower is better and still call this a good model.

If we want to predict which player will win, we might use a direct estimator of that quantity based on the sample values:

In [21]:
# Player 0 vs Player 1 prediction:
def logit(z): return 1./(1.+np.exp(-z))

# Use our model's win probability function (logistic of scaled difference)
#  using the predicted skill difference for each sample:
prob = logit( skill_data['scale']*(samples['skill'][:,ind[0]]-samples['skill'][:,ind2[0]]) ).mean()

print(playername[ind[0]], "has a ", prob * 100, "% chance of winning against ", playername[ind2[0]])

Aicy has a  90.73737815413845 % chance of winning against  Zest


according to this link https://liquipedia.net/starcraft2/ESL_Pro_Tour/2020/21/Korea/Standings, Zest is number 3 in the current standings in korea. So the results are definitely reversed.

##### Reversing the win data

In [24]:
skill_data = {
    'N': 999,
    'E': 3321,
    'scale': 0.7,
    'win': win,
    'PA': PB,
    'PB': PA
}

Now, we can perform MCMC on the model, and extract the samples:

In [None]:
fit = sm.sampling(data=skill_data, iter=10000, chains=1)

In [12]:
samples = fit.extract()

If we just want the mean estimate for each player's skill level, just take the empirical average over the samples:

In [13]:
player_skills = samples['skill'].mean(0)
print(player_skills)

[0.90609144 0.62436126 0.63230061 0.23528964 0.71095205 1.01382656
 0.4787382  0.55994435 0.20579434 1.10893401 1.3415139  0.81154325
 0.76594974 0.59331022 0.89188294 1.09066603 2.07571307 1.16190688
 0.6345189  1.12845268 1.11594884 1.00383142 1.05371308 0.87824355
 0.62479565 1.67357827 1.46019313 0.68137913 0.40865401 0.48191956
 1.14568414 2.69331408 0.60610885 0.76724954 1.70582223 1.77137615
 1.55484869 1.41801412 0.43946007 1.32199332 1.6387192  0.28335897
 0.59844302 0.37010834 1.08937856 1.24711152 0.72221306 0.5649091
 1.22308589 0.5474032  1.02100391 0.76097246 2.05005469 0.95024122
 1.08799806 0.9914249  0.4886044  0.50692379 2.19385535 0.41892485
 0.97056795 0.58468874 1.23876689 0.79377365 0.39020452 1.39655607
 1.42364348 0.72045303 0.67867555 0.47404143 0.84546617 1.53763862
 2.58999221 1.00115166 0.26432951 1.4700894  0.68559427 1.45343636
 1.07990802 1.5787172  0.31131955 0.49195734 1.54557319 1.38511893
 0.61000298 1.12447567 0.86723017 2.82068234 0.52560691 2.87140