# Data Processing

In [1]:
#First load the match data
import pandas as pd
import numpy as np
import scipy

matches = pd.read_csv('international_matches.csv')
matches.head()

Unnamed: 0,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_score,...,shoot_out,home_team_result,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score
0,9/3/2004,Spain,Scotland,Europe,Europe,3,67,0,0,1,...,No,Draw,94,84,86.5,89.3,89.5,80.2,79.7,81.8
1,9/4/2004,Austria,England,Europe,Europe,90,7,0,0,2,...,No,Draw,83,88,76.2,73.0,74.0,90.5,88.7,91.2
2,9/4/2004,Croatia,Hungary,Europe,Europe,25,76,0,0,3,...,No,Win,77,74,80.5,78.7,79.0,71.8,75.7,70.2
3,9/4/2004,Iceland,Bulgaria,Europe,Europe,80,41,0,0,1,...,No,Lose,78,78,68.8,77.0,69.2,70.5,79.7,78.5
4,9/4/2004,Italy,Norway,Europe,Europe,9,38,0,0,2,...,No,Win,97,79,91.8,92.3,87.5,79.2,81.3,79.0


In [2]:
#Get all team names and newest ea fifa scores
home_teams = matches.iloc[:,[1,17,19,20,21]]
home_teams = home_teams.rename(columns={"home_team": "team", "home_team_goalkeeper_score": "goalkeeper_score", 
                   "home_team_mean_defense_score": "mean_defense_score",  "home_team_mean_midfield_score": "mean_midfield_score",
                   "home_team_mean_offense_score": "mean_offense_score"})
away_teams = matches.iloc[:,[2,18,22,23,24]]
away_teams = away_teams.rename(columns={"away_team": "team", "away_team_goalkeeper_score": "goalkeeper_score", 
                   "away_team_mean_defense_score": "mean_defense_score",  "away_team_mean_midfield_score": "mean_midfield_score",
                   "away_team_mean_offense_score": "mean_offense_score"})
teams = pd.concat([home_teams, away_teams], ignore_index=True)
teams.drop_duplicates(subset=['team'],keep='last',inplace=True)
#teams = teams.to_frame()
#teams.columns = ['teams']

#add Qatar since the dataset doesnt have it for some reason
teams.loc[len(teams.index)] = ['Qatar', 70,70,70,70] 

teams.sort_values(by='team',inplace=True)
teams.reset_index(inplace=True, drop=True)
display(teams)

Unnamed: 0,team,goalkeeper_score,mean_defense_score,mean_offense_score,mean_midfield_score
0,Albania,80,76.2,70.0,73.0
1,Algeria,78,78.0,81.0,78.0
2,Angola,53,71.5,70.7,68.5
3,Argentina,84,82.2,89.0,84.0
4,Australia,77,72.0,72.3,73.5
...,...,...,...,...,...
85,Ukraine,75,74.8,78.7,80.0
86,United Arab Emirates,70,63.0,64.3,67.0
87,Uruguay,80,81.2,84.3,80.0
88,Venezuela,73,72.8,76.3,77.8


In [3]:
#get team score, and fifa rankings
hometeams_fifa = matches.iloc[:,[17,19,20,21]]
hometeams_score = matches.iloc[:,[9]]
awayteams_fifa = matches.iloc[:,[18,22,23,24]]
awayteams_score = matches.iloc[:,[10]]

In [4]:
#Setup ML model data

#create home matrix
XW = hometeams_fifa.to_numpy()
YW = hometeams_score.to_numpy()
#create away matrix
XL = awayteams_fifa.to_numpy()
YL = awayteams_score.to_numpy()

#combine the matrices and also reverse them so Home team isnt always on one side of the data
Xt = np.concatenate((XW,XL),axis=1)
Xb = np.concatenate((XL,XW),axis=1)
X = np.concatenate((Xt,Xb),axis=0)

Yt = np.concatenate((YW,YL),axis=0) #GPy wants different dimensions
Yb = np.concatenate((YL,YW),axis=0)
#Y = np.concatenate((Yt,Yb),axis=0)

#each row is a datapoint

dims = np.shape(X)[1] #size of columns

X = np.concatenate((Xt[3000:],Xb[3000:]),axis=0)
Yt = np.concatenate((YW[3000:],YL[3000:]),axis=0) #GPy wants different dimensions
Yb = np.concatenate((YL[3000:],YW[3000:]),axis=0)

In [5]:
#load the ML model
import GPy

Kmatern = GPy.kern.Matern32(input_dim = dims, ARD = True)
Klin = GPy.kern.Linear(input_dim = dims)
Kbias = GPy.kern.Bias(input_dim = dims)
icm = GPy.util.multioutput.LCM(input_dim=dims,num_outputs=2,kernels_list=[Kmatern,Klin,Kbias])

loaded_model = GPy.models.GPCoregionalizedRegression([X,X],[Yt,Yb],kernel=icm, initialize=False)
loaded_model.update_model(False)
loaded_model.initialize_parameter()
loaded_model[:] = np.load('model_save.npy')
loaded_model.update_model(True)



# Simulate The Tourney

In [6]:
import itertools
#Group Stage Simulation

defaultdata = {'Points': [0,0,0,0],'GD': [0,0,0,0], 'GF': [0,0,0,0]}
groups = {'A': pd.DataFrame(data = defaultdata, index = ['Qatar','Ecuador','Senegal','Netherlands']),
          'B': pd.DataFrame(data = defaultdata, index = ['England','IR Iran','USA','Wales']),
          'C': pd.DataFrame(data = defaultdata, index = ['Argentina', 'Saudi Arabia', 'Mexico', 'Poland']),
          'D': pd.DataFrame(data = defaultdata, index = ['France','Australia','Denmark','Tunisia']),
          'E': pd.DataFrame(data = defaultdata, index = ['Spain','Costa Rica', 'Germany', 'Japan']),
          'F': pd.DataFrame(data = defaultdata, index = ['Belgium', 'Canada', 'Morocco','Croatia']),
          'G': pd.DataFrame(data = defaultdata, index = ['Brazil', 'Serbia','Switzerland','Cameroon']),
          'H': pd.DataFrame(data = defaultdata, index = ['Portugal','Ghana','Uruguay','Korea Republic'])}
tiemargin = 0.1 #if probability of winning is between 60% and 40% just assume a tie for group stages
winpoints = 3 #points for group stages
losepoints = 0
tiepoints = 1

knockout_bracket_order = [['A',0],['B',1],    
                          ['C',0],['D',1],
                          ['E',0],['F',1],
                          ['G',0],['H',1],
                          ['B',0],['A',1],
                          ['D',0],['C',1],
                          ['F',0],['E',1],
                          ['H',0],['G',1],]   #best A group plays 2nd best B group

def playoffScoring(team1, team2, tiemargin):
    
    #get data for the team
    team1data = teams.loc[teams['team'] == team1, ["goalkeeper_score", "mean_defense_score", "mean_midfield_score", "mean_offense_score"]].to_numpy()
    team2data = teams.loc[teams['team'] == team2, ["goalkeeper_score", "mean_defense_score", "mean_midfield_score", "mean_offense_score"]].to_numpy()
    #create input matrix for GPy
    Xpred = np.concatenate((team1data,team2data),axis=1)
    #GP needs an extra entry to tell if predicting for output 0 or 1
    Xpred0 = np.concatenate((Xpred,[[0]]),axis=1)
    Xpred1 = np.concatenate((Xpred,[[1]]),axis=1)
    #noise_dict0 = {'output_index':Xpred0[:,-1].astype(int)}
    #noise_dict1 = {'output_index':Xpred1[:,-1].astype(int)}
    noise_dict = {'output_index':np.concatenate((Xpred0[:,-1],Xpred1[:,-1])).astype(int)}
    
    #score1 = loaded_model.predict(Xpred0,Y_metadata=noise_dict0)[0].item()  #dont need uncertainty
    #score2 = loaded_model.predict(Xpred1,Y_metadata=noise_dict1)[0].item()
    
    #predict
    [score,var] = loaded_model.predict(np.concatenate((Xpred0,Xpred1)), Y_metadata=noise_dict, full_cov = True)
    score1 = score[0]
    score2 = score[1]
    varscore1 = var[0][0]
    varscore2 = var[1][1]
    covar = var[0][1]
    
    #get probability of team1 beating team2 (probability of one gaussian being larger than another)
    #ie probability of Z = X-Y > 0
    muZ = score1-score2
    varZ = varscore1 + varscore2 - 2*covar
    prob = 0.5*scipy.special.erfc(-muZ/np.sqrt(2*varZ))
    
    
    if ((prob <= 0.5 + tiemargin) & (prob >= 0.5 - tiemargin)):
        winnerindex = 'Tie'
        loserindex = 'Tie'
        scorewin = score1
        scorelose = score2
        probout = prob;
    else:
        if prob > 0.5:
            winnerindex = team1
            scorewin = score1
            loserindex = team2
            scorelose = score2
            probout = prob;
        else:
            winnerindex = team2
            scorewin = score2
            loserindex = team1
            scorelose = score1
            probout = 1-prob;
    
    return [winnerindex, loserindex, scorewin, scorelose, probout] 




#Group Stage Simulation
print("\033[1m" + "Group Stages" + "\033[0m")
for group in groups.keys():
    teamsingroup = groups[group].index
    for subset in  itertools.combinations(teamsingroup, 2):
        team1 = subset[0]
        team2 = subset[1]
        [winning_team, losing_team, scorewin, scorelose, prob] = playoffScoring(team1, team2, tiemargin)
        if winning_team == 'Tie':
            groups[group].at[team1,'Points'] = groups[group].at[team1,'Points'] + tiepoints
            groups[group].at[team2,'Points'] = groups[group].at[team2,'Points'] + tiepoints
            groups[group].at[team1,'GD'] = groups[group].at[team1,'GD'] + (scorewin-scorelose)
            groups[group].at[team2,'GD'] = groups[group].at[team2,'GD'] + (scorelose-scorewin)
            groups[group].at[team1,'GF'] = groups[group].at[team1,'GF'] + (scorewin)
            groups[group].at[team2,'GF'] = groups[group].at[team2,'GF'] + (scorelose)
            print("{} {} - {} {} with p = {}".format(team1, scorewin, team2, scorelose, prob))
        else:
            groups[group].at[winning_team,'Points'] = groups[group].at[winning_team,'Points'] + winpoints
            groups[group].at[losing_team,'Points'] = groups[group].at[losing_team,'Points'] + losepoints
            groups[group].at[winning_team,'GD'] = groups[group].at[winning_team,'GD'] + (scorewin-scorelose)
            groups[group].at[losing_team,'GD'] = groups[group].at[losing_team,'GD'] + (scorelose-scorewin)
            groups[group].at[winning_team,'GF'] = groups[group].at[winning_team,'GF'] + (scorewin)
            groups[group].at[losing_team,'GF'] = groups[group].at[losing_team,'GF'] + (scorelose)
            print("{} {} - {} {} with p = {}".format(winning_team, scorewin, losing_team, scorelose, prob))
            
        #sort each group
        groups[group].sort_values(by = ['Points','GD','GF'], ascending = [False, False, False], inplace = True)   
        
#print groups
for group in groups.keys():
    print(groups[group])

    
    
#form the bracket
knockout_bracket = []
for entry in knockout_bracket_order:
    teamorder = groups[entry[0]].index.values
    knockout_bracket.append(teamorder[entry[1]])

def plan_games(teams):
    return zip(teams[::2], teams[1::2])

def games_round(games):
    winners = []
    for team1, team2 in games:
        [winning_team, losing_team, scorewin, scorelose, prob] = playoffScoring(team1, team2, 0) 
        winners.append(winning_team)
        
        print("{} {} - {} {} with p = {}".format(winning_team, scorewin, losing_team, scorelose, prob))
        
    print('\n')    
    return winners

#run knockout stages
print("\033[1m" + "Knockout Stages" + "\033[0m")
round = 0
while len(knockout_bracket) > 1:
    round += 1
    print("Round {}: teams: {}".format(round, knockout_bracket))
    print('\n')
    games = plan_games(knockout_bracket)
    knockout_bracket = games_round(games)

#champion = teams[0]  # only one left
champion = knockout_bracket[0]
print("Champion is {}".format(champion))

[1mGroup Stages[0m
Ecuador [1.45789726] - Qatar [0.85592065] with p = [0.6492104]
Senegal [2.08455064] - Qatar [0.66492297] with p = [0.81682894]
Netherlands [2.55802542] - Qatar [0.60325137] with p = [0.89302557]
Senegal [1.56895014] - Ecuador [0.75112325] with p = [0.69864768]
Netherlands [1.99643614] - Ecuador [0.64342369] with p = [0.80526092]
Netherlands [1.37300288] - Senegal [0.83753962] with p = [0.63334872]
England [2.4703687] - IR Iran [0.45084661] with p = [0.90027837]
England [2.03889108] - USA [0.76168678] with p = [0.79177139]
England [2.29791483] - Wales [0.82857709] with p = [0.82483631]
USA [1.44251077] - IR Iran [0.70022965] with p = [0.68160858]
Wales [1.35042551] - IR Iran [0.80014239] with p = [0.63679886]
USA [1.21008221] - Wales [1.01816436] with p = [0.54859912]
Argentina [2.46187426] - Saudi Arabia [0.48135937] with p = [0.89565645]
Argentina [1.8773708] - Mexico [1.07521399] with p = [0.69495414]
Argentina [1.88459149] - Poland [1.11123756] with p = [0.68819

# Single Sim
Run a single sim between 2 teams

In [7]:
team1 = 'Spain'
team2 = 'USA'



#get result
team1data = teams.loc[teams['team'] == team1, ["goalkeeper_score", "mean_defense_score", "mean_midfield_score", "mean_offense_score"]]
team2data = teams.loc[teams['team'] == team2, ["goalkeeper_score", "mean_defense_score", "mean_midfield_score", "mean_offense_score"]]
Xpred = np.concatenate((team1data,team2data),axis=1)
#GP needs an extra entry to tell if predicting for output 0 or 1
Xpred0 = np.concatenate((Xpred,[[0]]),axis=1)
Xpred1 = np.concatenate((Xpred,[[1]]),axis=1)
noise_dict0 = {'output_index':Xpred0[:,-1].astype(int)}
noise_dict1 = {'output_index':Xpred1[:,-1].astype(int)}

noise_dict = {'output_index':np.concatenate((Xpred0[:,-1],Xpred1[:,-1])).astype(int)}
[score,var] = loaded_model.predict(np.concatenate((Xpred0,Xpred1)), Y_metadata=noise_dict, full_cov = True)


print("{} {} - {} {}".format(team1,score[0],team2,score[1]))
print("with variance")
print(var)

#probability of one distribution being larger than another
muZ = score[0]-score[1]
varZ = var[0][0] + var[1][1] - 2*var[0][1]
prob = 0.5*scipy.special.erfc(-muZ/np.sqrt(2*varZ))
print("with probability of {} winning being {}".format(team1,prob))

Spain [1.98383469] - USA [0.67320756]
with variance
[[1.23979806 0.004794  ]
 [0.004794   1.23979021]]
with probability of Spain winning being [0.79784036]
