In [2]:
import pandas as pd

match_df = pd.read_csv("/kaggle/input/fifa-world-cup-2022/international_matches.csv")
rank_df = pd.read_csv("/kaggle/input/fifaworldranking/fifa_ranking-2022-10-06.csv")
worldcup_match_df = pd.read_csv("/kaggle/input/qatar2022worldcupschudule/matchs-schudule.csv", sep=";")
teams_df = pd.read_csv("/kaggle/input/qatar2022worldcupschudule/Qatar2022-teams.csv", sep=";")

In [3]:
match_df.head()

Unnamed: 0,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_score,...,shoot_out,home_team_result,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score
0,1993-08-08,Bolivia,Uruguay,South America,South America,59,22,0,0,3,...,No,Win,,,,,,,,
1,1993-08-08,Brazil,Mexico,South America,North America,8,14,0,0,1,...,No,Draw,,,,,,,,
2,1993-08-08,Ecuador,Venezuela,South America,South America,35,94,0,0,5,...,No,Win,,,,,,,,
3,1993-08-08,Guinea,Sierra Leone,Africa,Africa,65,86,0,0,1,...,No,Win,,,,,,,,
4,1993-08-08,Paraguay,Argentina,South America,South America,67,5,0,0,1,...,No,Lose,,,,,,,,


In [4]:
rank_df.head()

Unnamed: 0,rank,country_full,country_abrv,total_points,previous_points,rank_change,confederation,rank_date
0,1,Germany,GER,57.0,0.0,0,UEFA,1992-12-31
1,96,Syria,SYR,11.0,0.0,0,AFC,1992-12-31
2,97,Burkina Faso,BFA,11.0,0.0,0,CAF,1992-12-31
3,99,Latvia,LVA,10.0,0.0,0,UEFA,1992-12-31
4,100,Burundi,BDI,10.0,0.0,0,CAF,1992-12-31


In [5]:
worldcup_match_df.head()

Unnamed: 0,match,date,country1,coutry2,phase
0,1,21/11/2022,Qatar,Ecuador,group matches
1,2,21/11/2022,Senegal,Netherlands,group matches
2,3,21/11/2022,England,Iran,group matches
3,4,21/11/2022,USA,Wales,group matches
4,5,22/11/2022,France,Australia,group matches


In [6]:
teams_df.head()

Unnamed: 0,Team,Group
0,Senegal,A
1,Qatar,A
2,Netherlands,A
3,Ecuador,A
4,Iran,B


Some countries have different names, so we will standardize that.

In [7]:
match_df = match_df.replace({"IR Iran": "Iran", "Korea Republic" : "South Korea"})
rank_df = rank_df.replace({"IR Iran": "Iran", "Korea Republic" : "South Korea"})

# Pre-processing

First I added a few varibles that will help better classify the rankings of the teams. 

For your information, `is_stake` indicates whether the match is Friendly or not. Some teams tend to not do their best on friendly matches, so `is_stake` handles these cases. Friendly matches are sort of like a scrimmage to some, so we do not account for those games in our models.

Similarly, I added `is_worldcup` to specially handle world cup matches.

In [8]:
# Here we are creating different varibles to better catergorize each team. 
match_df['rank_difference'] = match_df['home_team_fifa_rank'] - match_df['away_team_fifa_rank']
match_df['average_rank'] = (match_df['home_team_fifa_rank'] + match_df['away_team_fifa_rank'])/2
match_df['point_difference'] = match_df['home_team_total_fifa_points'] - match_df['away_team_total_fifa_points']
match_df['is_stake'] = match_df['tournament'] != 'Friendly' #This is to separate friendly matches from competitive matches, since friendlies do not contribute to the FIFA rankings
match_df['is_worldcup'] = 'FIFA World Cup' in match_df['tournament']

match_df['score_difference'] = match_df['home_team_score'] - match_df['away_team_score'] # Note that this feature is not used in training
match_df['is_won'] = match_df['score_difference'] > 0 # Take draw as lost


In [9]:
from sklearn import linear_model
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [10]:
X, y = match_df.loc[:,['average_rank', 'rank_difference', 'point_difference', 'is_stake', 'is_worldcup']], match_df['is_won']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In this notebook we will be creating Logistic Regression and Randon Forest models.

In [13]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
lg_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_test, y_test) * 100, 2)
acc_log

68.38

In [11]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, y_train)
acc_random_forest = round(random_forest.score(X_test, y_test) * 100, 2)
acc_random_forest

64.76

In [14]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 
              'Random Forest'],
    'Score': [acc_log, 
              acc_random_forest]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
0,Logistic Regression,68.38
1,Random Forest,64.76


The Logistic Regression model appears to have higher accuracy.

In [26]:
model = logreg

# Prediction

Now, we have to manually create rows for Qatar 2022 matches. 



In [16]:
#  define a small margin for draws
margin = 0.05

# define the rankings at the time of the World Cup
rank_df = rank_df.loc[(rank_df['rank_date'] == rank_df['rank_date'].max()) & (rank_df['country_full'].isin(teams_df['Team'].unique()))]

worldcup_rankings = rank_df.set_index(['country_full'])

In [17]:
worldcup_rankings.head()

Unnamed: 0_level_0,rank,country_abrv,total_points,previous_points,rank_change,confederation,rank_date
country_full,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Brazil,1,BRA,1841.3,1837.56,0,CONMEBOL,2022-10-06
South Korea,28,KOR,1530.3,1526.02,0,AFC,2022-10-06
Tunisia,30,TUN,1507.54,1507.86,0,CAF,2022-10-06
Costa Rica,31,CRC,1503.59,1500.06,-3,CONCACAF,2022-10-06
Australia,38,AUS,1488.72,1483.73,-1,AFC,2022-10-06


In [18]:
worldcup_rankings.index.unique()

Index(['Brazil', 'South Korea', 'Tunisia', 'Costa Rica', 'Australia', 'Canada',
       'Cameroon', 'Ecuador', 'Poland', 'Japan', 'Belgium', 'Argentina',
       'France', 'England', 'Spain', 'Netherlands', 'Portugal', 'Denmark',
       'Germany', 'Croatia', 'Mexico', 'Uruguay', 'Switzerland', 'USA',
       'Senegal', 'Wales', 'Iran', 'Serbia', 'Morocco', 'Qatar',
       'Saudi Arabia', 'Ghana'],
      dtype='object', name='country_full')

In [21]:
opponents = ['First match \n against', 'Second match\n against', 'Third match\n against']

world_cup = teams_df.set_index(['Team'])
world_cup['points'] = 0
world_cup['total_prob'] = 0
world_cup['expected points'] = 0
world_cup['rank'] = None

country_win_prob = {}

for country in teams_df['Team'].unique():
    country_win_prob[country] = list()


In [None]:
from itertools import combinations
import numpy as np

for group in set(teams_df['Group']):
    print('___Starting group {}:___'.format(group))

    for home, away in combinations(teams_df.query('Group == "{}"'.format(group)).values, 2):
        print("{} vs. {}: ".format(home[0], away[0]), end='')
    
        home = home[0]
        away = away[0]
        
        # Create a row for each match
        row = pd.DataFrame(np.array([[np.nan, np.nan, np.nan, True, True]]), columns=X_train.columns)

        home_rank = worldcup_rankings.loc[home, 'rank']
        home_points = worldcup_rankings.loc[home, 'total_points']
        opp_rank = worldcup_rankings.loc[away, 'rank']
        opp_points = worldcup_rankings.loc[away, 'total_points']
        world_cup.loc[home, 'rank'] = home_rank
        world_cup.loc[away, 'rank'] = opp_rank

        row['average_rank'] = (home_rank + opp_rank) / 2
        row['rank_difference'] = home_rank - opp_rank
        row['point_difference'] = home_points - opp_points
        
        # Model the output
        home_win_prob = model.predict_proba(row)[:,1][0]
        
        
        
        # Saving model output
        world_cup.loc[home, 'total_prob'] += home_win_prob
        world_cup.loc[away, 'total_prob'] += 1-home_win_prob

        country_win_prob[home].append(home_win_prob)
        country_win_prob[away].append(1-home_win_prob)
        
        # Determining Win / Draw / Lose based on home_win_prob - Since this is the world cup, there are technically no home matches unless you are Qatar
        points = 0
        if home_win_prob <= 0.5 - margin:
            print("{} wins with {:.2f}".format(away, 1-home_win_prob))
            world_cup.loc[away, 'points'] += 3
            world_cup.loc[away, 'expected points'] += (1-home_win_prob) * 3
        if home_win_prob > 0.5 - margin:
            points = 1
        if home_win_prob >= 0.5 + margin:
            points = 3
            world_cup.loc[home, 'points'] += 3
            world_cup.loc[home, 'expected points'] += home_win_prob * 3
            print("{} wins with {:.2f}".format(home, home_win_prob))
        if points == 1:
            print("Draw")
            world_cup.loc[home, 'points'] += 1
            world_cup.loc[away, 'points'] += 1
            world_cup.loc[home, 'expected points'] += home_win_prob * 1
            world_cup.loc[away, 'expected points'] += (1-home_win_prob) * 1

# Expected Points




In [29]:
#Here we create a function to display the expected points for each team. The predicted points are based on the match_df values we previously went over. 
for group in set(teams_df['Group']):
    print('___Starting group {}:___'.format(group))
    teams = teams_df.query('Group == "{}"'.format(group)).values
    for team_info in teams:
        team_name = team_info[0]
        print(team_name, ": ", world_cup.loc[team_name, "expected points"])
        
#One more thing to note here is that the accuracy of the models has severely decreased since the group stage has already concluded now and the data has been updated.

___Starting group C:___
Argentina :  4.286353942719144
Saudi Arabia :  0.0
Mexico :  3.196319202031786
Poland :  2.4812713690338812
___Starting group H:___
Uruguay :  2.5899996526460924
South Korea :  2.3520097476957256
Portugal :  5.8162960019154815
Ghana :  0.0
___Starting group A:___
Senegal :  3.6079965458122643
Qatar :  0.0
Netherlands :  5.959668575161054
Ecuador :  1.7127215169764838
___Starting group D:___
Denmark :  3.4485978500610264
Tunisia :  0.494587799222057
France :  5.696886533016041
Australia :  0.505412200777943
___Starting group F:___
Morocco :  1.6526449812661026
Croatia :  3.6146314412505074
Belgium :  5.76783915561168
Canada :  0.0
___Starting group G:___
Switzerland :  2.2706012878512096
Cameroon :  0.0
Brazil :  5.810936160477053
Serbia :  2.482411222170502
___Starting group B:___
Iran :  0.0
England :  2.922749501335205
USA :  2.6840236954486825
Wales :  2.6832559153963644
___Starting group E:___
Germany :  2.178141394621764
Japan :  0.972807670163196
Spain :  