In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker

from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor


In [2]:
# IMPORT CSV FILES 

# -- PER COUNTRY FEATURES --
rank = pd.read_csv("Features/fifa_rank.csv")
player_avg = pd.read_csv("Features/player_averages_for_teams.csv")
win_ratio = pd.read_csv("Features/ratio_played_vs_won.csv")
delta_pts = pd.read_csv("Features/total_score_margins.csv")
games = pd.read_csv("Features/adv_df_pivot.csv")
player_top_features = pd.read_csv("Features/player_top_features_country.csv")

# -- OTHER FEATURES --
df_matches = pd.read_csv("Features/wc_matches.csv")
schedule = pd.read_csv('Data/schedule.csv')

# IDENTIFY 2022 WORLD CUP TEAMS
teams_2022 = ['Qatar', 'Netherlands', 'Senegal', 'Ecuador', 
            'England', 'United States', 'Wales', 'Iran', 
            'Argentina', 'Poland', 'Mexico', 'Saudi Arabia', 
            'France', 'Denmark', 'Tunisia', 'Australia', 
            'Germany', 'Spain', 'Japan', 'Costa Rica', 
            'Belgium', 'Croatia', 'Canada', 'Morocco', 
            'Brazil', 'Switzerland', 'Serbia', 'Cameroon', 
            'Portugal', 'Uruguay', 'Ghana', 'Korea Republic']

In [3]:
# matches_by_year = df_matches[["date", 'year']].groupby(["year"]).count()
# matches_by_year.head(50)

In [4]:
# CLEAN UP

rank = rank.drop(['Unnamed: 0', 'Points'], axis=1)
player_avg = player_avg.rename(columns={"Nationality": "Country"})
win_ratio = win_ratio.drop(['Unnamed: 0'], axis=1)
delta_pts = delta_pts.drop(['Unnamed: 0'], axis=1)
games = games.rename(columns={"country": "Country"})
player_top_features = player_top_features.drop(['Unnamed: 0'], axis=1)

In [5]:
# MERGE DATAFRAMES ON "COUNTRY"

 # list of dataframes
# dfs = [rank, player_avg, win_ratio, delta_pts, games]
dfs = [rank, player_top_features, win_ratio, delta_pts, games]

# MERGING
compiled_df = reduce(lambda  left,right: pd.merge(left,right,on=["Country"],
                                            how='outer'), dfs)
# FILLING NULLS WITH 0
compiled_df = compiled_df.fillna(0)
compiled_df["Country"].unique()

array(['Brazil', 'Belgium', 'Argentina', 'France', 'England', 'Spain',
       'Netherlands', 'Portugal', 'Denmark', 'Germany', 'Mexico',
       'Uruguay', 'United States', 'Croatia', 'Switzerland', 'Senegal',
       'Wales', 'Morocco', 'Iran', 'Japan', 'Serbia', 'Poland',
       'Korea Republic', 'Tunisia', 'Costa Rica', 'Cameroon', 'Australia',
       'Canada', 'Ecuador', 'Qatar', 'Saudi Arabia', 'Ghana'],
      dtype=object)

In [6]:
# limit the dataframe rows to the relevant teams playing in the 2022 world cup
compiled_df = compiled_df[(compiled_df['Country'].isin(teams_2022))]
compiled_df.head()

Unnamed: 0,Rank,Country,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,total_wins,total_games,ratio_won,win_margins,lose_margins,final,group stage,quarter-finals,round of 16,second group stage,semi-finals,third-place match
0,1,Brazil,64.8362,70.9672,73.1028,49.5706,68.139,56.0,77.0,0.727273,3.790698,-1.8,4.0,39.0,9.0,9.0,8.0,5.0,3.0
1,2,Belgium,60.6567,67.6767,73.9033,44.65,62.58,21.0,42.0,0.5,2.5,-1.903846,0.0,27.0,3.0,6.0,2.0,2.0,2.0
2,3,Argentina,60.8926,67.8063,72.7284,45.2558,63.9053,39.0,65.0,0.6,2.823529,-2.059524,4.0,36.0,6.0,8.0,8.0,3.0,0.0
3,4,France,60.0427,67.2386,73.5875,43.5765,62.4125,29.0,50.0,0.58,3.57619,-1.4,3.0,27.0,5.0,5.0,2.0,5.0,2.0
4,5,England,56.4206,63.2728,70.3153,40.3194,59.032,22.0,30.0,0.733333,3.232143,-1.064706,0.0,30.0,6.0,7.0,2.0,2.0,2.0


In [7]:
df_matches.head(15)
df_matches = df_matches.replace("South Korea", "Korea Republic")

# df_matches["home_team"].unique()


In [8]:
df_matches_home = df_matches[df_matches['home_team'].isin(teams_2022)]
df_matches_away = df_matches[df_matches['away_team'].isin(teams_2022)]
df_matches2 = pd.concat((df_matches_home, df_matches_away))
df_matches2.drop_duplicates()
df_matches2.count()

Unnamed: 0         5150
date               5150
home_team          5150
away_team          5150
home_score         5150
away_score         5150
tournament         5150
city               5150
country            5150
neutral            5150
winning_team       5150
goal_difference    5150
year               5150
dtype: int64

In [9]:
df_matches2 = df_matches2.drop(columns = "Unnamed: 0", axis = 1)

df_matches2.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team,goal_difference,year
0,1930-07-13,Belgium,United States,0,3,FIFA World Cup,Montevideo,Uruguay,True,United States,3,1930
1,1930-07-13,France,Mexico,4,1,FIFA World Cup,Montevideo,Uruguay,True,France,3,1930
2,1930-07-14,Brazil,Yugoslavia,1,2,FIFA World Cup,Montevideo,Uruguay,True,Yugoslavia,1,1930
3,1930-07-15,Argentina,France,1,0,FIFA World Cup,Montevideo,Uruguay,True,Argentina,1,1930
6,1930-07-18,Uruguay,Peru,1,0,FIFA World Cup,Montevideo,Uruguay,False,Uruguay,1,1930


In [10]:
df_matches2 = df_matches2.drop(["date", "home_score", "away_score", "tournament", "city", "country", "goal_difference", "neutral", "year"], axis = 1)
df_matches2.head()

Unnamed: 0,home_team,away_team,winning_team
0,Belgium,United States,United States
1,France,Mexico,France
2,Brazil,Yugoslavia,Yugoslavia
3,Argentina,France,Argentina
6,Uruguay,Peru,Uruguay


In [11]:
#Building the model
#the prediction label: 
# The winning_team column will show "2" if the home team has won, "1" if it was a tie, and "0" if the away team has won.

# df_matches2 = df_matches2.reset_index(drop=True)
df_matches2.loc[df_matches2.winning_team == df_matches2.home_team,'winning_team']=2
df_matches2.loc[df_matches2.winning_team == 'draw', 'winning_team']=1
df_matches2.loc[df_matches2.winning_team == df_matches2.away_team, 'winning_team']=0

df_matches2.head()

Unnamed: 0,home_team,away_team,winning_team
0,Belgium,United States,0
1,France,Mexico,2
2,Brazil,Yugoslavia,0
3,Argentina,France,2
6,Uruguay,Peru,2


## ADD IN COUNTRY FEATURES HERE (PROBABLY)

In [12]:
all_feautres_home = pd.merge(df_matches2, compiled_df, left_on  = "home_team", right_on = "Country", how ="left" )
all_feautres_away = pd.merge(df_matches2, compiled_df, left_on  = "away_team", right_on = "Country", how ="left" )

# all_feautres_away
# all_feautres_home

all_features = all_feautres_home.append(all_feautres_away, ignore_index = True)
all_features = all_features.dropna()
all_features["Country"].nunique()

32

In [13]:
final = pd.get_dummies(all_features, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6842 entries, 0 to 10299
Columns: 403 entries, winning_team to away_team_Zimbabwe
dtypes: float64(18), object(2), uint8(383)
memory usage: 3.6+ MB


In [14]:
#convert home team and away team from categorical variables to continous inputs 
# Get dummy variables
# final = pd.get_dummies(df_matches2, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# Separate X and y sets
X = final.drop(['winning_team', "Country"], axis=1)
y = final["winning_team"]
y = y.astype('int')

# Separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [15]:
final.info()
y.value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6842 entries, 0 to 10299
Columns: 403 entries, winning_team to away_team_Zimbabwe
dtypes: float64(18), object(2), uint8(383)
memory usage: 3.6+ MB


2    3372
0    1834
1    1636
Name: winning_team, dtype: int64

In [16]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

scaler = preprocessing.StandardScaler().fit(X_train)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [17]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
score = logreg.score(X_train_scaled, y_train)
score2 = logreg.score(X_test_scaled, y_test)

# from sklearn.ensemble import RandomForestClassifier


# clf = RandomForestClassifier(random_state=37, n_estimators=1000).fit(X_train_scaled, y_train)

# score = clf.score(X_train, y_train)
# score2 = clf.score(X_test, y_test)

print("Training set accuracy: ", '%.3f'%(score))
print("Test set accuracy: ", '%.3f'%(score2))

Training set accuracy:  0.549
Test set accuracy:  0.512


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
#adding Fifa rankings
#the team which is positioned higher on the FIFA Ranking will be considered "favourite" for the match
#and therefore, will be positioned under the "home_teams" column
#since there are no "home" or "away" teams in World Cup games. 

# Loading new datasets
# ranking = pd.read_csv('datasets/fifa_rankings.csv') 
fixtures = pd.read_csv('Data/schedule.csv')

# List for storing the group stage games
pred_set = []

## Add country  features here ?

In [19]:
# from github repo--------- itsmuriuki

# #adding Fifa rankings
# #the team which is positioned higher on the FIFA Ranking will be considered "favourite" for the match
# #and therefore, will be positioned under the "home_teams" column
# #since there are no "home" or "away" teams in World Cup games. 

# # Loading new datasets
# ranking = pd.read_csv('datasets/fifa_rankings.csv') 
# fixtures = pd.read_csv('datasets/fixtures.csv')

# # List for storing the group stage games
# pred_set = []

# loading datasets
country_features = compiled_df
fixtures = schedule

# list for storing group stage games
pred_set = []



In [20]:
# may not need the .inster and .map functions as we're trying to use more than one feature
# cannot not specify a single columns as currently written

# ------
# # Create new columns with ranking position of each team
# fixtures.insert(1, 'first_position', fixtures['Home Team'].map(country_features .set_index('Country')['avg_Composure']))
# fixtures.insert(2, 'second_position', fixtures['Away Team'].map(country_features .set_index('Country')['avg_Composure']))

# fixtures.insert(1, 'first_position', fixtures['Home Team'].map(country_features.set_index('Country')))
# fixtures.insert(2, 'second_position', fixtures['Away Team'].map(country_features.set_index('Country')))


# We only need the group stage games, so we have to slice the dataset
fixtures = fixtures.iloc[:48, :]
fixtures.tail()

Unnamed: 0,Match Number,Round Number,Date,Location,Home Team,Away Team,Group,Result
43,44,3,1/12/2022 19:00,Al Bayt Stadium,Costa Rica,Germany,Group E,
44,45,3,2/12/2022 15:00,Al Janoub Stadium,Ghana,Uruguay,Group H,
45,46,3,2/12/2022 15:00,Education City Stadium,Korea Republic,Portugal,Group H,
46,47,3,2/12/2022 19:00,Stadium 974,Serbia,Switzerland,Group G,
47,48,3,2/12/2022 19:00,Lusail Stadium,Cameroon,Brazil,Group G,
