In [1]:
# run dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# IMPORT CSV FILES 

# -- PER COUNTRY FEATURES --
rank = pd.read_csv("Features/fifa_rank.csv")
player_avg = pd.read_csv("Features/player_averages_for_teams.csv")
win_ratio = pd.read_csv("Features/ratio_played_vs_won.csv")
delta_pts = pd.read_csv("Features/total_score_margins.csv")
games = pd.read_csv("Features/adv_df_pivot.csv")
player_top_features = pd.read_csv("Features/player_top_features_country.csv")

# -- OTHER FEATURES --
df_matches = pd.read_csv("Features/wc_matches.csv")
schedule = pd.read_csv('Data/schedule.csv')

# IDENTIFY 2022 WORLD CUP TEAMS
teams_2022 = ['Qatar', 'Netherlands', 'Senegal', 'Ecuador', 
            'England', 'United States', 'Wales', 'Iran', 
            'Argentina', 'Poland', 'Mexico', 'Saudi Arabia', 
            'France', 'Denmark', 'Tunisia', 'Australia', 
            'Germany', 'Spain', 'Japan', 'Costa Rica', 
            'Belgium', 'Croatia', 'Canada', 'Morocco', 
            'Brazil', 'Switzerland', 'Serbia', 'Cameroon', 
            'Portugal', 'Uruguay', 'Ghana', 'Korea Republic']

#import dataset that you will be predicting upon (Target = predicting a win, draw, or loss)
#But also, the match data needs to be organized differently.
# df_matches = pd.read_csv("Features/wc_matches.csv")
# df_matches

In [4]:
# CLEAN UP

rank = rank.drop(['Unnamed: 0', 'Points'], axis=1)
player_avg = player_avg.rename(columns={"Nationality": "Country"})
win_ratio = win_ratio.drop(['Unnamed: 0'], axis=1)
delta_pts = delta_pts.drop(['Unnamed: 0'], axis=1)
games = games.rename(columns={"country": "Country"})
player_top_features = player_top_features.drop(['Unnamed: 0'], axis=1)

In [5]:
# VIEW DATAFRAMES

# rank.head()
# player_avg.head()
# win_ratio.head()
# delta_pts.head()
# games.head()
# player_top_features.head()

In [6]:
# MERGE DATAFRAMES ON "COUNTRY"

 # list of dataframes
# dfs = [rank, player_avg, win_ratio, delta_pts, games]
dfs = [rank, player_top_features, win_ratio, delta_pts, games]

# MERGING
compiled_df = reduce(lambda  left,right: pd.merge(left,right,on=["Country"],
                                            how='outer'), dfs)
# FILLING NULLS WITH 0
compiled_df = compiled_df.fillna(0)

In [7]:
# limit the dataframe rows to the relevant teams playing in the 2022 world cup
compiled_df = compiled_df[(compiled_df['Country'].isin(teams_2022))]
compiled_df

Unnamed: 0,Rank,Country,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,avg_FKAcc,avg_Jumping,avg_Penalties,avg_Aggression,avg_SprintSpeed,med_Composure,med_Overall,med_Potential,med_Volleys,med_Reactions,med_FKAcc,med_Jumping,med_Penalties,med_Aggression,med_SprintSpeed,max_Composure,max_Overall,max_Potential,max_Volleys,max_Reactions,max_FKAcc,max_Jumping,max_Penalties,max_Aggression,max_SprintSpeed,min_Composure,min_Overall,min_Potential,min_Volleys,min_Reactions,min_FKAcc,min_Jumping,min_Penalties,min_Aggression,min_SprintSpeed,std_Composure,std_Overall,std_Potential,std_Volleys,std_Reactions,std_FKAcc,std_Jumping,std_Penalties,std_Aggression,std_SprintSpeed,total_wins,total_games,ratio_won,win_margins,lose_margins,final,group stage,quarter-finals,round of 16,second group stage,semi-finals,third-place match
0,1,Brazil,64.8362,70.9672,73.1028,49.5706,68.139,49.5435,65.8554,54.7412,60.4011,66.4497,66.0,71.0,72.0,53.0,68.0,50.0,67.0,57.0,63.0,69.0,93.0,91.0,93.0,87.0,91.0,89.0,92.0,92.0,94.0,95.0,15.0,51.0,56.0,4.0,34.0,7.0,30.0,10.0,13.0,16.0,10.729,5.4933,5.5457,18.1326,7.6497,17.9845,12.5662,15.7556,17.2814,14.3189,56.0,77.0,0.727273,3.790698,-1.8,4.0,39.0,9.0,9.0,8.0,5.0,3.0
1,2,Belgium,60.6567,67.6767,73.9033,44.65,62.58,45.2967,64.9667,48.0433,55.3533,63.55,62.0,67.0,74.0,47.0,63.0,46.0,67.0,49.0,60.5,67.0,91.0,91.0,91.0,83.0,91.0,83.0,92.0,87.0,88.0,92.0,21.0,51.0,54.0,4.0,39.0,8.0,31.0,10.0,14.0,15.0,14.1556,6.8573,5.3868,19.9948,9.7427,19.465,11.1613,16.7004,18.2724,15.0786,21.0,42.0,0.5,2.5,-1.903846,0.0,27.0,3.0,6.0,2.0,2.0,2.0
2,3,Argentina,60.8926,67.8063,72.7284,45.2558,63.9053,44.3305,65.6379,51.4263,57.4863,64.3337,62.0,68.0,73.0,47.0,64.0,43.0,66.0,53.0,60.0,67.0,96.0,93.0,93.0,88.0,94.0,94.0,94.0,88.0,95.0,94.0,14.0,52.0,57.0,5.0,35.0,8.0,17.0,10.0,13.0,15.0,12.5168,6.3838,5.5847,17.7896,8.469,17.0677,11.8576,15.5284,18.4255,14.4028,39.0,65.0,0.6,2.823529,-2.059524,4.0,36.0,6.0,8.0,8.0,3.0,0.0
3,4,France,60.0427,67.2386,73.5875,43.5765,62.4125,43.6123,64.7535,49.8111,56.7038,64.4016,61.0,67.0,73.0,44.0,62.0,42.0,65.0,51.0,59.5,67.0,90.0,90.0,95.0,87.0,92.0,87.0,92.0,86.0,90.0,96.0,20.0,50.0,58.0,3.0,30.0,6.0,26.0,7.0,11.0,15.0,11.7619,6.9645,6.0742,17.8418,9.532,17.1862,12.328,15.5527,16.9628,14.4239,29.0,50.0,0.58,3.57619,-1.4,3.0,27.0,5.0,5.0,2.0,5.0,2.0
4,5,England,56.4206,63.2728,70.3153,40.3194,59.032,40.427,64.7312,46.6166,54.6713,64.9692,57.0,63.0,70.0,41.0,59.0,39.0,65.0,48.0,57.0,68.0,91.0,89.0,93.0,85.0,91.0,91.0,93.0,92.0,95.0,94.0,22.0,47.0,48.0,5.0,31.0,7.0,30.0,7.0,12.0,16.0,10.8998,7.0593,6.1797,15.9093,8.767,16.0784,11.1274,14.6914,16.4375,13.9696,22.0,30.0,0.733333,3.232143,-1.064706,0.0,30.0,6.0,7.0,2.0,2.0,2.0
5,6,Spain,61.253,69.2852,74.6753,44.7525,65.5943,46.3671,64.023,50.9227,55.851,64.1435,62.0,69.0,74.0,46.0,65.0,46.0,65.0,51.0,59.0,67.0,92.0,89.0,90.0,85.0,92.0,93.0,93.0,92.0,93.0,96.0,17.0,52.0,58.0,4.0,35.0,8.0,29.0,11.0,13.0,15.0,11.892,6.413,5.3323,18.1174,8.5025,18.2263,11.1967,16.2423,17.3279,14.0882,25.0,30.0,0.833333,3.733766,-1.241071,1.0,33.0,4.0,7.0,2.0,1.0,0.0
6,8,Netherlands,60.0804,67.0739,73.1652,43.963,63.3283,43.6783,64.3435,48.637,56.7717,63.3348,62.0,67.0,73.0,47.0,64.0,44.0,65.0,50.0,61.0,66.0,90.0,90.0,92.0,84.0,89.0,86.0,93.0,90.0,87.0,93.0,21.0,54.0,61.0,4.0,39.0,7.0,32.0,10.0,16.0,15.0,12.9509,6.4679,5.8863,18.7445,8.6,18.1774,12.3669,15.7317,18.3969,16.7405,28.0,31.0,0.903226,3.75,-1.083333,3.0,24.0,4.0,6.0,6.0,3.0,2.0
7,9,Portugal,64.4681,69.7839,74.9391,48.0055,67.1496,47.1191,64.241,52.651,60.4571,66.1136,66.0,69.0,74.0,51.0,67.0,47.0,65.0,54.0,65.0,69.0,95.0,92.0,93.0,87.0,95.0,87.0,95.0,91.0,92.0,94.0,21.0,50.0,59.0,5.0,32.0,9.0,32.0,10.0,12.0,17.0,12.2633,6.3607,6.0256,18.6607,7.9896,18.2778,12.2402,16.1622,17.1985,14.3446,10.0,25.0,0.4,2.75,-1.911111,0.0,18.0,1.0,3.0,0.0,1.0,1.0
8,10,Denmark,58.682,64.5705,70.8754,42.3541,60.177,41.3836,64.5967,44.3639,56.5213,64.9607,60.0,65.0,70.0,44.0,61.0,39.0,65.0,44.0,59.0,68.0,86.0,84.0,86.0,84.0,81.0,89.0,91.0,78.0,87.0,93.0,22.0,49.0,58.0,5.0,30.0,7.0,32.0,10.0,17.0,18.0,10.8276,7.1411,5.3223,16.4108,9.6342,16.9843,11.0214,14.3078,16.3566,13.938,9.0,20.0,0.45,4.333333,-2.3,0.0,15.0,1.0,4.0,0.0,0.0,0.0
9,11,Germany,56.1481,65.7146,71.1707,41.2477,61.8427,39.9331,64.2795,45.2812,54.7724,63.3933,58.0,65.0,71.0,43.0,62.0,39.0,65.0,46.0,58.0,66.0,89.0,90.0,93.0,89.0,94.0,86.0,91.0,90.0,93.0,95.0,12.0,49.0,56.0,4.0,32.0,7.0,22.0,7.0,12.0,15.0,12.9152,6.2991,5.5943,18.4078,9.1358,17.2515,11.463,15.624,17.0003,15.0329,29.0,55.0,0.527273,4.279762,-2.5,2.0,21.0,6.0,6.0,0.0,4.0,2.0


In [8]:
df_matches = df_matches.reset_index(drop=True)
df_matches.loc[df_matches.winning_team == df_matches.home_team,'winning_team']=2
df_matches.loc[df_matches.winning_team == 'draw', 'winning_team']=1
df_matches.loc[df_matches.winning_team == df_matches.away_team, 'winning_team']=0

# # CREATING A COLUMN THAT NAMES THE WINNING TEAM
# df_matches.loc[df_matches.away_score > df_matches.home_score, "winning_team"] = df_matches.away_team
# df_matches.loc[df_matches.home_score > df_matches.away_score, "winning_team"] = df_matches.home_team
# df_matches.loc[df_matches.home_score == df_matches.away_score, "winning_team"] = "None - Draw"

df_matches

Unnamed: 0.1,Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team,goal_difference,year
0,1314,1930-07-13,Belgium,United States,0,3,FIFA World Cup,Montevideo,Uruguay,True,0,3,1930
1,1315,1930-07-13,France,Mexico,4,1,FIFA World Cup,Montevideo,Uruguay,True,2,3,1930
2,1316,1930-07-14,Brazil,Yugoslavia,1,2,FIFA World Cup,Montevideo,Uruguay,True,0,1,1930
3,1318,1930-07-15,Argentina,France,1,0,FIFA World Cup,Montevideo,Uruguay,True,2,1,1930
4,1319,1930-07-16,Chile,Mexico,3,0,FIFA World Cup,Montevideo,Uruguay,True,2,3,1930
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4177,43441,2022-03-30,Costa Rica,United States,2,0,FIFA World Cup qualification,San José,Costa Rica,False,2,2,2022
4178,43442,2022-03-30,Panama,Canada,1,0,FIFA World Cup qualification,Panama City,Panama,False,2,1,2022
4179,43538,2022-06-05,Wales,Ukraine,1,0,FIFA World Cup qualification,Cardiff,Wales,False,2,1,2022
4180,43704,2022-06-13,Australia,Peru,0,0,FIFA World Cup qualification,Al Rayyan,Qatar,True,1,0,2022


In [9]:
# MERGE COMPLIED DATA WITH DF_MATCHES
# compiled_matches_df = pd.merge(df_matches,compiled_df, left_on = "winning_team", right_on = "Country", how = "inner")
# compiled_matches_df.tail()

In [11]:
# final = pd.get_dummies(compiled_matches_df)

# # Separate X and y sets
# X = final.drop(['winning_team'], axis=1)
# y = final["winning_team"]

# y=y.astype('int')

# # Separate train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# from sklearn.ensemble import RandomForestRegressor
# classifier = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# classifier.fit(X_train, y_train)

# score = classifier.score(X_train, y_train)
# score2 = classifier.score(X_test, y_test)
# print("Training set accuracy: ", '%.3f'%(score))
# print("Test set accuracy: ", '%.3f'%(score2))


In [12]:
#remove data that we consider irrelevant to predicting the feature
df_matches = df_matches.drop(['date', 'Unnamed: 0', 'home_score', 'away_score', 'tournament', 'city', 'country', 'neutral','year'], axis=1)

In [13]:
#if we are going to split the data per country, we will need to define the match data below differently:
#each row is one of the 32 countries
#previous match data would be a column
#ie. Belgium on row, and United States, Mexico, Yugoslavia, etc on y axis
#not exactly sure if this will work

In [14]:
# define home team, away team, and winning team
# The winning team column will show "2" if the home team has won, "1" if it was a tie, and "0" if the away team has won
# The model will be builT to predict the "winning_team"

df_matches = df_matches.reset_index(drop=True)
df_matches.loc[df_matches.winning_team == df_matches.home_team,'winning_team']=2
df_matches.loc[df_matches.winning_team == 'draw', 'winning_team']=1
df_matches.loc[df_matches.winning_team == df_matches.away_team, 'winning_team']=0

df_matches

Unnamed: 0,home_team,away_team,winning_team,goal_difference
0,Belgium,United States,0,3
1,France,Mexico,2,3
2,Brazil,Yugoslavia,0,1
3,Argentina,France,2,1
4,Chile,Mexico,2,3
...,...,...,...,...
4177,Costa Rica,United States,2,2
4178,Panama,Canada,2,1
4179,Wales,Ukraine,2,1
4180,Australia,Peru,1,0


In [17]:
# final = pd.get_dummies(df_matches, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# # Separate X and y sets
# X = final.drop(['winning_team'], axis=1)
# y = final["winning_team"]

# y=y.astype('int')

# # Separate train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# from sklearn.ensemble import RandomForestRegressor
# classifier = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# classifier.fit(X_train, y_train)

# score = classifier.score(X_train, y_train)
# score2 = classifier.score(X_test, y_test)
# print("Training set accuracy: ", '%.3f'%(score))
# print("Test set accuracy: ", '%.3f'%(score2))


Training set accuracy:  0.807
Test set accuracy:  0.148


# Running (up that hill) the model

In [18]:
final = pd.get_dummies(df_matches, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# Separate X and y sets
X = final.drop(['winning_team'], axis=1)
y = final["winning_team"]

y=y.astype('int')

# Separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [19]:
# final.head()

In [22]:
#classify data as 0, 1, or 2 (home team lost, drew, or won)
classifier = LogisticRegression()

# RAN THIS MODEL AS A RANDOM FOREST REGRESSOR. TRAINING SCORE SIMILAR TO LOGISTIC REGRESSION, BUT ACCURACY SUFFERED
# from sklearn.ensemble import RandomForestRegressor
# classifier = RandomForestRegressor(n_estimators = 1000, random_state = 42)

classifier.fit(X_train, y_train)

score = classifier.score(X_train, y_train)
score2 = classifier.score(X_test, y_test)
print("Training set accuracy: ", '%.3f'%(score))
print("Test set accuracy: ", '%.3f'%(score2))

# from sklearn.metrics import classification_report
# print(classification_report(y_test, predictions,
#                             target_names=["blue", "red"]))

Training set accuracy:  0.855
Test set accuracy:  0.812


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
# Create new columns with ranking position of each team
schedule.insert(1, 'first_position', schedule['Home Team'].map(rank.set_index('Country')['Rank']))
schedule.insert(2, 'second_position', schedule['Away Team'].map(rank.set_index('Country')['Rank']))

# We only need the group stage games, so we have to slice the dataset
schedule = schedule.iloc[:48, :]
schedule

ValueError: cannot insert first_position, already exists

In [37]:
# schedule[["Home Team", "Away Team", "Group"]].groupby("Group")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001D046FF60D0>

In [24]:
pred_set= []
for index, row in schedule.iterrows():
    if row['first_position'] < row['second_position']:
        pred_set.append({'home_team': row['Home Team'], 'away_team': row['Away Team'], 'winning_team': None})
    else:
        pred_set.append({'home_team': row['Away Team'], 'away_team': row['Home Team'], 'winning_team': None})
        
pred_set = pd.DataFrame(pred_set)
backup_pred_set = pred_set

pred_set.head(25)

Unnamed: 0,home_team,away_team,winning_team
0,Netherlands,Senegal,
1,England,Iran,
2,Ecuador,Qatar,
3,Wales,USA,
4,Argentina,Saudi Arabia,
5,Denmark,Tunisia,
6,Mexico,Poland,
7,France,Australia,
8,Croatia,Morocco,
9,Germany,Japan,


In [25]:
# # Get dummy variables and drop winning_team column
pred_set = pd.get_dummies(pred_set, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# Add missing columns compared to the model's training dataset
missing_cols = set(final.columns) - set(pred_set.columns)
for c in missing_cols:
    pred_set[c] = 0
pred_set = pred_set[final.columns]

# Remove winning team column
pred_set = pred_set.drop(['winning_team'], axis=1)

pred_set.head()

  pred_set[c] = 0


Unnamed: 0,goal_difference,home_team_Afghanistan,home_team_Albania,home_team_Algeria,home_team_Andorra,home_team_Angola,home_team_Antigua and Barbuda,home_team_Argentina,home_team_Armenia,home_team_Aruba,home_team_Australia,home_team_Austria,home_team_Azerbaijan,home_team_Bahrain,home_team_Bangladesh,home_team_Barbados,home_team_Belarus,home_team_Belgium,home_team_Belize,home_team_Benin,home_team_Bermuda,home_team_Bhutan,home_team_Bolivia,home_team_Bosnia and Herzegovina,home_team_Botswana,home_team_Brazil,home_team_Bulgaria,home_team_Burkina Faso,home_team_Burundi,home_team_Cambodia,home_team_Cameroon,home_team_Canada,home_team_Cape Verde,home_team_Cayman Islands,home_team_Chile,home_team_China PR,home_team_Colombia,home_team_Comoros,home_team_Congo,home_team_Costa Rica,home_team_Croatia,home_team_Cuba,home_team_Curaçao,home_team_Cyprus,home_team_Czech Republic,home_team_Czechoslovakia,home_team_DR Congo,home_team_Denmark,home_team_Dominica,home_team_Ecuador,home_team_Egypt,home_team_El Salvador,home_team_England,home_team_Equatorial Guinea,home_team_Estonia,home_team_Eswatini,home_team_Ethiopia,home_team_Faroe Islands,home_team_Fiji,home_team_Finland,home_team_France,home_team_Gabon,home_team_Gambia,home_team_Georgia,home_team_German DR,home_team_Germany,home_team_Ghana,home_team_Gibraltar,home_team_Greece,home_team_Grenada,home_team_Guam,home_team_Guatemala,home_team_Guinea,home_team_Guyana,home_team_Haiti,home_team_Honduras,home_team_Hong Kong,home_team_Hungary,home_team_Iceland,home_team_India,home_team_Indonesia,home_team_Iran,home_team_Iraq,home_team_Israel,home_team_Italy,home_team_Ivory Coast,home_team_Jamaica,home_team_Japan,home_team_Jordan,home_team_Kazakhstan,home_team_Kenya,home_team_Kosovo,home_team_Kuwait,home_team_Kyrgyzstan,home_team_Laos,home_team_Latvia,home_team_Lebanon,home_team_Lesotho,home_team_Liberia,home_team_Libya,home_team_Liechtenstein,home_team_Lithuania,home_team_Luxembourg,home_team_Macau,home_team_Madagascar,home_team_Malawi,home_team_Malaysia,home_team_Maldives,home_team_Mali,home_team_Malta,home_team_Mauritania,home_team_Mauritius,home_team_Mexico,home_team_Moldova,home_team_Montenegro,home_team_Morocco,home_team_Mozambique,home_team_Myanmar,home_team_Namibia,home_team_Nepal,home_team_Netherlands,home_team_New Zealand,home_team_Niger,home_team_Nigeria,home_team_North Korea,home_team_North Macedonia,home_team_Northern Ireland,home_team_Norway,home_team_Oman,home_team_Palestine,home_team_Panama,home_team_Paraguay,home_team_Peru,home_team_Poland,home_team_Portugal,home_team_Puerto Rico,home_team_Qatar,home_team_Republic of Ireland,home_team_Romania,home_team_Russia,home_team_Rwanda,home_team_Saarland,home_team_Saint Kitts and Nevis,home_team_Saint Lucia,home_team_Saint Vincent and the Grenadines,home_team_San Marino,home_team_Saudi Arabia,home_team_Scotland,home_team_Senegal,home_team_Serbia,home_team_Seychelles,home_team_Sierra Leone,home_team_Singapore,home_team_Slovakia,home_team_Slovenia,home_team_Solomon Islands,home_team_South Africa,home_team_South Korea,home_team_Spain,home_team_Sri Lanka,home_team_Sudan,home_team_Suriname,home_team_Sweden,home_team_Switzerland,home_team_Syria,home_team_Tahiti,home_team_Taiwan,home_team_Tajikistan,home_team_Tanzania,home_team_Thailand,home_team_Timor-Leste,home_team_Togo,home_team_Trinidad and Tobago,home_team_Tunisia,home_team_Turkey,home_team_Turkmenistan,home_team_Uganda,home_team_Ukraine,home_team_United Arab Emirates,home_team_United States,home_team_Uruguay,home_team_Uzbekistan,home_team_Venezuela,home_team_Vietnam,home_team_Vietnam Republic,home_team_Wales,home_team_Yemen,home_team_Yugoslavia,home_team_Zambia,home_team_Zimbabwe,away_team_Afghanistan,away_team_Albania,away_team_Algeria,away_team_American Samoa,away_team_Andorra,away_team_Angola,away_team_Antigua and Barbuda,away_team_Argentina,away_team_Armenia,away_team_Australia,away_team_Austria,away_team_Azerbaijan,away_team_Bahrain,away_team_Bangladesh,away_team_Barbados,away_team_Belarus,away_team_Belgium,away_team_Belize,away_team_Benin,away_team_Bermuda,away_team_Bhutan,away_team_Bolivia,away_team_Bosnia and Herzegovina,away_team_Botswana,away_team_Brazil,away_team_Bulgaria,away_team_Burkina Faso,away_team_Burundi,away_team_Cambodia,away_team_Cameroon,away_team_Canada,away_team_Cape Verde,away_team_Chile,away_team_China PR,away_team_Colombia,away_team_Comoros,away_team_Congo,away_team_Costa Rica,away_team_Croatia,away_team_Cuba,away_team_Curaçao,away_team_Cyprus,away_team_Czech Republic,away_team_Czechoslovakia,away_team_DR Congo,away_team_Denmark,away_team_Dominica,away_team_Ecuador,away_team_Egypt,away_team_El Salvador,away_team_England,away_team_Equatorial Guinea,away_team_Estonia,away_team_Eswatini,away_team_Ethiopia,away_team_Faroe Islands,away_team_Fiji,away_team_Finland,away_team_France,away_team_Gabon,away_team_Gambia,away_team_Georgia,away_team_German DR,away_team_Germany,away_team_Ghana,away_team_Gibraltar,away_team_Greece,away_team_Grenada,away_team_Guam,away_team_Guatemala,away_team_Guinea,away_team_Guinea-Bissau,away_team_Guyana,away_team_Haiti,away_team_Honduras,away_team_Hong Kong,away_team_Hungary,away_team_Iceland,away_team_India,away_team_Indonesia,away_team_Iran,away_team_Iraq,away_team_Israel,away_team_Italy,away_team_Ivory Coast,away_team_Jamaica,away_team_Japan,away_team_Jordan,away_team_Kazakhstan,away_team_Kenya,away_team_Kosovo,away_team_Kuwait,away_team_Kyrgyzstan,away_team_Laos,away_team_Latvia,away_team_Lebanon,away_team_Lesotho,away_team_Liberia,away_team_Libya,away_team_Liechtenstein,away_team_Lithuania,away_team_Luxembourg,away_team_Macau,away_team_Madagascar,away_team_Malawi,away_team_Malaysia,away_team_Maldives,away_team_Mali,away_team_Malta,away_team_Mauritania,away_team_Mauritius,away_team_Mexico,away_team_Moldova,away_team_Mongolia,away_team_Montenegro,away_team_Morocco,away_team_Mozambique,away_team_Namibia,away_team_Nepal,away_team_Netherlands,away_team_New Zealand,away_team_Niger,away_team_Nigeria,away_team_North Korea,away_team_North Macedonia,away_team_Northern Ireland,away_team_Norway,away_team_Oman,away_team_Palestine,away_team_Panama,away_team_Paraguay,away_team_Peru,away_team_Philippines,away_team_Poland,away_team_Portugal,away_team_Puerto Rico,away_team_Qatar,away_team_Republic of Ireland,away_team_Romania,away_team_Russia,away_team_Rwanda,away_team_Saarland,away_team_Saint Kitts and Nevis,away_team_Saint Lucia,away_team_Saint Vincent and the Grenadines,away_team_Samoa,away_team_San Marino,away_team_Saudi Arabia,away_team_Scotland,away_team_Senegal,away_team_Serbia,away_team_Seychelles,away_team_Sierra Leone,away_team_Singapore,away_team_Slovakia,away_team_Slovenia,away_team_Solomon Islands,away_team_Somalia,away_team_South Africa,away_team_South Korea,away_team_Spain,away_team_Sri Lanka,away_team_Sudan,away_team_Suriname,away_team_Sweden,away_team_Switzerland,away_team_Syria,away_team_Tahiti,away_team_Taiwan,away_team_Tajikistan,away_team_Tanzania,away_team_Thailand,away_team_Timor-Leste,away_team_Togo,away_team_Tonga,away_team_Trinidad and Tobago,away_team_Tunisia,away_team_Turkey,away_team_Turkmenistan,away_team_Uganda,away_team_Ukraine,away_team_United Arab Emirates,away_team_United States,away_team_Uruguay,away_team_Uzbekistan,away_team_Vanuatu,away_team_Venezuela,away_team_Vietnam,away_team_Wales,away_team_Yemen,away_team_Yugoslavia,away_team_Zambia,away_team_Zimbabwe
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
# group matches
predictions = classifier.predict(pred_set)
for i in range(schedule.shape[0]):
    print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
    if predictions[i] == 2:
        print("Winner: " + backup_pred_set.iloc[i, 1])
    elif predictions[i] == 1:
        print("Draw")
    elif predictions[i] == 0:
        print("Winner: " + backup_pred_set.iloc[i, 0])
    print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][2]))
    print('Probability of Draw: ', '%.3f'%(classifier.predict_proba(pred_set)[i][1]))
    print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][0]))
    print("")

Senegal and Netherlands
Draw
Probability of Senegal winning:  0.010
Probability of Draw:  0.989
Probability of Netherlands winning:  0.001

Iran and England
Draw
Probability of Iran winning:  0.006
Probability of Draw:  0.991
Probability of England winning:  0.003

Qatar and Ecuador
Draw
Probability of Qatar winning:  0.008
Probability of Draw:  0.990
Probability of Ecuador winning:  0.002

USA and Wales
Draw
Probability of USA winning:  0.006
Probability of Draw:  0.991
Probability of Wales winning:  0.003

Saudi Arabia and Argentina
Draw
Probability of Saudi Arabia winning:  0.009
Probability of Draw:  0.989
Probability of Argentina winning:  0.002

Tunisia and Denmark
Draw
Probability of Tunisia winning:  0.005
Probability of Draw:  0.992
Probability of Denmark winning:  0.003

Poland and Mexico
Draw
Probability of Poland winning:  0.007
Probability of Draw:  0.990
Probability of Mexico winning:  0.003

Australia and France
Draw
Probability of Australia winning:  0.010
Probability o

In [None]:
# I THINK WE NEED TO SAVE THIS OUTPUT AS A VARIABLE. 
# FOR GAMES WHERE THE PROBABILITY OF WINNING IS THE SAME FOR EITHER TEAM, CALL THAT A TRUE DRAW
# OTHERWISE, CAN WE ASSUME THAT THE TEAM WITH THE HIGHER PROBABILITY OF WINNING WOULD ADVANCE TO KNOCKOUT?

# GROUPSTAGE_PTS = 0

# FOR EACH IN OUTCOMES:
#     IF DF[FIRST-COUNTRY] WIN PROBABILITY > DF[SECOND-COUNTRY] WIN PROBABILITY:
#         GROUPSTAGE_PTS =+ 3
#     IF DF[SECOND-COUNTRY] WIN PROBABILITY > DF[FIRST-COUNTRY] WIN PROBABILITY:
#         GROUPSTAGE_PTS =+3
#     ELSEIF DF[SECOND-COUNTRY] WIN PROBABILITY == DF[FIRST-COUNTRY] WIN PROBABILITY:
#         GROUPSTAGE_PTS += 1
#        
# then sum the groupstage points per group to get the top 2 teams from each, which advance to the the round of 16, and so on

In [53]:
# WINNERS OF GROUP A MATCHES
schedule_grpA = schedule[schedule["Group"]=="Group A"]
schedule_grpA = schedule_grpA.drop(columns = ["Match Number", "Round Number", "Date", "Location", "Result"], axis = 1)
schedule_grpA

grpA_hometeams = schedule_grpA["Home Team"]
# grpA_hometeams

# PREDICT IF HOME TEAM WILL BEAT AWAY TEAM
predictions = classifier.predict(grpA_hometeams)
for i in range(schedule.shape[0]):
    print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
    if predictions[i] == 2:
        print("Winner: " + backup_pred_set.iloc[i, 1])
    elif predictions[i] == 1:
        print("Draw")
    elif predictions[i] == 0:
        print("Winner: " + backup_pred_set.iloc[i, 0])
    print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][2]))
    print('Probability of Draw: ', '%.3f'%(classifier.predict_proba(pred_set)[i][1]))
    print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][0]))
    print("")

# pred_setA = []

# for index, row in schedule_grpA.iterrows():
    
#     if row['first_position'] < row['second_position']:
#         pred_set.append({'home_team': row['Home Team'], 'away_team': row['Away Team'], 'winning_team': None})
#     else:
#         pred_set.append({'home_team': row['Away Team'], 'away_team': row['Home Team'], 'winning_team': None})
        
# pred_setA = pd.DataFrame(pred_setA)
# backup_pred_setA = pred_setA

# pred_setA.head(25)




# # # Get dummy variables and drop winning_team column
# pred_set = pd.get_dummies(pred_set, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# # Add missing columns compared to the model's training dataset
# missing_cols = set(final.columns) - set(pred_set.columns)
# for c in missing_cols:
#     pred_set[c] = 0
# pred_set = pred_set[final.columns]

# # Remove winning team column
# pred_set = pred_set.drop(['winning_team'], axis=1)

# pred_set.head()






# predictions_grpA = classifier.predict(pred_set)
# for i in range(schedule.shape[0]):
#     print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
#     if predictions[i] == 2:
#         print("Winner: " + backup_pred_set.iloc[i, 1])
#     elif predictions[i] == 1:
#         print("Draw")
#     elif predictions[i] == 0:
#         print("Winner: " + backup_pred_set.iloc[i, 0])
#     print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][2]))
#     print('Probability of Draw: ', '%.3f'%(classifier.predict_proba(pred_set)[i][1]))
#     print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][0]))
#     print("")

ValueError: could not convert string to float: 'Senegal'