In [1]:
# run dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# IMPORT CSV FILES 

# -- PER COUNTRY FEATURES --
rank = pd.read_csv("Features/fifa_rank.csv")
player_avg = pd.read_csv("Features/player_averages_for_teams.csv")
win_ratio = pd.read_csv("Features/ratio_played_vs_won.csv")
delta_pts = pd.read_csv("Features/total_score_margins.csv")
games = pd.read_csv("Features/adv_df_pivot.csv")
player_top_features = pd.read_csv("Features/player_top_features_country.csv")

# -- OTHER FEATURES --
df_matches = pd.read_csv("Features/wc_matches.csv")
schedule = pd.read_csv('Data/schedule.csv')

# IDENTIFY 2022 WORLD CUP TEAMS
teams_2022 = ['Qatar', 'Netherlands', 'Senegal', 'Ecuador', 
            'England', 'United States', 'Wales', 'Iran', 
            'Argentina', 'Poland', 'Mexico', 'Saudi Arabia', 
            'France', 'Denmark', 'Tunisia', 'Australia', 
            'Germany', 'Spain', 'Japan', 'Costa Rica', 
            'Belgium', 'Croatia', 'Canada', 'Morocco', 
            'Brazil', 'Switzerland', 'Serbia', 'Cameroon', 
            'Portugal', 'Uruguay', 'Ghana', 'Korea Republic']

#import dataset that you will be predicting upon (Target = predicting a win, draw, or loss)
#But also, the match data needs to be organized differently.
# df_matches = pd.read_csv("Features/wc_matches.csv")
# df_matches

In [4]:
# CLEAN UP

rank = rank.drop(['Unnamed: 0', 'Points'], axis=1)
player_avg = player_avg.rename(columns={"Nationality": "Country"})
win_ratio = win_ratio.drop(['Unnamed: 0'], axis=1)
delta_pts = delta_pts.drop(['Unnamed: 0'], axis=1)
games = games.rename(columns={"country": "Country"})
player_top_features = player_top_features.drop(['Unnamed: 0'], axis=1)

In [5]:
# VIEW DATAFRAMES

# rank.head()
# player_avg.head()
# win_ratio.head()
# delta_pts.head()
# games.head()
# player_top_features.head()

In [6]:
# MERGE DATAFRAMES ON "COUNTRY"

 # list of dataframes
# dfs = [rank, player_avg, win_ratio, delta_pts, games]
dfs = [rank, player_top_features, win_ratio, delta_pts, games]

# MERGING
compiled_df = reduce(lambda  left,right: pd.merge(left,right,on=["Country"],
                                            how='outer'), dfs)
# FILLING NULLS WITH 0
compiled_df = compiled_df.fillna(0)

In [7]:
# limit the dataframe rows to the relevant teams playing in the 2022 world cup
compiled_df = compiled_df[(compiled_df['Country'].isin(teams_2022))]
compiled_df.head()

Unnamed: 0,Rank,Country,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,total_wins,total_games,ratio_won,win_margins,lose_margins,final,group stage,quarter-finals,round of 16,second group stage,semi-finals,third-place match
0,1,Brazil,64.8362,70.9672,73.1028,49.5706,68.139,56.0,77.0,0.727273,3.790698,-1.8,4.0,39.0,9.0,9.0,8.0,5.0,3.0
1,2,Belgium,60.6567,67.6767,73.9033,44.65,62.58,21.0,42.0,0.5,2.5,-1.903846,0.0,27.0,3.0,6.0,2.0,2.0,2.0
2,3,Argentina,60.8926,67.8063,72.7284,45.2558,63.9053,39.0,65.0,0.6,2.823529,-2.059524,4.0,36.0,6.0,8.0,8.0,3.0,0.0
3,4,France,60.0427,67.2386,73.5875,43.5765,62.4125,29.0,50.0,0.58,3.57619,-1.4,3.0,27.0,5.0,5.0,2.0,5.0,2.0
4,5,England,56.4206,63.2728,70.3153,40.3194,59.032,22.0,30.0,0.733333,3.232143,-1.064706,0.0,30.0,6.0,7.0,2.0,2.0,2.0


In [8]:
df_matches = df_matches.reset_index(drop=True)
df_matches.loc[df_matches.winning_team == df_matches.home_team,'winning_team']=2
df_matches.loc[df_matches.winning_team == 'draw', 'winning_team']=1
df_matches.loc[df_matches.winning_team == df_matches.away_team, 'winning_team']=0

df_matches.columns

Index(['Unnamed: 0', 'date', 'home_team', 'away_team', 'home_score',
       'away_score', 'tournament', 'city', 'country', 'neutral',
       'winning_team', 'goal_difference', 'year'],
      dtype='object')

In [9]:
df_matches_hometeams = df_matches.drop(columns = ["Unnamed: 0", "date", "away_team", "home_score", "away_score", "tournament", "city", "country", "neutral"], axis = 1)
df_matches_awayteams = df_matches.drop(columns = ["Unnamed: 0", "date", "home_team", "home_score", "away_score", "tournament", "city", "country", "neutral"], axis = 1)

In [10]:
df_matches_awayteams = df_matches_awayteams.rename(columns = {"away_team":"Country"})
df_matches_awayteams

Unnamed: 0,Country,winning_team,goal_difference,year
0,United States,0,3,1930
1,Mexico,2,3,1930
2,Yugoslavia,0,1,1930
3,France,2,1,1930
4,Mexico,2,3,1930
...,...,...,...,...
4299,United States,2,2,2022
4300,Canada,2,1,2022
4301,Ukraine,2,1,2022
4302,Peru,1,0,2022


In [11]:
df_matches_hometeams = df_matches_hometeams.rename(columns = {"home_team":"Country"})
df_matches_hometeams

Unnamed: 0,Country,winning_team,goal_difference,year
0,Belgium,0,3,1930
1,France,2,3,1930
2,Brazil,0,1,1930
3,Argentina,2,1,1930
4,Chile,2,3,1930
...,...,...,...,...
4299,Costa Rica,2,2,2022
4300,Panama,2,1,2022
4301,Wales,2,1,2022
4302,Australia,1,0,2022


In [12]:
hist_country_matches = df_matches_hometeams.append(df_matches_awayteams, ignore_index=True)
hist_country_matches

Unnamed: 0,Country,winning_team,goal_difference,year
0,Belgium,0,3,1930
1,France,2,3,1930
2,Brazil,0,1,1930
3,Argentina,2,1,1930
4,Chile,2,3,1930
...,...,...,...,...
8603,United States,2,2,2022
8604,Canada,2,1,2022
8605,Ukraine,2,1,2022
8606,Peru,1,0,2022


In [13]:
# Create new columns with ranking position of each team
schedule.insert(1, 'first_position', schedule['Home Team'].map(rank.set_index('Country')['Rank']))
schedule.insert(2, 'second_position', schedule['Away Team'].map(rank.set_index('Country')['Rank']))

# We only need the group stage games, so we have to slice the dataset
schedule_grp = schedule.iloc[:48, :]
# schedule_grp

# MERGING COMPILED_DF WITH SCHEDULE_GRP TO GET THE COUNTRY FEATURES PER MATCHUP
compiled_histmatches_beta = pd.merge(hist_country_matches, compiled_df, how="left", left_on = "Country", right_on = "Country")
# compiled_histmatches_gamma = pd.merge(compiled_histmatches_beta, compiled_df, how="left", left_on = "Country", right_on = "Country")

compiled_histmatches = compiled_histmatches_beta
compiled_histmatches.info()
# unpivot to have row by row of matches, qtar v ecaudor and equador v qatar

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8608 entries, 0 to 8607
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Country             8608 non-null   object 
 1   winning_team        8608 non-null   object 
 2   goal_difference     8608 non-null   int64  
 3   year                8608 non-null   int64  
 4   Rank                4965 non-null   float64
 5   avg_Composure       4965 non-null   float64
 6   avg_Overall         4965 non-null   float64
 7   avg_Potential       4965 non-null   float64
 8   avg_Volleys         4965 non-null   float64
 9   avg_Reactions       4965 non-null   float64
 10  total_wins          4965 non-null   float64
 11  total_games         4965 non-null   float64
 12  ratio_won           4965 non-null   float64
 13  win_margins         4965 non-null   float64
 14  lose_margins        4965 non-null   float64
 15  final               4965 non-null   float64
 16  group 

In [14]:
compiled_histmatches_beta

Unnamed: 0,Country,winning_team,goal_difference,year,Rank,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,total_wins,total_games,ratio_won,win_margins,lose_margins,final,group stage,quarter-finals,round of 16,second group stage,semi-finals,third-place match
0,Belgium,0,3,1930,2.0,60.6567,67.6767,73.9033,44.6500,62.5800,21.0,42.0,0.500000,2.500000,-1.903846,0.0,27.0,3.0,6.0,2.0,2.0,2.0
1,France,2,3,1930,4.0,60.0427,67.2386,73.5875,43.5765,62.4125,29.0,50.0,0.580000,3.576190,-1.400000,3.0,27.0,5.0,5.0,2.0,5.0,2.0
2,Brazil,0,1,1930,1.0,64.8362,70.9672,73.1028,49.5706,68.1390,56.0,77.0,0.727273,3.790698,-1.800000,4.0,39.0,9.0,9.0,8.0,5.0,3.0
3,Argentina,2,1,1930,3.0,60.8926,67.8063,72.7284,45.2558,63.9053,39.0,65.0,0.600000,2.823529,-2.059524,4.0,36.0,6.0,8.0,8.0,3.0,0.0
4,Chile,2,3,1930,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8603,United States,2,2,2022,14.0,55.0921,63.5703,70.2583,38.8338,59.1867,5.0,18.0,0.277778,2.500000,-2.166667,0.0,21.0,1.0,4.0,0.0,0.0,0.0
8604,Canada,2,1,2022,43.0,55.2597,62.8312,69.7273,38.7273,58.1039,0.0,3.0,0.000000,0.000000,0.000000,0.0,3.0,0.0,0.0,0.0,0.0,0.0
8605,Ukraine,2,1,2022,,,,,,,,,,,,,,,,,,
8606,Peru,1,0,2022,,,,,,,,,,,,,,,,,,


In [15]:
# MERGE COMPLIED DATA WITH DF_MATCHES
# compiled_matches_df = pd.merge(df_matches,compiled_df, left_on = "winning_team", right_on = "Country", how = "inner")
# compiled_matches_df.tail()

In [16]:
# final = pd.get_dummies(compiled_matches_df)

# # Separate X and y sets
# X = final.drop(['winning_team'], axis=1)
# y = final["winning_team"]

# y=y.astype('int')

# # Separate train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# from sklearn.ensemble import RandomForestRegressor
# classifier = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# classifier.fit(X_train, y_train)

# score = classifier.score(X_train, y_train)
# score2 = classifier.score(X_test, y_test)
# print("Training set accuracy: ", '%.3f'%(score))
# print("Test set accuracy: ", '%.3f'%(score2))


In [17]:
#remove data that we consider irrelevant to predicting the feature
# df_matches = df_matches.drop(['date', 'Unnamed: 0', 'home_score', 'away_score', 'tournament', 'city', 'country', 'neutral','year'], axis=1)

In [18]:
#if we are going to split the data per country, we will need to define the match data below differently:
#each row is one of the 32 countries
#previous match data would be a column
#ie. Belgium on row, and United States, Mexico, Yugoslavia, etc on y axis
#not exactly sure if this will work

In [19]:
# define home team, away team, and winning team
# The winning team column will show "2" if the home team has won, "1" if it was a tie, and "0" if the away team has won
# The model will be builT to predict the "winning_team"

df_matches = df_matches.reset_index(drop=True)
df_matches.loc[df_matches.winning_team == df_matches.home_team,'winning_team']=2
df_matches.loc[df_matches.winning_team == 'draw', 'winning_team']=1
df_matches.loc[df_matches.winning_team == df_matches.away_team, 'winning_team']=0

# df_matches

In [20]:
# final = pd.get_dummies(df_matches, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# # Separate X and y sets
# X = final.drop(['winning_team'], axis=1)
# y = final["winning_team"]

# y=y.astype('int')

# # Separate train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# from sklearn.ensemble import RandomForestRegressor
# classifier = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# classifier.fit(X_train, y_train)

# score = classifier.score(X_train, y_train)
# score2 = classifier.score(X_test, y_test)
# print("Training set accuracy: ", '%.3f'%(score))
# print("Test set accuracy: ", '%.3f'%(score2))


# Running (up that hill) the model

In [21]:
compiled_histmatches

Unnamed: 0,Country,winning_team,goal_difference,year,Rank,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,total_wins,total_games,ratio_won,win_margins,lose_margins,final,group stage,quarter-finals,round of 16,second group stage,semi-finals,third-place match
0,Belgium,0,3,1930,2.0,60.6567,67.6767,73.9033,44.6500,62.5800,21.0,42.0,0.500000,2.500000,-1.903846,0.0,27.0,3.0,6.0,2.0,2.0,2.0
1,France,2,3,1930,4.0,60.0427,67.2386,73.5875,43.5765,62.4125,29.0,50.0,0.580000,3.576190,-1.400000,3.0,27.0,5.0,5.0,2.0,5.0,2.0
2,Brazil,0,1,1930,1.0,64.8362,70.9672,73.1028,49.5706,68.1390,56.0,77.0,0.727273,3.790698,-1.800000,4.0,39.0,9.0,9.0,8.0,5.0,3.0
3,Argentina,2,1,1930,3.0,60.8926,67.8063,72.7284,45.2558,63.9053,39.0,65.0,0.600000,2.823529,-2.059524,4.0,36.0,6.0,8.0,8.0,3.0,0.0
4,Chile,2,3,1930,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8603,United States,2,2,2022,14.0,55.0921,63.5703,70.2583,38.8338,59.1867,5.0,18.0,0.277778,2.500000,-2.166667,0.0,21.0,1.0,4.0,0.0,0.0,0.0
8604,Canada,2,1,2022,43.0,55.2597,62.8312,69.7273,38.7273,58.1039,0.0,3.0,0.000000,0.000000,0.000000,0.0,3.0,0.0,0.0,0.0,0.0,0.0
8605,Ukraine,2,1,2022,,,,,,,,,,,,,,,,,,
8606,Peru,1,0,2022,,,,,,,,,,,,,,,,,,


In [22]:
final = compiled_histmatches.dropna().reset_index(drop=True)
final

Unnamed: 0,Country,winning_team,goal_difference,year,Rank,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,total_wins,total_games,ratio_won,win_margins,lose_margins,final,group stage,quarter-finals,round of 16,second group stage,semi-finals,third-place match
0,Belgium,0,3,1930,2.0,60.6567,67.6767,73.9033,44.6500,62.5800,21.0,42.0,0.500000,2.500000,-1.903846,0.0,27.0,3.0,6.0,2.0,2.0,2.0
1,France,2,3,1930,4.0,60.0427,67.2386,73.5875,43.5765,62.4125,29.0,50.0,0.580000,3.576190,-1.400000,3.0,27.0,5.0,5.0,2.0,5.0,2.0
2,Brazil,0,1,1930,1.0,64.8362,70.9672,73.1028,49.5706,68.1390,56.0,77.0,0.727273,3.790698,-1.800000,4.0,39.0,9.0,9.0,8.0,5.0,3.0
3,Argentina,2,1,1930,3.0,60.8926,67.8063,72.7284,45.2558,63.9053,39.0,65.0,0.600000,2.823529,-2.059524,4.0,36.0,6.0,8.0,8.0,3.0,0.0
4,Uruguay,2,1,1930,13.0,58.3902,67.3844,71.9769,44.1040,64.0751,13.0,21.0,0.619048,2.619048,-2.361538,0.0,24.0,3.0,5.0,0.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4960,Argentina,1,0,2022,3.0,60.8926,67.8063,72.7284,45.2558,63.9053,39.0,65.0,0.600000,2.823529,-2.059524,4.0,36.0,6.0,8.0,8.0,3.0,0.0
4961,Brazil,0,4,2022,1.0,64.8362,70.9672,73.1028,49.5706,68.1390,56.0,77.0,0.727273,3.790698,-1.800000,4.0,39.0,9.0,9.0,8.0,5.0,3.0
4962,Uruguay,0,2,2022,13.0,58.3902,67.3844,71.9769,44.1040,64.0751,13.0,21.0,0.619048,2.619048,-2.361538,0.0,24.0,3.0,5.0,0.0,2.0,2.0
4963,United States,2,2,2022,14.0,55.0921,63.5703,70.2583,38.8338,59.1867,5.0,18.0,0.277778,2.500000,-2.166667,0.0,21.0,1.0,4.0,0.0,0.0,0.0


In [23]:
# final = pd.get_dummies(compiled_histmatches, prefix=['Home Team', 'Away Team'], columns=['home_team', 'away_team'])
# final = compiled_histmatches

# Separate X and y sets
X = final.drop(['winning_team', "Country"], axis=1)
y = final["winning_team"]

y=y.astype('int')

# Separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [24]:
#classify data as 0, 1, or 2 (home team lost, drew, or won)
# classifier = LogisticRegression()

# RAN THIS MODEL AS A RANDOM FOREST REGRESSOR. TRAINING SCORE SIMILAR TO LOGISTIC REGRESSION, BUT ACCURACY SUFFERED
# from sklearn.ensemble import RandomForestRegressor
classifier = RandomForestClassifier(n_estimators = 1000, random_state = 42)

classifier.fit(X_train, y_train)

score = classifier.score(X_train, y_train)
score2 = classifier.score(X_test, y_test)

print("Training set accuracy: ", '%.3f'%(score))
print("Test set accuracy: ", '%.3f'%(score2))

Training set accuracy:  0.906
Test set accuracy:  0.656


In [40]:
# CLASSIFER AS LOGISTIC REGRESSION

classifier_log = LogisticRegression()
classifier_log.fit(X_train, y_train)

score_log = classifier.score(X_train, y_train)
score2_log = classifier.score(X_test, y_test)

print("Training set accuracy: ", '%.3f'%(score_log))
print("Test set accuracy: ", '%.3f'%(score2_log))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training set accuracy:  0.906
Test set accuracy:  0.656


In [25]:
# schedule_grp[["Home Team", "Away Team", "Group"]].groupby("Group").
# print(X)
# final.head()

In [26]:
# CREATE THE PREDICTIVE SET OFF THE SCHDULE

pred_set= []

for index, row in schedule_grp.iterrows():
    if row['first_position'] < row['second_position']:
        pred_set.append({'home_team': row['Home Team'], 'away_team': row['Away Team'], 'winning_team': None})
    else:
        pred_set.append({'home_team': row['Away Team'], 'away_team': row['Home Team'], 'winning_team': None})
        
pred_set = pd.DataFrame(pred_set)
backup_pred_set = pred_set

# pred_set.head(25)

In [27]:
X_train.columns

Index(['goal_difference', 'year', 'Rank', 'avg_Composure', 'avg_Overall',
       'avg_Potential', 'avg_Volleys', 'avg_Reactions', 'total_wins',
       'total_games', 'ratio_won', 'win_margins', 'lose_margins', 'final',
       'group stage', 'quarter-finals', 'round of 16', 'second group stage',
       'semi-finals', 'third-place match'],
      dtype='object')

In [28]:
# pred_set = pred_set.drop(columns = ["Country", "winning_team"])
# pred_set

In [29]:
pred_set.columns

Index(['home_team', 'away_team', 'winning_team'], dtype='object')

In [30]:
# # Get dummy variables and drop winning_team column
# pred_set = pred_set.drop(columns = ["Match Number", "Round Number", "Group", "Result"], axis = 1)

# from sklearn.preprocessing import OneHotEncoder
# oneh = OneHotEncoder(handle_unknown="ignore")
# oneh.fit(final)
# X_test = oneh.transform(schedule_grp)

pred_set = pd.get_dummies(schedule_grp, prefix=['Home Team', 'Away Team'], columns=['Home Team', 'Away Team'])

# Add missing columns compared to the model's training dataset
missing_cols = set(final.columns) - set(pred_set.columns)
for c in missing_cols:
    pred_set[c] = 0
pred_set = pred_set[final.columns]

pred_set = pred_set.drop(columns = ["Country", "winning_team"])

backup_pred_set = schedule_grp.drop(columns = ["Match Number", "Round Number", "Group"])

# Remove winning team column
# pred_set = pred_set.drop(['winning_team'], axis=1)

# pred_set = pred_set.dropna().reset_index(drop=True)
# pred_set.info(50)
# backup_pred_set

In [31]:
# # MERGING PRED_SET ONTO SCHEDULE_GRP
# final_w_predset_home = backup_pred_set.merge(final, left_on = "Home Team", right_on = "Country", how = "left")
# final_w_predset_home = final_w_predset_home.drop(columns = ["Date", "Location", "Home Team", "Away Team", "Country", "Result"], axis = 1)
# final_w_predset_home = final_w_predset_home.dropna()
# final_w_predset_home

In [32]:
# compiled_schedule_grp
# schedule_grp = schedule_grp.drop(["Date", "Location"], axis = 1)
# schedule_grp.head()
# final.columns
# pred_set.head()
# backup_pred_set.head()

In [33]:
# group matches

# predictions = classifier.predict(final)

# # Add missing columns compared to the model's training dataset
# missing_cols = set(final_w_predset_home.columns) - set(pred_set.columns)

# for c in missing_cols:
#     pred_set[c] = 0
    
# pred_set = pred_set[final_w_predset_home.columns]
# pred_set = pred_set.drop(['winning_team'], axis=1)
# pred_set.head()
backup_pred_set.head()
backup_pred_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   first_position   47 non-null     float64
 1   second_position  46 non-null     float64
 2   Date             48 non-null     object 
 3   Location         48 non-null     object 
 4   Home Team        48 non-null     object 
 5   Away Team        48 non-null     object 
 6   Result           0 non-null      float64
dtypes: float64(3), object(4)
memory usage: 2.8+ KB


In [41]:


predictions = classifier_log.predict(pred_set)

for i in range(schedule_grp.shape[0]):
    print(backup_pred_set.iloc[i, 4] + " and " + backup_pred_set.iloc[i, 5])
    if predictions[i] == 2:
        print("Winner: " + backup_pred_set.iloc[i, 4])
    elif predictions[i] == 1:
        print("Draw")
    elif predictions[i] == 0:
        print("Winner: " + backup_pred_set.iloc[i, 5])
    print('Probability of ' + backup_pred_set.iloc[i, 4] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][2]))
    print('Probability of Draw: ', '%.3f'%(classifier.predict_proba(pred_set)[i][1]))
    print('Probability of ' + backup_pred_set.iloc[i, 5] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][0]))
    print("")

Senegal and Netherlands
Winner: Senegal
Probability of Senegal winning:  0.064
Probability of Draw:  0.851
Probability of Netherlands winning:  0.085

England and Iran
Winner: England
Probability of England winning:  0.064
Probability of Draw:  0.851
Probability of Iran winning:  0.085

Qatar and Ecuador
Winner: Qatar
Probability of Qatar winning:  0.064
Probability of Draw:  0.851
Probability of Ecuador winning:  0.085

USA and Wales
Winner: USA
Probability of USA winning:  0.064
Probability of Draw:  0.851
Probability of Wales winning:  0.085

Argentina and Saudi Arabia
Winner: Argentina
Probability of Argentina winning:  0.064
Probability of Draw:  0.851
Probability of Saudi Arabia winning:  0.085

Denmark and Tunisia
Winner: Denmark
Probability of Denmark winning:  0.064
Probability of Draw:  0.851
Probability of Tunisia winning:  0.085

Mexico and Poland
Winner: Mexico
Probability of Mexico winning:  0.064
Probability of Draw:  0.851
Probability of Poland winning:  0.085

France a

In [None]:
# # group matches
# predictions = classifier.predict(pred_set)
# # predictions = classifier.predict(final)

# for i in range(schedule_grp.shape[0]):
#     print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
#     if predictions[i] == 2:
#         print("Winner: " + backup_pred_set.iloc[i, 1])
#     elif predictions[i] == 1:
#         print("Draw")
#     elif predictions[i] == 0:
#         print("Winner: " + backup_pred_set.iloc[i, 0])
#     print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][2]))
#     print('Probability of Draw: ', '%.3f'%(classifier.predict_proba(pred_set)[i][1]))
#     print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][0]))
#     print("")

In [None]:
# I THINK WE NEED TO SAVE THIS OUTPUT AS A VARIABLE. 
# FOR GAMES WHERE THE PROBABILITY OF WINNING IS THE SAME FOR EITHER TEAM, CALL THAT A TRUE DRAW
# OTHERWISE, CAN WE ASSUME THAT THE TEAM WITH THE HIGHER PROBABILITY OF WINNING WOULD ADVANCE TO KNOCKOUT?

# GROUPSTAGE_PTS = 0

# FOR EACH IN OUTCOMES:
#     IF DF[FIRST-COUNTRY] WIN PROBABILITY > DF[SECOND-COUNTRY] WIN PROBABILITY:
#         GROUPSTAGE_PTS =+ 3
#     IF DF[SECOND-COUNTRY] WIN PROBABILITY > DF[FIRST-COUNTRY] WIN PROBABILITY:
#         GROUPSTAGE_PTS =+3
#     ELSEIF DF[SECOND-COUNTRY] WIN PROBABILITY == DF[FIRST-COUNTRY] WIN PROBABILITY:
#         GROUPSTAGE_PTS += 1
#        
# then sum the groupstage points per group to get the top 2 teams from each, which advance to the the round of 16, and so on

In [None]:
# WINNERS OF GROUP A MATCHES
schedule_grpA = schedule[schedule["Group"]=="Group A"]
schedule_grpA = schedule_grpA.drop(columns = ["Match Number", "Round Number", "Date", "Location", "Result"], axis = 1)
schedule_grpA

grpA_hometeams = schedule_grpA["Home Team"]
# grpA_hometeams

# PREDICT IF HOME TEAM WILL BEAT AWAY TEAM
predictions = classifier.predict(grpA_hometeams)
for i in range(schedule.shape[0]):
    print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
    if predictions[i] == 2:
        print("Winner: " + backup_pred_set.iloc[i, 1])
    elif predictions[i] == 1:
        print("Draw")
    elif predictions[i] == 0:
        print("Winner: " + backup_pred_set.iloc[i, 0])
    print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][2]))
    print('Probability of Draw: ', '%.3f'%(classifier.predict_proba(pred_set)[i][1]))
    print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][0]))
    print("")

# pred_setA = []

# for index, row in schedule_grpA.iterrows():
    
#     if row['first_position'] < row['second_position']:
#         pred_set.append({'home_team': row['Home Team'], 'away_team': row['Away Team'], 'winning_team': None})
#     else:
#         pred_set.append({'home_team': row['Away Team'], 'away_team': row['Home Team'], 'winning_team': None})
        
# pred_setA = pd.DataFrame(pred_setA)
# backup_pred_setA = pred_setA

# pred_setA.head(25)




# # # Get dummy variables and drop winning_team column
# pred_set = pd.get_dummies(pred_set, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# # Add missing columns compared to the model's training dataset
# missing_cols = set(final.columns) - set(pred_set.columns)
# for c in missing_cols:
#     pred_set[c] = 0
# pred_set = pred_set[final.columns]

# # Remove winning team column
# pred_set = pred_set.drop(['winning_team'], axis=1)

# pred_set.head()






# predictions_grpA = classifier.predict(pred_set)
# for i in range(schedule.shape[0]):
#     print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
#     if predictions[i] == 2:
#         print("Winner: " + backup_pred_set.iloc[i, 1])
#     elif predictions[i] == 1:
#         print("Draw")
#     elif predictions[i] == 0:
#         print("Winner: " + backup_pred_set.iloc[i, 0])
#     print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][2]))
#     print('Probability of Draw: ', '%.3f'%(classifier.predict_proba(pred_set)[i][1]))
#     print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][0]))
#     print("")