In [1]:
# run dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# IMPORT CSV FILES 

# -- PER COUNTRY FEATURES --
rank = pd.read_csv("Features/fifa_rank.csv")
player_avg = pd.read_csv("Features/player_averages_for_teams.csv")
win_ratio = pd.read_csv("Features/ratio_played_vs_won.csv")
delta_pts = pd.read_csv("Features/total_score_margins.csv")
games = pd.read_csv("Features/adv_df_pivot.csv")
player_top_features = pd.read_csv("Features/player_top_features_country.csv")

# -- OTHER FEATURES --
df_matches = pd.read_csv("Features/wc_matches.csv")
schedule = pd.read_csv('Data/schedule.csv')

# IDENTIFY 2022 WORLD CUP TEAMS
teams_2022 = ['Qatar', 'Netherlands', 'Senegal', 'Ecuador', 
            'England', 'United States', 'Wales', 'Iran', 
            'Argentina', 'Poland', 'Mexico', 'Saudi Arabia', 
            'France', 'Denmark', 'Tunisia', 'Australia', 
            'Germany', 'Spain', 'Japan', 'Costa Rica', 
            'Belgium', 'Croatia', 'Canada', 'Morocco', 
            'Brazil', 'Switzerland', 'Serbia', 'Cameroon', 
            'Portugal', 'Uruguay', 'Ghana', 'Korea Republic']

#import dataset that you will be predicting upon (Target = predicting a win, draw, or loss)
#But also, the match data needs to be organized differently.
# df_matches = pd.read_csv("Features/wc_matches.csv")
# df_matches

In [4]:
# CLEAN UP

rank = rank.drop(['Unnamed: 0', 'Points'], axis=1)
player_avg = player_avg.rename(columns={"Nationality": "Country"})
win_ratio = win_ratio.drop(['Unnamed: 0'], axis=1)
delta_pts = delta_pts.drop(['Unnamed: 0'], axis=1)
games = games.rename(columns={"country": "Country"})
player_top_features = player_top_features.drop(['Unnamed: 0'], axis=1)

In [5]:
# VIEW DATAFRAMES

# rank.head()
# player_avg.head()
# win_ratio.head()
# delta_pts.head()
# games.head()
# player_top_features.head()

In [6]:
# MERGE DATAFRAMES ON "COUNTRY"

 # list of dataframes
# dfs = [rank, player_avg, win_ratio, delta_pts, games]
dfs = [rank, player_top_features, win_ratio, delta_pts, games]

# MERGING
compiled_df = reduce(lambda  left,right: pd.merge(left,right,on=["Country"],
                                            how='outer'), dfs)
# FILLING NULLS WITH 0
compiled_df = compiled_df.fillna(0)

In [7]:
# limit the dataframe rows to the relevant teams playing in the 2022 world cup
compiled_df = compiled_df[(compiled_df['Country'].isin(teams_2022))]
compiled_df.head()

Unnamed: 0,Rank,Country,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,avg_FKAcc,avg_Jumping,avg_Penalties,avg_Aggression,avg_SprintSpeed,avg_Crossing,avg_LongPass,avg_Age,avg_GKPositioning,avg_Marking,avg_Agility,avg_GKKicking,avg_BallControl,avg_Curve,avg_SlideTackle,med_Composure,med_Overall,med_Potential,med_Volleys,med_Reactions,med_FKAcc,med_Jumping,med_Penalties,med_Aggression,med_SprintSpeed,med_Crossing,med_LongPass,med_Age,med_GKPositioning,med_Marking,med_Agility,med_GKKicking,med_BallControl,med_Curve,med_SlideTackle,max_Composure,max_Overall,max_Potential,max_Volleys,max_Reactions,max_FKAcc,max_Jumping,max_Penalties,max_Aggression,max_SprintSpeed,max_Crossing,max_LongPass,max_Age,max_GKPositioning,max_Marking,max_Agility,max_GKKicking,max_BallControl,max_Curve,max_SlideTackle,min_Composure,min_Overall,min_Potential,min_Volleys,min_Reactions,min_FKAcc,min_Jumping,min_Penalties,min_Aggression,min_SprintSpeed,min_Crossing,min_LongPass,min_Age,min_GKPositioning,min_Marking,min_Agility,min_GKKicking,min_BallControl,min_Curve,min_SlideTackle,std_Composure,std_Overall,std_Potential,std_Volleys,std_Reactions,std_FKAcc,std_Jumping,std_Penalties,std_Aggression,std_SprintSpeed,std_Crossing,std_LongPass,std_Age,std_GKPositioning,std_Marking,std_Agility,std_GKKicking,std_BallControl,std_Curve,std_SlideTackle,total_wins,total_games,ratio_won,win_margins,lose_margins,final,group stage,quarter-finals,round of 16,second group stage,semi-finals,third-place match
0,1,Brazil,64.8362,70.9672,73.1028,49.5706,68.139,49.5435,65.8554,54.7412,60.4011,66.4497,55.0915,57.7311,26.1627,15.8542,49.2576,65.4712,15.8927,64.1299,52.9774,49.1616,66.0,71.0,72.0,53.0,68.0,50.0,67.0,57.0,63.0,69.0,60.0,61.0,26.0,11.0,56.0,68.0,11.0,68.0,56.0,56.0,93.0,91.0,93.0,87.0,91.0,89.0,92.0,92.0,94.0,95.0,89.0,87.0,43.0,91.0,88.0,96.0,93.0,95.0,90.0,87.0,15.0,51.0,56.0,4.0,34.0,7.0,30.0,10.0,13.0,16.0,6.0,9.0,18.0,3.0,6.0,21.0,3.0,8.0,7.0,7.0,10.729,5.4933,5.5457,18.1326,7.6497,17.9845,12.5662,15.7556,17.2814,14.3189,18.0939,15.5242,4.2563,17.179,23.7528,14.7489,16.3274,15.7501,18.5041,22.9348,56.0,77.0,0.727273,3.790698,-1.8,4.0,39.0,9.0,9.0,8.0,5.0,3.0
1,2,Belgium,60.6567,67.6767,73.9033,44.65,62.58,45.2967,64.9667,48.0433,55.3533,63.55,52.84,56.4,24.4967,18.2667,45.99,64.53,18.0867,60.6033,48.9433,46.4567,62.0,67.0,74.0,47.0,63.0,46.0,67.0,49.0,60.5,67.0,59.0,61.0,24.0,11.0,52.0,67.5,12.0,65.0,53.0,55.0,91.0,91.0,91.0,83.0,91.0,83.0,92.0,87.0,88.0,92.0,94.0,93.0,41.0,85.0,88.0,95.0,80.0,92.0,85.0,90.0,21.0,51.0,54.0,4.0,39.0,8.0,31.0,10.0,14.0,15.0,8.0,12.0,17.0,5.0,6.0,22.0,5.0,10.0,10.0,7.0,14.1556,6.8573,5.3868,19.9948,9.7427,19.465,11.1613,16.7004,18.2724,15.0786,20.0845,15.6835,4.953,19.5782,21.7907,14.8343,18.8649,17.8913,20.0689,21.9221,21.0,42.0,0.5,2.5,-1.903846,0.0,27.0,3.0,6.0,2.0,2.0,2.0
2,3,Argentina,60.8926,67.8063,72.7284,45.2558,63.9053,44.3305,65.6379,51.4263,57.4863,64.3337,51.1758,54.6558,26.4421,16.5453,47.12,64.4547,16.3621,60.9432,49.6242,46.1274,62.0,68.0,73.0,47.0,64.0,43.0,66.0,53.0,60.0,67.0,55.0,59.0,26.0,11.0,51.0,67.0,11.0,65.0,51.0,52.0,96.0,93.0,93.0,88.0,94.0,94.0,94.0,88.0,95.0,94.0,88.0,91.0,42.0,82.0,84.0,94.0,83.0,96.0,93.0,81.0,14.0,52.0,57.0,5.0,35.0,8.0,17.0,10.0,13.0,15.0,8.0,12.0,17.0,2.0,5.0,23.0,3.0,9.0,8.0,8.0,12.5168,6.3838,5.5847,17.7896,8.469,17.0677,11.8576,15.5284,18.4255,14.4028,17.9087,15.4327,5.156,17.6885,19.9285,14.8388,17.2703,16.9946,18.633,21.4892,39.0,65.0,0.6,2.823529,-2.059524,4.0,36.0,6.0,8.0,8.0,3.0,0.0
3,4,France,60.0427,67.2386,73.5875,43.5765,62.4125,43.6123,64.7535,49.8111,56.7038,64.4016,51.8549,55.3529,24.6382,15.7197,47.7575,63.1789,15.4831,61.005,49.329,48.0308,61.0,67.0,73.0,44.0,62.0,42.0,65.0,51.0,59.5,67.0,56.0,59.0,24.0,11.0,54.0,65.0,11.0,65.0,52.0,57.0,90.0,90.0,95.0,87.0,92.0,87.0,92.0,86.0,90.0,96.0,88.0,91.0,39.0,84.0,90.0,93.0,82.0,90.0,86.0,88.0,20.0,50.0,58.0,3.0,30.0,6.0,26.0,7.0,11.0,15.0,9.0,13.0,17.0,3.0,4.0,21.0,2.0,10.0,6.0,8.0,11.7619,6.9645,6.0742,17.8418,9.532,17.1862,12.328,15.5527,16.9628,14.4239,18.1683,15.0856,4.7396,16.5057,21.2941,14.5739,15.8776,16.4098,18.166,21.962,29.0,50.0,0.58,3.57619,-1.4,3.0,27.0,5.0,5.0,2.0,5.0,2.0
4,5,England,56.4206,63.2728,70.3153,40.3194,59.032,40.427,64.7312,46.6166,54.6713,64.9692,47.6219,50.5922,24.4671,15.452,46.1722,63.5364,15.4561,56.2565,45.0157,45.1978,57.0,63.0,70.0,41.0,59.0,39.0,65.0,48.0,57.0,68.0,51.0,52.0,24.0,11.0,52.0,66.0,11.0,60.0,46.0,52.0,91.0,89.0,93.0,85.0,91.0,91.0,93.0,92.0,95.0,94.0,93.0,89.0,41.0,83.0,84.0,94.0,87.0,91.0,92.0,90.0,22.0,47.0,48.0,5.0,31.0,7.0,30.0,7.0,12.0,16.0,8.0,11.0,17.0,2.0,5.0,21.0,5.0,7.0,9.0,10.0,10.8998,7.0593,6.1797,15.9093,8.767,16.0784,11.1274,14.6914,16.4375,13.9696,17.0172,13.847,4.6874,15.5678,19.0616,13.8629,15.4897,14.9067,16.5647,19.8853,22.0,30.0,0.733333,3.232143,-1.064706,0.0,30.0,6.0,7.0,2.0,2.0,2.0


In [8]:
df_matches = df_matches.reset_index(drop=True)
df_matches.loc[df_matches.winning_team == df_matches.home_team,'winning_team']=2
df_matches.loc[df_matches.winning_team == 'draw', 'winning_team']=1
df_matches.loc[df_matches.winning_team == df_matches.away_team, 'winning_team']=0

df_matches.columns

Index(['Unnamed: 0', 'date', 'home_team', 'away_team', 'home_score',
       'away_score', 'tournament', 'city', 'country', 'neutral',
       'winning_team', 'goal_difference', 'year'],
      dtype='object')

In [9]:
df_matches_hometeams = df_matches.drop(columns = ["Unnamed: 0", "date", "away_team", "home_score", "away_score", "tournament", "city", "country", "neutral"], axis = 1)
df_matches_awayteams = df_matches.drop(columns = ["Unnamed: 0", "date", "home_team", "home_score", "away_score", "tournament", "city", "country", "neutral"], axis = 1)

In [10]:
df_matches_awayteams = df_matches_awayteams.rename(columns = {"away_team":"Country"})
df_matches_awayteams

Unnamed: 0,Country,winning_team,goal_difference,year
0,United States,0,3,1930
1,Mexico,2,3,1930
2,Yugoslavia,0,1,1930
3,France,2,1,1930
4,Mexico,2,3,1930
...,...,...,...,...
4177,United States,2,2,2022
4178,Canada,2,1,2022
4179,Ukraine,2,1,2022
4180,Peru,1,0,2022


In [11]:
df_matches_hometeams = df_matches_hometeams.rename(columns = {"home_team":"Country"})
df_matches_hometeams

Unnamed: 0,Country,winning_team,goal_difference,year
0,Belgium,0,3,1930
1,France,2,3,1930
2,Brazil,0,1,1930
3,Argentina,2,1,1930
4,Chile,2,3,1930
...,...,...,...,...
4177,Costa Rica,2,2,2022
4178,Panama,2,1,2022
4179,Wales,2,1,2022
4180,Australia,1,0,2022


In [12]:
hist_country_matches = df_matches_hometeams.append(df_matches_awayteams, ignore_index=True)
hist_country_matches

Unnamed: 0,Country,winning_team,goal_difference,year
0,Belgium,0,3,1930
1,France,2,3,1930
2,Brazil,0,1,1930
3,Argentina,2,1,1930
4,Chile,2,3,1930
...,...,...,...,...
8359,United States,2,2,2022
8360,Canada,2,1,2022
8361,Ukraine,2,1,2022
8362,Peru,1,0,2022


In [13]:
# Create new columns with ranking position of each team
# schedule.insert(1, 'first_position', schedule['Home Team'].map(rank.set_index('Country')['Rank']))
# schedule.insert(2, 'second_position', schedule['Away Team'].map(rank.set_index('Country')['Rank']))

# We only need the group stage games, so we have to slice the dataset
schedule_grp = schedule.iloc[:48, :]
# schedule_grp

# MERGING COMPILED_DF WITH SCHEDULE_GRP TO GET THE COUNTRY FEATURES PER MATCHUP
compiled_histmatches_beta = pd.merge(hist_country_matches, compiled_df, how="left", left_on = "Country", right_on = "Country")
# compiled_histmatches_gamma = pd.merge(compiled_histmatches_beta, compiled_df, how="left", left_on = "Country", right_on = "Country")

compiled_histmatches = compiled_histmatches_beta
compiled_histmatches.info()
# unpivot to have row by row of matches, qtar v ecaudor and equador v qatar

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8364 entries, 0 to 8363
Columns: 117 entries, Country to third-place match
dtypes: float64(113), int64(2), object(2)
memory usage: 7.5+ MB


In [14]:
compiled_histmatches_beta

Unnamed: 0,Country,winning_team,goal_difference,year,Rank,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,avg_FKAcc,avg_Jumping,avg_Penalties,avg_Aggression,avg_SprintSpeed,avg_Crossing,avg_LongPass,avg_Age,avg_GKPositioning,avg_Marking,avg_Agility,avg_GKKicking,avg_BallControl,avg_Curve,avg_SlideTackle,med_Composure,med_Overall,med_Potential,med_Volleys,med_Reactions,med_FKAcc,med_Jumping,med_Penalties,med_Aggression,med_SprintSpeed,med_Crossing,med_LongPass,med_Age,med_GKPositioning,med_Marking,med_Agility,med_GKKicking,med_BallControl,med_Curve,med_SlideTackle,max_Composure,max_Overall,max_Potential,max_Volleys,max_Reactions,max_FKAcc,max_Jumping,max_Penalties,max_Aggression,max_SprintSpeed,max_Crossing,max_LongPass,max_Age,max_GKPositioning,max_Marking,max_Agility,max_GKKicking,max_BallControl,max_Curve,max_SlideTackle,min_Composure,min_Overall,min_Potential,min_Volleys,min_Reactions,min_FKAcc,min_Jumping,min_Penalties,min_Aggression,min_SprintSpeed,min_Crossing,min_LongPass,min_Age,min_GKPositioning,min_Marking,min_Agility,min_GKKicking,min_BallControl,min_Curve,min_SlideTackle,std_Composure,std_Overall,std_Potential,std_Volleys,std_Reactions,std_FKAcc,std_Jumping,std_Penalties,std_Aggression,std_SprintSpeed,std_Crossing,std_LongPass,std_Age,std_GKPositioning,std_Marking,std_Agility,std_GKKicking,std_BallControl,std_Curve,std_SlideTackle,total_wins,total_games,ratio_won,win_margins,lose_margins,final,group stage,quarter-finals,round of 16,second group stage,semi-finals,third-place match
0,Belgium,0,3,1930,2.0,60.6567,67.6767,73.9033,44.6500,62.5800,45.2967,64.9667,48.0433,55.3533,63.5500,52.8400,56.4000,24.4967,18.2667,45.9900,64.5300,18.0867,60.6033,48.9433,46.4567,62.0,67.0,74.0,47.0,63.0,46.0,67.0,49.0,60.5,67.0,59.0,61.0,24.0,11.0,52.0,67.5,12.0,65.0,53.0,55.0,91.0,91.0,91.0,83.0,91.0,83.0,92.0,87.0,88.0,92.0,94.0,93.0,41.0,85.0,88.0,95.0,80.0,92.0,85.0,90.0,21.0,51.0,54.0,4.0,39.0,8.0,31.0,10.0,14.0,15.0,8.0,12.0,17.0,5.0,6.0,22.0,5.0,10.0,10.0,7.0,14.1556,6.8573,5.3868,19.9948,9.7427,19.4650,11.1613,16.7004,18.2724,15.0786,20.0845,15.6835,4.9530,19.5782,21.7907,14.8343,18.8649,17.8913,20.0689,21.9221,21.0,42.0,0.500000,2.500000,-1.903846,0.0,27.0,3.0,6.0,2.0,2.0,2.0
1,France,2,3,1930,4.0,60.0427,67.2386,73.5875,43.5765,62.4125,43.6123,64.7535,49.8111,56.7038,64.4016,51.8549,55.3529,24.6382,15.7197,47.7575,63.1789,15.4831,61.0050,49.3290,48.0308,61.0,67.0,73.0,44.0,62.0,42.0,65.0,51.0,59.5,67.0,56.0,59.0,24.0,11.0,54.0,65.0,11.0,65.0,52.0,57.0,90.0,90.0,95.0,87.0,92.0,87.0,92.0,86.0,90.0,96.0,88.0,91.0,39.0,84.0,90.0,93.0,82.0,90.0,86.0,88.0,20.0,50.0,58.0,3.0,30.0,6.0,26.0,7.0,11.0,15.0,9.0,13.0,17.0,3.0,4.0,21.0,2.0,10.0,6.0,8.0,11.7619,6.9645,6.0742,17.8418,9.5320,17.1862,12.3280,15.5527,16.9628,14.4239,18.1683,15.0856,4.7396,16.5057,21.2941,14.5739,15.8776,16.4098,18.1660,21.9620,29.0,50.0,0.580000,3.576190,-1.400000,3.0,27.0,5.0,5.0,2.0,5.0,2.0
2,Brazil,0,1,1930,1.0,64.8362,70.9672,73.1028,49.5706,68.1390,49.5435,65.8554,54.7412,60.4011,66.4497,55.0915,57.7311,26.1627,15.8542,49.2576,65.4712,15.8927,64.1299,52.9774,49.1616,66.0,71.0,72.0,53.0,68.0,50.0,67.0,57.0,63.0,69.0,60.0,61.0,26.0,11.0,56.0,68.0,11.0,68.0,56.0,56.0,93.0,91.0,93.0,87.0,91.0,89.0,92.0,92.0,94.0,95.0,89.0,87.0,43.0,91.0,88.0,96.0,93.0,95.0,90.0,87.0,15.0,51.0,56.0,4.0,34.0,7.0,30.0,10.0,13.0,16.0,6.0,9.0,18.0,3.0,6.0,21.0,3.0,8.0,7.0,7.0,10.7290,5.4933,5.5457,18.1326,7.6497,17.9845,12.5662,15.7556,17.2814,14.3189,18.0939,15.5242,4.2563,17.1790,23.7528,14.7489,16.3274,15.7501,18.5041,22.9348,56.0,77.0,0.727273,3.790698,-1.800000,4.0,39.0,9.0,9.0,8.0,5.0,3.0
3,Argentina,2,1,1930,3.0,60.8926,67.8063,72.7284,45.2558,63.9053,44.3305,65.6379,51.4263,57.4863,64.3337,51.1758,54.6558,26.4421,16.5453,47.1200,64.4547,16.3621,60.9432,49.6242,46.1274,62.0,68.0,73.0,47.0,64.0,43.0,66.0,53.0,60.0,67.0,55.0,59.0,26.0,11.0,51.0,67.0,11.0,65.0,51.0,52.0,96.0,93.0,93.0,88.0,94.0,94.0,94.0,88.0,95.0,94.0,88.0,91.0,42.0,82.0,84.0,94.0,83.0,96.0,93.0,81.0,14.0,52.0,57.0,5.0,35.0,8.0,17.0,10.0,13.0,15.0,8.0,12.0,17.0,2.0,5.0,23.0,3.0,9.0,8.0,8.0,12.5168,6.3838,5.5847,17.7896,8.4690,17.0677,11.8576,15.5284,18.4255,14.4028,17.9087,15.4327,5.1560,17.6885,19.9285,14.8388,17.2703,16.9946,18.6330,21.4892,39.0,65.0,0.600000,2.823529,-2.059524,4.0,36.0,6.0,8.0,8.0,3.0,0.0
4,Chile,2,3,1930,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8359,United States,2,2,2022,14.0,55.0921,63.5703,70.2583,38.8338,59.1867,36.7238,64.3683,44.1867,53.7596,62.4757,43.5473,49.8645,24.8338,19.5601,43.3657,60.9923,20.0537,53.5294,43.4194,42.0128,56.0,64.0,70.0,41.0,60.0,36.0,65.0,45.0,57.0,66.0,48.0,53.0,24.0,11.0,50.0,64.0,12.0,60.0,47.0,49.0,83.0,81.0,88.0,79.0,82.0,73.0,94.0,76.0,85.0,92.0,79.0,79.0,38.0,79.0,75.0,91.0,73.0,83.0,80.0,76.0,19.0,49.0,54.0,4.0,24.0,8.0,30.0,10.0,15.0,15.0,7.0,15.0,17.0,5.0,5.0,20.0,5.0,9.0,8.0,6.0,11.4126,6.1138,5.6602,17.2003,8.8198,15.8861,12.0725,15.6416,17.4941,15.9639,18.3742,14.7307,4.8601,20.2187,21.3712,15.6009,19.9565,18.1314,18.6485,20.9510,5.0,18.0,0.277778,2.500000,-2.166667,0.0,21.0,1.0,4.0,0.0,0.0,0.0
8360,Canada,2,1,2022,43.0,55.2597,62.8312,69.7273,38.7273,58.1039,34.9610,64.6623,44.0779,55.5844,62.2857,42.7792,48.7662,24.5974,18.0649,43.1429,60.8831,16.7403,52.9221,43.4156,42.3377,56.0,63.0,69.0,39.0,58.0,34.0,66.0,45.0,59.0,65.0,44.0,52.0,23.0,12.0,52.0,63.0,10.0,59.0,45.0,52.0,83.0,81.0,89.0,71.0,79.0,69.0,92.0,78.0,82.0,96.0,73.0,75.0,41.0,74.0,77.0,92.0,72.0,79.0,72.0,79.0,23.0,50.0,56.0,5.0,30.0,9.0,30.0,10.0,18.0,16.0,11.0,17.0,18.0,5.0,6.0,22.0,5.0,12.0,9.0,10.0,11.8925,6.7149,5.8279,17.4941,8.9613,15.4370,12.3668,16.1797,17.2225,18.4311,17.8227,14.2643,4.9425,18.3721,21.1801,17.6762,17.4758,18.1142,18.5319,20.4371,0.0,3.0,0.000000,0.000000,0.000000,0.0,3.0,0.0,0.0,0.0,0.0,0.0
8361,Ukraine,2,1,2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8362,Peru,1,0,2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [15]:
# MERGE COMPLIED DATA WITH DF_MATCHES
# compiled_matches_df = pd.merge(df_matches,compiled_df, left_on = "winning_team", right_on = "Country", how = "inner")
# compiled_matches_df.tail()

In [16]:
# final = pd.get_dummies(compiled_matches_df)

# # Separate X and y sets
# X = final.drop(['winning_team'], axis=1)
# y = final["winning_team"]

# y=y.astype('int')

# # Separate train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# from sklearn.ensemble import RandomForestRegressor
# classifier = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# classifier.fit(X_train, y_train)

# score = classifier.score(X_train, y_train)
# score2 = classifier.score(X_test, y_test)
# print("Training set accuracy: ", '%.3f'%(score))
# print("Test set accuracy: ", '%.3f'%(score2))


In [17]:
#remove data that we consider irrelevant to predicting the feature
# df_matches = df_matches.drop(['date', 'Unnamed: 0', 'home_score', 'away_score', 'tournament', 'city', 'country', 'neutral','year'], axis=1)

In [18]:
#if we are going to split the data per country, we will need to define the match data below differently:
#each row is one of the 32 countries
#previous match data would be a column
#ie. Belgium on row, and United States, Mexico, Yugoslavia, etc on y axis
#not exactly sure if this will work

In [19]:
# define home team, away team, and winning team
# The winning team column will show "2" if the home team has won, "1" if it was a tie, and "0" if the away team has won
# The model will be builT to predict the "winning_team"

df_matches = df_matches.reset_index(drop=True)
df_matches.loc[df_matches.winning_team == df_matches.home_team,'winning_team']=2
df_matches.loc[df_matches.winning_team == 'draw', 'winning_team']=1
df_matches.loc[df_matches.winning_team == df_matches.away_team, 'winning_team']=0

# df_matches

In [20]:
# final = pd.get_dummies(df_matches, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# # Separate X and y sets
# X = final.drop(['winning_team'], axis=1)
# y = final["winning_team"]

# y=y.astype('int')

# # Separate train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# from sklearn.ensemble import RandomForestRegressor
# classifier = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# classifier.fit(X_train, y_train)

# score = classifier.score(X_train, y_train)
# score2 = classifier.score(X_test, y_test)
# print("Training set accuracy: ", '%.3f'%(score))
# print("Test set accuracy: ", '%.3f'%(score2))


# Running (up that hill) the model

In [21]:
compiled_histmatches

Unnamed: 0,Country,winning_team,goal_difference,year,Rank,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,avg_FKAcc,avg_Jumping,avg_Penalties,avg_Aggression,avg_SprintSpeed,avg_Crossing,avg_LongPass,avg_Age,avg_GKPositioning,avg_Marking,avg_Agility,avg_GKKicking,avg_BallControl,avg_Curve,avg_SlideTackle,med_Composure,med_Overall,med_Potential,med_Volleys,med_Reactions,med_FKAcc,med_Jumping,med_Penalties,med_Aggression,med_SprintSpeed,med_Crossing,med_LongPass,med_Age,med_GKPositioning,med_Marking,med_Agility,med_GKKicking,med_BallControl,med_Curve,med_SlideTackle,max_Composure,max_Overall,max_Potential,max_Volleys,max_Reactions,max_FKAcc,max_Jumping,max_Penalties,max_Aggression,max_SprintSpeed,max_Crossing,max_LongPass,max_Age,max_GKPositioning,max_Marking,max_Agility,max_GKKicking,max_BallControl,max_Curve,max_SlideTackle,min_Composure,min_Overall,min_Potential,min_Volleys,min_Reactions,min_FKAcc,min_Jumping,min_Penalties,min_Aggression,min_SprintSpeed,min_Crossing,min_LongPass,min_Age,min_GKPositioning,min_Marking,min_Agility,min_GKKicking,min_BallControl,min_Curve,min_SlideTackle,std_Composure,std_Overall,std_Potential,std_Volleys,std_Reactions,std_FKAcc,std_Jumping,std_Penalties,std_Aggression,std_SprintSpeed,std_Crossing,std_LongPass,std_Age,std_GKPositioning,std_Marking,std_Agility,std_GKKicking,std_BallControl,std_Curve,std_SlideTackle,total_wins,total_games,ratio_won,win_margins,lose_margins,final,group stage,quarter-finals,round of 16,second group stage,semi-finals,third-place match
0,Belgium,0,3,1930,2.0,60.6567,67.6767,73.9033,44.6500,62.5800,45.2967,64.9667,48.0433,55.3533,63.5500,52.8400,56.4000,24.4967,18.2667,45.9900,64.5300,18.0867,60.6033,48.9433,46.4567,62.0,67.0,74.0,47.0,63.0,46.0,67.0,49.0,60.5,67.0,59.0,61.0,24.0,11.0,52.0,67.5,12.0,65.0,53.0,55.0,91.0,91.0,91.0,83.0,91.0,83.0,92.0,87.0,88.0,92.0,94.0,93.0,41.0,85.0,88.0,95.0,80.0,92.0,85.0,90.0,21.0,51.0,54.0,4.0,39.0,8.0,31.0,10.0,14.0,15.0,8.0,12.0,17.0,5.0,6.0,22.0,5.0,10.0,10.0,7.0,14.1556,6.8573,5.3868,19.9948,9.7427,19.4650,11.1613,16.7004,18.2724,15.0786,20.0845,15.6835,4.9530,19.5782,21.7907,14.8343,18.8649,17.8913,20.0689,21.9221,21.0,42.0,0.500000,2.500000,-1.903846,0.0,27.0,3.0,6.0,2.0,2.0,2.0
1,France,2,3,1930,4.0,60.0427,67.2386,73.5875,43.5765,62.4125,43.6123,64.7535,49.8111,56.7038,64.4016,51.8549,55.3529,24.6382,15.7197,47.7575,63.1789,15.4831,61.0050,49.3290,48.0308,61.0,67.0,73.0,44.0,62.0,42.0,65.0,51.0,59.5,67.0,56.0,59.0,24.0,11.0,54.0,65.0,11.0,65.0,52.0,57.0,90.0,90.0,95.0,87.0,92.0,87.0,92.0,86.0,90.0,96.0,88.0,91.0,39.0,84.0,90.0,93.0,82.0,90.0,86.0,88.0,20.0,50.0,58.0,3.0,30.0,6.0,26.0,7.0,11.0,15.0,9.0,13.0,17.0,3.0,4.0,21.0,2.0,10.0,6.0,8.0,11.7619,6.9645,6.0742,17.8418,9.5320,17.1862,12.3280,15.5527,16.9628,14.4239,18.1683,15.0856,4.7396,16.5057,21.2941,14.5739,15.8776,16.4098,18.1660,21.9620,29.0,50.0,0.580000,3.576190,-1.400000,3.0,27.0,5.0,5.0,2.0,5.0,2.0
2,Brazil,0,1,1930,1.0,64.8362,70.9672,73.1028,49.5706,68.1390,49.5435,65.8554,54.7412,60.4011,66.4497,55.0915,57.7311,26.1627,15.8542,49.2576,65.4712,15.8927,64.1299,52.9774,49.1616,66.0,71.0,72.0,53.0,68.0,50.0,67.0,57.0,63.0,69.0,60.0,61.0,26.0,11.0,56.0,68.0,11.0,68.0,56.0,56.0,93.0,91.0,93.0,87.0,91.0,89.0,92.0,92.0,94.0,95.0,89.0,87.0,43.0,91.0,88.0,96.0,93.0,95.0,90.0,87.0,15.0,51.0,56.0,4.0,34.0,7.0,30.0,10.0,13.0,16.0,6.0,9.0,18.0,3.0,6.0,21.0,3.0,8.0,7.0,7.0,10.7290,5.4933,5.5457,18.1326,7.6497,17.9845,12.5662,15.7556,17.2814,14.3189,18.0939,15.5242,4.2563,17.1790,23.7528,14.7489,16.3274,15.7501,18.5041,22.9348,56.0,77.0,0.727273,3.790698,-1.800000,4.0,39.0,9.0,9.0,8.0,5.0,3.0
3,Argentina,2,1,1930,3.0,60.8926,67.8063,72.7284,45.2558,63.9053,44.3305,65.6379,51.4263,57.4863,64.3337,51.1758,54.6558,26.4421,16.5453,47.1200,64.4547,16.3621,60.9432,49.6242,46.1274,62.0,68.0,73.0,47.0,64.0,43.0,66.0,53.0,60.0,67.0,55.0,59.0,26.0,11.0,51.0,67.0,11.0,65.0,51.0,52.0,96.0,93.0,93.0,88.0,94.0,94.0,94.0,88.0,95.0,94.0,88.0,91.0,42.0,82.0,84.0,94.0,83.0,96.0,93.0,81.0,14.0,52.0,57.0,5.0,35.0,8.0,17.0,10.0,13.0,15.0,8.0,12.0,17.0,2.0,5.0,23.0,3.0,9.0,8.0,8.0,12.5168,6.3838,5.5847,17.7896,8.4690,17.0677,11.8576,15.5284,18.4255,14.4028,17.9087,15.4327,5.1560,17.6885,19.9285,14.8388,17.2703,16.9946,18.6330,21.4892,39.0,65.0,0.600000,2.823529,-2.059524,4.0,36.0,6.0,8.0,8.0,3.0,0.0
4,Chile,2,3,1930,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8359,United States,2,2,2022,14.0,55.0921,63.5703,70.2583,38.8338,59.1867,36.7238,64.3683,44.1867,53.7596,62.4757,43.5473,49.8645,24.8338,19.5601,43.3657,60.9923,20.0537,53.5294,43.4194,42.0128,56.0,64.0,70.0,41.0,60.0,36.0,65.0,45.0,57.0,66.0,48.0,53.0,24.0,11.0,50.0,64.0,12.0,60.0,47.0,49.0,83.0,81.0,88.0,79.0,82.0,73.0,94.0,76.0,85.0,92.0,79.0,79.0,38.0,79.0,75.0,91.0,73.0,83.0,80.0,76.0,19.0,49.0,54.0,4.0,24.0,8.0,30.0,10.0,15.0,15.0,7.0,15.0,17.0,5.0,5.0,20.0,5.0,9.0,8.0,6.0,11.4126,6.1138,5.6602,17.2003,8.8198,15.8861,12.0725,15.6416,17.4941,15.9639,18.3742,14.7307,4.8601,20.2187,21.3712,15.6009,19.9565,18.1314,18.6485,20.9510,5.0,18.0,0.277778,2.500000,-2.166667,0.0,21.0,1.0,4.0,0.0,0.0,0.0
8360,Canada,2,1,2022,43.0,55.2597,62.8312,69.7273,38.7273,58.1039,34.9610,64.6623,44.0779,55.5844,62.2857,42.7792,48.7662,24.5974,18.0649,43.1429,60.8831,16.7403,52.9221,43.4156,42.3377,56.0,63.0,69.0,39.0,58.0,34.0,66.0,45.0,59.0,65.0,44.0,52.0,23.0,12.0,52.0,63.0,10.0,59.0,45.0,52.0,83.0,81.0,89.0,71.0,79.0,69.0,92.0,78.0,82.0,96.0,73.0,75.0,41.0,74.0,77.0,92.0,72.0,79.0,72.0,79.0,23.0,50.0,56.0,5.0,30.0,9.0,30.0,10.0,18.0,16.0,11.0,17.0,18.0,5.0,6.0,22.0,5.0,12.0,9.0,10.0,11.8925,6.7149,5.8279,17.4941,8.9613,15.4370,12.3668,16.1797,17.2225,18.4311,17.8227,14.2643,4.9425,18.3721,21.1801,17.6762,17.4758,18.1142,18.5319,20.4371,0.0,3.0,0.000000,0.000000,0.000000,0.0,3.0,0.0,0.0,0.0,0.0,0.0
8361,Ukraine,2,1,2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8362,Peru,1,0,2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [24]:
final = compiled_histmatches.dropna().reset_index(drop=True)
final

Unnamed: 0,Country,winning_team,goal_difference,year,Rank,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,avg_FKAcc,avg_Jumping,avg_Penalties,avg_Aggression,avg_SprintSpeed,avg_Crossing,avg_LongPass,avg_Age,avg_GKPositioning,avg_Marking,avg_Agility,avg_GKKicking,avg_BallControl,avg_Curve,avg_SlideTackle,med_Composure,med_Overall,med_Potential,med_Volleys,med_Reactions,med_FKAcc,med_Jumping,med_Penalties,med_Aggression,med_SprintSpeed,med_Crossing,med_LongPass,med_Age,med_GKPositioning,med_Marking,med_Agility,med_GKKicking,med_BallControl,med_Curve,med_SlideTackle,max_Composure,max_Overall,max_Potential,max_Volleys,max_Reactions,max_FKAcc,max_Jumping,max_Penalties,max_Aggression,max_SprintSpeed,max_Crossing,max_LongPass,max_Age,max_GKPositioning,max_Marking,max_Agility,max_GKKicking,max_BallControl,max_Curve,max_SlideTackle,min_Composure,min_Overall,min_Potential,min_Volleys,min_Reactions,min_FKAcc,min_Jumping,min_Penalties,min_Aggression,min_SprintSpeed,min_Crossing,min_LongPass,min_Age,min_GKPositioning,min_Marking,min_Agility,min_GKKicking,min_BallControl,min_Curve,min_SlideTackle,std_Composure,std_Overall,std_Potential,std_Volleys,std_Reactions,std_FKAcc,std_Jumping,std_Penalties,std_Aggression,std_SprintSpeed,std_Crossing,std_LongPass,std_Age,std_GKPositioning,std_Marking,std_Agility,std_GKKicking,std_BallControl,std_Curve,std_SlideTackle,total_wins,total_games,ratio_won,win_margins,lose_margins,final,group stage,quarter-finals,round of 16,second group stage,semi-finals,third-place match
0,Belgium,0,3,1930,2.0,60.6567,67.6767,73.9033,44.6500,62.5800,45.2967,64.9667,48.0433,55.3533,63.5500,52.8400,56.4000,24.4967,18.2667,45.9900,64.5300,18.0867,60.6033,48.9433,46.4567,62.0,67.0,74.0,47.0,63.0,46.0,67.0,49.0,60.5,67.0,59.0,61.0,24.0,11.0,52.0,67.5,12.0,65.0,53.0,55.0,91.0,91.0,91.0,83.0,91.0,83.0,92.0,87.0,88.0,92.0,94.0,93.0,41.0,85.0,88.0,95.0,80.0,92.0,85.0,90.0,21.0,51.0,54.0,4.0,39.0,8.0,31.0,10.0,14.0,15.0,8.0,12.0,17.0,5.0,6.0,22.0,5.0,10.0,10.0,7.0,14.1556,6.8573,5.3868,19.9948,9.7427,19.4650,11.1613,16.7004,18.2724,15.0786,20.0845,15.6835,4.9530,19.5782,21.7907,14.8343,18.8649,17.8913,20.0689,21.9221,21.0,42.0,0.500000,2.500000,-1.903846,0.0,27.0,3.0,6.0,2.0,2.0,2.0
1,France,2,3,1930,4.0,60.0427,67.2386,73.5875,43.5765,62.4125,43.6123,64.7535,49.8111,56.7038,64.4016,51.8549,55.3529,24.6382,15.7197,47.7575,63.1789,15.4831,61.0050,49.3290,48.0308,61.0,67.0,73.0,44.0,62.0,42.0,65.0,51.0,59.5,67.0,56.0,59.0,24.0,11.0,54.0,65.0,11.0,65.0,52.0,57.0,90.0,90.0,95.0,87.0,92.0,87.0,92.0,86.0,90.0,96.0,88.0,91.0,39.0,84.0,90.0,93.0,82.0,90.0,86.0,88.0,20.0,50.0,58.0,3.0,30.0,6.0,26.0,7.0,11.0,15.0,9.0,13.0,17.0,3.0,4.0,21.0,2.0,10.0,6.0,8.0,11.7619,6.9645,6.0742,17.8418,9.5320,17.1862,12.3280,15.5527,16.9628,14.4239,18.1683,15.0856,4.7396,16.5057,21.2941,14.5739,15.8776,16.4098,18.1660,21.9620,29.0,50.0,0.580000,3.576190,-1.400000,3.0,27.0,5.0,5.0,2.0,5.0,2.0
2,Brazil,0,1,1930,1.0,64.8362,70.9672,73.1028,49.5706,68.1390,49.5435,65.8554,54.7412,60.4011,66.4497,55.0915,57.7311,26.1627,15.8542,49.2576,65.4712,15.8927,64.1299,52.9774,49.1616,66.0,71.0,72.0,53.0,68.0,50.0,67.0,57.0,63.0,69.0,60.0,61.0,26.0,11.0,56.0,68.0,11.0,68.0,56.0,56.0,93.0,91.0,93.0,87.0,91.0,89.0,92.0,92.0,94.0,95.0,89.0,87.0,43.0,91.0,88.0,96.0,93.0,95.0,90.0,87.0,15.0,51.0,56.0,4.0,34.0,7.0,30.0,10.0,13.0,16.0,6.0,9.0,18.0,3.0,6.0,21.0,3.0,8.0,7.0,7.0,10.7290,5.4933,5.5457,18.1326,7.6497,17.9845,12.5662,15.7556,17.2814,14.3189,18.0939,15.5242,4.2563,17.1790,23.7528,14.7489,16.3274,15.7501,18.5041,22.9348,56.0,77.0,0.727273,3.790698,-1.800000,4.0,39.0,9.0,9.0,8.0,5.0,3.0
3,Argentina,2,1,1930,3.0,60.8926,67.8063,72.7284,45.2558,63.9053,44.3305,65.6379,51.4263,57.4863,64.3337,51.1758,54.6558,26.4421,16.5453,47.1200,64.4547,16.3621,60.9432,49.6242,46.1274,62.0,68.0,73.0,47.0,64.0,43.0,66.0,53.0,60.0,67.0,55.0,59.0,26.0,11.0,51.0,67.0,11.0,65.0,51.0,52.0,96.0,93.0,93.0,88.0,94.0,94.0,94.0,88.0,95.0,94.0,88.0,91.0,42.0,82.0,84.0,94.0,83.0,96.0,93.0,81.0,14.0,52.0,57.0,5.0,35.0,8.0,17.0,10.0,13.0,15.0,8.0,12.0,17.0,2.0,5.0,23.0,3.0,9.0,8.0,8.0,12.5168,6.3838,5.5847,17.7896,8.4690,17.0677,11.8576,15.5284,18.4255,14.4028,17.9087,15.4327,5.1560,17.6885,19.9285,14.8388,17.2703,16.9946,18.6330,21.4892,39.0,65.0,0.600000,2.823529,-2.059524,4.0,36.0,6.0,8.0,8.0,3.0,0.0
4,Uruguay,2,1,1930,13.0,58.3902,67.3844,71.9769,44.1040,64.0751,44.5289,65.3786,50.4913,57.3671,64.1474,49.7486,54.1965,26.2919,15.2688,47.8671,63.2514,15.0954,61.3179,47.4884,46.0289,59.0,67.0,72.0,44.0,65.0,43.0,66.0,51.0,59.0,66.0,52.0,57.0,26.0,11.0,54.0,67.0,11.0,64.0,48.0,52.0,85.0,87.0,90.0,90.0,92.0,82.0,92.0,90.0,89.0,91.0,81.0,84.0,39.0,82.0,88.0,92.0,77.0,85.0,87.0,86.0,22.0,52.0,55.0,5.0,36.0,6.0,27.0,10.0,15.0,15.0,8.0,12.0,17.0,2.0,6.0,22.0,4.0,10.0,8.0,9.0,12.5712,6.9857,6.0934,17.6233,9.3811,17.5615,11.0332,15.4562,17.7258,13.7309,18.1837,15.2504,5.0411,16.2038,20.7559,14.1515,15.4464,15.2434,17.9830,21.4734,13.0,21.0,0.619048,2.619048,-2.361538,0.0,24.0,3.0,5.0,0.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4960,Argentina,1,0,2022,3.0,60.8926,67.8063,72.7284,45.2558,63.9053,44.3305,65.6379,51.4263,57.4863,64.3337,51.1758,54.6558,26.4421,16.5453,47.1200,64.4547,16.3621,60.9432,49.6242,46.1274,62.0,68.0,73.0,47.0,64.0,43.0,66.0,53.0,60.0,67.0,55.0,59.0,26.0,11.0,51.0,67.0,11.0,65.0,51.0,52.0,96.0,93.0,93.0,88.0,94.0,94.0,94.0,88.0,95.0,94.0,88.0,91.0,42.0,82.0,84.0,94.0,83.0,96.0,93.0,81.0,14.0,52.0,57.0,5.0,35.0,8.0,17.0,10.0,13.0,15.0,8.0,12.0,17.0,2.0,5.0,23.0,3.0,9.0,8.0,8.0,12.5168,6.3838,5.5847,17.7896,8.4690,17.0677,11.8576,15.5284,18.4255,14.4028,17.9087,15.4327,5.1560,17.6885,19.9285,14.8388,17.2703,16.9946,18.6330,21.4892,39.0,65.0,0.600000,2.823529,-2.059524,4.0,36.0,6.0,8.0,8.0,3.0,0.0
4961,Brazil,0,4,2022,1.0,64.8362,70.9672,73.1028,49.5706,68.1390,49.5435,65.8554,54.7412,60.4011,66.4497,55.0915,57.7311,26.1627,15.8542,49.2576,65.4712,15.8927,64.1299,52.9774,49.1616,66.0,71.0,72.0,53.0,68.0,50.0,67.0,57.0,63.0,69.0,60.0,61.0,26.0,11.0,56.0,68.0,11.0,68.0,56.0,56.0,93.0,91.0,93.0,87.0,91.0,89.0,92.0,92.0,94.0,95.0,89.0,87.0,43.0,91.0,88.0,96.0,93.0,95.0,90.0,87.0,15.0,51.0,56.0,4.0,34.0,7.0,30.0,10.0,13.0,16.0,6.0,9.0,18.0,3.0,6.0,21.0,3.0,8.0,7.0,7.0,10.7290,5.4933,5.5457,18.1326,7.6497,17.9845,12.5662,15.7556,17.2814,14.3189,18.0939,15.5242,4.2563,17.1790,23.7528,14.7489,16.3274,15.7501,18.5041,22.9348,56.0,77.0,0.727273,3.790698,-1.800000,4.0,39.0,9.0,9.0,8.0,5.0,3.0
4962,Uruguay,0,2,2022,13.0,58.3902,67.3844,71.9769,44.1040,64.0751,44.5289,65.3786,50.4913,57.3671,64.1474,49.7486,54.1965,26.2919,15.2688,47.8671,63.2514,15.0954,61.3179,47.4884,46.0289,59.0,67.0,72.0,44.0,65.0,43.0,66.0,51.0,59.0,66.0,52.0,57.0,26.0,11.0,54.0,67.0,11.0,64.0,48.0,52.0,85.0,87.0,90.0,90.0,92.0,82.0,92.0,90.0,89.0,91.0,81.0,84.0,39.0,82.0,88.0,92.0,77.0,85.0,87.0,86.0,22.0,52.0,55.0,5.0,36.0,6.0,27.0,10.0,15.0,15.0,8.0,12.0,17.0,2.0,6.0,22.0,4.0,10.0,8.0,9.0,12.5712,6.9857,6.0934,17.6233,9.3811,17.5615,11.0332,15.4562,17.7258,13.7309,18.1837,15.2504,5.0411,16.2038,20.7559,14.1515,15.4464,15.2434,17.9830,21.4734,13.0,21.0,0.619048,2.619048,-2.361538,0.0,24.0,3.0,5.0,0.0,2.0,2.0
4963,United States,2,2,2022,14.0,55.0921,63.5703,70.2583,38.8338,59.1867,36.7238,64.3683,44.1867,53.7596,62.4757,43.5473,49.8645,24.8338,19.5601,43.3657,60.9923,20.0537,53.5294,43.4194,42.0128,56.0,64.0,70.0,41.0,60.0,36.0,65.0,45.0,57.0,66.0,48.0,53.0,24.0,11.0,50.0,64.0,12.0,60.0,47.0,49.0,83.0,81.0,88.0,79.0,82.0,73.0,94.0,76.0,85.0,92.0,79.0,79.0,38.0,79.0,75.0,91.0,73.0,83.0,80.0,76.0,19.0,49.0,54.0,4.0,24.0,8.0,30.0,10.0,15.0,15.0,7.0,15.0,17.0,5.0,5.0,20.0,5.0,9.0,8.0,6.0,11.4126,6.1138,5.6602,17.2003,8.8198,15.8861,12.0725,15.6416,17.4941,15.9639,18.3742,14.7307,4.8601,20.2187,21.3712,15.6009,19.9565,18.1314,18.6485,20.9510,5.0,18.0,0.277778,2.500000,-2.166667,0.0,21.0,1.0,4.0,0.0,0.0,0.0


In [51]:
# final = pd.get_dummies(compiled_histmatches, prefix=['Home Team', 'Away Team'], columns=['home_team', 'away_team'])
# final = compiled_histmatches

# Separate X and y sets
X = final.drop(['winning_team', "Country"], axis=1)
y = final["winning_team"]

y=y.astype('int')

# Separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [72]:
#classify data as 0, 1, or 2 (home team lost, drew, or won)
# classifier = LogisticRegression()

# RAN THIS MODEL AS A RANDOM FOREST REGRESSOR. TRAINING SCORE SIMILAR TO LOGISTIC REGRESSION, BUT ACCURACY SUFFERED
# from sklearn.ensemble import RandomForestRegressor
classifier = RandomForestClassifier(n_estimators = 1000, random_state = 42)

classifier.fit(X_train, y_train)

score = classifier.score(X_train, y_train)
score2 = classifier.score(X_test, y_test)

print("Training set accuracy: ", '%.3f'%(score))
print("Test set accuracy: ", '%.3f'%(score2))

Training set accuracy:  0.906
Test set accuracy:  0.649


Index(['Match Number', 'Round Number', 'Home Team', 'Away Team', 'Group',
       'Result'],
      dtype='object')

In [73]:
# schedule_grp[["Home Team", "Away Team", "Group"]].groupby("Group").
# print(X)

In [74]:
# pred_set= []

# for index, row in final.iterrows():
#     if row['first_position'] < row['second_position']:
#         pred_set.append({'home_team': row['Home Team'], 'away_team': row['Away Team'], 'winning_team': None})
#     else:
#         pred_set.append({'home_team': row['Away Team'], 'away_team': row['Home Team'], 'winning_team': None})
        
# pred_set = pd.DataFrame(pred_set)
# backup_pred_set = pred_set

# pred_set.head(25)

In [97]:
X_train.columns

Index(['goal_difference', 'year', 'Rank', 'avg_Composure', 'avg_Overall',
       'avg_Potential', 'avg_Volleys', 'avg_Reactions', 'avg_FKAcc',
       'avg_Jumping',
       ...
       'ratio_won', 'win_margins', 'lose_margins', 'final', 'group stage',
       'quarter-finals', 'round of 16', 'second group stage', 'semi-finals',
       'third-place match'],
      dtype='object', length=115)

In [98]:

pred_set = pred_set.drop(columns = ["Country", "winning_team"])

In [100]:
pred_set.columns

Index(['goal_difference', 'year', 'Rank', 'avg_Composure', 'avg_Overall',
       'avg_Potential', 'avg_Volleys', 'avg_Reactions', 'avg_FKAcc',
       'avg_Jumping',
       ...
       'ratio_won', 'win_margins', 'lose_margins', 'final', 'group stage',
       'quarter-finals', 'round of 16', 'second group stage', 'semi-finals',
       'third-place match'],
      dtype='object', length=115)

In [124]:
# # Get dummy variables and drop winning_team column
# pred_set = pred_set.drop(columns = ["Match Number", "Round Number", "Group", "Result"], axis = 1)

# from sklearn.preprocessing import OneHotEncoder
# oneh = OneHotEncoder(handle_unknown="ignore")
# oneh.fit(final)
# X_test = oneh.transform(schedule_grp)

pred_set = pd.get_dummies(schedule_grp, prefix=['Home Team', 'Away Team'], columns=['Home Team', 'Away Team'])

# Add missing columns compared to the model's training dataset
missing_cols = set(final.columns) - set(pred_set.columns)
for c in missing_cols:
    pred_set[c] = 0
pred_set = pred_set[final.columns]

pred_set = pred_set.drop(columns = ["Country", "winning_team"])

backup_pred_set = schedule_grp.drop(columns = ["Match Number", "Round Number", "Group"])

# Remove winning team column
# pred_set = pred_set.drop(['winning_team'], axis=1)

# pred_set = pred_set.dropna().reset_index(drop=True)
# pred_set.info(50)
backup_pred_set

  pred_set[c] = 0


Unnamed: 0,Home Team,Away Team,Result
0,Senegal,Netherlands,
1,England,Iran,
2,Qatar,Ecuador,
3,USA,Wales,
4,Argentina,Saudi Arabia,
5,Denmark,Tunisia,
6,Mexico,Poland,
7,France,Australia,
8,Morocco,Croatia,
9,Germany,Japan,


In [132]:
# compiled_schedule_grp
# schedule_grp = schedule_grp.drop(["Date", "Location"], axis = 1)
# schedule_grp.head()
final.columns
# pred_set.head()
# backup_pred_set.head()

Index(['Country', 'winning_team', 'goal_difference', 'year', 'Rank',
       'avg_Composure', 'avg_Overall', 'avg_Potential', 'avg_Volleys',
       'avg_Reactions',
       ...
       'ratio_won', 'win_margins', 'lose_margins', 'final', 'group stage',
       'quarter-finals', 'round of 16', 'second group stage', 'semi-finals',
       'third-place match'],
      dtype='object', length=117)

In [131]:
# group matches
predictions = classifier.predict(pred_set)
# predictions = classifier.predict(final)

for i in range(final.shape[0]):
    print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
    if predictions[i] == 2:
        print("Winner: " + backup_pred_set.iloc[i, 1])
    elif predictions[i] == 1:
        print("Draw")
    elif predictions[i] == 0:
        print("Winner: " + backup_pred_set.iloc[i, 0])
    print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][2]))
    print('Probability of Draw: ', '%.3f'%(classifier.predict_proba(pred_set)[i][1]))
    print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][0]))
    print("")

Netherlands and Senegal
Draw
Probability of Netherlands winning:  0.135
Probability of Draw:  0.763
Probability of Senegal winning:  0.102

Iran and England
Draw
Probability of Iran winning:  0.135
Probability of Draw:  0.763
Probability of England winning:  0.102

Ecuador and Qatar
Draw
Probability of Ecuador winning:  0.135
Probability of Draw:  0.763
Probability of Qatar winning:  0.102

Wales and USA
Draw
Probability of Wales winning:  0.135
Probability of Draw:  0.763
Probability of USA winning:  0.102

Saudi Arabia and Argentina
Draw
Probability of Saudi Arabia winning:  0.135
Probability of Draw:  0.763
Probability of Argentina winning:  0.102

Tunisia and Denmark
Draw
Probability of Tunisia winning:  0.135
Probability of Draw:  0.763
Probability of Denmark winning:  0.102

Poland and Mexico
Draw
Probability of Poland winning:  0.135
Probability of Draw:  0.763
Probability of Mexico winning:  0.102

Australia and France
Draw
Probability of Australia winning:  0.135
Probability o

IndexError: single positional indexer is out-of-bounds

In [None]:
# I THINK WE NEED TO SAVE THIS OUTPUT AS A VARIABLE. 
# FOR GAMES WHERE THE PROBABILITY OF WINNING IS THE SAME FOR EITHER TEAM, CALL THAT A TRUE DRAW
# OTHERWISE, CAN WE ASSUME THAT THE TEAM WITH THE HIGHER PROBABILITY OF WINNING WOULD ADVANCE TO KNOCKOUT?

# GROUPSTAGE_PTS = 0

# FOR EACH IN OUTCOMES:
#     IF DF[FIRST-COUNTRY] WIN PROBABILITY > DF[SECOND-COUNTRY] WIN PROBABILITY:
#         GROUPSTAGE_PTS =+ 3
#     IF DF[SECOND-COUNTRY] WIN PROBABILITY > DF[FIRST-COUNTRY] WIN PROBABILITY:
#         GROUPSTAGE_PTS =+3
#     ELSEIF DF[SECOND-COUNTRY] WIN PROBABILITY == DF[FIRST-COUNTRY] WIN PROBABILITY:
#         GROUPSTAGE_PTS += 1
#        
# then sum the groupstage points per group to get the top 2 teams from each, which advance to the the round of 16, and so on

In [None]:
# WINNERS OF GROUP A MATCHES
schedule_grpA = schedule[schedule["Group"]=="Group A"]
schedule_grpA = schedule_grpA.drop(columns = ["Match Number", "Round Number", "Date", "Location", "Result"], axis = 1)
schedule_grpA

grpA_hometeams = schedule_grpA["Home Team"]
# grpA_hometeams

# PREDICT IF HOME TEAM WILL BEAT AWAY TEAM
predictions = classifier.predict(grpA_hometeams)
for i in range(schedule.shape[0]):
    print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
    if predictions[i] == 2:
        print("Winner: " + backup_pred_set.iloc[i, 1])
    elif predictions[i] == 1:
        print("Draw")
    elif predictions[i] == 0:
        print("Winner: " + backup_pred_set.iloc[i, 0])
    print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][2]))
    print('Probability of Draw: ', '%.3f'%(classifier.predict_proba(pred_set)[i][1]))
    print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][0]))
    print("")

# pred_setA = []

# for index, row in schedule_grpA.iterrows():
    
#     if row['first_position'] < row['second_position']:
#         pred_set.append({'home_team': row['Home Team'], 'away_team': row['Away Team'], 'winning_team': None})
#     else:
#         pred_set.append({'home_team': row['Away Team'], 'away_team': row['Home Team'], 'winning_team': None})
        
# pred_setA = pd.DataFrame(pred_setA)
# backup_pred_setA = pred_setA

# pred_setA.head(25)




# # # Get dummy variables and drop winning_team column
# pred_set = pd.get_dummies(pred_set, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# # Add missing columns compared to the model's training dataset
# missing_cols = set(final.columns) - set(pred_set.columns)
# for c in missing_cols:
#     pred_set[c] = 0
# pred_set = pred_set[final.columns]

# # Remove winning team column
# pred_set = pred_set.drop(['winning_team'], axis=1)

# pred_set.head()






# predictions_grpA = classifier.predict(pred_set)
# for i in range(schedule.shape[0]):
#     print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
#     if predictions[i] == 2:
#         print("Winner: " + backup_pred_set.iloc[i, 1])
#     elif predictions[i] == 1:
#         print("Draw")
#     elif predictions[i] == 0:
#         print("Winner: " + backup_pred_set.iloc[i, 0])
#     print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][2]))
#     print('Probability of Draw: ', '%.3f'%(classifier.predict_proba(pred_set)[i][1]))
#     print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][0]))
#     print("")