In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker

from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor


from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [3]:
# IMPORT CSV FILES 

# -- PER COUNTRY FEATURES --
rank = pd.read_csv("Features/fifa_rank.csv")
player_avg = pd.read_csv("Features/player_averages_for_teams.csv")
win_ratio = pd.read_csv("Features/ratio_played_vs_won.csv")
delta_pts = pd.read_csv("Features/total_score_margins.csv")
games = pd.read_csv("Features/adv_df_pivot.csv")
player_top_features = pd.read_csv("Features/player_top_features_country.csv")

# -- OTHER FEATURES --
df_matches = pd.read_csv("Features/wc_matches.csv")
schedule = pd.read_csv('Data/schedule.csv')

# IDENTIFY 2022 WORLD CUP TEAMS
teams_2022 = ['Qatar', 'Netherlands', 'Senegal', 'Ecuador', 
            'England', 'United States', 'Wales', 'Iran', 
            'Argentina', 'Poland', 'Mexico', 'Saudi Arabia', 
            'France', 'Denmark', 'Tunisia', 'Australia', 
            'Germany', 'Spain', 'Japan', 'Costa Rica', 
            'Belgium', 'Croatia', 'Canada', 'Morocco', 
            'Brazil', 'Switzerland', 'Serbia', 'Cameroon', 
            'Portugal', 'Uruguay', 'Ghana', 'Korea Republic']

## Data Clean UP and Merging

In [4]:
# CLEAN UP

rank = rank.drop(['Unnamed: 0', 'Points'], axis=1)
player_avg = player_avg.rename(columns={"Nationality": "Country"})
win_ratio = win_ratio.drop(['Unnamed: 0'], axis=1)
delta_pts = delta_pts.drop(['Unnamed: 0'], axis=1)
games = games.rename(columns={"country": "Country"})
player_top_features = player_top_features.drop(['Unnamed: 0'], axis=1)

In [5]:
# MERGE DATAFRAMES ON "COUNTRY"

 # list of dataframes
# dfs = [rank, player_avg, win_ratio, delta_pts, games]
dfs = [rank, player_top_features, win_ratio, delta_pts, games]

# MERGING
compiled_df = reduce(lambda  left,right: pd.merge(left,right,on=["Country"],
                                            how='outer'), dfs)
# FILLING NULLS WITH 0
compiled_df = compiled_df.fillna(0)
# compiled_df["Country"].unique()

In [6]:
# limit the dataframe rows to the relevant teams playing in the 2022 world cup
compiled_df = compiled_df[(compiled_df['Country'].isin(teams_2022))]
compiled_df.head()

Unnamed: 0,Rank,Country,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,total_wins,total_games,ratio_won,win_margins,lose_margins,final,group stage,quarter-finals,round of 16,second group stage,semi-finals,third-place match
0,1.0,Brazil,64.8362,70.9672,73.1028,49.5706,68.139,56.0,77.0,0.727273,3.790698,-1.8,4.0,39.0,9.0,9.0,8.0,5.0,3.0
1,2.0,Belgium,60.6567,67.6767,73.9033,44.65,62.58,21.0,42.0,0.5,2.5,-1.903846,0.0,27.0,3.0,6.0,2.0,2.0,2.0
2,3.0,Argentina,60.8926,67.8063,72.7284,45.2558,63.9053,39.0,65.0,0.6,2.823529,-2.059524,4.0,36.0,6.0,8.0,8.0,3.0,0.0
3,4.0,France,60.0427,67.2386,73.5875,43.5765,62.4125,29.0,50.0,0.58,3.57619,-1.4,3.0,27.0,5.0,5.0,2.0,5.0,2.0
4,5.0,England,56.4206,63.2728,70.3153,40.3194,59.032,22.0,30.0,0.733333,3.232143,-1.064706,0.0,30.0,6.0,7.0,2.0,2.0,2.0


In [7]:
df_matches = df_matches.replace("South Korea", "Korea Republic")
# df_matches["home_team"].unique()

In [8]:
df_matches_home = df_matches[df_matches['home_team'].isin(teams_2022)]
df_matches_away = df_matches[df_matches['away_team'].isin(teams_2022)]
df_matches2 = pd.concat((df_matches_home, df_matches_away))
df_matches2.drop_duplicates()
df_matches2.count()

Unnamed: 0         5150
date               5150
home_team          5150
away_team          5150
home_score         5150
away_score         5150
tournament         5150
city               5150
country            5150
neutral            5150
winning_team       5150
goal_difference    5150
year               5150
dtype: int64

In [9]:
df_matches2 = df_matches2.drop(columns = "Unnamed: 0", axis = 1)
# df_matches2.head()

In [10]:
df_matches2 = df_matches2.drop(["date", "home_score", "away_score", "tournament", "city", "country", "goal_difference", "neutral", "year"], axis = 1)
df_matches2.head()

Unnamed: 0,home_team,away_team,winning_team
0,Belgium,United States,United States
1,France,Mexico,France
2,Brazil,Yugoslavia,Yugoslavia
3,Argentina,France,Argentina
6,Uruguay,Peru,Uruguay


In [11]:
#Building the model
#the prediction label: 
# The winning_team column will show "2" if the home team has won, "1" if it was a tie, and "0" if the away team has won.

# df_matches2 = df_matches2.reset_index(drop=True)
df_matches2.loc[df_matches2.winning_team == df_matches2.home_team,'winning_team']=2
df_matches2.loc[df_matches2.winning_team == 'draw', 'winning_team']=1
df_matches2.loc[df_matches2.winning_team == df_matches2.away_team, 'winning_team']=0

df_matches2.head()

Unnamed: 0,home_team,away_team,winning_team
0,Belgium,United States,0
1,France,Mexico,2
2,Brazil,Yugoslavia,0
3,Argentina,France,2
6,Uruguay,Peru,2


## ADD IN COUNTRY FEATURES HERE?

In [12]:
all_feautres_home = pd.merge(df_matches2, compiled_df, left_on  = "home_team", right_on = "Country", how ="left" )
all_feautres_away = pd.merge(df_matches2, compiled_df, left_on  = "away_team", right_on = "Country", how ="left" )


# GROUP STAGE
all_features = all_feautres_home.append(all_feautres_away, ignore_index = True)
all_features = all_features.dropna()
all_features["Country"].nunique()



32

In [13]:
final = pd.get_dummies(all_features, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])
final.head()

Unnamed: 0,winning_team,Rank,Country,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,total_wins,total_games,...,away_team_Uruguay,away_team_Uzbekistan,away_team_Vanuatu,away_team_Venezuela,away_team_Vietnam,away_team_Wales,away_team_Yemen,away_team_Yugoslavia,away_team_Zambia,away_team_Zimbabwe
0,0,2.0,Belgium,60.6567,67.6767,73.9033,44.65,62.58,21.0,42.0,...,0,0,0,0,0,0,0,0,0,0
1,2,4.0,France,60.0427,67.2386,73.5875,43.5765,62.4125,29.0,50.0,...,0,0,0,0,0,0,0,0,0,0
2,0,1.0,Brazil,64.8362,70.9672,73.1028,49.5706,68.139,56.0,77.0,...,0,0,0,0,0,0,0,1,0,0
3,2,3.0,Argentina,60.8926,67.8063,72.7284,45.2558,63.9053,39.0,65.0,...,0,0,0,0,0,0,0,0,0,0
4,2,13.0,Uruguay,58.3902,67.3844,71.9769,44.104,64.0751,13.0,21.0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
final = pd.get_dummies(all_features, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6842 entries, 0 to 10299
Columns: 403 entries, winning_team to away_team_Zimbabwe
dtypes: float64(18), object(2), uint8(383)
memory usage: 3.6+ MB


In [15]:
#convert home team and away team from categorical variables to continous inputs 
# Get dummy variables

# Separate X and y sets
X = final.drop(['winning_team', "Country"], axis=1)
y = final["winning_team"]
y = y.astype('int')

# Separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# final.info()
# y.value_counts()

In [16]:
final.info()
y.value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6842 entries, 0 to 10299
Columns: 403 entries, winning_team to away_team_Zimbabwe
dtypes: float64(18), object(2), uint8(383)
memory usage: 3.6+ MB


2    3372
0    1834
1    1636
Name: winning_team, dtype: int64

In [17]:

scaler = preprocessing.StandardScaler().fit(X_train)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [18]:
RandomForestClassifier = RandomForestClassifier()
RandomForestClassifier.fit(X_train, y_train)
score = RandomForestClassifier.score(X_train_scaled, y_train)
score2 = RandomForestClassifier.score(X_test_scaled, y_test)

print("Training set accuracy: ", '%.3f'%(score))
print("Test set accuracy: ", '%.3f'%(score2))

Training set accuracy:  0.573
Test set accuracy:  0.559


In [19]:
#adding Fifa rankings
#the team which is positioned higher on the FIFA Ranking will be considered "favourite" for the match
#and therefore, will be positioned under the "home_teams" column
#since there are no "home" or "away" teams in World Cup games. 

# Loading new datasets
# ranking = pd.read_csv('datasets/fifa_rankings.csv') 
fixtures = pd.read_csv('Data/schedule.csv')

# List for storing the group stage games

pred_set_16 = []
pred_set_qtrs = []
pred_set_semi = []
pred_set_final = []

## Group Stages

In [20]:
# loading datasets
country_features = compiled_df
# country_features_qtrs = compiled_df
# country_features_semi = compiled_df
# country_features_final = compiled_df


fixtures = schedule

# list for storing group stage games
pred_set = []



In [21]:
# may not need the .inster and .map functions as we're trying to use more than one feature
# cannot not specify a single columns as currently written

# ------
# Create new columns with ranking position of each team
fixtures.insert(1, 'first_position', fixtures['Home Team'].map(country_features .set_index('Country')['avg_Composure']))
fixtures.insert(2, 'second_position', fixtures['Away Team'].map(country_features .set_index('Country')['avg_Composure']))

# We only need the group stage games, so we have to slice the dataset
fixtures = fixtures.iloc[:48, :]
fixtures.tail()

Unnamed: 0,Match Number,first_position,second_position,Round Number,Date,Location,Home Team,Away Team,Group,Result
43,44,61.9655,56.1481,3,1/12/2022 19:00,Al Bayt Stadium,Costa Rica,Germany,Group E,
44,45,60.9836,58.3902,3,2/12/2022 15:00,Al Janoub Stadium,Ghana,Uruguay,Group H,
45,46,55.5819,64.4681,3,2/12/2022 15:00,Education City Stadium,Korea Republic,Portugal,Group H,
46,47,62.2406,57.6938,3,2/12/2022 19:00,Stadium 974,Serbia,Switzerland,Group G,
47,48,60.5294,64.8362,3,2/12/2022 19:00,Lusail Stadium,Cameroon,Brazil,Group G,


In [22]:

# Loop to add teams to new prediction dataset based on the ranking position of each team
for index, row in fixtures.iterrows():
    if row['first_position'] < row['second_position']:
        pred_set.append({'home_team': row['Home Team'], 'away_team': row['Away Team'], 'winning_team': None})
    else:
        pred_set.append({'home_team': row['Away Team'], 'away_team': row['Home Team'], 'winning_team': None})
        
pred_set = pd.DataFrame(pred_set)
backup_pred_set = pred_set

pred_set.head()

Unnamed: 0,home_team,away_team,winning_team
0,Netherlands,Senegal,
1,England,Iran,
2,Qatar,Ecuador,
3,Wales,USA,
4,Saudi Arabia,Argentina,


In [23]:
# Get dummy variables and drop winning_team column
pred_set = pd.get_dummies(pred_set, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# Add missing columns compared to the model's training dataset
missing_cols = set(final.columns) - set(pred_set.columns)
for c in missing_cols:
    pred_set[c] = 0
pred_set = pred_set[final.columns]

# Remove winning team column
pred_set = pred_set.drop(['winning_team', "Country"], axis=1)

pred_set.head()

  pred_set[c] = 0


Unnamed: 0,Rank,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,total_wins,total_games,ratio_won,win_margins,...,away_team_Uruguay,away_team_Uzbekistan,away_team_Vanuatu,away_team_Venezuela,away_team_Vietnam,away_team_Wales,away_team_Yemen,away_team_Yugoslavia,away_team_Zambia,away_team_Zimbabwe
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
fixtures2 = fixtures.drop(columns = ["first_position", "second_position", "Date", "Location", "Result"], axis =1)

## Group Stage Predictions

In [56]:
#group matches 
predictions = RandomForestClassifier.predict(pred_set)

outcome_df = []



for i in range(fixtures.shape[0]):
    print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
    
    if predictions[i] == 2:
        print("Winner: " + backup_pred_set.iloc[i, 1])
    elif predictions[i] == 1:
        print("Draw")
    elif predictions[i] == 0:
        print("Winner: " + backup_pred_set.iloc[i, 0])
        
    
    print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(RandomForestClassifier.predict_proba(pred_set)[i][2]))
    print('Probability of Draw: ', '%.3f'%(RandomForestClassifier.predict_proba(pred_set)[i][1]))
    print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(RandomForestClassifier.predict_proba(pred_set)[i][0]))
    print("")
    
    outcome_df.append({"Team_1": backup_pred_set.iloc[i, 1], "Team_1_Prob": RandomForestClassifier.predict_proba(pred_set)[i][2], "Team_2":backup_pred_set.iloc[i, 0], "Team_2_Prob":RandomForestClassifier.predict_proba(pred_set)[i][0], "Draw_Prob":RandomForestClassifier.predict_proba(pred_set)[i][1]})


array([2, 1, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2, 0, 0, 2, 0, 0, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2,
       2, 2, 2, 0])

## Round of 16

In [34]:
# IDENTIFYING WHO IS IN THE ROUND OF 16
# PER THE WORLD CUP, AT THE GORUP STAGE, EACH TEAM PLAYS 3 GAMES:
    # THREE POINTS FOR A WIN
    # ONE POINT FOR A DRAW
    # ZERO POINTS FOR LOSING
# THE TOP TWO TEAMS FROM EACH GROUP ADVANCE

outcome_df = pd.DataFrame(outcome_df)
outcome_df

def find_winner(row):
    if row["Team_1_Prob"] > row["Team_2_Prob"] and row["Team_1_Prob"] > row["Draw_Prob"]:
        val = row["Team_1"]
    elif row["Team_2_Prob"] > row["Team_1_Prob"] and row["Team_2_Prob"] > row["Draw_Prob"]:
        val = row["Team_2"]
    else:
        val = "Draw"
    return val

def assign_points(row):
    if row["Team"] == row["outcome"]:
        val = 3
    elif row["outcome"] == "Draw":
        val = 1
    else:
        val = 0
    return val


outcome_df["outcome"] = outcome_df.apply(find_winner, axis = 1)

outcome_df_team1 = outcome_df.drop(columns = ["Team_1_Prob", "Team_2", "Team_2_Prob", "Draw_Prob"], axis = 1)
outcome_df_team1 = outcome_df_team1.rename(columns = {"Team_1":"Team"})
outcome_df_team1["outcome_points"] = outcome_df_team1.apply(assign_points, axis = 1)
outcome_df_team1 = outcome_df_team1.groupby("Team").sum("outcome_points")


outcome_df_team2 = outcome_df.drop(columns = ["Team_2_Prob", "Team_1", "Team_1_Prob", "Draw_Prob"], axis = 1)
outcome_df_team2 = outcome_df_team2.rename(columns = {"Team_2":"Team"})
outcome_df_team2["outcome_points"] = outcome_df_team2.apply(assign_points, axis = 1)
outcome_df_team2 = outcome_df_team2.groupby("Team").sum("outcome_points")

all_outcomes = pd.concat([outcome_df_team1,outcome_df_team2])
all_outcomes = all_outcomes.groupby("Team").sum("outcome_points")
all_outcomes

Unnamed: 0_level_0,outcome_points
Team,Unnamed: 1_level_1
Argentina,9
Australia,0
Belgium,3
Brazil,3
Cameroon,6
Canada,0
Costa Rica,9
Croatia,6
Denmark,3
Ecuador,6


In [42]:
# GETTING THE GROUP ASSIGNMENTS - THE TOP 2 TEAMS IN EACH GROUP WILL ADVANCE TO THE KNOCK-OUT STAGES
fixtures_grps_h = fixtures.drop(columns = ["Match Number","Away Team", "first_position", "second_position", "Round Number", "Date", "Location", "Result"], axis = 1)
fixtures_grps_h = fixtures_grps_h.rename(columns = {"Home Team": "Team"})

fixtures_grps_a = fixtures.drop(columns = ["Match Number","Home Team", "first_position", "second_position", "Round Number", "Date", "Location", "Result"], axis = 1)
fixtures_grps_a = fixtures_grps_a.rename(columns = {"Away Team": "Team"})

fixtures_grps = pd.concat([fixtures_grps_h,fixtures_grps_a ]).drop_duplicates()
# len(fixtures_grps)

In [36]:
outcomes_by_grp = all_outcomes.merge(fixtures_grps, left_on = "Team", right_on = "Team", how = "outer")
outcomes_by_grp.sort_values(["Group", "outcome_points"])

# SEGMENTING THE TOP TWO TEAMS FROM EACH GROUP AND ASSIGNING POSITIONS FOR ROUND OF 16
outcomes_A = outcomes_by_grp[outcomes_by_grp.iloc[:,2] == "Group A"].nlargest(2, ["outcome_points"])
outcomes_B = outcomes_by_grp[outcomes_by_grp.iloc[:,2] == "Group B"].nlargest(2, ["outcome_points"])
outcomes_C = outcomes_by_grp[outcomes_by_grp.iloc[:,2] == "Group C"].nlargest(2, ["outcome_points"])
outcomes_D = outcomes_by_grp[outcomes_by_grp.iloc[:,2] == "Group D"].nlargest(2, ["outcome_points"])
outcomes_E = outcomes_by_grp[outcomes_by_grp.iloc[:,2] == "Group E"].nlargest(2, ["outcome_points"])
outcomes_F = outcomes_by_grp[outcomes_by_grp.iloc[:,2] == "Group F"].nlargest(2, ["outcome_points"])
outcomes_G = outcomes_by_grp[outcomes_by_grp.iloc[:,2] == "Group G"].nlargest(2, ["outcome_points"])
outcomes_H = outcomes_by_grp[outcomes_by_grp.iloc[:,2] == "Group H"].nlargest(2, ["outcome_points"])

# THIS ONLY WORKS BECAUSE THE DATAFRAME IS ALREADY SORTED CORRECTLY 
# It's only way I can think of to handle when both of the top two teams have the same point values
round_16_nums = pd.Series(["1A", "2A", "1B", "2B", "1C", "2C", "1D", "2D", "1E", "2E", "1F", "2F", "1G", "2G", "1H", "2H"], name = "R16_num")

outcomes_top_two = pd.concat([outcomes_A,outcomes_B, outcomes_C,outcomes_D, outcomes_E, outcomes_F, outcomes_G, outcomes_H])
outcomes_top_two.reset_index(inplace = True)
outcomes_top_two.drop(columns = ["index"])
round_16_df = pd.merge(outcomes_top_two, round_16_nums, left_index=True, right_index=True)
# round_16_df
round_16_list = round_16_df["Team"].tolist()
round_16_list

['Senegal',
 'Ecuador',
 'USA',
 'Wales',
 'Argentina',
 'Mexico',
 'Tunisia',
 'France',
 'Costa Rica',
 'Japan',
 'Morocco',
 'Croatia',
 'Serbia',
 'Cameroon',
 'Portugal',
 'Ghana']

In [37]:
fixtures_16 = schedule[schedule["Round Number"] == "Round of 16"]
fixtures_16_h = pd.merge(fixtures_16, round_16_df[["Team", "R16_num"]], left_on = "Home Team", right_on = "R16_num", how = "left")
fixtures_16_h = fixtures_16_h.rename(columns = {"Home Team": "Matchup_H", "Team":"Home Team"})
fixtures_16_a = pd.merge(fixtures_16_h, round_16_df[["Team", "R16_num"]], left_on = "Away Team", right_on = "R16_num", how = "left")
fixtures_16_a = fixtures_16_a.rename(columns = {"Away Team": "Matchup_A", "Team":"Away Team"})
fixtures_16_a = fixtures_16_a.drop(columns = ["Round Number","first_position", "second_position", "Date", "Location", "Matchup_H", "Matchup_A", "Group", "R16_num_x", "R16_num_y"], axis = 1)
fixtures_16_a

Unnamed: 0,Match Number,Result,Home Team,Away Team
0,49,,Senegal,Wales
1,50,,Argentina,France
2,52,,Tunisia,Mexico
3,51,,USA,Ecuador
4,53,,Costa Rica,Croatia
5,54,,Serbia,Ghana
6,55,,Morocco,Japan
7,56,,Portugal,Cameroon


In [44]:
pred_16 = fixtures_16_a.drop("Match Number", axis='columns')
pred_16.columns
backup_pred_16 = pred_16
pred_16

Unnamed: 0,Result,Home Team,Away Team
0,,Senegal,Wales
1,,Argentina,France
2,,Tunisia,Mexico
3,,USA,Ecuador
4,,Costa Rica,Croatia
5,,Serbia,Ghana
6,,Morocco,Japan
7,,Portugal,Cameroon


In [45]:
# Get dummy variables and drop winning_team column
pred_16 = pd.get_dummies(pred_16, prefix=['Home Team', 'Away Team'], columns=['Home Team', 'Away Team'])

# Add missing columns compared to the model's training dataset
missing_cols = set(final.columns) - set(pred_16.columns)
for c in missing_cols:
    pred_16[c] = 0
pred_16 = pred_16[final.columns]
pred_16
# # Remove winning team column
pred_16 = pred_16.drop(['winning_team', "Country"], axis=1)



  pred_16[c] = 0


In [46]:
#group matches 
predictions = RandomForestClassifier.predict(pred_16)

outcome_df_16 = []

predictions
# for i in range(backup_pred_16.shape[0]):
#     print(backup_pred_16.iloc[i, 1] + " and " + backup_pred_16.iloc[i, 0])
    
#     if predictions[i] == 2:
#         print("Winner: " + backup_pred_16.iloc[i, 1])
#     elif predictions[i] == 1:
#         print("Draw")
#     elif predictions[i] == 0:
#         print("Winner: " + backup_pred_16.iloc[i, 0])
        
    
#     print('Probability of ' + backup_pred_16.iloc[i, 1] + ' winning: ', '%.3f'%(RandomForestClassifier.predict_proba(pred_16)[i][2]))
#     print('Probability of Draw: ', '%.3f'%(RandomForestClassifier.predict_proba(pred_16)[i][1]))
#     print('Probability of ' + backup_pred_16.iloc[i, 0] + ' winning: ', '%.3f'%(RandomForestClassifier.predict_proba(pred_16)[i][0]))
#     print("")
    
#     outcome_df_16.append({"Team_1": backup_pred_16.iloc[i, 1], "Team_1_Prob": RandomForestClassifier.predict_proba(pred_16)[i][2], "Team_2":backup_pred_16.iloc[i, 0], "Team_2_Prob":RandomForestClassifier.predict_proba(pred_16)[i][0], "Draw_Prob":RandomForestClassifier.predict_proba(pred_16)[i][1]})
    

array([2, 2, 2, 2, 2, 2, 2, 2])

In [47]:
# List of matches
pred_8 = [('Senegal', 'Argentina'),
            ('Tunisia', 'USA'),
            ('Costa Rica', 'Serbia'),
            ('Morocco', 'Portugal')]

In [48]:
# Get dummy variables and drop winning_team column
pred_8 = pd.get_dummies(pred_8, prefix=['Home Team', 'Away Team'], columns=['Home Team', 'Away Team'])

# Add missing columns compared to the model's training dataset
missing_cols = set(final.columns) - set(pred_8.columns)
for c in missing_cols:
    pred_8[c] = 0
pred_8 = pred_8[final.columns]
pred_8
# # Remove winning team column
pred_8 = pred_8.drop(['winning_team', "Country"], axis=1)
pred_8

  pred_8[c] = 0


Unnamed: 0,Rank,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,total_wins,total_games,ratio_won,win_margins,...,away_team_Uruguay,away_team_Uzbekistan,away_team_Vanuatu,away_team_Venezuela,away_team_Vietnam,away_team_Wales,away_team_Yemen,away_team_Yugoslavia,away_team_Zambia,away_team_Zimbabwe
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
#group matches 
predictions = RandomForestClassifier.predict(pred_8)

outcome_df_8 = []

predictions
# for i in range(backup_pred_8.shape[0]):
#     print(backup_pred_8.iloc[i, 1] + " and " + backup_pred_8.iloc[i, 0])
    
#     if predictions[i] == 2:
#         print("Winner: " + backup_pred_8.iloc[i, 1])
#     elif predictions[i] == 1:
#         print("Draw")
#     elif predictions[i] == 0:
#         print("Winner: " + backup_pred_8.iloc[i, 0])
        
    
#     print('Probability of ' + backup_pred_8.iloc[i, 1] + ' winning: ', '%.3f'%(RandomForestClassifier.predict_proba(pred_8)[i][2]))
#     print('Probability of Draw: ', '%.3f'%(RandomForestClassifier.predict_proba(pred_8)[i][1]))
#     print('Probability of ' + backup_pred_8.iloc[i, 0] + ' winning: ', '%.3f'%(RandomForestClassifier.predict_proba(pred_8)[i][0]))
#     print("")
    
#     outcome_df_8.append({"Team_1": backup_pred_8.iloc[i, 1], "Team_1_Prob": RandomForestClassifier.predict_proba(pred_8)[i][2], "Team_2":backup_pred_8.iloc[i, 0], "Team_2_Prob":RandomForestClassifier.predict_proba(pred_8)[i][0], "Draw_Prob":RandomForestClassifier.predict_proba(pred_8)[i][1]})
    

array([2, 2, 2, 2])

In [50]:
# List of matches
pred_4 = [('Senegal', 'Tunisia'),
            ('Costa Rica', 'Morocca')]

In [51]:
# Get dummy variables and drop winning_team column
pred_4 = pd.get_dummies(pred_4, prefix=['Home Team', 'Away Team'], columns=['Home Team', 'Away Team'])

# Add missing columns compared to the model's training dataset
missing_cols = set(final.columns) - set(pred_4.columns)
for c in missing_cols:
    pred_4[c] = 0
pred_4 = pred_4[final.columns]
pred_4
# # Remove winning team column
pred_4 = pred_4.drop(['winning_team', "Country"], axis=1)
pred_4

  pred_4[c] = 0


Unnamed: 0,Rank,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,total_wins,total_games,ratio_won,win_margins,...,away_team_Uruguay,away_team_Uzbekistan,away_team_Vanuatu,away_team_Venezuela,away_team_Vietnam,away_team_Wales,away_team_Yemen,away_team_Yugoslavia,away_team_Zambia,away_team_Zimbabwe
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
#group matches 
predictions = RandomForestClassifier.predict(pred_4)

outcome_df_4 = []

predictions
# for i in range(backup_pred_4.shape[0]):
#     print(backup_pred_4.iloc[i, 1] + " and " + backup_pred_4.iloc[i, 0])
    
#     if predictions[i] == 2:
#         print("Winner: " + backup_pred_4.iloc[i, 1])
#     elif predictions[i] == 1:
#         print("Draw")
#     elif predictions[i] == 0:
#         print("Winner: " + backup_pred_4.iloc[i, 0])
        
    
#     print('Probability of ' + backup_pred_4.iloc[i, 1] + ' winning: ', '%.3f'%(RandomForestClassifier.predict_proba(pred_4)[i][2]))
#     print('Probability of Draw: ', '%.3f'%(RandomForestClassifier.predict_proba(pred_4)[i][1]))
#     print('Probability of ' + backup_pred_4.iloc[i, 0] + ' winning: ', '%.3f'%(RandomForestClassifier.predict_proba(pred_4)[i][0]))
#     print("")
    
#     outcome_df_4.append({"Team_1": backup_pred_4.iloc[i, 1], "Team_1_Prob": RandomForestClassifier.predict_proba(pred_4)[i][2], "Team_2":backup_pred_4.iloc[i, 0], "Team_2_Prob":RandomForestClassifier.predict_proba(pred_4)[i][0], "Draw_Prob":RandomForestClassifier.predict_proba(pred_4)[i][1]})
    

array([2, 2])

In [53]:
# List of matches
pred_2 = [('Senegal', 'Costa Rica')]

In [54]:
# Get dummy variables and drop winning_team column
pred_2 = pd.get_dummies(pred_2, prefix=['Home Team', 'Away Team'], columns=['Home Team', 'Away Team'])

# Add missing columns compared to the model's training dataset
missing_cols = set(final.columns) - set(pred_2.columns)
for c in missing_cols:
    pred_2[c] = 0
pred_2 = pred_2[final.columns]
pred_2
# # Remove winning team column
pred_2 = pred_2.drop(['winning_team', "Country"], axis=1)
pred_2

  pred_2[c] = 0


Unnamed: 0,Rank,avg_Composure,avg_Overall,avg_Potential,avg_Volleys,avg_Reactions,total_wins,total_games,ratio_won,win_margins,...,away_team_Uruguay,away_team_Uzbekistan,away_team_Vanuatu,away_team_Venezuela,away_team_Vietnam,away_team_Wales,away_team_Yemen,away_team_Yugoslavia,away_team_Zambia,away_team_Zimbabwe
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
#group matches 
predictions = RandomForestClassifier.predict(pred_2)

outcome_df_2 = []

predictions
# for i in range(backup_pred_4.shape[0]):
#     print(backup_pred_4.iloc[i, 1] + " and " + backup_pred_4.iloc[i, 0])
    
#     if predictions[i] == 2:
#         print("Winner: " + backup_pred_4.iloc[i, 1])
#     elif predictions[i] == 1:
#         print("Draw")
#     elif predictions[i] == 0:
#         print("Winner: " + backup_pred_4.iloc[i, 0])
        
    
#     print('Probability of ' + backup_pred_4.iloc[i, 1] + ' winning: ', '%.3f'%(RandomForestClassifier.predict_proba(pred_4)[i][2]))
#     print('Probability of Draw: ', '%.3f'%(RandomForestClassifier.predict_proba(pred_4)[i][1]))
#     print('Probability of ' + backup_pred_4.iloc[i, 0] + ' winning: ', '%.3f'%(RandomForestClassifier.predict_proba(pred_4)[i][0]))
#     print("")
    
#     outcome_df_4.append({"Team_1": backup_pred_4.iloc[i, 1], "Team_1_Prob": RandomForestClassifier.predict_proba(pred_4)[i][2], "Team_2":backup_pred_4.iloc[i, 0], "Team_2_Prob":RandomForestClassifier.predict_proba(pred_4)[i][0], "Draw_Prob":RandomForestClassifier.predict_proba(pred_4)[i][1]})
    

array([2])

In [41]:
#Winner is Senegal

In [None]:
# # ROUND OF 16 -----------------------------------------------------------------------------------


# all_features_16 = all_feautres_home.append(all_feautres_away, ignore_index = True).isin(round_16_list)
# all_features_16 = all_features_16.dropna()
# all_features_16["Country"].nunique()

# final_16 = pd.get_dummies(all_features_16, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# # Separate X and y sets
# X16 = final_16.drop(['winning_team', "Country"], axis=1)
# y16 = final_16["winning_team"]
# y16 = y16.astype('int')

In [None]:
# # Separate train and test sets
# X16_train, X16_test, y16_train, y16_test = train_test_split(X16, y16, test_size=0.30, random_state=42)

In [None]:
# # SCALING THE DATA
# scaler16 = preprocessing.StandardScaler().fit(X16_train)
# scaler16 = StandardScaler().fit(X16_train)
# X16_train_scaled = scaler16.transform(X16_train)
# X16_test_scaled = scaler16.transform(X16_test)

# # FITTING THE MODEL
# logreg_16 = LogisticRegression()
# logreg_16.fit(X16_train, y16_train)
# score_16 = logreg_16.score(X16_train_scaled, y16_train)
# score2_16 = logreg_16.score(X16_test_scaled, y16_test)

# # SCORING
# print("Training set accuracy: ", '%.3f'%(score_16))
# print("Test set accuracy: ", '%.3f'%(score2_16))

In [None]:
# country_features_16 = compiled_df[(compiled_df['Country'].isin(round_16_list))]
# all_features_16["winning_team"]

In [None]:
# # Create new columns with ranking position of each team
# fixtures_16_a.insert(1, 'first_position', fixtures_16_a['Home Team'].map(country_features_16 .set_index('Country')['avg_Composure']))
# fixtures_16_a.insert(2, 'second_position', fixtures_16_a['Away Team'].map(country_features_16 .set_index('Country')['avg_Composure']))

In [None]:
# # Loop to add teams to new prediction dataset based on the ranking position of each team
# for index, row in fixtures_16_a.iterrows():
#     if row['first_position'] < row['second_position']:
#         pred_set_16.append({'home_team': row['Home Team'], 'away_team': row['Away Team'], 'winning_team': None})
#     else:
#         pred_set_16.append({'home_team': row['Away Team'], 'away_team': row['Home Team'], 'winning_team': None})
        
# pred_set_16 = pd.DataFrame(pred_set_16)
# backup_pred_set_16 = pred_set_16

# pred_set_16.head()

In [None]:
# final_16

In [None]:
# # Get dummy variables and drop winning_team column
# pred_set_16 = pd.get_dummies(pred_set_16, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# # Add missing columns compared to the model's training dataset
# missing_cols = set(final_16.columns) - set(pred_set_16.columns)
# for c in missing_cols:
#     pred_set[c] = 0
# pred_set_16 = pred_set_16[final_16.columns]

# # Remove winning team column
# pred_set_16 = pred_set_16.drop(['winning_team', "Country"], axis=1)

# pred_set_16.head()

# pred_set_16 = pred_set_16.dropna()

In [None]:
# # ROUND OF 16 MATCHES 
# predictions_16 = logreg.predict(pred_set_16)

# outcome16_df = []

# for i in range(fixtures_16_a.shape[0]):
#     print(backup_pred_set_16.iloc[i, 1] + " and " + backup_pred_set_16.iloc[i, 0])
    
#     if predictions_16[i] == 2:
#         print("Winner: " + backup_pred_set_16.iloc[i, 1])
#     elif predictions_16[i] == 1:
#         print("Draw")
#     elif predictions_16[i] == 0:
#         print("Winner: " + backup_pred_set_16.iloc[i, 0])
        
    
#     print('Probability of ' + backup_pred_set_16.iloc[i, 1] + ' winning: ', '%.3f'%(logreg.predict_proba(pred_set_16)[i][2]))
#     print('Probability of Draw: ', '%.3f'%(logreg.predict_proba(pred_set_16)[i][1]))
#     print('Probability of ' + backup_pred_set_16.iloc[i, 0] + ' winning: ', '%.3f'%(logreg.predict_proba(pred_set_16)[i][0]))
#     print("")
    
#     outcome16_df.append({"Team_1": backup_pred_set_16.iloc[i, 1], "Team_1_Prob": logreg.predict_proba(pred_set_16)[i][2], "Team_2":backup_pred_set_16.iloc[i, 0], "Team_2_Prob":logreg.predict_proba(pred_set_16)[i][0], "Draw_Prob":logreg.predict_proba(pred_set_16)[i][1]})
    