In [1]:
import numpy as np
import pandas as pd
from sklearn import *
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

#Initial CSV Reads

xGS_Final = pd.read_csv(r'CSVs\International\xGS_Final.csv' ,encoding='latin-1')
xGA_Final = pd.read_csv(r'CSVs\International\xGA_Final.csv' ,encoding='latin-1')
World_Cup_Teams = pd.read_csv(r'CSVs\World_Cup_Teams.csv' ,encoding='latin-1')
xGS_Predictors = pd.read_csv(r'CSVs\xGS_Predictors.csv', encoding='latin-1')
xGA_Predictors = pd.read_csv(r'CSVs\xGA_Predictors.csv', encoding='latin-1')

# loading linear algorithms
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor


# machine learning processing and metrics
from sklearn.model_selection import train_test_split

#Display Options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Get features and target for both models

xGS_X = xGS_Final.drop(['Key','Squad','Gls'], axis=1).values
xGS_y = xGS_Final['Gls'].values

xGA_X = xGA_Final.drop(['Key','Squad','O. Gls'], axis=1).values
xGA_y = xGA_Final['O. Gls'].values

# Use the Sklearn `train_test_split()` function to split the data into training and testing data
xGS_X_train, xGS_X_test, xGS_y_train, xGS_y_test = train_test_split(xGS_X, xGS_y, random_state=25, test_size=.2)


xGA_X_train, xGA_X_test, xGA_y_train, xGA_y_test = train_test_split(xGA_X, xGA_y, random_state=25, test_size=.2)

In [3]:
# Initiate ML Models - GradientBoostingRegressor for the xGS Model, Ridge model for xGA

xGS_ML = Ridge(random_state=22) #GradientBoostingRegressor(random_state=25, n_estimators=600, learning_rate=.05)
xGA_ML = Ridge(random_state=22)

# Fit Model & Get Predictions. Add predictions to each total Dataframe

xGS_ML_Fit = xGS_ML.fit(xGS_X, xGS_y)
xGA_ML_Fit = xGA_ML.fit(xGA_X, xGA_y)

xGS_preds = xGS_ML_Fit.predict(xGS_X)
xGA_preds = xGA_ML_Fit.predict(xGA_X)

xGS_Final['xGS'] = xGS_preds
xGA_Final['xGA'] = xGA_preds

# Add xGS and xGA to World Cup Teams DF
World_Cup_Teams = pd.merge(World_Cup_Teams,xGS_Final[['Key','xGS']], on='Key', how='inner')
World_Cup_Teams = pd.merge(World_Cup_Teams,xGA_Final[['Key','xGA']], on='Key', how='inner')

In [4]:
xGS_Simulation_vals = xGS_Predictors.drop(['Squad','Gls'], axis=1).values
xGA_Simulation_vals = xGA_Predictors.drop(['Squad','O. Gls'], axis=1).values

xGS_Simulation_Preds = xGS_ML_Fit.predict(xGS_Simulation_vals)
xGA_Simulation_Preds = xGA_ML_Fit.predict(xGA_Simulation_vals)

In [5]:
World_Cup_Teams['xGS'] = xGS_Simulation_Preds
World_Cup_Teams['xGA'] = xGA_Simulation_Preds

In [6]:
World_Cup_Teams

Unnamed: 0,Squad,Association,Date qualified,Key Competition,Key,Group,xGS,xGA
0,Qatar,AFC,2-Dec-10,2021 Gold Cup,2021-Gold Cup-Qatar,A,1.471731,1.045786
1,Iran,AFC,27-Jan-22,2019 AFC Asian Cup,2019-AFC Asian Cup-Iran,B,1.520622,0.759413
2,South Korea,AFC,1-Feb-22,2019 AFC Asian Cup,2019-AFC Asian Cup-South Korea,H,1.470596,0.646379
3,Japan,AFC,24-Mar-22,2019 AFC Asian Cup,2019-AFC Asian Cup-Japan,E,1.585592,1.152138
4,Saudi Arabia,AFC,24-Mar-22,2019 AFC Asian Cup,2019-AFC Asian Cup-Saudi Arabia,C,1.17796,0.765093
5,Australia,AFC,13-Jun-22,2019 AFC Asian Cup,2019-AFC Asian Cup-Australia,D,1.758087,0.976595
6,Ghana,CAF,29-Mar-22,2021 Africa Cup of Nations,2021-Africa Cup of Nations-Ghana,H,0.747516,0.796257
7,Senegal,CAF,29-Mar-22,2021 Africa Cup of Nations,2021-Africa Cup of Nations-Senegal,A,1.449238,0.534193
8,Tunisia,CAF,29-Mar-22,2021 Africa Cup of Nations,2021-Africa Cup of Nations-Tunisia,D,1.072997,0.642787
9,Cameroon,CAF,29-Mar-22,2021 Africa Cup of Nations,2021-Africa Cup of Nations-Cameroon,G,1.244722,0.577642


In [None]:
x_GS_preds = x_GS_ML_Fit.predict(test)

In [None]:
xGS_Final[]

In [None]:
xGA_Predictors

In [7]:
# Create new predicted Values for each team
Squad_Predicted_Values = World_Cup_Teams[['Squad','Group','xGS','xGA']].copy()

Squad_Predicted_Values['% of Points Taken'] = (np.power(Squad_Predicted_Values['xGS'],1.2))/((np.power(Squad_Predicted_Values['xGS'],1.2)) + (np.power(Squad_Predicted_Values['xGA'],1.2)))

In [8]:
Squad_Predicted_Values.sort_values(by='% of Points Taken', ascending=False)

Unnamed: 0,Squad,Group,xGS,xGA,% of Points Taken
28,Netherlands,A,2.454842,0.588678,0.847293
21,Belgium,F,2.429896,0.654077,0.828474
26,England,B,2.243788,0.604074,0.828448
24,Spain,E,2.182467,0.592435,0.827036
19,Germany,E,2.634662,0.745795,0.819723
15,Brazil,G,1.848081,0.535112,0.815674
10,Morocco,F,1.514196,0.49393,0.793196
29,Portugal,H,2.273236,0.763673,0.787343
22,France,D,1.847393,0.642302,0.78036
20,Denmark,D,2.099563,0.736278,0.778587


In [9]:
# Export Predicted Values to CSV
Squad_Predicted_Values.to_csv('CSVs/Predictions/Predictions_Final.csv', index=False)

In [None]:
final_dict = Squad_Predicted_Values.set_index('Squad').T.to_dict('dict')

In [None]:
xGS_Final[]

In [10]:
Squad_Predicted_Values.sort_values(by='xGS')

Unnamed: 0,Squad,Group,xGS,xGA,% of Points Taken
6,Ghana,H,0.747516,0.796257,0.481059
8,Tunisia,D,1.072997,0.642787,0.649053
17,Ecuador,A,1.100619,1.088802,0.503238
14,Costa Rica,E,1.134287,1.039307,0.526211
4,Saudi Arabia,C,1.17796,0.765093,0.626645
18,Uruguay,H,1.240581,1.054904,0.548486
9,Cameroon,G,1.244722,0.577642,0.715298
31,Wales,B,1.334231,1.150966,0.54421
7,Senegal,A,1.449238,0.534193,0.768105
2,South Korea,H,1.470596,0.646379,0.728385


In [None]:
xGS_Final.loc[xGS_Final['Confederation_CONMEBOL'] == 1]

In [None]:
World_Cup_Teams