In [1]:
import numpy as np
import pandas as pd
from sklearn import *
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

#Initial CSV Reads

xGS_Final = pd.read_csv(r'CSVs\International\xGS_Final.csv' ,encoding='latin-1')
xGA_Final = pd.read_csv(r'CSVs\International\xGA_Final.csv' ,encoding='latin-1')
World_Cup_Teams = pd.read_csv(r'CSVs\World_Cup_Teams.csv' ,encoding='latin-1')

# loading linear algorithms
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor


# machine learning processing and metrics
from sklearn.model_selection import train_test_split

#Display Options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Get features and target for both models

xGS_X = xGS_Final.drop(['Key','Squad','Gls'], axis=1).values
xGS_y = xGS_Final['Gls'].values

xGA_X = xGA_Final.drop(['Key','Squad','O. Gls'], axis=1).values
xGA_y = xGA_Final['O. Gls'].values

# Use the Sklearn `train_test_split()` function to split the data into training and testing data
xGS_X_train, xGS_X_test, xGS_y_train, xGS_y_test = train_test_split(xGS_X, xGS_y, random_state=22, test_size=.2)


xGA_X_train, xGA_X_test, xGA_y_train, xGA_y_test = train_test_split(xGA_X, xGA_y, random_state=22, test_size=.2)

In [4]:
# Initiate ML Models - GradientBoostingRegressor for the xGS Model, Ridge model for xGA

xGS_ML = GradientBoostingRegressor(random_state=22, n_estimators=600, learning_rate=.05)
xGA_ML = Ridge(random_state=22)

# Fit Model & Get Predictions. Add predictions to each total Dataframe

x_GS_ML_Fit = xGS_ML.fit(xGS_X, xGS_y)
x_GA_ML_Fit = xGA_ML.fit(xGA_X, xGA_y)

x_GS_preds = x_GS_ML_Fit.predict(xGS_X)
x_GA_preds = x_GA_ML_Fit.predict(xGA_X)

xGS_Final['xGS'] = x_GS_preds
xGA_Final['xGA'] = x_GA_preds

# Add xGS and xGA to World Cup Teams DF
World_Cup_Teams = pd.merge(World_Cup_Teams,xGS_Final[['Key','xGS']], on='Key', how='inner')
World_Cup_Teams = pd.merge(World_Cup_Teams,xGA_Final[['Key','xGA']], on='Key', how='inner')

In [6]:
# Create new predicted Values for each team
Squad_Predicted_Values = World_Cup_Teams[['Squad','Group','xGS','xGA']].copy()

Squad_Predicted_Values['% of Points Taken'] = (np.power(Squad_Predicted_Values['xGS'],1.2))/((np.power(Squad_Predicted_Values['xGS'],1.2)) + (np.power(Squad_Predicted_Values['xGA'],1.2)))

In [9]:
# Export Predicted Values to CSV
Squad_Predicted_Values.to_csv('CSVs/Predictions/Predictions_8-19.csv', index=False)

In [12]:
final_dict = Squad_Predicted_Values.set_index('Squad').T.to_dict('dict')

In [13]:
final_dict

{'Qatar': {'Group': 'A',
  'xGS': 2.354670431332299,
  'xGA': 1.182060478690003,
  '% of Points Taken': 0.6957138771119763},
 'Iran': {'Group': 'B',
  'xGS': 1.7596787950608648,
  'xGA': 0.7827720086189838,
  '% of Points Taken': 0.7255287361107485},
 'South Korea': {'Group': 'H',
  'xGS': 1.2104446749095974,
  'xGA': 0.6077907841584096,
  '% of Points Taken': 0.6956556145094872},
 'Japan': {'Group': 'E',
  'xGS': 1.6748679334566163,
  'xGA': 0.7708848698626459,
  '% of Points Taken': 0.7173066455004384},
 'Saudi Arabia': {'Group': 'C',
  'xGS': 1.4752336090487181,
  'xGA': 0.7081324947360901,
  '% of Points Taken': 0.7069731621739035},
 'Australia': {'Group': 'D',
  'xGS': 1.2727016571697993,
  'xGA': 0.9713387028139151,
  '% of Points Taken': 0.5803636482334932},
 'Ghana': {'Group': 'H',
  'xGS': 1.0626600182454051,
  'xGA': 0.7701435237830707,
  '% of Points Taken': 0.5954023607074348},
 'Senegal': {'Group': 'A',
  'xGS': 1.2059173795967055,
  'xGA': 0.6217795864853027,
  '% of Poin