In [1]:
import numpy as np
import pandas as pd
from sklearn import *
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

#Initial CSV Read

xGS_Total = pd.read_csv(r'CSVs\International\Int_xGS_Total.csv' ,encoding='latin-1')
xGA_Total = pd.read_csv(r'CSVs\International\Int_xGA_Total.csv' ,encoding='latin-1')

# loading linear algorithms
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge

# machine learning processing and metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [74]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Evaluate Model function

def evaluateModel(model, X_train, y_train, X_test, y_test):
    # FIT THE MODEL
    model.fit(X_train, y_train)

    # EVALUATE the MODEL
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    # PRINT the METRICS
    print("TRAINING SET")

    # Score the prediction with MSE and R2
    mse_train = np.sqrt(mean_squared_error(y_train, train_preds))
    r2_train = r2_score(y_train, train_preds)
    mae_train = mean_absolute_error(y_train, train_preds)

    print(f"root mean squared error (RMSE): {mse_train}")
    print(f"R-squared (R2 ): {r2_train}")
    print(f"MAE {mae_train}")

    print()

    # PRINT the METRICS
    print("Testing SET")

    # Score the prediction with MSE and R2
    mse_test = np.sqrt(mean_squared_error(y_test, test_preds))
    r2_test = r2_score(y_test, test_preds)
    mae_test = mean_absolute_error(y_test, test_preds)

    print(f"root mean squared error (RMSE): {mse_test}")
    print(f"R-squared (R2 ): {r2_test}")
    print(f"MAE {mae_test}")
    
    plt.figure(figsize=(10,6))
    plt.scatter(y_test, test_preds)
    plt.title("Actual vs Predicted Plot")
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.plot(y_test, y_test)
    plt.show()
    
    # Plot Residuals
    plt.figure(figsize=(10,6))
    plt.scatter(test_preds, test_preds - y_test)
    plt.hlines(y=0, xmin=test_preds.min(), xmax=test_preds.max(), color="r")
    plt.title("Residuals")
    plt.xlabel("Prediction")
    plt.ylabel("Error")
    plt.show()

    return(model)

In [3]:
#Model Prep

 
xGS_Column_Drop_List = ['Key','Year','Competition','Squad','Age','MP','G/SoT','TklW','O. TklW','G/Sh']
xGA_Column_Drop_List = ['Key','Year','Competition','Squad','O. Age','O. MP','O. G/SoT','O. Fld', 'O. TklW', 'TklW','O. G/Sh']

xGS_Model = xGS_Total.drop(xGS_Column_Drop_List, axis=1)
xGA_Model = xGA_Total.drop(xGA_Column_Drop_List, axis=1)

xGS_Model_features = xGS_Model.drop(['Confederation','Gls'], axis=1)
xGA_Model_features = xGA_Model.drop(['Confederation','O. Gls'], axis=1)

xGS_scaler = StandardScaler().fit(xGS_Model_features)
xGA_scaler = StandardScaler().fit(xGA_Model_features)


xGS_scaler_df = pd.DataFrame(xGS_scaler.transform(xGS_Model_features), columns = xGS_Model_features.columns)
xGA_scaler_df = pd.DataFrame(xGA_scaler.transform(xGA_Model_features), columns = xGA_Model_features.columns)

xGS_scaler_df['Gls'] = xGS_Model['Gls']
xGA_scaler_df['O. Gls'] = xGA_Model['O. Gls']
xGS_scaler_df['Confederation'] = xGS_Model['Confederation']
xGA_scaler_df['Confederation'] = xGA_Model['Confederation']


In [4]:
# Create final model variable and one-hot encode Confederation

xGS_Model_Final = pd.get_dummies(xGS_scaler_df, columns=['Confederation'])
xGA_Model_Final = pd.get_dummies(xGA_scaler_df, columns=['Confederation'])

In [5]:
# Get features and target for both models

xGS_X = xGS_Model_Final.drop('Gls', axis=1).values
xGS_y = xGS_Model_Final['Gls'].values

xGA_X = xGA_Model_Final.drop('O. Gls', axis=1).values
xGA_y = xGA_Model_Final['O. Gls'].values

In [6]:
# Use the Sklearn `train_test_split()` function to split the data into training and testing data
xGS_X_train, xGS_X_test, xGS_y_train, xGS_y_test = train_test_split(xGS_X, xGS_y, random_state=42, test_size=.2)


xGA_X_train, xGA_X_test, xGA_y_train, xGA_y_test = train_test_split(xGA_X, xGA_y, random_state=42, test_size=.2)

In [7]:
ridge1 = Ridge()
ridge2 = Ridge()
x_GS_ridge = ridge1.fit(xGS_X, xGS_y)
x_GA_ridge = ridge2.fit(xGA_X, xGA_y)

In [11]:
x_GS_preds = x_GS_ridge.predict(xGS_X)
x_GA_preds = x_GA_ridge.predict(xGA_X)

In [19]:
xGS_Total.drop('xGA', axis=1, inplace=True)

In [23]:
xGS_Total['xGS'] = x_GS_preds
xGA_Total['xGA'] = x_GA_preds

In [71]:
xGS_Total.loc[xGS_Total['Squad']=='United States']

Unnamed: 0,Key,Year,Competition,Squad,Age,Poss,Gls,Sh,SoT,SoT%,...,CrdY,CrdR,Fld,Int,TklW,Confederation,O. CrdY,O. TklW,MP,xGS
242,2016-Copa America-United States,2016,Copa America,United States,28.2,42.0,1.17,8.67,2.5,28.8,...,1.33,0.17,12.8,12.5,12.0,CONCACAF,1.67,11.7,6,0.649485
243,2021-Gold Cup-United States,2021,Gold Cup,United States,23.8,56.0,1.59,11.4,5.08,44.4,...,0.63,0.0,12.2,9.84,7.78,CONCACAF,1.27,10.3,6,1.52784
244,2019-Gold Cup-United States,2019,Gold Cup,United States,25.2,55.0,2.5,15.3,6.0,39.1,...,0.17,0.0,11.2,7.17,9.83,CONCACAF,1.5,10.3,6,1.780832
245,2017-Gold Cup-United States,2017,Gold Cup,United States,27.6,59.5,2.17,14.3,7.5,52.3,...,0.33,0.0,17.3,12.7,11.7,CONCACAF,2.0,13.5,6,2.498949


In [76]:
xGA_Total.loc[xGA_Total['Squad']=='Morocco']

Unnamed: 0,Key,Year,Competition,Squad,O. Age,O. Poss,O. MP,O. Gls,O. Sh,O. SoT,O. SoT%,O. G/Sh,O. G/SoT,O. PKatt,O. CrdY,O. CrdR,O. Fld,O. Int,Confederation,O. TklW,CrdY,TklW,xGA
180,2018-World Cup-Morocco,2018,World Cup,Morocco,28.1,51.0,3,1.0,12.7,2.33,18.4,0.08,0.43,0.0,1.33,0.0,20.0,8.0,CAF,10.0,2.67,9.33,0.624803
181,2021-Africa Cup of Nations-Morocco,2021,Africa Cup of Nations,Morocco,26.6,39.4,5,0.75,7.74,2.26,29.3,0.1,0.33,0.0,2.45,0.0,14.7,7.55,CAF,7.55,1.13,10.4,0.519608
182,2019-Africa Cup of Nations-Morocco,2019,Africa Cup of Nations,Morocco,27.6,38.8,4,0.23,3.72,0.7,18.8,0.06,0.33,0.0,2.09,0.23,13.7,9.07,CAF,8.37,0.23,12.6,0.316543
183,2017-Africa Cup of Nations-Morocco,2017,Africa Cup of Nations,Morocco,27.4,51.0,4,0.75,8.25,3.5,42.4,0.09,0.21,0.0,2.0,0.25,23.5,13.5,CAF,9.75,0.0,15.0,0.809253


In [58]:
World_Cup_Teams = pd.read_csv(r'CSVs\World_Cup_Teams.csv' ,encoding='latin-1')

In [60]:
World_Cup_Teams = pd.merge(World_Cup_Teams,xGS_Total[['Key','xGS']], on='Key', how='inner')

In [61]:
World_Cup_Teams = pd.merge(World_Cup_Teams,xGA_Total[['Key','xGA']], on='Key', how='inner')

In [62]:
World_Cup_Teams

Unnamed: 0,Squad,Association,Date qualified,Most Recent Competition,Key,Group,xGS,xGA
0,Qatar,AFC,2-Dec-10,2021 Gold Cup,2021-Gold Cup-Qatar,A,1.958577,1.1817
1,Iran,AFC,27-Jan-22,2019 AFC Asian Cup,2019-AFC Asian Cup-Iran,B,1.520378,0.784702
2,South Korea,AFC,1-Feb-22,2019 AFC Asian Cup,2019-AFC Asian Cup-South Korea,H,1.47615,0.593052
3,Japan,AFC,24-Mar-22,2019 AFC Asian Cup,2019-AFC Asian Cup-Japan,E,1.805929,0.765247
4,Saudi Arabia,AFC,24-Mar-22,2019 AFC Asian Cup,2019-AFC Asian Cup-Saudi Arabia,C,1.176455,0.711999
5,Australia,AFC,13-Jun-22,2019 AFC Asian Cup,2019-AFC Asian Cup-Australia,D,1.74681,0.985206
6,Ghana,CAF,29-Mar-22,2021 Africa Cup of Nations,2021-Africa Cup of Nations-Ghana,H,0.9393,0.778277
7,Senegal,CAF,29-Mar-22,2021 Africa Cup of Nations,2021-Africa Cup of Nations-Senegal,A,1.315233,0.579346
8,Tunisia,CAF,29-Mar-22,2021 Africa Cup of Nations,2021-Africa Cup of Nations-Tunisia,D,1.419231,0.679135
9,Cameroon,CAF,29-Mar-22,2021 Africa Cup of Nations,2021-Africa Cup of Nations-Cameroon,G,1.506329,0.90107


In [63]:
Squad_Predicted_Values = World_Cup_Teams[['Squad','Group','xGS','xGA']].copy()

In [64]:
Squad_Predicted_Values.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32 entries, 0 to 31
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Squad   32 non-null     object 
 1   Group   32 non-null     object 
 2   xGS     32 non-null     float64
 3   xGA     32 non-null     float64
dtypes: float64(2), object(2)
memory usage: 1.2+ KB


In [65]:
Squad_Predicted_Values['% of Points Taken'] = (np.power(Squad_Predicted_Values['xGS'],1.2))/((np.power(Squad_Predicted_Values['xGS'],1.2)) + (np.power(Squad_Predicted_Values['xGA'],1.2)))

In [66]:
Squad_Predicted_Values

Unnamed: 0,Squad,Group,xGS,xGA,% of Points Taken
0,Qatar,A,1.958577,1.1817,0.6471
1,Iran,B,1.520378,0.784702,0.688624
2,South Korea,H,1.47615,0.593052,0.749189
3,Japan,E,1.805929,0.765247,0.736985
4,Saudi Arabia,C,1.176455,0.711999,0.646256
5,Australia,D,1.74681,0.985206,0.665352
6,Ghana,H,0.9393,0.778277,0.556178
7,Senegal,A,1.315233,0.579346,0.72787
8,Tunisia,D,1.419231,0.679135,0.707746
9,Cameroon,G,1.506329,0.90107,0.649449


In [55]:
Squad_Predicted_Values.drop(['Test'],axis=1,inplace=True)