In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('float_format', '{:f}'.format)
from catboost import CatBoostRegressor
from sklearn import preprocessing
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import math

# Data

data = pd.concat(map(pd.read_csv, ['2017.csv','2018.csv', '2019.csv','2020.csv', '2021.csv','2022.csv']))
data['Date_'] = pd.to_datetime(data['Date'])
data = data.sort_values(by = 'Date_')
data['id_game'] = data.index
dataT = data[(data['HomeTeam']=='Tottenham') |(data['AwayTeam']=='Tottenham')]


def Tottenham_res_Y(HomeTeam, FTR ):
    res_score=0
    if (HomeTeam=='Tottenham'):
        if FTR=='H':
            res_score=3
        elif (FTR=='A'):
            res_score=0
        else:
            res_score=1
            
    else:
        if(FTR=='A'):
            res_score=3
        elif( FTR=='D'):
            res_score=1        
    return res_score

dataT['Tottenham_score'] = dataT.apply(lambda x: Tottenham_res_Y(x['HomeTeam'], x['FTR'] ), axis=1)


# select column for model
cols_for_model_x =  [ 'HomeTeam','AwayTeam', 'B365H','B365D','B365A']
cols_for_model_y =  ['Tottenham_score']
# Create X, y
X = dataT.copy()
y = dataT.copy()

# Label
labelersDictionary = {}
for col in cols_for_model_x:
    print(col)
    if X[col].dtype=='O':
        le = preprocessing.LabelEncoder()
        X[col] =  X[col].fillna('').astype(str)
        currLabel = le.fit(X[col])
        X[col] = le.fit_transform(X[col])
        labelersDictionary[col] = currLabel


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=130)
X_train = X.head(200)
y_train = y.head(200)

X_test = X.tail(28)
y_test = y.tail(28)

#

## Build Models

my_modelRisk_1_cat = CatBoostRegressor(loss_function='Tweedie:variance_power=1.9',silent=True,max_depth=8, n_estimators=500 ,  learning_rate= 0.01) 
my_modelRisk_1_cat.fit(X_train[cols_for_model_x], y_train[['Tottenham_score']])
my_modelRisk_1_xg = XGBRegressor(max_depth=6, n_estimators=250,  learning_rate= 0.1, gamma=1)
my_modelRisk_1_xg.fit(X_train[cols_for_model_x], y_train[['Tottenham_score']])

Predictions_cat = my_modelRisk_1_cat.predict(X_test[cols_for_model_x])
Predictions_xg = my_modelRisk_1_xg.predict(X_test[cols_for_model_x])

## Resualts

df_res = pd.DataFrame({  'Predictions_cat':Predictions_cat, 'Predictions_xg':Predictions_xg, 'Date': X_test['Date_'],'id_game': X_test['id_game'] })
df_res =  pd.merge( df_res, dataT, how='left', on='id_game')
df_res= df_res.sort_values(by='Date_')
df_res.tail(20)[[ 'Date_','HomeTeam','AwayTeam','FTHG','FTAG','FTR', 'Tottenham_score' ,'Predictions_cat', 'Predictions_xg']]

plt.style.use('seaborn-whitegrid')
x = df_res.index
y = [0,1,3]
sizes = 50
plt.scatter(df_res['Tottenham_score'], df_res['Predictions_cat'], c='red', s=sizes)
plt.scatter(df_res['Tottenham_score'], df_res['Predictions_xg'],c='#fdfdfc', s=sizes)
plt.colorbar();
ax = plt.axes()
ax.set_facecolor('#a7a7a0')