In [1]:
import pandas as pd
import numpy as np

folder_path = 'C:/Users/99451/Desktop/MODEL/old/eng_prem'

import os
# Step 2: Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Step 3: Read each CSV file into a DataFrame and store it in a list
dfs = []
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path, on_bad_lines = "skip", encoding='latin-1')
    dfs.append(df)

# Step 4: Concatenate all DataFrames in the list into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

if folder_path == 'C:/Users/99451/Desktop/MODEL/old/eng_conf':
    df = df[['Date', 'HomeTeam', 'AwayTeam', 'FTHG','FTAG',	'FTR','HTHG','HTAG','HTR','HY', 'AY','HR','AR','PSH','PSD','PSA']]
else:
    df = df[['Date', 'HomeTeam', 'AwayTeam', 'FTHG','FTAG',	'FTR',	'HTHG',	'HTAG',	'HTR',
                'HS', 'AS',	'HST',	'AST',	'HF', 'AF',	'HC', 'AC',	'HY', 'AY',	'HR','AR','PSH','PSD','PSA', 'P>2.5', 'P<2.5']]
df.dropna(inplace = True)
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])

df.head()

Number of rows: 1891
Number of columns: 26


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,...,AC,HY,AY,HR,AR,PSH,PSD,PSA,P>2.5,P<2.5
6371,09/08/2019,Liverpool,Norwich,4.0,1.0,H,4.0,0.0,H,15.0,...,2.0,0.0,2.0,0.0,0.0,1.15,9.59,18.05,1.4,3.11
6372,10/08/2019,West Ham,Man City,0.0,5.0,A,0.0,1.0,A,5.0,...,1.0,2.0,2.0,0.0,0.0,11.68,6.53,1.26,1.49,2.77
6373,10/08/2019,Bournemouth,Sheffield United,1.0,1.0,D,0.0,0.0,D,13.0,...,4.0,2.0,1.0,0.0,0.0,2.04,3.57,3.9,1.96,1.96
6374,10/08/2019,Burnley,Southampton,3.0,0.0,H,0.0,0.0,D,10.0,...,7.0,0.0,0.0,0.0,0.0,2.71,3.31,2.81,2.17,1.77
6375,10/08/2019,Crystal Palace,Everton,0.0,0.0,D,0.0,0.0,D,6.0,...,2.0,2.0,1.0,0.0,1.0,3.21,3.37,2.39,2.23,1.74


In [2]:
last_games = 3
df['Total'] = df['FTHG'] + df['FTAG']

#Calculating average save for home/away teams
df['HSV'] = df['AST'] - df['FTAG']
df['ASV'] = df['HST'] - df['FTHG']

df['HTSV'] = df.groupby('HomeTeam')['HSV'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['ATSV'] = df.groupby('HomeTeam')['ASV'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))

#Calculating over and under odds
total = 1 / df['P>2.5'] + 1 / df['P<2.5']
over25 = (1 / df['P>2.5']) / total
under25 = (1 / df['P<2.5']) / total
df['2.5Dif'] = over25 - under25

#Calculating scored and conceded goals for each team to calculate attack and defence powers
df['HTSG'] = df.groupby('HomeTeam')['FTHG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['HTCG'] = df.groupby('HomeTeam')['FTAG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['LHGS'] = df['FTHG'].transform(lambda x: x.rolling(50, min_periods=1).mean().shift(1))

df['ATSG'] = df.groupby('AwayTeam')['FTAG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['ATCG'] = df.groupby('AwayTeam')['FTHG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['LAGS'] = df['FTAG'].transform(lambda x: x.rolling(50, min_periods=1).mean().shift(1))


df['HATT'] = df['HTSG'] / df['LHGS']
df['HDEF'] = df['HTCG'] / df['LAGS']
df['AATT'] = df['ATSG'] / df['LAGS']
df['ADEF'] = df['ATCG'] / df['LHGS']

#Calculating expected goals regarding to team attack and defence (poisson similar)
df['HXG'] = df['HATT'] * df['ADEF'] * df['LHGS']
df['AXG'] = df['HDEF'] * df['AATT'] * df['LAGS']
df['XGTotal'] = df['HXG'] + df['AXG']

#Calculating shots on target for home/away teams
df['HSTS'] = df.groupby('HomeTeam')['HST'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['HSTC'] = df.groupby('HomeTeam')['AST'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))

df['ASTS'] = df.groupby('AwayTeam')['AST'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['ASTC'] = df.groupby('AwayTeam')['HST'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))

df['HSTT'] = (df['HSTS'] + df['ASTC']) / 2
df['ASTT'] = (df['ASTS'] + df['HSTC']) / 2
df['TSTT'] = df['HSTT'] + df['ASTT']

#Calculating scored and conceded goals for half time for each team to calculate attack and defence powers
df['HTHSG'] = df.groupby('HomeTeam')['HTHG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['HTHCG'] = df.groupby('HomeTeam')['HTAG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['HTLHGS'] = df['HTHG'].transform(lambda x: x.rolling(50, min_periods=1).mean().shift(1))

df['HTASG'] = df.groupby('AwayTeam')['HTAG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['HTACG'] = df.groupby('AwayTeam')['HTHG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['HTLAGS'] = df['HTAG'].transform(lambda x: x.rolling(50, min_periods=1).mean().shift(1))


df['HTHATT'] = df['HTHSG'] / df['HTLHGS']
df['HTHDEF'] = df['HTHCG'] / df['HTLAGS']
df['HTAATT'] = df['HTASG'] / df['HTLAGS']
df['HTADEF'] = df['HTACG'] / df['HTLHGS']

#Calculating expected goals regarding to team attack and defence (poisson similar)
df['HTHXG'] = df['HTHATT'] * df['HTADEF'] * df['HTLHGS']
df['HTAXG'] = df['HTHDEF'] * df['HTAATT'] * df['HTLAGS']
df['HTXGTotal'] = df['HTHXG'] + df['HTAXG']

#Calculating odds for over and under 1.5, 2.5 and 3.5
df['1.5O'] = [1 if x > 1.5 else 0 for x in df['Total']]
df['2.5O'] = [1 if x > 2.5 else 0 for x in df['Total']]
df['3.5O'] = [1 if x > 3.5 else 0 for x in df['Total']]

#Calculating over and under probability according to last games
phover_15 = df.groupby('HomeTeam')['1.5O'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
paover_15 = df.groupby('AwayTeam')['1.5O'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['P1.5O'] = (phover_15 + paover_15) / 2

phover_25 = df.groupby('HomeTeam')['2.5O'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
paover_25 = df.groupby('AwayTeam')['2.5O'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['P2.5O'] = (phover_25 + paover_25) / 2

phover_35 = df.groupby('HomeTeam')['3.5O'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
paover_35 = df.groupby('AwayTeam')['3.5O'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['P3.5O'] = (phover_35 + paover_35) / 2

#Preparing new_df with necessary columns only
new_df = df[['HomeTeam', 'AwayTeam', 'Total', 'XGTotal','HTXGTotal', 'TSTT', 
             '1.5O', '2.5O', '3.5O','P1.5O', 'P2.5O', 'P3.5O', '2.5Dif',
             'HTSV', 'ATSV']].dropna()

new_df.tail(7)

Unnamed: 0,HomeTeam,AwayTeam,Total,XGTotal,HTXGTotal,TSTT,1.5O,2.5O,3.5O,P1.5O,P2.5O,P3.5O,2.5Dif,HTSV,ATSV
8260,Man United,Newcastle,5.0,2.358046,0.877193,12.5,1,1,1,0.666667,0.333333,0.333333,0.417559,4.0,6.666667
8261,Arsenal,Everton,3.0,4.074904,1.666667,9.5,1,1,0,1.0,0.5,0.333333,0.388027,1.0,5.0
8262,Brentford,Newcastle,6.0,0.70922,0.0,10.333333,1,1,1,0.666667,0.333333,0.333333,0.38255,3.666667,2.666667
8263,Brighton,Man United,2.0,3.29416,1.709402,10.0,1,0,0,0.833333,0.833333,0.666667,0.426124,2.333333,3.666667
8264,Burnley,Nott'm Forest,3.0,2.751068,1.596941,10.666667,1,1,0,1.0,0.5,0.5,0.253659,5.0,3.666667
8266,Crystal Palace,Aston Villa,5.0,3.577408,1.848291,9.666667,1,1,1,0.833333,0.5,0.5,0.373874,1.0,4.333333
8268,Luton,Fulham,6.0,2.190171,0.867117,9.5,1,1,1,0.833333,0.5,0.333333,0.309693,2.0,4.333333


In [3]:
#Checking some classification models
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score

X = new_df.drop(columns=['Total', 'HomeTeam', 'AwayTeam', '1.5O', '2.5O', '3.5O', 'TSTT'], axis = 1)
y = new_df['1.5O']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize models
log_model = LogisticRegression(penalty = 'l1', max_iter = 1000, C = 1, solver = 'liblinear')
svm_model = SVC(probability = True, C = 10, degree = 2, gamma = 'scale', kernel = 'linear')
gb_model = GradientBoostingClassifier(learning_rate = 0.01, max_depth = 3, min_samples_leaf = 1, 
                                                    min_samples_split = 10, n_estimators = 100)
X.head()

Unnamed: 0,XGTotal,HTXGTotal,P1.5O,P2.5O,P3.5O,2.5Dif,HTSV,ATSV
6391,0.0,0.0,0.5,0.5,0.0,0.056122,2.0,6.0
6392,8.129032,1.75,1.0,1.0,1.0,0.237864,2.0,5.0
6393,2.0,0.0,1.0,0.5,0.0,-0.089059,2.0,3.0
6394,2.787879,0.0,0.5,0.5,0.5,0.083969,7.0,1.0
6395,0.705882,0.0,0.5,0.0,0.0,-0.174129,4.0,2.0


In [4]:
# Train and evaluate models
log_scores = cross_val_score(log_model, X, y, cv = 5, scoring = 'roc_auc').mean()
svm_scores = cross_val_score(svm_model, X, y, cv = 5, scoring = 'roc_auc').mean()
gb_scores = cross_val_score(gb_model, X, y, cv = 5, scoring = 'roc_auc').mean()

print('Logistic Regression: ROC AUC = ', log_scores)
print('SVM: ROC AUC = ', svm_scores)
print('Gradient Boosting: ROC AUC = ', gb_scores)

Logistic Regression: ROC AUC =  0.5825555894810375
SVM: ROC AUC =  0.5415184845930365
Gradient Boosting: ROC AUC =  0.5735200934745098
