In [1]:
import pandas as pd
import numpy as np

folder_path = 'C:/Users/99451/Desktop/MODEL/old/eng_champ'

import os
# Step 2: Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Step 3: Read each CSV file into a DataFrame and store it in a list
dfs = []
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path, on_bad_lines = "skip", encoding='latin-1')
    dfs.append(df)

# Step 4: Concatenate all DataFrames in the list into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

if folder_path == 'C:/Users/99451/Desktop/MODEL/old/eng_conf':
    df = df[['Date', 'HomeTeam', 'AwayTeam', 'FTHG','FTAG',	'FTR','HTHG','HTAG','HTR','HY', 'AY','HR','AR','PSH','PSD','PSA']]
else:
    df = df[['Date', 'HomeTeam', 'AwayTeam', 'FTHG','FTAG',	'FTR',	'HTHG',	'HTAG',	'HTR',
                'HS', 'AS',	'HST',	'AST',	'HF', 'AF',	'HC', 'AC',	'HY', 'AY',	'HR','AR','PSH','PSD','PSA']]
df.dropna(inplace = True)
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])

df.head()

Number of rows: 6620
Number of columns: 24


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,...,AF,HC,AC,HY,AY,HR,AR,PSH,PSD,PSA
5328,17/08/12,Cardiff,Huddersfield,1.0,0.0,H,0.0,0.0,D,13.0,...,8.0,7.0,4.0,1.0,3.0,0.0,0.0,1.78,3.71,5.26
5329,18/08/12,Barnsley,Middlesbrough,1.0,0.0,H,1.0,0.0,H,16.0,...,9.0,11.0,3.0,3.0,2.0,0.0,0.0,3.33,3.45,2.3
5330,18/08/12,Birmingham,Charlton,1.0,1.0,D,0.0,0.0,D,14.0,...,10.0,3.0,7.0,0.0,1.0,0.0,0.0,1.83,3.6,4.95
5331,18/08/12,Burnley,Bolton,2.0,0.0,H,1.0,0.0,H,14.0,...,11.0,7.0,9.0,1.0,2.0,0.0,0.0,2.95,3.49,2.49
5332,18/08/12,Crystal Palace,Watford,2.0,3.0,A,2.0,1.0,H,4.0,...,8.0,4.0,4.0,0.0,2.0,0.0,0.0,2.55,3.27,3.11


In [2]:
#Calculating odds probabilities
total = 1 / df['PSH'] + 1 / df['PSD'] + 1 / df['PSA']
last_games = 3

df['FTH'] = (1 / df['PSH']) / total
draw_percentage = (1 / df['PSD']) / total
away_percentage = (1 / df['PSA']) / total
df['FTDA'] = draw_percentage + away_percentage

#Encoding FTR as 1 or 0 to FTRT
df['FTRT'] = [1 if x == 'H' else 0 for x in df['FTR']]

#Calculating Points Per Game for Home and Away Teams
df['HPTS'] = np.select([df['FTR'] == 'H', df['FTR'] == 'D', df['FTR'] == 'A'], [3, 1, 0], default=0)
df['APTS'] = np.select([df['FTR'] == 'H', df['FTR'] == 'D', df['FTR'] == 'A'], [0, 1, 3], default=0)

df['HPPG'] = df.groupby('HomeTeam')['HPTS'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['APPG'] = df.groupby('AwayTeam')['APTS'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['PPGDif'] = df['HPPG'] - df['APPG']

#Calculating scored and conceded goals for each team to calculate attack and defence powers
df['HTSG'] = df.groupby('HomeTeam')['FTHG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['HTCG'] = df.groupby('HomeTeam')['FTAG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['LHGS'] = df['FTHG'].transform(lambda x: x.rolling(50, min_periods=1).mean().shift(1))

df['ATSG'] = df.groupby('AwayTeam')['FTAG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['ATCG'] = df.groupby('AwayTeam')['FTHG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['LAGS'] = df['FTAG'].transform(lambda x: x.rolling(50, min_periods=1).mean().shift(1))


df['HATT'] = df['HTSG'] / df['LHGS']
df['HDEF'] = df['HTCG'] / df['LAGS']
df['AATT'] = df['ATSG'] / df['LAGS']
df['ADEF'] = df['ATCG'] / df['LHGS']

#Calculating expected goals regarding to team attack and defence (poisson similar)
df['HXG'] = df['HATT'] * df['ADEF'] * df['LHGS']
df['AXG'] = df['HDEF'] * df['AATT'] * df['LAGS']
df['XGDif'] = df['HXG'] - df['AXG']
df['PRBDif'] = df['FTH'] - df['FTDA']

#Preparing new_df with necessary columns only
new_df = df[['HomeTeam', 'AwayTeam', 'FTRT', 'PRBDif', 'PPGDif', 'XGDif']].dropna()

new_df.tail(7)

Unnamed: 0,HomeTeam,AwayTeam,FTRT,PRBDif,PPGDif,XGDif
11946,Middlesbrough,Watford,1,0.133731,1.333333,0.849078
11947,Plymouth,Hull,1,-0.449597,-1.0,-0.86011
11948,Rotherham,Cardiff,1,-0.440622,-0.666667,-0.262108
11949,Stoke,Bristol City,1,-0.07497,0.0,-0.213675
11950,Sunderland,Sheffield Weds,0,-0.317694,-1.666667,-3.001876
11951,Swansea,Millwall,0,-0.061108,1.0,0.638121
11952,West Brom,Preston,1,0.301192,1.0,1.154401


In [3]:
#Checking some classification models
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

# Split the data into training and testing sets
X = new_df.drop(columns=['FTRT', 'HomeTeam', 'AwayTeam'], axis = 1)  # Features
y = new_df['FTRT']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(penalty = 'l1', max_iter = 1000, C = 1, solver = 'liblinear'),
    'Support Vector Machine': SVC(probability = True, C = 10, degree = 2, gamma = 'scale', kernel = 'linear'),
    'Naive Bayes': GaussianNB(var_smoothing = 1e-09),
    'Gradient Boosting': GradientBoostingClassifier(learning_rate = 0.01, max_depth = 3, min_samples_leaf = 1, 
                                                    min_samples_split = 10, n_estimators = 100),
}

# Train and evaluate models
results = {}
for name, model in models.items():
    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
    # Store the mean ROC AUC score
    results[name] = cv_scores.mean()

# Print ROC AUC scores
for name, roc_auc in results.items():
    print(f'{name}: ROC AUC = {roc_auc}')

Logistic Regression: ROC AUC = 0.6417918742015545
Support Vector Machine: ROC AUC = 0.6419622749849149
Naive Bayes: ROC AUC = 0.627430792761541
Gradient Boosting: ROC AUC = 0.641231608432177
