In [13]:
import pandas as pd
import numpy as np
from scipy.stats import poisson

last_games = 3

df = pd.read_csv('china.csv')
df.head()

Unnamed: 0,Country,League,Season,Date,Time,Home,Away,HG,AG,Res,...,PSCA,MaxCH,MaxCD,MaxCA,AvgCH,AvgCD,AvgCA,BFECH,BFECD,BFECA
0,China,Super League,2014,07/03/2014,11:00,Shandong Luneng,Zhejiang Yiteng,1,0,H,...,,1.3,6.5,13.0,1.21,5.72,9.77,,,
1,China,Super League,2014,08/03/2014,08:30,Guangzhou Evergrande,Henan Songshan Longmen,3,0,H,...,17.5,1.19,8.8,23.0,1.14,6.64,15.35,,,
2,China,Super League,2014,08/03/2014,11:30,Beijing Guoan,Changchun Yatai,1,0,H,...,9.7,1.45,4.8,9.7,1.39,4.27,7.18,,,
3,China,Super League,2014,08/03/2014,11:35,Hangzhou Greentown,Dalian Yifang F.C.,1,1,D,...,2.63,3.0,3.25,2.78,2.7,3.1,2.48,,,
4,China,Super League,2014,08/03/2014,11:35,Jiangsu Suning,Beijing Renhe,0,0,D,...,4.38,2.38,3.5,4.38,2.01,3.23,3.48,,,


In [14]:
#Calculating Total Goals
df['TG'] = df['HG'] + df['AG']
df['1.5O'] = [1 if goal > 1.5 else 0 for goal in df['TG']]
df['2.5O'] = [1 if goal > 2.5 else 0 for goal in df['TG']]
df['3.5O'] = [1 if goal > 3.5 else 0 for goal in df['TG']]

#Calculating percentages for over and under goals for home and away teams
df['H1.5O'] = df.groupby('Home')['1.5O'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['A1.5O'] = df.groupby('Home')['1.5O'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['P1.5O'] = (df['H1.5O'] + df['A1.5O']) / 2

df['H2.5O'] = df.groupby('Home')['2.5O'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['A2.5O'] = df.groupby('Home')['2.5O'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['P2.5O'] = (df['H2.5O'] + df['A2.5O']) / 2

df['H3.5O'] = df.groupby('Home')['3.5O'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['A3.5O'] = df.groupby('Home')['3.5O'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['P3.5O'] = (df['H3.5O'] + df['A3.5O']) / 2

#Calculating scored and conceded goals for each team to calculate attack and defence powers
df['HTSG'] = df.groupby('Home')['HG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['HTCG'] = df.groupby('Home')['AG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['LHGS'] = df['HG'].transform(lambda x: x.rolling(50, min_periods=1).mean().shift(1))

df['ATSG'] = df.groupby('Away')['AG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['ATCG'] = df.groupby('Away')['HG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['LAGS'] = df['AG'].transform(lambda x: x.rolling(50, min_periods=1).mean().shift(1))

df['HATT'] = df['HTSG'] / df['LHGS']
df['HDEF'] = df['HTCG'] / df['LAGS']
df['AATT'] = df['ATSG'] / df['LAGS']
df['ADEF'] = df['ATCG'] / df['LHGS']

#Calculating expected goals regarding to team attack and defence (poisson similar)
df['HXG'] = df['HATT'] * df['ADEF'] * df['LHGS']
df['AXG'] = df['HDEF'] * df['AATT'] * df['LAGS']
df['TXG'] = df['HXG'] + df['AXG']

#Calculating expected goals according to poisson distribution
goals = range(10)
home_poisson = [] 
away_poisson = []

for i in range(len(df)):
    home_xg = [poisson.pmf(goal, df['HXG'][i]) for goal in goals]
    away_xg = [poisson.pmf(goal, df['AXG'][i]) for goal in goals]
    home_poisson.append(home_xg.index(max(home_xg)))
    away_poisson.append(away_xg.index(max(away_xg)))

df['HP'] = home_poisson 
df['AP'] = away_poisson
df['TP'] = df['HP'] + df['AP']

new_df = df[['TG','P1.5O','P2.5O','P3.5O','TXG', 'TP', '1.5O', '2.5O', '3.5O']].dropna()
new_df.head(7)

Unnamed: 0,TG,P1.5O,P2.5O,P3.5O,TXG,TP,1.5O,2.5O,3.5O
11,5,1.0,1.0,0.0,3.0,2,1,1,1
15,4,1.0,0.0,0.0,2.727273,2,1,1,1
17,3,1.0,0.0,0.0,1.80163,1,1,1,0
18,3,0.0,0.0,0.0,0.0,0,1,1,0
19,3,1.0,0.0,0.0,0.0,0,1,1,0
21,4,1.0,1.0,0.5,2.625,2,1,1,1
22,2,0.0,0.0,0.0,0.0,0,1,0,0


In [15]:
#Checking some classification models
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score

# Split the data into training and testing sets
X = new_df.drop(columns=['TG','1.5O', '2.5O', '3.5O'], axis = 1)  # Features
y = new_df['3.5O']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(penalty = 'l1', max_iter = 1000, C = 1, solver = 'liblinear'),
    'Support Vector Machine': SVC(probability = True, C = 10, degree = 2, gamma = 'scale', kernel = 'linear'),
    'Naive Bayes': GaussianNB(var_smoothing = 1e-09),
    'Gradient Boosting': GradientBoostingClassifier(learning_rate = 0.01, max_depth = 3, min_samples_leaf = 1, 
                                                    min_samples_split = 10, n_estimators = 100),
}

# Train and evaluate models
results = {}
for name, model in models.items():
    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
    # Store the mean ROC AUC score
    results[name] = cv_scores.mean()

# Print ROC AUC scores
for name, roc_auc in results.items():
    print(f'{name}: ROC AUC = {roc_auc}')

Logistic Regression: ROC AUC = 0.5671665835632022
Support Vector Machine: ROC AUC = 0.5673293850292793
Naive Bayes: ROC AUC = 0.5615214185837646
Gradient Boosting: ROC AUC = 0.5449239158555819
