In [10]:
import pandas as pd
import numpy as np

In [11]:
df = pd.read_csv('china.csv')
df.head()

Unnamed: 0,Country,League,Season,Date,Time,Home,Away,HG,AG,Res,...,PSCA,MaxCH,MaxCD,MaxCA,AvgCH,AvgCD,AvgCA,BFECH,BFECD,BFECA
0,China,Super League,2014,07/03/2014,11:00,Shandong Luneng,Zhejiang Yiteng,1,0,H,...,,1.3,6.5,13.0,1.21,5.72,9.77,,,
1,China,Super League,2014,08/03/2014,08:30,Guangzhou Evergrande,Henan Songshan Longmen,3,0,H,...,17.5,1.19,8.8,23.0,1.14,6.64,15.35,,,
2,China,Super League,2014,08/03/2014,11:30,Beijing Guoan,Changchun Yatai,1,0,H,...,9.7,1.45,4.8,9.7,1.39,4.27,7.18,,,
3,China,Super League,2014,08/03/2014,11:35,Hangzhou Greentown,Dalian Yifang F.C.,1,1,D,...,2.63,3.0,3.25,2.78,2.7,3.1,2.48,,,
4,China,Super League,2014,08/03/2014,11:35,Jiangsu Suning,Beijing Renhe,0,0,D,...,4.38,2.38,3.5,4.38,2.01,3.23,3.48,,,


In [12]:
#Calculating odds probabilities
total = 1 / df['PSCH'] + 1 / df['PSCD'] + 1 / df['PSCA']
last_games = 3

df['FTH'] = (1 / df['PSCH']) / total
draw_percentage = (1 / df['PSCD']) / total
away_percentage = (1 / df['PSCA']) / total
df['FTDA'] = draw_percentage + away_percentage

#Encoding FTR as 1 or 0 to FTRT
df['FTRT'] = [1 if x == 'H' else 0 for x in df['Res']]

#Calculating Points Per Game for Home and Away Teams
df['HPTS'] = np.select([df['Res'] == 'H', df['Res'] == 'D', df['Res'] == 'A'], [3, 1, 0], default=0)
df['APTS'] = np.select([df['Res'] == 'H', df['Res'] == 'D', df['Res'] == 'A'], [0, 1, 3], default=0)

df['HPPG'] = df.groupby('Home')['HPTS'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['APPG'] = df.groupby('Away')['APTS'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['PPGDif'] = df['HPPG'] - df['APPG']

#Calculating scored and conceded goals for each team to calculate attack and defence powers
df['HTSG'] = df.groupby('Home')['HG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['HTCG'] = df.groupby('Home')['AG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['LHGS'] = df['HG'].transform(lambda x: x.rolling(50, min_periods=1).mean().shift(1))

df['ATSG'] = df.groupby('Away')['AG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['ATCG'] = df.groupby('Away')['HG'].transform(lambda x: x.rolling(last_games, min_periods=1).mean().shift(1))
df['LAGS'] = df['AG'].transform(lambda x: x.rolling(50, min_periods=1).mean().shift(1))


df['HATT'] = df['HTSG'] / df['LHGS']
df['HDEF'] = df['HTCG'] / df['LAGS']
df['AATT'] = df['ATSG'] / df['LAGS']
df['ADEF'] = df['ATCG'] / df['LHGS']

#Calculating expected goals regarding to team attack and defence (poisson similar)
df['HXG'] = df['HATT'] * df['ADEF'] * df['LHGS']
df['AXG'] = df['HDEF'] * df['AATT'] * df['LAGS']
df['XGDif'] = df['HXG'] - df['AXG']
df['PRBDif'] = df['FTH'] - df['FTDA']

#Preparing new_df with necessary columns only
new_df = df[['Home', 'Away', 'FTRT', 'PRBDif', 'PPGDif', 'XGDif']].dropna()

new_df.tail(7)

Unnamed: 0,Home,Away,FTRT,PRBDif,PPGDif,XGDif
2513,Tianjin Jinmen Tiger,Wuhan Three Towns,1,-0.076859,1.0,0.086806
2514,Beijing Guoan,Zhejiang Professional,0,0.077456,2.0,1.200419
2515,Nantong Zhiyun,Qingdao West Coast,0,-0.101365,0.333333,-0.405167
2516,Qingdao Hainiu,Shenzhen Xinpengcheng,1,-0.27305,1.666667,1.986175
2517,Meizhou Hakka,Changchun Yatai,1,-0.071215,2.0,0.841346
2518,Shanghai Shenhua,Shanghai Port,1,-0.443377,0.0,-1.041667
2519,Shandong Taishan,Chengdu Rongcheng,1,-0.717638,-1.0,-0.803755


In [13]:
#Checking some classification models
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

# Split the data into training and testing sets
X = new_df.drop(columns=['FTRT', 'Home', 'Away'], axis = 1)  # Features
y = new_df['FTRT']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(penalty = 'l1', max_iter = 1000, C = 1, solver = 'liblinear'),
    'Support Vector Machine': SVC(probability = True, C = 10, degree = 2, gamma = 'scale', kernel = 'linear'),
    'Naive Bayes': GaussianNB(var_smoothing = 1e-09),
    'Gradient Boosting': GradientBoostingClassifier(learning_rate = 0.01, max_depth = 3, min_samples_leaf = 1, 
                                                    min_samples_split = 10, n_estimators = 100)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
    # Store the mean ROC AUC score
    results[name] = cv_scores.mean()

# Print ROC AUC scores
for name, roc_auc in results.items():
    print(f'{name}: ROC AUC = {roc_auc}')

Logistic Regression: ROC AUC = 0.7494196131655482
Support Vector Machine: ROC AUC = 0.7494467271312685
Naive Bayes: ROC AUC = 0.7228718833542462
Gradient Boosting: ROC AUC = 0.742070053595891


In [14]:
#Hyperparameter tuning best model Logistic Regression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

# Define hyperparameter tuning space for logistic regression
param_grid = {
    'penalty': ['l1','l2'],
    'C': [0.1, 1, 10],
    'max_iter': [500, 1000, 2000],
    'solver': ['liblinear']
}

# Define logistic regression model
log_reg = LogisticRegression()

# Perform grid search with cross-validation
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

# Get the best model parameters and ROC-AUC score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best model parameters:", best_params)
print("Best ROC-AUC score:", best_score)

# Train the best model on the entire training set
best_model = LogisticRegression(**best_params)
best_model.fit(X_train, y_train)

# Evaluate the best model on the test set
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print("ROC-AUC score on test set:", roc_auc)

Best model parameters: {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}
Best ROC-AUC score: 0.7494725887462131
ROC-AUC score on test set: 0.7525684637322614


In [15]:
#Trying MLP model for the prediction
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

# Define the neural network model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, min_delta=0.001)

# Train the model with early stopping
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), 
          callbacks=[early_stopping], verbose=2)

# Evaluate the model on the test set
y_pred_proba = model.predict(X_test)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'Test ROC-AUC score: {roc_auc:.3f}')

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


61/61 - 2s - 32ms/step - accuracy: 0.6436 - loss: 0.6468 - val_accuracy: 0.6825 - val_loss: 0.6121
Epoch 2/50
61/61 - 0s - 3ms/step - accuracy: 0.6756 - loss: 0.6124 - val_accuracy: 0.6928 - val_loss: 0.5970
Epoch 3/50
61/61 - 0s - 3ms/step - accuracy: 0.6797 - loss: 0.5971 - val_accuracy: 0.6845 - val_loss: 0.5896
Epoch 4/50
61/61 - 0s - 3ms/step - accuracy: 0.6880 - loss: 0.5943 - val_accuracy: 0.6804 - val_loss: 0.5953
Epoch 5/50
61/61 - 0s - 3ms/step - accuracy: 0.6911 - loss: 0.5903 - val_accuracy: 0.6742 - val_loss: 0.5922
Epoch 6/50
61/61 - 0s - 3ms/step - accuracy: 0.6911 - loss: 0.5879 - val_accuracy: 0.6763 - val_loss: 0.5913
Epoch 7/50
61/61 - 0s - 3ms/step - accuracy: 0.6880 - loss: 0.5883 - val_accuracy: 0.6804 - val_loss: 0.5938
Epoch 8/50
61/61 - 0s - 3ms/step - accuracy: 0.6900 - loss: 0.5860 - val_accuracy: 0.6866 - val_loss: 0.5954
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Test ROC-AUC score: 0.743
