# Winning prediction from bettings odds of all 4 companies

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv('../data/matches.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22467 entries, 0 to 22466
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                22467 non-null  int64  
 1   date              22467 non-null  object 
 2   season_name       22467 non-null  object 
 3   match_api_id      22467 non-null  int64  
 4   home_team_api_id  22467 non-null  int64  
 5   away_team_api_id  22467 non-null  int64  
 6   win_lose_draw     22467 non-null  object 
 7   B365H             22467 non-null  float64
 8   B365D             22467 non-null  float64
 9   B365A             22467 non-null  float64
 10  BWH               22467 non-null  float64
 11  BWD               22467 non-null  float64
 12  BWA               22467 non-null  float64
 13  IWH               22467 non-null  float64
 14  IWD               22467 non-null  float64
 15  IWA               22467 non-null  float64
 16  LBH               22467 non-null  float6

## Prepare data and try different shallow models

In [4]:
feature_cols = list(
    set(df.columns.tolist()) - 
    {'id', 'date', 'match_api_id', 'home_team_api_id', 'away_team_api_id', 'win_lose_draw', 'season_name'}
    )
target_label = 'win_lose_draw'

X = df[feature_cols]
y = df[target_label]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


### Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(logreg.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(logreg.score(X_test, y_test)))

Accuracy of Logistic regression classifier on training set: 0.53
Accuracy of Logistic regression classifier on test set: 0.53


#### A shot at 10 fold cross validation

In [6]:
from numpy import mean, std
from sklearn.model_selection import KFold, cross_val_score

scores = cross_val_score(LogisticRegression(), X, y, cv=KFold(n_splits=10, random_state=42, shuffle=True))

print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.532 (0.011)


### Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 0.41


In [8]:
scores = cross_val_score(DecisionTreeClassifier(), X, y, cv=KFold(n_splits=10, random_state=42, shuffle=True))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.417 (0.008)


### K-nearest neighbors

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of K-NN classifier on training set: 0.69
Accuracy of K-NN classifier on test set: 0.45


In [10]:
scores = cross_val_score(KNeighborsClassifier(), X, y, cv=KFold(n_splits=10, random_state=42, shuffle=True))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.452 (0.010)


### Linear Discriminant Analysis

In [11]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

print('Accuracy of LDA classifier on training set: {:.2f}'
     .format(lda.score(X_train, y_train)))
print('Accuracy of LDA classifier on test set: {:.2f}'
     .format(lda.score(X_test, y_test)))

Accuracy of LDA classifier on training set: 0.53
Accuracy of LDA classifier on test set: 0.53


In [12]:
scores = cross_val_score(LinearDiscriminantAnalysis(), X, y, cv=KFold(n_splits=10, random_state=42, shuffle=True))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.527 (0.012)


### Gaußian Naive Bayes

In [13]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)

print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(X_train, y_train)))
print('Accuracy of GNB classifier on test set: {:.2f}'
     .format(gnb.score(X_test, y_test)))

Accuracy of GNB classifier on training set: 0.42
Accuracy of GNB classifier on test set: 0.41


In [14]:
scores = cross_val_score(GaussianNB(), X, y, cv=KFold(n_splits=10, random_state=42, shuffle=True))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.419 (0.009)


### Support Vector Machine

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

Accuracy of SVM classifier on training set: 0.52
Accuracy of SVM classifier on test set: 0.52


In [None]:
scores = cross_val_score(SVC(), X, y, cv=KFold(n_splits=10, random_state=42, shuffle=True))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.530 (0.013)


## Multi-layer perceptron

### Linear MLP, single hidden layer

#### One layer of 1 neuron

In [15]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(random_state=42, activation='identity', hidden_layer_sizes=(1,))
mlp.fit(X_train, y_train)

print('Accuracy of MLP classifier on training set: {:.2f}'
     .format(mlp.score(X_train, y_train)))
print('Accuracy of MLP classifier on test set: {:.2f}'
     .format(mlp.score(X_test, y_test)))

Accuracy of MLP classifier on training set: 0.53
Accuracy of MLP classifier on test set: 0.53


In [16]:
scores = cross_val_score(mlp, X, y, cv=KFold(n_splits=10, random_state=42, shuffle=True))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.532 (0.011)


#### One layer of 10 neurons

In [20]:
mlp = MLPClassifier(random_state=42, activation='identity', hidden_layer_sizes=(10,))
mlp.fit(X_train, y_train)

print('Accuracy of MLP classifier on training set: {:.2f}'
     .format(mlp.score(X_train, y_train)))
print('Accuracy of MLP classifier on test set: {:.2f}'
     .format(mlp.score(X_test, y_test)))

Accuracy of MLP classifier on training set: 0.53
Accuracy of MLP classifier on test set: 0.53


In [21]:
scores = cross_val_score(mlp, X, y, cv=KFold(n_splits=10, random_state=42, shuffle=True))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.529 (0.010)


### Linear MLP, two hidden layers

In [22]:
mlp = MLPClassifier(random_state=42, activation='identity', hidden_layer_sizes=(1,1))
mlp.fit(X_train, y_train)

print('Accuracy of MLP classifier on training set: {:.2f}'
     .format(mlp.score(X_train, y_train)))
print('Accuracy of MLP classifier on test set: {:.2f}'
     .format(mlp.score(X_test, y_test)))

Accuracy of MLP classifier on training set: 0.53
Accuracy of MLP classifier on test set: 0.53


In [23]:
scores = cross_val_score(mlp, X, y, cv=KFold(n_splits=10, random_state=42, shuffle=True))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.530 (0.012)


In [24]:
mlp = MLPClassifier(random_state=42, activation='identity', hidden_layer_sizes=(10,10))
mlp.fit(X_train, y_train)

print('Accuracy of MLP classifier on training set: {:.2f}'
     .format(mlp.score(X_train, y_train)))
print('Accuracy of MLP classifier on test set: {:.2f}'
     .format(mlp.score(X_test, y_test)))

Accuracy of MLP classifier on training set: 0.53
Accuracy of MLP classifier on test set: 0.53


### Non-linear MLP, one hidden layer

#### One layer of 1 neuron

In [25]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(random_state=42, activation='relu', hidden_layer_sizes=(1,))
mlp.fit(X_train, y_train)

print('Accuracy of MLP classifier on training set: {:.2f}'
     .format(mlp.score(X_train, y_train)))
print('Accuracy of MLP classifier on test set: {:.2f}'
     .format(mlp.score(X_test, y_test)))

Accuracy of MLP classifier on training set: 0.53
Accuracy of MLP classifier on test set: 0.53


In [26]:
scores = cross_val_score(mlp, X, y, cv=KFold(n_splits=10, random_state=42, shuffle=True))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.531 (0.011)


#### One layer of 10 neurons

In [27]:
mlp = MLPClassifier(random_state=42, activation='relu', hidden_layer_sizes=(10,))
mlp.fit(X_train, y_train)

print('Accuracy of MLP classifier on training set: {:.2f}'
     .format(mlp.score(X_train, y_train)))
print('Accuracy of MLP classifier on test set: {:.2f}'
     .format(mlp.score(X_test, y_test)))

Accuracy of MLP classifier on training set: 0.53
Accuracy of MLP classifier on test set: 0.53


In [None]:
scores = cross_val_score(mlp, X, y, cv=KFold(n_splits=10, random_state=42, shuffle=True))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.529 (0.010)


### Non-Linear MLP, two hidden layers

In [28]:
mlp = MLPClassifier(random_state=42, activation='relu', hidden_layer_sizes=(1,1))
mlp.fit(X_train, y_train)

print('Accuracy of MLP classifier on training set: {:.2f}'
     .format(mlp.score(X_train, y_train)))
print('Accuracy of MLP classifier on test set: {:.2f}'
     .format(mlp.score(X_test, y_test)))

Accuracy of MLP classifier on training set: 0.46
Accuracy of MLP classifier on test set: 0.46


In [None]:
scores = cross_val_score(mlp, X, y, cv=KFold(n_splits=10, random_state=42, shuffle=True))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.530 (0.012)


In [29]:
mlp = MLPClassifier(random_state=42, activation='relu', hidden_layer_sizes=(10,10))
mlp.fit(X_train, y_train)

print('Accuracy of MLP classifier on training set: {:.2f}'
     .format(mlp.score(X_train, y_train)))
print('Accuracy of MLP classifier on test set: {:.2f}'
     .format(mlp.score(X_test, y_test)))

Accuracy of MLP classifier on training set: 0.53
Accuracy of MLP classifier on test set: 0.53


In [30]:
mlp = MLPClassifier(random_state=42, activation='relu', hidden_layer_sizes=(100,100))
mlp.fit(X_train, y_train)

print('Accuracy of MLP classifier on training set: {:.2f}'
     .format(mlp.score(X_train, y_train)))
print('Accuracy of MLP classifier on test set: {:.2f}'
     .format(mlp.score(X_test, y_test)))

Accuracy of MLP classifier on training set: 0.56
Accuracy of MLP classifier on test set: 0.51


## Non-linear MLP, three hidden layers and more

In [31]:
mlp = MLPClassifier(random_state=42, activation='relu', hidden_layer_sizes=(10, 10, 10))
mlp.fit(X_train, y_train)

print('Accuracy of MLP classifier on training set: {:.2f}'
     .format(mlp.score(X_train, y_train)))
print('Accuracy of MLP classifier on test set: {:.2f}'
     .format(mlp.score(X_test, y_test)))

Accuracy of MLP classifier on training set: 0.53
Accuracy of MLP classifier on test set: 0.53


In [32]:
mlp = MLPClassifier(random_state=42, activation='relu', hidden_layer_sizes=(10, 10, 10, 10, 10))
mlp.fit(X_train, y_train)

print('Accuracy of MLP classifier on training set: {:.2f}'
     .format(mlp.score(X_train, y_train)))
print('Accuracy of MLP classifier on test set: {:.2f}'
     .format(mlp.score(X_test, y_test)))

Accuracy of MLP classifier on training set: 0.53
Accuracy of MLP classifier on test set: 0.53


In [33]:
mlp = MLPClassifier(random_state=42, activation='relu', hidden_layer_sizes=(100, 100, 100, 100, 100))
mlp.fit(X_train, y_train)

print('Accuracy of MLP classifier on training set: {:.2f}'
     .format(mlp.score(X_train, y_train)))
print('Accuracy of MLP classifier on test set: {:.2f}'
     .format(mlp.score(X_test, y_test)))

Accuracy of MLP classifier on training set: 0.74
Accuracy of MLP classifier on test set: 0.45


## Confusion matrix of Logistic Regression

Logistic Regression was the best performing simplest model.

In [7]:
from sklearn.metrics import confusion_matrix, classification_report

pred = logreg.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[ 568    1  744]
 [ 265    1  831]
 [ 259    0 1825]]
              precision    recall  f1-score   support

           A       0.52      0.43      0.47      1313
           D       0.50      0.00      0.00      1097
           H       0.54      0.88      0.67      2084

    accuracy                           0.53      4494
   macro avg       0.52      0.44      0.38      4494
weighted avg       0.52      0.53      0.45      4494

