In [1]:
# Importing the dependencies
import pandas as pd
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

Using TensorFlow backend.


In [2]:
# Loading the dataset
df = pd.read_csv('dataset/football_data.csv', index_col=False)
df.head().T

Unnamed: 0,0,1,2,3,4
Div,E0,E0,E0,E0,E0
Date,10-08-2018,11-08-2018,11-08-2018,11-08-2018,11-08-2018
HomeTeam,Man United,Bournemouth,Fulham,Huddersfield,Newcastle
AwayTeam,Leicester,Cardiff,Crystal Palace,Chelsea,Tottenham
FTHG,2,2,0,0,1
FTAG,1,0,2,3,2
FTR,H,H,A,A,A
HTHG,1,1,0,0,1
HTAG,0,0,1,2,2
HTR,H,H,A,A,A


In [3]:
# Finding for any null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 23 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Div       380 non-null    object
 1   Date      380 non-null    object
 2   HomeTeam  380 non-null    object
 3   AwayTeam  380 non-null    object
 4   FTHG      380 non-null    int64 
 5   FTAG      380 non-null    int64 
 6   FTR       380 non-null    object
 7   HTHG      380 non-null    int64 
 8   HTAG      380 non-null    int64 
 9   HTR       380 non-null    object
 10  Referee   380 non-null    object
 11  HS        380 non-null    int64 
 12  AS        380 non-null    int64 
 13  HST       380 non-null    int64 
 14  AST       380 non-null    int64 
 15  HF        380 non-null    int64 
 16  AF        380 non-null    int64 
 17  HC        380 non-null    int64 
 18  AC        380 non-null    int64 
 19  HY        380 non-null    int64 
 20  AY        380 non-null    int64 
 21  HR        380 no

In [4]:
df.columns

Index(['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG',
       'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC',
       'AC', 'HY', 'AY', 'HR', 'AR'],
      dtype='object')

In [5]:
# Dropping unnecessary columns
df.drop(['Div', 'Date', 'HomeTeam', 'AwayTeam', 'Referee', 'HTR'],
        axis=1,
        inplace=True)

In [6]:
# Finding total trainable features -1, as one feature is used as our label for prediction
print('Trainable features: ', df.shape[1] - 1)

Trainable features:  16


In [7]:
# Now to normalize the data, we will drop our label and use Min-Max normalization
data = df.drop(['FTR'], axis=1)
x = data.values
scaled = (data - data.min()) / (data.max() - data.min())
data = pd.DataFrame(scaled)
data.head()

Unnamed: 0,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,0.333333,0.166667,0.25,0.0,0.222222,0.478261,0.428571,0.333333,0.478261,0.277778,0.125,0.357143,0.333333,0.2,0.0,0.0
1,0.333333,0.0,0.25,0.0,0.333333,0.347826,0.285714,0.083333,0.478261,0.333333,0.4375,0.285714,0.166667,0.2,0.0,0.0
2,0.0,0.333333,0.0,0.333333,0.416667,0.347826,0.428571,0.75,0.391304,0.444444,0.3125,0.357143,0.166667,0.4,0.0,0.0
3,0.0,0.5,0.0,0.666667,0.166667,0.478261,0.071429,0.333333,0.391304,0.277778,0.125,0.357143,0.333333,0.2,0.0,0.0
4,0.166667,0.333333,0.25,0.666667,0.416667,0.565217,0.142857,0.416667,0.478261,0.5,0.1875,0.357143,0.333333,0.4,0.0,0.0


In [8]:
# Now we will join our label back again to the new dataframe and use this dataframe as our dataset for building the model
data = data.join(other=df['FTR'], how='left')
data.head()

Unnamed: 0,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,FTR
0,0.333333,0.166667,0.25,0.0,0.222222,0.478261,0.428571,0.333333,0.478261,0.277778,0.125,0.357143,0.333333,0.2,0.0,0.0,H
1,0.333333,0.0,0.25,0.0,0.333333,0.347826,0.285714,0.083333,0.478261,0.333333,0.4375,0.285714,0.166667,0.2,0.0,0.0,H
2,0.0,0.333333,0.0,0.333333,0.416667,0.347826,0.428571,0.75,0.391304,0.444444,0.3125,0.357143,0.166667,0.4,0.0,0.0,A
3,0.0,0.5,0.0,0.666667,0.166667,0.478261,0.071429,0.333333,0.391304,0.277778,0.125,0.357143,0.333333,0.2,0.0,0.0,A
4,0.166667,0.333333,0.25,0.666667,0.416667,0.565217,0.142857,0.416667,0.478261,0.5,0.1875,0.357143,0.333333,0.4,0.0,0.0,A


In [9]:
data['FTR'].unique()

array(['H', 'A', 'D'], dtype=object)

In [10]:
data['FTR'].replace(to_replace='H', value=0, inplace=True)
data['FTR'].replace(to_replace='A', value=1, inplace=True)
data['FTR'].replace(to_replace='D', value=2, inplace=True)

In [11]:
encoded = to_categorical(data['FTR'].values)
encoded

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.]], dtype=float32)

In [12]:
data['FTR'] = encoded

In [13]:
X = data.drop(['FTR'], axis=1)
Y = data['FTR']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

In [15]:
# Here we will consider 4 classification algorithms for building the model
models = [LogisticRegression(), RandomForestClassifier(), SVC()]

for m in models:
    print(f'{m.__class__.__name__}')
    model = m.fit(X_train, y_train)
    print(f'Accuracy: {model.score(X_test, y_test)}')
    print(
        f'Confusion matrix:\n{confusion_matrix(y_true=y_test, y_pred=model.predict(X_test))}\n'
    )

LogisticRegression
Accuracy: 0.9210526315789473
Confusion matrix:
[[31  4]
 [ 2 39]]

RandomForestClassifier
Accuracy: 0.9736842105263158
Confusion matrix:
[[34  1]
 [ 1 40]]

SVC
Accuracy: 0.9605263157894737
Confusion matrix:
[[33  2]
 [ 1 40]]



In [16]:
random_forest_model = RandomForestClassifier()
model = random_forest_model.fit(X_train, y_train)

print(f'Accuracy: {model.score(X_test, y_test)}')

Accuracy: 0.9605263157894737


In [17]:
joblib.dump(model, 'Predict_Winner_App/model/model.joblib', compress=True)

['Predict_Winner_App/model/model.joblib']