In [189]:
import pandas as pd
import numpy as np 
import os

In [190]:
os.chdir('/Users/manishrathor/Documents/Data Science Projects')

In [191]:
df = pd.read_csv('matches.csv', index_col=0)
df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,...,Match Report,,13.0,1.0,18.7,1.0,1,1,2023,Manchester City
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,...,Match Report,,19.0,7.0,17.5,0.0,0,0,2023,Manchester City
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,...,Match Report,,21.0,10.0,16.2,1.0,0,0,2023,Manchester City
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,...,Match Report,,18.0,5.0,14.1,0.0,0,0,2023,Manchester City
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,...,Match Report,,17.0,9.0,14.8,0.0,0,0,2023,Manchester City


In [192]:
# checking for missing data - there should be 2280 matches over three seasons
print(df.shape)
38 * 20 * 3
# no missing data

(2280, 27)


2280

In [193]:
# checking data types - our model can only take numeric type 
df.columns

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'xG', 'xGA', 'Poss', 'Attendance', 'Captain', 'Formation',
       'Referee', 'Match Report', 'Notes', 'Sh', 'SoT', 'Dist', 'FK', 'PK',
       'PKatt', 'Season', 'Team'],
      dtype='object')

In [194]:
# dropping data we don't need
df = df.drop(['Comp', 'Round', 'Captain', 'Formation', 'Referee', 'Match Report', 'Notes', 'Season'], axis = 1)
df

Unnamed: 0,Date,Time,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Sh,SoT,Dist,FK,PK,PKatt,Team
1,2022-08-07,16:30,Sun,Away,W,2,0,West Ham,2.2,0.5,75.0,62443.0,13.0,1.0,18.7,1.0,1,1,Manchester City
2,2022-08-13,15:00,Sat,Home,W,4,0,Bournemouth,1.7,0.1,67.0,53453.0,19.0,7.0,17.5,0.0,0,0,Manchester City
3,2022-08-21,16:30,Sun,Away,D,3,3,Newcastle Utd,2.1,1.8,69.0,52258.0,21.0,10.0,16.2,1.0,0,0,Manchester City
4,2022-08-27,15:00,Sat,Home,W,4,2,Crystal Palace,2.2,0.1,74.0,53112.0,18.0,5.0,14.1,0.0,0,0,Manchester City
5,2022-08-31,19:30,Wed,Home,W,6,0,Nott'ham Forest,3.3,0.7,74.0,53409.0,17.0,9.0,14.8,0.0,0,0,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Sun,Away,L,0,4,Tottenham,0.5,2.2,34.0,,8.0,1.0,18.2,0.0,0,0,Sheffield United
39,2021-05-08,15:00,Sat,Home,L,0,2,Crystal Palace,0.7,2.0,50.0,,7.0,0.0,13.4,1.0,0,0,Sheffield United
40,2021-05-16,19:00,Sun,Away,W,1,0,Everton,1.2,1.4,38.0,,10.0,3.0,18.5,0.0,0,0,Sheffield United
41,2021-05-19,18:00,Wed,Away,L,0,1,Newcastle Utd,0.9,1.6,49.0,10000.0,11.0,1.0,18.3,1.0,0,0,Sheffield United


In [195]:
df.dtypes

Date           object
Time           object
Day            object
Venue          object
Result         object
GF              int64
GA              int64
Opponent       object
xG            float64
xGA           float64
Poss          float64
Attendance    float64
Sh            float64
SoT           float64
Dist          float64
FK            float64
PK              int64
PKatt           int64
Team           object
dtype: object

In [196]:
# converting data to datetime type
df.Date = pd.to_datetime(df.Date)

#converting time to an integer from 0-23
df.Time = df.Time.replace(":.+", "", regex = True).astype(int)

# converting venue to integer (0 or 1)
df.Venue = df.Venue.astype('category').cat.codes

# converting day to integer (1 - 7)
day_codes = {
    'Mon' : 1,
    'Tue' : 2,
    'Wed' : 3,
    'Thu' : 4,
    'Fri' : 5,
    'Sat' : 6, 
    'Sun' : 7
}
df.Day = df.Day.map(day_codes)

# converting result to integers
result_codes = {
    'W' : 2,
    'D' : 1,
    'L' : 0
}
df.Result = df.Result.map(result_codes)

In [197]:
df.head()

Unnamed: 0,Date,Time,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Sh,SoT,Dist,FK,PK,PKatt,Team
1,2022-08-07,16,7,0,2,2,0,West Ham,2.2,0.5,75.0,62443.0,13.0,1.0,18.7,1.0,1,1,Manchester City
2,2022-08-13,15,6,1,2,4,0,Bournemouth,1.7,0.1,67.0,53453.0,19.0,7.0,17.5,0.0,0,0,Manchester City
3,2022-08-21,16,7,0,1,3,3,Newcastle Utd,2.1,1.8,69.0,52258.0,21.0,10.0,16.2,1.0,0,0,Manchester City
4,2022-08-27,15,6,1,2,4,2,Crystal Palace,2.2,0.1,74.0,53112.0,18.0,5.0,14.1,0.0,0,0,Manchester City
5,2022-08-31,19,3,1,2,6,0,Nott'ham Forest,3.3,0.7,74.0,53409.0,17.0,9.0,14.8,0.0,0,0,Manchester City


## Encoding Summary: 

1. Time: I simply stripped the minute values from the time data
2. Venue: 0 = Away, 1 = Home

The rest are self explanatory from the code above

In [198]:
# dropping missing values 
df.isna().sum()

Date            0
Time            0
Day             0
Venue           0
Result          0
GF              0
GA              0
Opponent        0
xG              0
xGA             0
Poss            0
Attendance    698
Sh              0
SoT             0
Dist            1
FK              0
PK              0
PKatt           0
Team            0
dtype: int64

In [199]:
df = df.drop(['Attendance'], axis = 1)
print(df.shape)
df.head()

(2280, 18)


Unnamed: 0,Date,Time,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Sh,SoT,Dist,FK,PK,PKatt,Team
1,2022-08-07,16,7,0,2,2,0,West Ham,2.2,0.5,75.0,13.0,1.0,18.7,1.0,1,1,Manchester City
2,2022-08-13,15,6,1,2,4,0,Bournemouth,1.7,0.1,67.0,19.0,7.0,17.5,0.0,0,0,Manchester City
3,2022-08-21,16,7,0,1,3,3,Newcastle Utd,2.1,1.8,69.0,21.0,10.0,16.2,1.0,0,0,Manchester City
4,2022-08-27,15,6,1,2,4,2,Crystal Palace,2.2,0.1,74.0,18.0,5.0,14.1,0.0,0,0,Manchester City
5,2022-08-31,19,3,1,2,6,0,Nott'ham Forest,3.3,0.7,74.0,17.0,9.0,14.8,0.0,0,0,Manchester City


In [200]:
df = df[df.Dist.isna() != True]
df.shape

(2279, 18)

In [201]:
print(len(df.Opponent.unique()))
df.Opponent.unique()

25


array(['West Ham', 'Bournemouth', 'Newcastle Utd', 'Crystal Palace',
       "Nott'ham Forest", 'Aston Villa', 'Wolves', 'Manchester Utd',
       'Southampton', 'Liverpool', 'Brighton', 'Leicester City', 'Fulham',
       'Brentford', 'Leeds United', 'Everton', 'Chelsea', 'Tottenham',
       'Arsenal', 'Manchester City', 'Norwich City', 'Burnley', 'Watford',
       'Sheffield Utd', 'West Brom'], dtype=object)

In [202]:
print(len(df.Team.unique()))
df.Team.unique()

25


array(['Manchester City', 'Arsenal', 'Manchester United',
       'Newcastle United', 'Liverpool', 'Brighton and Hove Albion',
       'Aston Villa', 'Tottenham Hotspur', 'Brentford', 'Fulham',
       'Crystal Palace', 'Chelsea', 'Wolverhampton Wanderers',
       'West Ham United', 'Bournemouth', 'Nottingham Forest', 'Everton',
       'Leicester City', 'Leeds United', 'Southampton', 'Burnley',
       'Watford', 'Norwich City', 'West Bromwich Albion',
       'Sheffield United'], dtype=object)

In [203]:
# standardizing team names across team/opponent columns

class MissingDict(dict):
    __missing__ = lambda self, key: key

club_names = {
    'West Ham' : 'West Ham United',
    'Newcastle Utd' : 'Newcastle United',
    'Nott\'ham Forest' : 'Nottingham Forest', 
    'Wolves' : 'Wolverhampton Wanderers',
    'Manchester Utd' : 'Manchester United',
    'Brighton' : 'Brighton and Hove Albion',
    'Tottenham' : 'Tottenham Hotspur'
}

map = MissingDict(**club_names)
df.Opponent = df.Opponent.map(map)

In [204]:
df.head()

Unnamed: 0,Date,Time,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Sh,SoT,Dist,FK,PK,PKatt,Team
1,2022-08-07,16,7,0,2,2,0,West Ham United,2.2,0.5,75.0,13.0,1.0,18.7,1.0,1,1,Manchester City
2,2022-08-13,15,6,1,2,4,0,Bournemouth,1.7,0.1,67.0,19.0,7.0,17.5,0.0,0,0,Manchester City
3,2022-08-21,16,7,0,1,3,3,Newcastle United,2.1,1.8,69.0,21.0,10.0,16.2,1.0,0,0,Manchester City
4,2022-08-27,15,6,1,2,4,2,Crystal Palace,2.2,0.1,74.0,18.0,5.0,14.1,0.0,0,0,Manchester City
5,2022-08-31,19,3,1,2,6,0,Nottingham Forest,3.3,0.7,74.0,17.0,9.0,14.8,0.0,0,0,Manchester City


In [205]:
# checking for missing values 
df.isna().sum()

Date        0
Time        0
Day         0
Venue       0
Result      0
GF          0
GA          0
Opponent    0
xG          0
xGA         0
Poss        0
Sh          0
SoT         0
Dist        0
FK          0
PK          0
PKatt       0
Team        0
dtype: int64

In [206]:
# creating our features and target 
features = df[['Time', 'Day', 'Venue', 'xG', 'xGA', 'Poss', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']]
target = df['Result']

In [207]:
# converting the target to categorical type for classification

from tensorflow.keras.utils import to_categorical

target = to_categorical(target, num_classes= 3)

In [208]:
from sklearn.model_selection import train_test_split

In [209]:
# splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size= 0.2, random_state= 42)

In [210]:
# checking the shape of the data
split = [x_train, x_test, y_train, y_test]

for val in split:
    print(val.shape)

(1823, 12)
(456, 12)
(1823, 3)
(456, 3)


### Creating and Training the Neural Network

In [211]:
# importing dependencies
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [212]:
# implementing the model architecture

# this will be a sequential model - we can add layers step-by-step
# we will also use ReLU activation
model = Sequential()
# first hidden layer will have 128 neurons
model.add(Dense(128, input_shape = (x_train.shape[1],), activation = 'relu'))
# second hidden layer will have 64 neurons
model.add(Dense(64, activation = 'relu'))
#third hidden layer will have 32 neurons
model.add(Dense(32, activation = 'relu'))
# ouput layer will have 1 neuron
model.add(Dense(3, activation = 'softmax'))

In [213]:
# compiling the model 

# using the Adam optimizer
# using the mse as the loss function
# measuring model performance with mse
optimizer = Adam(learning_rate = 0.001)
model.compile(optimizer = optimizer, loss = 'categorical_crossentropy', metrics = ['accuracy'])



In [214]:
# training the model

model_fit = model.fit(x_train, y_train, epochs = 10, batch_size = 32, validation_split= 0.2)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [215]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Testing Accuracy: {accuracy}")

Testing Accuracy: 0.5899122953414917
