In [105]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score
from keras.models import Sequential
from keras.layers import LSTM, Dense
import matplotlib.pyplot as plt

In [121]:
df = pd.read_csv('data/england-premier-league-matches-2017-to-2018-stats.csv')

In [62]:
df.columns

Index(['timestamp', 'date_GMT', 'status', 'attendance', 'home_team_name',
       'away_team_name', 'referee', 'Game Week', 'Pre-Match PPG (Home)',
       'Pre-Match PPG (Away)', 'home_ppg', 'away_ppg', 'home_team_goal_count',
       'away_team_goal_count', 'total_goal_count', 'total_goals_at_half_time',
       'home_team_goal_count_half_time', 'away_team_goal_count_half_time',
       'home_team_goal_timings', 'away_team_goal_timings',
       'home_team_corner_count', 'away_team_corner_count',
       'home_team_yellow_cards', 'home_team_red_cards',
       'away_team_yellow_cards', 'away_team_red_cards',
       'home_team_first_half_cards', 'home_team_second_half_cards',
       'away_team_first_half_cards', 'away_team_second_half_cards',
       'home_team_shots', 'away_team_shots', 'home_team_shots_on_target',
       'away_team_shots_on_target', 'home_team_shots_off_target',
       'away_team_shots_off_target', 'home_team_fouls', 'away_team_fouls',
       'home_team_possession', 'away_te

In [85]:
df.describe()

Unnamed: 0,timestamp,attendance,Game Week,Pre-Match PPG (Home),Pre-Match PPG (Away),home_ppg,away_ppg,home_team_goal_count,away_team_goal_count,total_goal_count,...,average_cards_per_match_pre_match,odds_ft_home_team_win,odds_ft_draw,odds_ft_away_team_win,odds_ft_over15,odds_ft_over25,odds_ft_over35,odds_ft_over45,odds_btts_yes,odds_btts_no
count,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,...,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0
mean,1514574000.0,38316.707895,19.5,1.509105,1.105105,1.626,1.1125,1.531579,1.147368,2.678947,...,3.228684,3.055632,4.425474,4.586474,1.263368,1.880211,3.243632,6.732974,1.878553,1.827105
std,6827494.0,17802.226732,10.980313,0.755472,0.793324,0.517686,0.534331,1.340087,1.177793,1.665125,...,1.197882,2.27288,1.619347,2.986443,0.209504,0.40469,1.020293,2.88521,0.356175,0.34772
min,1502477000.0,10242.0,1.0,0.0,0.0,0.95,0.58,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1509199000.0,24102.5,10.0,1.105,0.6075,1.33,0.74,1.0,0.0,1.0,...,2.83,1.6375,3.33,2.5475,1.1875,1.61,2.5,4.5,1.75,1.69
50%,1514175000.0,31545.0,19.5,1.4,0.865,1.42,0.865,1.0,1.0,3.0,...,3.3,2.265,3.715,3.545,1.27,1.87,3.2,6.45,1.91,1.83
75%,1520089000.0,53272.25,29.0,2.0,1.64,2.0275,1.5225,2.0,2.0,4.0,...,3.905,2.955,5.1425,5.8275,1.38,2.15,3.9275,8.6,2.05,2.0
max,1526220000.0,83222.0,38.0,3.0,3.0,2.63,2.63,7.0,6.0,9.0,...,11.0,9.99,9.99,9.99,1.61,2.9,6.0,15.0,3.13,2.65


In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 66 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   timestamp                            380 non-null    int64  
 1   date_GMT                             380 non-null    object 
 2   status                               380 non-null    object 
 3   attendance                           380 non-null    int64  
 4   home_team_name                       380 non-null    object 
 5   away_team_name                       380 non-null    object 
 6   referee                              373 non-null    object 
 7   Game Week                            380 non-null    int64  
 8   Pre-Match PPG (Home)                 380 non-null    float64
 9   Pre-Match PPG (Away)                 380 non-null    float64
 10  home_ppg                             380 non-null    float64
 11  away_ppg                        

In [64]:
data = df.copy()

In [67]:
df.shape

(380, 66)

In [68]:
df.isnull().sum()[data.isnull().sum() > 0]

referee                     7
home_team_goal_timings     90
away_team_goal_timings    136
dtype: int64

In [69]:
data = df.drop(['referee', 'home_team_goal_timings', 'away_team_goal_timings'], axis=1)

In [71]:
data.shape

(380, 63)

In [72]:
data.isnull().sum().sum()

0

In [63]:
target_columns = ['home_team_goal_count', 'away_team_goal_count']

In [91]:
non_numeric_columns = ['date_GMT', 'status', 'home_team_name', 'away_team_name', 'referee',
                        'home_team_goal_timings', 'away_team_goal_timings', 'stadium_name']

X = df.drop(target_columns + non_numeric_columns, axis=1)
y = df[target_columns]

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [93]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [96]:
def create_sequences(data, target, sequence_length=10):
    sequences, targets = [], []
    for i in range(len(data) - sequence_length + 1):
        seq = data[i:i+sequence_length]
        label = target[i+sequence_length-1]
        sequences.append(seq)
        targets.append(label)
    return np.array(sequences), np.array(targets)

In [97]:
sequence_length = 10
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train.values, sequence_length)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test.values, sequence_length)

In [98]:
model = Sequential()
model.add(LSTM(50, input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])))
model.add(Dense(2, activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam')





In [112]:
history = model.fit(X_train_seq, y_train_seq, epochs=50, batch_size=32, validation_split=0.1, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [100]:
y_pred = model.predict(X_test_seq)



In [109]:
rmse = np.sqrt(np.mean((y_test_seq - y_pred)**2))
print(f'Root Mean Squared Error on Test Set: {rmse}')

Root Mean Squared Error on Test Set: 0.6566698497246275


In [108]:
y_pred_discrete = np.round(y_pred).astype(int)
confusion_matrix(y_test_seq.flatten(), y_pred_discrete.flatten())

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1, 23, 14,  0,  0,  0,  0,  0,  0],
       [ 0,  7, 36,  5,  1,  0,  0,  0,  0],
       [ 0,  1,  8, 19,  1,  0,  0,  0,  0],
       [ 0,  0,  1,  5,  4,  1,  0,  0,  0],
       [ 0,  0,  0,  1,  2,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  2,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  1,  0]], dtype=int64)

In [107]:
accuracy_score(y_test_seq.flatten(), y_pred_discrete.flatten())

0.6119402985074627