In [11]:
# Import libraries for pre-processing

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Import libraries for model selection

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [12]:
# Create the dataframe

df = pd.read_csv('../Preprocessing/masterupdate.csv')
df.head()

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,Avg>2.5,AHh,B365AHH,B365AHA,PAHH,PAHA,MaxAHH,MaxAHA,AvgAHH,AvgAHA
0,03/08/07,,Mechelen,Anderlecht,0.0,1.0,A,0.0,0.0,D,...,,,,,,,,,,
1,04/08/07,,Charleroi,Roeselare,1.0,1.0,D,1.0,0.0,H,...,,,,,,,,,,
2,04/08/07,,Club Brugge,Bergen,2.0,1.0,H,1.0,1.0,D,...,,,,,,,,,,
3,04/08/07,,Dender,Germinal,1.0,2.0,A,0.0,0.0,D,...,,,,,,,,,,
4,04/08/07,,FC Brussels,Westerlo,0.0,1.0,A,0.0,0.0,D,...,,,,,,,,,,


In [13]:
df.drop('Time', axis=1, inplace=True)


In [14]:
# Filling missing values

# with mode for categorical columns



# with kNN for numerical columns

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=2)
df_filled = imputer.fit_transform(df)
df_filled = pd.DataFrame(df_filled, columns=df.columns)



In [15]:
# Rename the columns 
def clean_data(df):
    # Rename column 'FTHG' to ' FTHG and HG = Full Time Home Team Goals'
    df = df.rename(columns={'FTHG': ' FTHG and HG = Full Time Home Team Goals'})
    # Rename column 'FTAG' to 'FTAG and AG = Full Time Away Team Goals'
    df = df.rename(columns={'FTAG': 'FTAG and AG = Full Time Away Team Goals'})
    # Rename column 'FTR' to 'FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)'
    df = df.rename(columns={'FTR': 'FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)'})
    # Rename column 'HTHG' to 'HTHG = Half Time Home Team Goals'
    df = df.rename(columns={'HTHG': 'HTHG = Half Time Home Team Goals'})
    # Rename column 'HTAG' to 'HTAG = Half Time Away Team Goals'
    df = df.rename(columns={'HTAG': 'HTAG = Half Time Away Team Goals'})
    # Rename column 'HTR' to 'HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)'
    df = df.rename(columns={'HTR': 'HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)'})
    # Rename column 'B365H' to ' B365H = Bet365 home win odds'
    df = df.rename(columns={'B365H': ' B365H = Bet365 home win odds'})
    # Rename column 'B365D' to ' B365D = Bet365 draw odds'
    df = df.rename(columns={'B365D': ' B365D = Bet365 draw odds'})
    # Rename column 'B365A' to 'B365A = Bet365 away win odds'
    df = df.rename(columns={'B365A': 'B365A = Bet365 away win odds'})
    # Rename column 'BWD' to 'BWH = Bet&Win home win odds'
    df = df.rename(columns={'BWD': 'BWH = Bet&Win home win odds'})
    # Rename column 'BWA' to 'BWD = Bet&Win draw odds'
    df = df.rename(columns={'BWA': 'BWD = Bet&Win draw odds'})
    # Rename column 'PSH' to 'PSH and PH = Pinnacle home win odds'
    df = df.rename(columns={'PSH': 'PSH and PH = Pinnacle home win odds'})
    # Rename column 'PSD' to ' PSD and PD = Pinnacle draw odds'
    df = df.rename(columns={'PSD': ' PSD and PD = Pinnacle draw odds'})
    # Rename column 'PSA' to ' PSA and PA = Pinnacle away win odds'
    df = df.rename(columns={'PSA': ' PSA and PA = Pinnacle away win odds'})
    # Rename column 'WHH' to ' WHH = William Hill home win odds'
    df = df.rename(columns={'WHH': ' WHH = William Hill home win odds'})
    # Rename column 'WHD' to 'WHD = William Hill draw odds'
    df = df.rename(columns={'WHD': 'WHD = William Hill draw odds'})
    # Rename column 'WHA' to 'WHA = William Hill away win odds'
    df = df.rename(columns={'WHA': 'WHA = William Hill away win odds'})
    return df

df_clean = clean_data(df.copy())
df_clean.head()


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG and HG = Full Time Home Team Goals,FTAG and AG = Full Time Away Team Goals,"FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)",HTHG = Half Time Home Team Goals,HTAG = Half Time Away Team Goals,"HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)",HS,...,Avg>2.5,AHh,B365AHH,B365AHA,PAHH,PAHA,MaxAHH,MaxAHA,AvgAHH,AvgAHA
0,03/08/07,Mechelen,Anderlecht,0.0,1.0,A,0.0,0.0,D,,...,,,,,,,,,,
1,04/08/07,Charleroi,Roeselare,1.0,1.0,D,1.0,0.0,H,,...,,,,,,,,,,
2,04/08/07,Club Brugge,Bergen,2.0,1.0,H,1.0,1.0,D,,...,,,,,,,,,,
3,04/08/07,Dender,Germinal,1.0,2.0,A,0.0,0.0,D,,...,,,,,,,,,,
4,04/08/07,FC Brussels,Westerlo,0.0,1.0,A,0.0,0.0,D,,...,,,,,,,,,,


In [16]:
# General statistics

df.describe()

Unnamed: 0,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,HF,AF,...,Avg>2.5,AHh,B365AHH,B365AHA,PAHH,PAHA,MaxAHH,MaxAHA,AvgAHH,AvgAHA
count,7823.0,7823.0,7816.0,7816.0,1983.0,1983.0,1983.0,1983.0,1503.0,1503.0,...,1508.0,1508.0,1602.0,1602.0,1504.0,1504.0,1508.0,1508.0,1508.0,1508.0
mean,1.652052,1.234693,0.704708,0.532369,12.668684,10.570348,5.026223,4.258699,11.667997,12.463074,...,1.689211,-0.281996,1.927803,1.92196,1.945306,1.930359,1.988176,1.979881,1.918428,1.908528
std,1.356297,1.17325,0.854214,0.731156,5.108081,4.556625,2.503543,2.397473,3.832662,3.912806,...,0.148253,0.745544,0.087702,0.089082,0.089526,0.089338,0.087221,0.087794,0.0817,0.081354
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,...,1.26,-2.5,1.7,1.65,1.71,1.7,1.8,1.8,1.73,1.72
25%,1.0,0.0,0.0,0.0,9.0,7.0,3.0,2.0,9.0,10.0,...,1.59,-0.75,1.85,1.85,1.88,1.86,1.92,1.91,1.85,1.84
50%,1.0,1.0,0.0,0.0,12.0,10.0,5.0,4.0,11.0,12.0,...,1.68,-0.25,1.93,1.93,1.94,1.93,1.99,1.98,1.92,1.91
75%,2.0,2.0,1.0,1.0,16.0,13.0,7.0,6.0,14.0,15.0,...,1.79,0.25,2.0,2.0,2.02,2.0,2.06,2.05,1.99,1.97
max,9.0,9.0,6.0,5.0,33.0,29.0,17.0,22.0,25.0,26.0,...,2.27,2.0,2.25,2.15,2.19,2.18,2.23,2.23,2.12,2.12


In [17]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7823 entries, 0 to 7822
Data columns (total 55 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      7823 non-null   object 
 1   HomeTeam  7823 non-null   object 
 2   AwayTeam  7823 non-null   object 
 3   FTHG      7823 non-null   float64
 4   FTAG      7823 non-null   float64
 5   FTR       7823 non-null   object 
 6   HTHG      7816 non-null   float64
 7   HTAG      7816 non-null   float64
 8   HTR       7816 non-null   object 
 9   HS        1983 non-null   float64
 10  AS        1983 non-null   float64
 11  HST       1983 non-null   float64
 12  AST       1983 non-null   float64
 13  HF        1503 non-null   float64
 14  AF        1503 non-null   float64
 15  HC        1983 non-null   float64
 16  AC        1983 non-null   float64
 17  HY        1984 non-null   float64
 18  AY        1984 non-null   float64
 19  HR        1984 non-null   float64
 20  AR        1984 non-null   floa

(7823, 55)

In [18]:
df.columns

Index(['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
       'HTR', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY',
       'HR', 'AR', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'PSH',
       'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'MaxH', 'MaxD', 'MaxA', 'AvgH',
       'AvgD', 'AvgA', 'B365>2.5', 'B365<2.5', 'P>2.5', 'P<2.5', 'Max>2.5',
       'Max<2.5', 'Avg>2.5', 'AHh', 'B365AHH', 'B365AHA', 'PAHH', 'PAHA',
       'MaxAHH', 'MaxAHA', 'AvgAHH', 'AvgAHA'],
      dtype='object')

In [19]:
# Data vizualization

# Plot FTR = Full Time Result (H=Home Win, D=Draw, A=Away Win)

plt.figure(figsize=(10, 6))
sns.countplot(x='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)', data=df)  # This counts occurrences of each result
plt.title('Full Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.xlabel('Full Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.ylabel('Count')
plt.show()

# Plot FTHG and HG = Full Time Home Team Goals

plt.figure(figsize=(10, 6))
sns.histplot(df['FTHG and HG = Full Time Home Team Goals'], kde=True)
plt.title('Full Time Home Team Goals')
plt.xlabel('Goals')
plt.ylabel('Count')
plt.show()

# Plot FTAG and AG = Full Time Away Team Goals
plt.figure(figsize=(10, 6))
sns.histplot(df['FTAG and AG = Full Time Away Team Goals'], kde=True)
plt.title('Full Time Away Team Goals')
plt.xlabel('Goals')
plt.ylabel('Count')
plt.show()

# Plot HTHG = Half Time Home Team Goals

plt.figure(figsize=(10, 6))
sns.histplot(df['HTHG = Half Time Home Team Goals'], kde=True)
plt.title('Half Time Home Team Goals')
plt.xlabel('Goals')
plt.ylabel('Count')
plt.show()

# Plot HTAG = Half Time Away Team Goals

plt.figure(figsize=(10, 6))
sns.histplot(df['HTAG = Half Time Away Team Goals'], kde=True)
plt.title('Half Time Away Team Goals')
plt.xlabel('Goals')
plt.ylabel('Count')
plt.show()

# Plot HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)

plt.figure(figsize=(10, 6))
sns.countplot(x='HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)', data=df)
plt.title('Half Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.xlabel('Half Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.ylabel('Count')
plt.show()


# Plot average wins and losses by home team and away team

plt.figure(figsize=(10, 6))
sns.countplot(x='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)', data=df, hue='HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.title('Full Time Result')
plt.xlabel('Full Time Result')
plt.ylabel('Count')
plt.show()


# Plot average goals by HomeTeam

plt.figure(figsize=(10, 6))
sns.boxplot(x='HomeTeam', y='FTHG and HG = Full Time Home Team Goals', data=df)
plt.title('Average Goals by Home Team')
plt.xlabel('Home Team')
plt.ylabel('Goals')
plt.xticks(rotation=90)
plt.show()

# Plot average goals by AwayTeam

plt.figure(figsize=(10, 6))
sns.boxplot(x='AwayTeam', y='FTAG and AG = Full Time Away Team Goals', data=df)   
plt.title('Average Goals by Away Team')
plt.xlabel('Away Team')
plt.ylabel('Goals')
plt.xticks(rotation=90)
plt.show()

# Plot FTR by HomeTeam

plt.figure(figsize=(10, 6))
sns.countplot(x='HomeTeam', data=df, hue='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.title('Full Time Result by Home Team')
plt.xlabel('Home Team')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

# Plot FTR by AwayTeam

plt.figure(figsize=(10, 6))
sns.countplot(x='AwayTeam', data=df, hue='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.title('Full Time Result by Away Team')
plt.xlabel('Away Team')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()



# Plot FTR by AY

plt.figure(figsize=(10, 6))
sns.countplot(x='AY = Away Team Yellow Cards', data=df, hue='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.title('Full Time Result by Away Team Yellow Cards')
plt.xlabel('Away Team Yellow Cards')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)', y='AY = Away Team Yellow Cards', data=df)
plt.title('Distribution of Away Team Yellow Cards by Full Time Result')
plt.xlabel('Full Time Result')
plt.ylabel('Away Team Yellow Cards')
plt.xticks(rotation=45)
plt.show()


# Plot FTR by HY

plt.figure(figsize=(10, 6))
sns.countplot(x='HY = Home Team Yellow Cards', data=df, hue='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.title('Full Time Result by Home Team Yellow Cards')
plt.xlabel('Home Team Yellow Cards')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)', y='HY = Home Team Yellow Cards', data=df)
plt.title('Distribution of Home Team Yellow Cards by Full Time Result')
plt.xlabel('Full Time Result')
plt.ylabel('Home Team Yellow Cards')
plt.xticks(rotation=45)
plt.show()


# Plot FTR by HR

plt.figure(figsize=(10, 6))
sns.countplot(x='HR = Home Team Red Cards', data=df, hue='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.title('Full Time Result by Home Team Red Cards')
plt.xlabel('Home Team Red Cards')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)', y='HR = Home Team Red Cards', data=df)
plt.title('Distribution of Home Team Red Cards by Full Time Result')
plt.xlabel('Full Time Result')
plt.ylabel('Home Team Red Cards')
plt.xticks(rotation=45)
plt.show()


# Plot FTR by AR

plt.figure(figsize=(10, 6))
sns.countplot(x='AR = Away Team Red Cards', data=df, hue='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.title('Full Time Result by Away Team Red Cards')
plt.xlabel('Away Team Red Cards')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)', y='AR = Away Team Red Cards', data=df)
plt.title('Distribution of Away Team Red Cards by Full Time Result')
plt.xlabel('Full Time Result')
plt.ylabel('Away Team Red Cards')
plt.xticks(rotation=45)
plt.show()



# Plot FTR by HS

plt.figure(figsize=(10, 6))
sns.countplot(x='HS = Home Team Shots', data=df, hue='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.title('Full Time Result by Home Team Shots')
plt.xlabel('Home Team Shots')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)', y='HS = Home Team Shots', data=df)
plt.title('Distribution of Home Team Shots by Full Time Result')
plt.xlabel('Full Time Result')
plt.ylabel('Home Team Shots')
plt.xticks(rotation=45)
plt.show()


# Plot FTR by AS

plt.figure(figsize=(10, 6))
sns.countplot(x='AS = Away Team Shots', data=df, hue='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.title('Full Time Result by Away Team Shots')
plt.xlabel('Away Team Shots')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

plt.figure(figsize=(10, 6)) 
sns.boxplot(x='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)', y='AS = Away Team Shots', data=df)
plt.title('Distribution of Away Team Shots by Full Time Result')
plt.xlabel('Full Time Result')
plt.ylabel('Away Team Shots')
plt.xticks(rotation=45)
plt.show()




# Plot FTR by HST

plt.figure(figsize=(10, 6))
sns.countplot(x='HST = Home Team Shots on Target', data=df, hue='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.title('Full Time Result by Home Team Shots on Target')
plt.xlabel('Home Team Shots on Target')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)', y='HST = Home Team Shots on Target', data=df)
plt.title('Distribution of Home Team Shots on Target by Full Time Result')
plt.xlabel('Full Time Result')
plt.ylabel('Home Team Shots on Target')
plt.xticks(rotation=45)
plt.show()



# Plot FTR by HF

plt.figure(figsize=(10, 6))
sns.countplot(x='HF = Home Team Fouls Committed', data=df, hue='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.title('Full Time Result by Home Team Fouls Committed')
plt.xlabel('Home Team Fouls Committed')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)', y='HF = Home Team Fouls Committed', data=df)
plt.title('Distribution of Home Team Fouls Committed by Full Time Result')
plt.xlabel('Full Time Result')
plt.ylabel('Home Team Fouls Committed')
plt.xticks(rotation=45)
plt.show()


# Plot FTR by AF

plt.figure(figsize=(10, 6))
sns.countplot(x='AF = Away Team Fouls Committed', data=df, hue='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)')
plt.title('Full Time Result by Away Team Fouls Committed')
plt.xlabel('Away Team Fouls Committed')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)', y='AF = Away Team Fouls Committed', data=df)
plt.title('Distribution of Away Team Fouls Committed by Full Time Result')
plt.xlabel('Full Time Result')
plt.ylabel('Away Team Fouls Committed')
plt.xticks(rotation=45)
plt.show()




ValueError: Could not interpret value `FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)` for `x`. An entry with this name does not appear in `data`.

<Figure size 1000x600 with 0 Axes>