# Transforming the dataframe 

Machine learning models use predictors, and these predictors need to be numeric for the model to use them, so the dataframe needs to be transformed, changing the data types.

### Imports

In [1]:
import pandas as pd

In [6]:
matches = pd.read_csv("matches.csv")
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2.0,0.0,Ipswich Town,...,Match Report,,18.0,5.0,14.8,0.0,0,0,2025,Liverpool
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2.0,0.0,Brentford,...,Match Report,,19.0,8.0,13.6,1.0,0,0,2025,Liverpool
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3.0,0.0,Manchester Utd,...,Match Report,,11.0,3.0,13.4,0.0,0,0,2025,Liverpool
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0.0,1.0,Nott'ham Forest,...,Match Report,,14.0,5.0,14.9,0.0,0,0,2025,Liverpool
4,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Home,W,3.0,0.0,Bournemouth,...,Match Report,,19.0,12.0,16.6,0.0,0,0,2025,Liverpool


In [7]:
# looking at the current data types
matches.dtypes

date              object
time              object
comp              object
round             object
day               object
venue             object
result            object
gf               float64
ga               float64
opponent          object
xg               float64
xga              float64
poss             float64
attendance       float64
captain           object
formation         object
opp formation     object
referee           object
match report      object
notes            float64
sh               float64
sot              float64
dist             float64
fk               float64
pk                 int64
pkatt              int64
season             int64
team              object
dtype: object

In [8]:
# date string to datetime
matches['date'] = pd.to_datetime(matches['date'])

In [9]:
# venue code - home/away as ints
matches['venue_code'] = matches['venue'].astype("category").cat.codes

In [10]:
# opposition code as ints
matches['opp_code'] = matches['opponent'].astype("category").cat.codes

In [11]:
# replaces : in hour to nothing then turning to an int
matches["hour"] = matches["time"].str.replace(
    ":.+", "", regex=True).astype("int")

In [12]:
# day code for day of the week
matches["day_code"] = matches["date"].dt.dayofweek

In [13]:
# loss or draw = 0, win = 1 
matches['target'] = (matches["result"] == "W").astype("int")

In [14]:
# team code as ints
matches['team_code'] = matches['team'].astype("category").cat.codes

# Training the Model

In this project we will be using the **Random forest** model (from sklearn), a commonly-used machine learning algorithm that combines the output of multiple decision trees to reach a single result. This is good for non-linear relationships.

### Imports

In [17]:
from sklearn.ensemble import RandomForestClassifier

## The Parameters:
- **n_estimators** = number of individual decision trees to train. Higher the number -> longer to run but more accurate
- **min_samples_split** = the number of samples in a leaf of the decision tree before splitting the node. Higher -> less likely to over fit, but lower the accuracy on training data
- **random_state** = running the forrest multiple types will get the same result


In [18]:
rf = RandomForestClassifier(
    n_estimators=50, min_samples_split=10, random_state=1)

In [19]:
# splitting around 77% for training
train = matches[matches["date"] <= '2024-05-05'] 
# This date will change if you extract the matches yourself
test = matches[matches["date"] > '2024-05-05']

The predictors we will use to start with

In [20]:
predictors = ['venue_code', "opp_code", 'hour', "day_code", "team_code"]

Fitting the model to our training data

In [21]:
rf.fit(train[predictors], train['target'])

Predicting on the test data

In [22]:
preds = rf.predict(test[predictors])

## Metrics to look at Accuracy/Precision

**Accuracy:** the ratio of correct predictions to total number of input samples

**Precision:** the fraction of retrieved results that are relevant
(tp/tp+fp)

### Imports


In [23]:
from sklearn.metrics import accuracy_score, precision_score

In [24]:
acc = accuracy_score(test['target'], preds)
acc

0.7087378640776699

### Accuracy: **70.87%**

A two way table to see when we predicted what actually happened

In [25]:
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds))
pd.crosstab(index=combined["actual"], columns=combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,108,18
1,42,38


In [26]:
prec = precision_score(test['target'], preds)
prec

np.float64(0.6785714285714286)

### Precision: **67.86%**

## Creating Rolling Averages
We can create rolling averages of certain metrics over the last 3 weeks (e.g. shots, penalty kicks..) to give the model more predictors. 

In [27]:
def rolling_averages(group, cols, new_cols):
    # gets rolling average for specific col for last 3 weeks
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    # closed=left as otherwise it will include data from that week, but want previous 3 weeks not including current
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    # removes rows with missing values so algorithm can work
    return group

The columns to find rolling averages for

In [28]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [29]:
matches_rolling = matches.groupby("team").apply(
    lambda x: rolling_averages(x, cols, new_cols), include_groups=True)

  matches_rolling = matches.groupby("team").apply(


In [30]:
matches_rolling.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,target,team_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,199,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,...,1,0,1.666667,1.0,15.333333,6.0,16.433333,0.0,0.666667,0.666667
Arsenal,200,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,...,1,0,2.0,1.0,16.0,5.333333,15.066667,0.0,0.666667,0.666667
Arsenal,201,2023-09-24,14:00,Premier League,Matchweek 6,Sun,Home,D,2.0,2.0,Tottenham,...,0,0,2.0,1.0,16.0,6.0,15.4,0.0,0.333333,0.333333
Arsenal,202,2023-09-30,15:00,Premier League,Matchweek 7,Sat,Away,W,4.0,0.0,Bournemouth,...,1,0,2.0,1.0,14.0,4.333333,16.433333,0.0,0.333333,0.333333
Arsenal,203,2023-10-08,16:30,Premier League,Matchweek 8,Sun,Home,W,1.0,0.0,Manchester City,...,1,0,2.333333,0.666667,12.666667,4.666667,16.6,0.0,1.0,1.0


We can drop extra index level of 'team' and make a new range of indices.

In [31]:
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])

In [32]:
matches_rolling.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,target,team_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,...,1,0,1.666667,1.0,15.333333,6.0,16.433333,0.0,0.666667,0.666667
1,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,...,1,0,2.0,1.0,16.0,5.333333,15.066667,0.0,0.666667,0.666667
2,2023-09-24,14:00,Premier League,Matchweek 6,Sun,Home,D,2.0,2.0,Tottenham,...,0,0,2.0,1.0,16.0,6.0,15.4,0.0,0.333333,0.333333
3,2023-09-30,15:00,Premier League,Matchweek 7,Sat,Away,W,4.0,0.0,Bournemouth,...,1,0,2.0,1.0,14.0,4.333333,16.433333,0.0,0.333333,0.333333
4,2023-10-08,16:30,Premier League,Matchweek 8,Sun,Home,W,1.0,0.0,Manchester City,...,1,0,2.333333,0.666667,12.666667,4.666667,16.6,0.0,1.0,1.0


### Creating a function to fit the model and return metrics

In [33]:
def make_predictions(data, predictors):
    train = data[data["date"] <= '2024-05-05']
    test = data[data["date"] > '2024-05-05']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(
        dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    accuracy = accuracy_score(test["target"], preds)
    return combined, precision, accuracy

Fitting the model using the new rolling averages as predictors.

In [34]:
combined, precision, accuracy = make_predictions(matches_rolling, predictors + new_cols)

In [35]:
precision

np.float64(0.6595744680851063)

### Precision with Rolling Averages: **65.96%**
This is a decrease from without the rolling averages.

In [36]:
accuracy

0.6700507614213198

### Accuracy with Rolling Averages: **67.01%**
This is also a decrease from without the rolling averages.

In [37]:
combined

Unnamed: 0,actual,predicted
33,1,1
34,1,1
35,1,1
36,1,0
37,0,1
...,...,...
844,0,0
845,0,0
846,0,0
847,0,0


Adding the date, team name, opponent name, and result to see more information.

In [38]:
combined = combined.merge(matches_rolling[[
                          "date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [39]:
combined.head()

Unnamed: 0,actual,predicted,date,team,opponent,result
33,1,1,2024-05-12,Arsenal,Manchester Utd,W
34,1,1,2024-05-19,Arsenal,Everton,W
35,1,1,2024-08-17,Arsenal,Wolves,W
36,1,0,2024-08-24,Arsenal,Aston Villa,W
37,0,1,2024-08-31,Arsenal,Brighton,D


The model would have predicted both sides of the match, so now we will look at both predictions.

First, the 'team' name and 'opponent' name need to be normalised as they are slightly different.

In [40]:
combined['team'].unique()

array(['Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford',
       'Brighton and Hove Albion', 'Burnley', 'Chelsea', 'Crystal Palace',
       'Everton', 'Fulham', 'Ipswich Town', 'Leicester City', 'Liverpool',
       'Luton Town', 'Manchester City', 'Manchester United',
       'Newcastle United', 'Nottingham Forest', 'Sheffield United',
       'Southampton', 'Tottenham Hotspur', 'West Ham United',
       'Wolverhampton Wanderers'], dtype=object)

In [41]:
combined['opponent'].unique()

array(['Manchester Utd', 'Everton', 'Wolves', 'Aston Villa', 'Brighton',
       'Tottenham', 'Manchester City', 'Leicester City', 'Southampton',
       'Bournemouth', 'Liverpool', 'Crystal Palace', 'West Ham',
       'Arsenal', 'Ipswich Town', 'Fulham', 'Brentford', 'Chelsea',
       "Nott'ham Forest", 'Newcastle Utd', 'Sheffield Utd', 'Luton Town',
       'Burnley'], dtype=object)

Creating a new class, a type of dictionary, but if there is no key, then just returns the value as it was inputted.

In [42]:
class MissingDict(dict):
    def __missing__(self, key): return key


map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Nottingham Forest": "Nott'ham Forest",
    "Sheffield United": "Sheffield Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}
mapping = MissingDict(**map_values)

In [43]:
# new_team will match opponent names
combined['new_team'] = combined['team'].map(mapping)

In [44]:
# this wil merge combined on itself by the same match, getting both sides of the match
merged = combined.merge(
    combined, left_on=["date", 'new_team'], right_on=['date', 'opponent'])

In [45]:
merged.head()

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,1,1,2024-05-12,Arsenal,Manchester Utd,W,Arsenal,0,0,Manchester United,Arsenal,L,Manchester Utd
1,1,1,2024-05-19,Arsenal,Everton,W,Arsenal,0,0,Everton,Arsenal,L,Everton
2,1,1,2024-08-17,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
3,1,0,2024-08-24,Arsenal,Aston Villa,W,Arsenal,0,0,Aston Villa,Arsenal,L,Aston Villa
4,0,1,2024-08-31,Arsenal,Brighton,D,Arsenal,0,0,Brighton and Hove Albion,Arsenal,D,Brighton


Now we will look where one team was predicted to win and the other was predicted to lose. This is where the algorithm had more confidence.

In [46]:
merged[(merged["predicted_x"] == 1) & (
    merged["predicted_y"] == 0)]['actual_x'].value_counts()

actual_x
1    25
0    12
Name: count, dtype: int64

In [47]:
25 / 37

0.6756756756756757

### The model was correct **67.57%** of the time.