### Load and Investigate Data

In [1]:
import pandas as pd

In [2]:
pl_matches = pd.read_csv("pl_matches.csv", index_col = 0) # preserve index structure
pl_matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2,0,Ipswich Town,...,Match Report,,18.0,5.0,14.8,0.0,0,0,2025,Liverpool
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2,0,Brentford,...,Match Report,,19.0,8.0,13.6,1.0,0,0,2025,Liverpool
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3,0,Manchester Utd,...,Match Report,,11.0,3.0,13.4,0.0,0,0,2025,Liverpool
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,...,Match Report,,14.0,5.0,14.9,0.0,0,0,2025,Liverpool
5,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Home,W,3,0,Bournemouth,...,Match Report,,19.0,12.0,16.6,0.0,0,0,2025,Liverpool


In [3]:
pl_matches.shape

(2280, 28)

### Investigate Potential Missing Data

In [4]:
print(f"Total matches over three seasons should be: {20 * 38 * 3}") # 20 teams, 38 matches/team, 3 seasons

# See how many matches each team has played
pl_matches["team"].value_counts()

Total matches over three seasons should be: 2280


team
Liverpool                   114
Brentford                   114
Tottenham Hotspur           114
Wolverhampton Wanderers     114
Manchester United           114
West Ham United             114
Arsenal                     114
Crystal Palace              114
Fulham                      114
Everton                     114
Bournemouth                 114
Brighton and Hove Albion    114
Nottingham Forest           114
Aston Villa                 114
Newcastle United            114
Chelsea                     114
Manchester City             114
Leicester City               76
Southampton                  76
Ipswich Town                 38
Luton Town                   38
Burnley                      38
Sheffield United             38
Leeds United                 38
Name: count, dtype: int64

Data looks good:
- 3 teams get pulled up (38 matches)
- 2 teams played in the PL 2/3 past seasons (76 matches)
- The rest played in the PL for all three seasons 

### Clean Data for ML

In [5]:
# ML algos only work with numeric data
pl_matches.dtypes

date              object
time              object
comp              object
round             object
day               object
venue             object
result            object
gf                 int64
ga                 int64
opponent          object
xg               float64
xga              float64
poss             float64
attendance       float64
captain           object
formation         object
opp formation     object
referee           object
match report      object
notes            float64
sh               float64
sot              float64
dist             float64
fk               float64
pk                 int64
pkatt              int64
season             int64
team              object
dtype: object

In [6]:
del pl_matches["comp"]
del pl_matches["notes"]

In [9]:
pl_matches["date"] = pd.to_datetime(pl_matches["date"])
pl_matches.dtypes

date             datetime64[ns]
time                     object
round                    object
day                      object
venue                    object
result                   object
gf                        int64
ga                        int64
opponent                 object
xg                      float64
xga                     float64
poss                    float64
attendance              float64
captain                  object
formation                object
opp formation            object
referee                  object
match report             object
sh                      float64
sot                     float64
dist                    float64
fk                      float64
pk                        int64
pkatt                     int64
season                    int64
team                     object
dtype: object

### Create Predictors for Basic ML Model
Remember, ML algorithms can only work with numeric data

In [19]:
# Convert venue and opponent columns to categorical type: string --> categories --> numbers
pl_matches["venue_code"] = pl_matches["venue"].astype("category").cat.codes # Away = 0, Home = 1
pl_matches["opp_code"] = pl_matches["opponent"].astype("category").cat.codes

# Replace time with just the hour
pl_matches["hour"] = pl_matches["time"].str.replace(":.+", "", regex=True).astype("int")

# Encode the day of the week as a number
pl_matches["day_code"] = pl_matches["date"].dt.dayofweek

In [21]:
pl_matches.head()

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code
0,2024-08-17,12:30,Matchweek 1,Sat,Away,W,2,0,Ipswich Town,2.6,...,14.8,0.0,0,0,2025,Liverpool,0,10,12,5
1,2024-08-25,16:30,Matchweek 2,Sun,Home,W,2,0,Brentford,2.5,...,13.6,1.0,0,0,2025,Liverpool,1,3,16,6
2,2024-09-01,16:00,Matchweek 3,Sun,Away,W,3,0,Manchester Utd,1.8,...,13.4,0.0,0,0,2025,Liverpool,0,16,16,6
3,2024-09-14,15:00,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,0.9,...,14.9,0.0,0,0,2025,Liverpool,1,18,15,5
5,2024-09-21,15:00,Matchweek 5,Sat,Home,W,3,0,Bournemouth,2.0,...,16.6,0.0,0,0,2025,Liverpool,1,2,15,5


### Create Target that our model will try to predict on
Did the team win the match?

In [22]:
pl_matches["target"] = (pl_matches["result"] == "W").astype("int") # W = 1, D/L = 0
pl_matches.head()

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
0,2024-08-17,12:30,Matchweek 1,Sat,Away,W,2,0,Ipswich Town,2.6,...,0.0,0,0,2025,Liverpool,0,10,12,5,1
1,2024-08-25,16:30,Matchweek 2,Sun,Home,W,2,0,Brentford,2.5,...,1.0,0,0,2025,Liverpool,1,3,16,6,1
2,2024-09-01,16:00,Matchweek 3,Sun,Away,W,3,0,Manchester Utd,1.8,...,0.0,0,0,2025,Liverpool,0,16,16,6,1
3,2024-09-14,15:00,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,0.9,...,0.0,0,0,2025,Liverpool,1,18,15,5,0
5,2024-09-21,15:00,Matchweek 5,Sat,Home,W,3,0,Bournemouth,2.0,...,0.0,0,0,2025,Liverpool,1,2,15,5,1
