In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import glob
import time 

In [4]:
BASE_URL = "https://www.football-data.co.uk/englandm.php"

headers = {
    "User-Agent": "Mozilla/5.0"
}

response = requests.get(BASE_URL, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

csv_links = []

for link in soup.find_all("a", href=True):
    href = link["href"]
    if "mmz4281" in href and href.endswith("E0.csv"):
        full_url = "https://www.football-data.co.uk/" + href
        csv_links.append(full_url)

print(f"Found {len(csv_links)} Premier League CSV files")
csv_links[:5]


Found 33 Premier League CSV files


['https://www.football-data.co.uk/mmz4281/2526/E0.csv',
 'https://www.football-data.co.uk/mmz4281/2425/E0.csv',
 'https://www.football-data.co.uk/mmz4281/2324/E0.csv',
 'https://www.football-data.co.uk/mmz4281/2223/E0.csv',
 'https://www.football-data.co.uk/mmz4281/2122/E0.csv']

In [5]:
os.makedirs("data/epl", exist_ok=True)

for url in csv_links:
    season = url.split("/")[-2]   # e.g. 1516
    filename = f"data/epl/EPL_{season}.csv"

    if os.path.exists(filename):
        continue  # avoid re-downloading

    r = requests.get(url, headers=headers)
    with open(filename, "wb") as f:
        f.write(r.content)

    print("Saved:", filename)
    time.sleep(1)  # polite delay


Saved: data/epl/EPL_2526.csv
Saved: data/epl/EPL_2425.csv
Saved: data/epl/EPL_2324.csv
Saved: data/epl/EPL_2223.csv
Saved: data/epl/EPL_2122.csv
Saved: data/epl/EPL_2021.csv
Saved: data/epl/EPL_1920.csv
Saved: data/epl/EPL_1819.csv
Saved: data/epl/EPL_1718.csv
Saved: data/epl/EPL_1617.csv
Saved: data/epl/EPL_1516.csv
Saved: data/epl/EPL_1415.csv
Saved: data/epl/EPL_1314.csv
Saved: data/epl/EPL_1213.csv
Saved: data/epl/EPL_1112.csv
Saved: data/epl/EPL_1011.csv
Saved: data/epl/EPL_0910.csv
Saved: data/epl/EPL_0809.csv
Saved: data/epl/EPL_0708.csv
Saved: data/epl/EPL_0607.csv
Saved: data/epl/EPL_0506.csv
Saved: data/epl/EPL_0405.csv
Saved: data/epl/EPL_0304.csv
Saved: data/epl/EPL_0203.csv
Saved: data/epl/EPL_0102.csv
Saved: data/epl/EPL_0001.csv
Saved: data/epl/EPL_9900.csv
Saved: data/epl/EPL_9899.csv
Saved: data/epl/EPL_9798.csv
Saved: data/epl/EPL_9697.csv
Saved: data/epl/EPL_9596.csv
Saved: data/epl/EPL_9495.csv
Saved: data/epl/EPL_9394.csv


In [8]:
dfs = []

for file in glob.glob("data/epl/EPL_*.csv"):
    season = file.split("_")[-1].replace(".csv", "")
    df = pd.read_csv(
        file,
        encoding="latin-1",
        engine="python",
        on_bad_lines="skip",
    )
    df["Season"] = season
    dfs.append(df)

matches = pd.concat(dfs, ignore_index=True)

matches = matches[[
    "Season", "Date",
    "HomeTeam", "AwayTeam",
    "FTHG", "FTAG", "FTR"
 ]]

matches.head()

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR
0,2324,11/08/2023,Burnley,Man City,0.0,3.0,A
1,2324,12/08/2023,Arsenal,Nott'm Forest,2.0,1.0,H
2,2324,12/08/2023,Bournemouth,West Ham,1.0,1.0,D
3,2324,12/08/2023,Brighton,Luton,4.0,1.0,H
4,2324,12/08/2023,Everton,Fulham,0.0,1.0,A


In [9]:
print("Total matches:", len(matches))
print("Teams:", matches["HomeTeam"].nunique())
print("Seasons:", matches["Season"].nunique())

Total matches: 13171
Teams: 51
Seasons: 33


In [10]:
def head_to_head_stats(df, team_a, team_b):
    h2h = df[
        ((df.HomeTeam == team_a) & (df.AwayTeam == team_b)) |
        ((df.HomeTeam == team_b) & (df.AwayTeam == team_a))
    ].copy()

    if h2h.empty:
        return None

    h2h["A_goals"] = h2h.apply(
        lambda r: r.FTHG if r.HomeTeam == team_a else r.FTAG, axis=1
    )
    h2h["B_goals"] = h2h.apply(
        lambda r: r.FTAG if r.HomeTeam == team_a else r.FTHG, axis=1
    )

    return {
        "matches": len(h2h),
        "A_wins": (h2h.A_goals > h2h.B_goals).sum(),
        "B_wins": (h2h.A_goals < h2h.B_goals).sum(),
        "draws": (h2h.A_goals == h2h.B_goals).sum(),
        "A_avg_goals": round(h2h.A_goals.mean(), 2),
        "B_avg_goals": round(h2h.B_goals.mean(), 2)
    }


In [11]:
stats = head_to_head_stats(matches, "Arsenal", "Chelsea")
stats


{'matches': 65,
 'A_wins': np.int64(26),
 'B_wins': np.int64(19),
 'draws': np.int64(20),
 'A_avg_goals': np.float64(1.43),
 'B_avg_goals': np.float64(1.31)}