In [7]:
import requests  # HTTP requests
from bs4 import BeautifulSoup  # HTML parsing
import pandas as pd  # data manipulation
import os  # filesystem utilities
import glob  # filename pattern matching
import time  # sleep/delays

In [8]:
BASE_URL = "https://www.football-data.co.uk/englandm.php"  # page with league CSV links

headers = {  # set a simple User-Agent to avoid basic blocking
    "User-Agent": "Mozilla/5.0"
}

response = requests.get(BASE_URL, headers=headers)  # GET the page
soup = BeautifulSoup(response.text, "html.parser")  # parse HTML

csv_links = []  # collect CSV URLs

for link in soup.find_all("a", href=True):  # iterate anchor tags
    href = link["href"]  # extract href attribute
    if "mmz4281" in href and href.endswith("E0.csv"):  # filter league CSVs
        full_url = "https://www.football-data.co.uk/" + href  # build absolute URL
        csv_links.append(full_url)  # store URL

print(f"Found {len(csv_links)} Premier League CSV files")  # report count
csv_links[:5]  # display first few links


Found 33 Premier League CSV files


['https://www.football-data.co.uk/mmz4281/2526/E0.csv',
 'https://www.football-data.co.uk/mmz4281/2425/E0.csv',
 'https://www.football-data.co.uk/mmz4281/2324/E0.csv',
 'https://www.football-data.co.uk/mmz4281/2223/E0.csv',
 'https://www.football-data.co.uk/mmz4281/2122/E0.csv']

In [9]:
os.makedirs("data/epl", exist_ok=True)  # ensure data directory exists

for url in csv_links:  # download each CSV URL
    season = url.split("/")[-2]   # extract season code from URL, e.g. 1516
    filename = f"data/epl/EPL_{season}.csv"  # local filename

    if os.path.exists(filename):  # skip if already downloaded
        continue  # avoid re-downloading

    r = requests.get(url, headers=headers)  # download CSV
    with open(filename, "wb") as f:  # save to disk
        f.write(r.content)  # write bytes

    print("Saved:", filename)  # confirm save
    time.sleep(1)  # polite delay between requests


In [10]:
dfs = []  # list to hold DataFrames for each CSV

for file in glob.glob("data/epl/EPL_*.csv"):  # iterate local CSV files
    season = file.split("_")[-1].replace(".csv", "")  # parse season from filename
    df = pd.read_csv(  # read CSV into DataFrame
        file,
        encoding="latin-1",  # handle special characters
        engine="python",  # use python engine for parsing
        on_bad_lines="skip",  # skip malformed rows
    )
    df["Season"] = season  # add season column
    dfs.append(df)  # collect DataFrame

matches = pd.concat(dfs, ignore_index=True)  # combine all seasons

matches = matches[[  # keep only relevant columns
    "Season", "Date",
    "HomeTeam", "AwayTeam",
    "FTHG", "FTAG", "FTR"
 ]]

matches.head()  # show sample rows

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR
0,2324,11/08/2023,Burnley,Man City,0.0,3.0,A
1,2324,12/08/2023,Arsenal,Nott'm Forest,2.0,1.0,H
2,2324,12/08/2023,Bournemouth,West Ham,1.0,1.0,D
3,2324,12/08/2023,Brighton,Luton,4.0,1.0,H
4,2324,12/08/2023,Everton,Fulham,0.0,1.0,A


In [11]:
print("Total matches:", len(matches))  # number of rows/matches
print("Teams:", matches["HomeTeam"].nunique())  # unique home teams count
print("Seasons:", matches["Season"].nunique())  # number of seasons collected

Total matches: 13171
Teams: 51
Seasons: 33


In [12]:
def head_to_head_stats(df, team_a, team_b):  # compute head-to-head stats
    h2h = df[  # filter matches between the two teams
        ((df.HomeTeam == team_a) & (df.AwayTeam == team_b)) |
        ((df.HomeTeam == team_b) & (df.AwayTeam == team_a))
    ].copy()  # make a copy to avoid SettingWithCopyWarning

    if h2h.empty:  # no matches found
        return None

    h2h["A_goals"] = h2h.apply(  # goals scored by team A in each match
        lambda r: r.FTHG if r.HomeTeam == team_a else r.FTAG, axis=1
    )
    h2h["B_goals"] = h2h.apply(  # goals scored by team B in each match
        lambda r: r.FTAG if r.HomeTeam == team_a else r.FTHG, axis=1
    )

    return {  # aggregated statistics
        "matches": len(h2h),
        "A_wins": (h2h.A_goals > h2h.B_goals).sum(),  # A wins count
        "B_wins": (h2h.A_goals < h2h.B_goals).sum(),  # B wins count
        "draws": (h2h.A_goals == h2h.B_goals).sum(),  # draws count
        "A_avg_goals": round(h2h.A_goals.mean(), 2),  # A average goals
        "B_avg_goals": round(h2h.B_goals.mean(), 2)  # B average goals
    }


In [13]:
stats = head_to_head_stats(matches, "Arsenal", "Chelsea")  # example call
stats  # display resulting dictionary


{'matches': 65,
 'A_wins': np.int64(26),
 'B_wins': np.int64(19),
 'draws': np.int64(20),
 'A_avg_goals': np.float64(1.43),
 'B_avg_goals': np.float64(1.31)}