In [6]:
import pandas as pd
import requests
from io import StringIO
import csv

# Define all seasons
years = [ '9394', '9495', '9596', '9697', '9798', '9899', '9900', '0001',
          '0102', '0203', '0304', '0405', '0506', '0607', '0708', '0809',
          '0910', '1011', '1112', '1213', '1314', '1415', '1516', '1617',
          '1718', '1819', '1920', '2021', '2122', '2223', '2324', '2425']

div = 'E0'
all_dfs = []
errors = []

def safe_read_csv(url):
    r = requests.get(url)
    r.raise_for_status()
    lines = r.content.decode("ISO-8859-1").splitlines()

    header = next(csv.reader([lines[0]]))
    expected_cols = len(header)

    df = pd.read_csv(StringIO("\n".join(lines)), usecols=range(expected_cols))
    df.columns = df.columns.str.strip()
    return df

# Process each season
for y in years:
    url = f"https://www.football-data.co.uk/mmz4281/{y}/{div}.csv"
    print(f"⬇️ Downloading {url}")
    try:
        df = safe_read_csv(url)

        # Clean and parse dates
        df.columns = df.columns.str.strip()
        if 'Date' not in df.columns:
            raise ValueError("Missing 'Date' column")
        df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
        df = df.dropna(subset=['Date'])

        # Sort by date to find actual season year from last match
        df = df.sort_values('Date')
        season_year = df['Date'].iloc[-1].year

        df['Season'] = str(season_year)
        df['Season_Div'] = f"{season_year}_{div}"

        all_dfs.append(df)

    except Exception as e:
        print(f"❌ Error processing {url}: {e}")
        errors.append((y, str(e)))

# Combine and save
combined = pd.concat(all_dfs, ignore_index=True, sort=False)
combined.to_csv("../outputs/England_1_dload.csv", index=False)

print(f"\n✅ Saved {len(combined)} England_1_dload.csv")

if errors:
    print("\n⚠️ Issues with the following seasons:")
    for year, msg in errors:
        print(f"- {year}: {msg}")


⬇️ Downloading https://www.football-data.co.uk/mmz4281/9394/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/9495/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/9596/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/9697/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/9798/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/9899/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/9900/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/0001/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/0102/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/0203/E0.csv
⬇️ Downloading https://www.football-data.co.uk/mmz4281/0304/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/0405/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/0506/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/0607/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/0708/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/0809/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/0910/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/1011/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/1112/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/1213/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/1314/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/1415/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/1516/E0.csv
⬇️ Downloading https://www.football-data.co.uk/mmz4281/1617/E0.csv


  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')


⬇️ Downloading https://www.football-data.co.uk/mmz4281/1718/E0.csv
⬇️ Downloading https://www.football-data.co.uk/mmz4281/1819/E0.csv
⬇️ Downloading https://www.football-data.co.uk/mmz4281/1920/E0.csv
⬇️ Downloading https://www.football-data.co.uk/mmz4281/2021/E0.csv
⬇️ Downloading https://www.football-data.co.uk/mmz4281/2122/E0.csv
⬇️ Downloading https://www.football-data.co.uk/mmz4281/2223/E0.csv
⬇️ Downloading https://www.football-data.co.uk/mmz4281/2324/E0.csv
⬇️ Downloading https://www.football-data.co.uk/mmz4281/2425/E0.csv

✅ Saved 12324 rows to all_e0_seasons_combined.csv


In [7]:
rows_per_season = (
    combined.groupby("Season", as_index=False)
            .size()
            .rename(columns={"size": "Rows"})
)

print(rows_per_season)

rows_per_season.to_csv("../outputs/England_1_rows_per_season.csv", index=False)

   Season  Rows
0    1994   462
1    1995   462
2    1996   380
3    1997   380
4    1998   380
5    1999   380
6    2000   380
7    2001   380
8    2002   380
9    2003   380
10   2004   380
11   2005   380
12   2006   380
13   2007   380
14   2008   380
15   2009   380
16   2010   380
17   2011   380
18   2012   380
19   2013   380
20   2014   380
21   2015   380
22   2016   380
23   2017   380
24   2018   380
25   2019   380
26   2020   380
27   2021   380
28   2022   380
29   2023   380
30   2024   380
31   2025   380


In [10]:
import pandas as pd, os
from collections import OrderedDict

# Load combined data created earlier
df = pd.read_csv("../outputs/England_1_dload.csv")

# Remove stray unnamed columns
df = df.loc[:, ~df.columns.str.contains(r'^Unnamed')]

# Core column groups in intuitive order
base      = ['Season', 'Div', 'Season_Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR']
ht        = ['HTHG', 'HTAG', 'HTR']
match     = ['Referee', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF']

wh_small  = ['WHH', 'WHD', 'WHA']
wh_large  = wh_small + ['WHCH', 'WHCD', 'WHCA']
vc_small  = ['VCH', 'VCD', 'VCA']
vc_large  = vc_small + ['VCCH', 'VCCD', 'VCCA']
b365_small= ['B365H', 'B365D', 'B365A']
b365_large= b365_small + ['B365CH', 'B365CD', 'B365CA']

p_small   = ['PSH', 'PSD', 'PSA', 'PSCH', 'PSCD', 'PSCA']
p_extra   = ['P>2.5', 'P<2.5', 'PAHH', 'PAHA', 'PC>2.5', 'PC<2.5', 'PCAHA']

datasets = {
    'Plankton'   : base,
    'Krill'      : base + ht,
    'Squid'      : base + ht + match,
    'Turtle'     : base + ht + match + wh_small + vc_small + b365_small,
    'Dolphin'    : base + ht + match + wh_small + vc_small + b365_small + p_small,
    'Orca'       : base + ht + match + wh_large + vc_large + b365_large + p_small + p_extra,
    'Sperm_whale': base + ht + match + wh_large + vc_large + b365_large + p_small + p_extra,
}

# Ensure target folder exists
os.makedirs("../outputs", exist_ok=True)

for creature, cols in datasets.items():
    # keep only columns present in the dataframe, preserve order, drop duplicates
    ordered = list(OrderedDict.fromkeys([c for c in cols if c in df.columns]))
    df[ordered].to_csv(f"../outputs/England_1_{creature}.csv", index=False)
    print(f"Saved England_1_{creature}.csv")


  df = pd.read_csv("../outputs/England_1_dload.csv")


Saved England_1_Plankton.csv
Saved England_1_Krill.csv
Saved England_1_Squid.csv
Saved England_1_Turtle.csv
Saved England_1_Dolphin.csv
Saved England_1_Orca.csv
Saved England_1_Sperm_whale.csv


In [13]:
import pandas as pd
from collections import OrderedDict

creatures = ['Plankton', 'Krill', 'Squid', 'Turtle', 'Dolphin', 'Orca', 'Sperm_whale']
records   = []

for c in creatures:
    path = f"../outputs/England_1_{c}.csv"
    df   = pd.read_csv(path)

    stats          = df.notna().sum().to_dict()   # non-null counts per column
    stats['TotalRows'] = len(df)
    stats['Dataset']   = c
    records.append(stats)

# build summary, place TotalRows first
summary = pd.DataFrame(records).set_index('Dataset')
summary = summary[['TotalRows'] + [col for col in summary.columns if col != 'TotalRows']]

print(summary)               # inspect before deciding to save
summary.to_csv("../outputs/England_1_summary.csv")

             TotalRows  Season    Div  Season_Div   Date  HomeTeam  AwayTeam  \
Dataset                                                                        
Plankton         12324   12324  11944       12324  12324     12324     12324   
Krill            12324   12324  11944       12324  12324     12324     12324   
Squid            12324   12324  11944       12324  12324     12324     12324   
Turtle           12324   12324  11944       12324  12324     12324     12324   
Dolphin          12324   12324  11944       12324  12324     12324     12324   
Orca             12324   12324  11944       12324  12324     12324     12324   
Sperm_whale      12324   12324  11944       12324  12324     12324     12324   

              FTHG   FTAG    FTR  ...  B365CH  B365CD  B365CA   P>2.5   P<2.5  \
Dataset                           ...                                           
Plankton     12324  12324  12324  ...     NaN     NaN     NaN     NaN     NaN   
Krill        12324  12324  12324  ..