# Merge LaLiga CSV Files (by season)

This notebook merges all CSVs from the ../data/ directory in chronological order and saves the result as merged.csv.


In [1]:
import pandas as pd
from pathlib import Path

data_dir = Path('../data')
csv_files = sorted(data_dir.glob('SP1 *.csv'))  # Ensure files are sorted by season/year
print(f'Merging files: {[f.name for f in csv_files]}')

# Read and concatenate all DataFrames in order
dfs = [pd.read_csv(f) for f in csv_files]
merged = pd.concat(dfs, ignore_index=True)

# Output file
merged_out = Path('merged.csv')
merged.to_csv(merged_out, index=False)
print(f'Merged CSV written to: {merged_out}')
merged.head()


Merging files: ['SP1 2015.csv', 'SP1 2016.csv', 'SP1 2017.csv', 'SP1 2018.csv', 'SP1 2019.csv', 'SP1 2020.csv', 'SP1 2021.csv', 'SP1 2022.csv', 'SP1 2023.csv', 'SP1 2024.csv', 'SP1 2025.csv']
Merged CSV written to: merged.csv


Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BMGMCA,BVCH,BVCD,BVCA,CLCH,CLCD,CLCA,LBCH,LBCD,LBCA
0,SP1,21/08/15,Malaga,Sevilla,0,0,D,0,0,D,...,,,,,,,,,,
1,SP1,22/08/15,Ath Madrid,Las Palmas,1,0,H,1,0,H,...,,,,,,,,,,
2,SP1,22/08/15,Espanol,Getafe,1,0,H,1,0,H,...,,,,,,,,,,
3,SP1,22/08/15,La Coruna,Sociedad,0,0,D,0,0,D,...,,,,,,,,,,
4,SP1,22/08/15,Vallecano,Valencia,0,0,D,0,0,D,...,,,,,,,,,,


In [3]:
# Merge Elo ratings into 2015-2025.csv on team + season and overwrite the file
import pandas as pd
from pathlib import Path

# Paths are relative to this notebook directory (data_engineering)
team_season_path = Path("2015-2025.csv")
elo_path = Path("elo_by_season.csv")

# Load datasets
df = pd.read_csv(team_season_path)
elo = pd.read_csv(elo_path)

# Normalize season key to two-digit format (e.g., 2015/16 -> 15/16)
def to_two_digit_season(s: str) -> str:
    s = str(s)
    if "/" not in s:
        return s
    left, right = s.split("/", 1)
    return f"{left[-2:]}/{right[-2:]}"

df['season'] = df['season'].apply(to_two_digit_season)
elo['season'] = elo['season'].apply(to_two_digit_season)

# Elo columns to join
elo_cols = ["elo_start_of_season", "elo_end_of_season", "elo_mean_of_season"]

# Merge elo onto team-season dataset; keep elo columns with "_elo" suffix
merged = df.merge(
    elo[["team", "season"] + elo_cols],
    on=["team", "season"],
    how="left",
    suffixes=("", "_elo")
)

# Drop any pre-existing (NaN) elo columns from df, keeping only suffixed ones
for col in elo_cols:
    if col in merged.columns:
        merged = merged.drop(columns=[col])

# Ensure that the suffixed Elo columns exist in the result
keep_elo_cols = [f"{c}_elo" for c in elo_cols]
for c in keep_elo_cols:
    if c not in merged.columns:
        merged[c] = pd.NA

# Overwrite the original 2015-2025.csv with Elo-filled dataset
merged.to_csv(team_season_path, index=False)
print("Updated", team_season_path, "with Elo ratings.")

# Quick preview of keys and Elo columns
display(merged[["team", "season"] + keep_elo_cols].head(20))


Updated 2015-2025.csv with Elo ratings.


Unnamed: 0,team,season,elo_start_of_season_elo,elo_end_of_season_elo,elo_mean_of_season_elo
0,Alaves,16/17,1300.0,1347.240342,1309.374489
1,Alaves,17/18,1347.240342,1306.381991,1284.339356
2,Alaves,18/19,1306.381991,1299.643613,1335.265079
3,Alaves,19/20,1299.643613,1249.263197,1285.863602
4,Alaves,20/21,1249.263197,1264.336068,1246.566436
5,Alaves,21/22,1264.336068,1204.569031,1230.119729
6,Alaves,23/24,1204.569031,1297.988635,1236.168108
7,Alaves,24/25,1297.988635,1311.2309,1276.435307
8,Alaves,25/26,1311.2309,1303.283878,1318.183679
9,Almeria,22/23,1300.0,1278.228583,1280.027728
