In [1]:
import pandas as pd
import json

In [2]:
with open("../data/matches.json", "r") as file:
    bl1_matches = json.load(file)

In [3]:
def get_end_result(row):
    results = row['matchResults']
    for result in results:
        if result["resultName"] == "Endergebnis":
            return result
    raise ValueError("Couldn't find end result of {row}")

def match_to_record(row):
    record = {
        "id": row['matchID'],
        "match_day": row['group']['groupOrderID'],
        "season": row['leagueSeason'],
        "host_id":row['team1']['teamId'],
        "host_name": row['team1']['shortName'] or row['team1']['teamName'],
        "guest_id": row['team2']['teamId'],
        "guest_name": row['team2']['shortName'] or row['team2']['teamName'],
    }
    if row['leagueSeason'] != 2024: 
        result = get_end_result(row)
        record['host_goals'] = result['pointsTeam1']
        record['guest_goals'] = result['pointsTeam2']
    else:
        record['host_goals'] = None
        record['guest_goals'] = None

    return record

records = [match_to_record(match) for match in bl1_matches]
df_matches_all = pd.DataFrame.from_records(records)
df_matches_all = df_matches_all.set_index("id")
df_matches_all.tail(2)

Unnamed: 0_level_0,match_day,season,host_id,host_name,guest_id,guest_name,host_goals,guest_goals
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
72518,34,2024,87,Gladbach,131,Wolfsburg,,
72519,34,2024,98,St. Pauli,129,Bochum,,


In [4]:
df_matches_all.loc[df_matches_all["host_name"] == "TSG 1899 Hoffenheim", "host_name"] = "Hoffenheim"
df_matches_all.loc[df_matches_all["guest_name"] == "TSG 1899 Hoffenheim", "guest_name"] = "Hoffenheim"

In [5]:
df_matches_all.isna().any()

match_day      False
season         False
host_id        False
host_name      False
guest_id       False
guest_name     False
host_goals      True
guest_goals     True
dtype: bool

In [6]:
df_matches_all.loc[df_matches_all["host_goals"].isna(), "season"].value_counts()

season
2024    306
Name: count, dtype: int64

In [7]:
df_matches_all.to_csv("../data/matches_all.csv")

# Teams

In [8]:
with open("../data/tables.json", "r") as file:
    bl1_tables = json.load(file)

In [9]:
columns = ['points', 'opponentGoals', 'goals', 'matches', 'won', 'lost', 'draw', 'goalDiff']
team_rows = []
for year, teams in bl1_tables.items():
    for team in teams:
        for col in columns:
            team_rows.append([col, year, team['teamInfoId'], team[col]])

df_teams_all = pd.DataFrame(team_rows, columns=['feature', 'year', 'team', 'value'])


In [10]:
df_teams_all = df_teams_all.groupby(["team", "year", "feature"]).sum()["value"].reset_index()
df_teams_all = pd.pivot_table(df_teams_all, values="value", index=["team", "year"], columns="feature")
df_teams_all = df_teams_all.rename(columns={"draw": "draws", "won": "wins", "lost": "defeats", "goalDiff": "goal_diff", "opponentGoals": "opponent_goals"})
df_teams_all = df_teams_all.astype(int)
df_teams_all.to_csv("../data/teams_all.csv")
df_teams_all.tail()

Unnamed: 0_level_0,feature,draws,goal_diff,goals,defeats,matches,opponent_goals,points,wins
team,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1635,2019,12,44,81,4,34,37,66,18
1635,2020,8,28,60,7,34,32,65,19
1635,2021,7,35,72,10,34,37,58,17
1635,2022,6,23,64,8,34,41,66,20
1635,2023,8,38,77,7,34,39,65,19


# Combine

In [11]:
df_matches_all = pd.read_csv("../data/matches_all.csv", index_col="id")
df_teams_all = pd.read_csv("../data/teams_all.csv", index_col="team")

horizon = 3
historic_columns = ["points", "goals", "opponent_goals", "goal_diff", "wins", "draws", "defeats"]
test_years = range(df_teams_all["year"].min(), 2025)

historic_mean = []
for team in df_teams_all.index.unique():
    mask_team = df_teams_all.index == team
    years = df_teams_all.loc[mask_team, "year"].unique()
    for test_year in test_years:
        horizont_years = [y for y in range(test_year - horizon, test_year) if y in years]
        if len(horizont_years) == 0:
            continue
        
        mask_horizon = df_teams_all["year"].isin(horizont_years)

        masked = df_teams_all.loc[mask_team & mask_horizon, historic_columns].mean()

        record = {
            "team": team,
            "year": test_year,
            "horizon_years": horizont_years
        }
        for col in historic_columns:
            record["horizon_" + col] = masked[col]
        historic_mean.append(record)
    
df_teams_historic = pd.DataFrame.from_records(historic_mean)
df_teams_historic.to_csv("../data/teams_historic_all.csv")
df_teams_historic.tail(10)

Unnamed: 0,team,year,horizon_years,horizon_points,horizon_goals,horizon_opponent_goals,horizon_goal_diff,horizon_wins,horizon_draws,horizon_defeats
292,185,2022,[2019],30.0,36.0,67.0,-31.0,6.0,12.0,16.0
293,199,2024,[2023],42.0,50.0,55.0,-5.0,10.0,12.0,12.0
294,1635,2017,[2016],67.0,66.0,39.0,27.0,20.0,7.0,7.0
295,1635,2018,"[2016, 2017]",60.0,61.5,46.0,15.5,17.5,7.5,9.0
296,1635,2019,"[2016, 2017, 2018]",62.0,62.0,40.333333,21.666667,18.0,8.0,8.0
297,1635,2020,"[2017, 2018, 2019]",61.666667,67.0,39.666667,27.333333,17.333333,9.666667,7.0
298,1635,2021,"[2018, 2019, 2020]",65.666667,68.0,32.666667,35.333333,18.666667,9.666667,5.666667
299,1635,2022,"[2019, 2020, 2021]",63.0,71.0,35.333333,35.666667,18.0,9.0,7.0
300,1635,2023,"[2020, 2021, 2022]",63.0,65.333333,36.666667,28.666667,18.666667,7.0,8.333333
301,1635,2024,"[2021, 2022, 2023]",63.0,71.0,39.0,32.0,18.666667,7.0,8.333333


In [12]:
df_team_host = df_teams_all.reset_index()
df_team_host["year"] = df_team_host["year"] + 1
df_team_host.columns = "host_last_season_" + df_team_host.columns
df_full = pd.merge(left=df_matches_all.reset_index(), left_on=["host_id", "season"], right=df_team_host, right_on=["host_last_season_team", "host_last_season_year"], how="left", validate="m:1")

df_team_guest = df_teams_all.reset_index()
df_team_guest["year"] = df_team_guest["year"] + 1
df_team_guest.columns = "guest_last_season_" + df_team_guest.columns
df_full = pd.merge(left=df_full, left_on=["guest_id", "season"], right=df_team_guest, right_on=["guest_last_season_team", "guest_last_season_year"], how="left", validate="m:1")

df_full = df_full.drop(columns=["guest_last_season_team", "host_last_season_team", "guest_last_season_year", "host_last_season_year"])

df_team_host = df_teams_historic.copy()
df_team_host.columns = "host_" + df_team_host.columns
df_full = pd.merge(left=df_full, left_on=["host_id", "season"], right=df_team_host, right_on=["host_team", "host_year"], how="left", validate="m:1")

df_team_guest = df_teams_historic.copy()
df_team_guest.columns = "guest_" + df_team_guest.columns
df_full = pd.merge(left=df_full, left_on=["guest_id", "season"], right=df_team_guest, right_on=["guest_team", "guest_year"], how="left", validate="m:1")

df_full = df_full.drop(columns=["guest_team", "guest_year", "host_team", "host_year", "guest_horizon_years", "host_horizon_years"])
df_full = df_full.set_index("id")

cols_fill_zero = ["goals", "wins", "draws", "defeats", "points", "matches"]
for col in df_full.columns:
    if not any([c for c in cols_fill_zero if c in col]):
        continue
    df_full[col] = df_full[col].fillna(0)

cols_fill_min = ["goal_diff"]
for col in df_full.columns:
    if not any([c for c in cols_fill_min if c in col]):
        continue
    df_full[col] = df_full[col].fillna(-100)

df_full.tail(10)

Unnamed: 0_level_0,match_day,season,host_id,host_name,guest_id,guest_name,host_goals,guest_goals,host_last_season_draws,host_last_season_goal_diff,...,host_horizon_wins,host_horizon_draws,host_horizon_defeats,guest_horizon_points,guest_horizon_goals,guest_horizon_opponent_goals,guest_horizon_goal_diff,guest_horizon_wins,guest_horizon_draws,guest_horizon_defeats
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
72510,33,2024,104,Kiel,112,Freiburg,0.0,0.0,0.0,-100.0,...,0.0,0.0,0.0,52.0,51.333333,49.333333,2.0,14.333333,9.0,10.666667
72511,34,2024,1635,Leipzig,16,Stuttgart,0.0,0.0,8.0,38.0,...,18.666667,7.0,8.333333,46.333333,54.666667,51.666667,3.0,12.333333,9.333333,12.333333
72512,34,2024,7,BVB,104,Kiel,0.0,0.0,9.0,25.0,...,20.666667,5.666667,7.666667,0.0,0.0,0.0,-100.0,0.0,0.0,0.0
72513,34,2024,175,Hoffenheim,40,Bayern,0.0,0.0,7.0,0.0,...,12.0,6.666667,15.333333,73.333333,94.333333,40.0,54.333333,22.666667,5.333333,6.0
72514,34,2024,199,Heidenheim,134,Bremen,0.0,0.0,12.0,-5.0,...,10.0,12.0,12.0,39.0,49.5,59.0,-9.5,10.5,7.5,16.0
72515,34,2024,112,Freiburg,91,Frankfurt,0.0,0.0,9.0,-13.0,...,14.333333,9.0,10.666667,46.333333,51.333333,50.333333,1.0,11.333333,12.333333,10.333333
72516,34,2024,95,Augsburg,80,Union Berlin,0.0,0.0,9.0,-10.0,...,9.666667,8.0,16.333333,50.666667,44.666667,46.666667,-2.0,14.333333,7.666667,12.0
72517,34,2024,81,Mainz,6,Leverkusen,0.0,0.0,14.0,-12.0,...,10.666667,10.333333,13.0,68.0,75.333333,40.0,35.333333,20.333333,7.0,6.666667
72518,34,2024,87,Gladbach,131,Wolfsburg,0.0,0.0,13.0,-11.0,...,10.0,10.666667,13.333333,42.666667,47.0,52.666667,-5.666667,11.666667,7.666667,14.666667
72519,34,2024,98,St. Pauli,129,Bochum,0.0,0.0,0.0,-100.0,...,0.0,0.0,0.0,36.666667,40.0,66.0,-26.0,9.666667,7.666667,16.666667


In [13]:
cols = [col.replace("host_", "") for col in df_full.columns if col.startswith("host")]
cols = [col for col in cols if col not in ["id", "name", "goals"]]
print(cols)

for col in cols:
    df_full["diff_" + col] = df_full["host_" + col] - df_full["guest_" + col]

['last_season_draws', 'last_season_goal_diff', 'last_season_goals', 'last_season_defeats', 'last_season_matches', 'last_season_opponent_goals', 'last_season_points', 'last_season_wins', 'horizon_points', 'horizon_goals', 'horizon_opponent_goals', 'horizon_goal_diff', 'horizon_wins', 'horizon_draws', 'horizon_defeats']


In [14]:
df_full.tail(10)

Unnamed: 0_level_0,match_day,season,host_id,host_name,guest_id,guest_name,host_goals,guest_goals,host_last_season_draws,host_last_season_goal_diff,...,diff_last_season_opponent_goals,diff_last_season_points,diff_last_season_wins,diff_horizon_points,diff_horizon_goals,diff_horizon_opponent_goals,diff_horizon_goal_diff,diff_horizon_wins,diff_horizon_draws,diff_horizon_defeats
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
72510,33,2024,104,Kiel,112,Freiburg,0.0,0.0,0.0,-100.0,...,-58.0,-42.0,-11.0,-52.0,-51.333333,-49.333333,-102.0,-14.333333,-9.0,-10.666667
72511,34,2024,1635,Leipzig,16,Stuttgart,0.0,0.0,8.0,38.0,...,0.0,-8.0,-4.0,16.666667,16.333333,-12.666667,29.0,6.333333,-2.333333,-4.0
72512,34,2024,7,BVB,104,Kiel,0.0,0.0,9.0,25.0,...,43.0,63.0,18.0,67.666667,78.666667,46.333333,132.333333,20.666667,5.666667,7.666667
72513,34,2024,175,Hoffenheim,40,Bayern,0.0,0.0,7.0,0.0,...,21.0,-26.0,-10.0,-30.666667,-37.0,21.0,-58.0,-10.666667,1.333333,9.333333
72514,34,2024,199,Heidenheim,134,Bremen,0.0,0.0,12.0,-5.0,...,1.0,0.0,-1.0,3.0,0.5,-4.0,4.5,-0.5,4.5,-4.0
72515,34,2024,112,Freiburg,91,Frankfurt,0.0,0.0,9.0,-13.0,...,8.0,-5.0,0.0,5.666667,0.0,-1.0,1.0,3.0,-3.333333,0.333333
72516,34,2024,95,Augsburg,80,Union Berlin,0.0,0.0,9.0,-10.0,...,2.0,6.0,1.0,-13.666667,-1.0,13.0,-14.0,-4.666667,0.333333,4.333333
72517,34,2024,81,Mainz,6,Leverkusen,0.0,0.0,14.0,-12.0,...,27.0,-55.0,-21.0,-25.666667,-27.666667,10.333333,-38.0,-9.666667,3.333333,6.333333
72518,34,2024,87,Gladbach,131,Wolfsburg,0.0,0.0,13.0,-11.0,...,11.0,-3.0,-3.0,-2.0,7.0,8.333333,-1.333333,-1.666667,3.0,-1.333333
72519,34,2024,98,St. Pauli,129,Bochum,0.0,0.0,0.0,-100.0,...,-74.0,-33.0,-7.0,-36.666667,-40.0,-66.0,-74.0,-9.666667,-7.666667,-16.666667


In [15]:
df_full.to_csv("../data/matches_all_with_context.csv")

In [16]:
df_full.head(10)

Unnamed: 0_level_0,match_day,season,host_id,host_name,guest_id,guest_name,host_goals,guest_goals,host_last_season_draws,host_last_season_goal_diff,...,diff_last_season_opponent_goals,diff_last_season_points,diff_last_season_wins,diff_horizon_points,diff_horizon_goals,diff_horizon_opponent_goals,diff_horizon_goal_diff,diff_horizon_wins,diff_horizon_draws,diff_horizon_defeats
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9998,1,2010,40,Bayern,131,Wolfsburg,2.0,1.0,0.0,-100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10002,1,2010,175,Hoffenheim,134,Bremen,4.0,1.0,0.0,-100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10003,1,2010,87,Gladbach,79,Nürnberg,1.0,1.0,0.0,-100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10004,1,2010,65,Köln,76,Kaiserslautern,1.0,3.0,0.0,-100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10005,1,2010,112,Freiburg,98,St. Pauli,1.0,3.0,0.0,-100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10006,1,2010,55,Hannover,91,Frankfurt,2.0,1.0,0.0,-100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000,1,2010,100,HSV,9,Schalke,2.0,1.0,0.0,-100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10001,1,2010,81,Mainz,16,Stuttgart,2.0,0.0,0.0,-100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9999,1,2010,7,BVB,6,Leverkusen,0.0,2.0,0.0,-100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10014,2,2010,76,Kaiserslautern,40,Bayern,2.0,0.0,0.0,-100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Splitting

In [17]:
df_train = df_full[df_full["season"] != 2024]
df_train.to_csv("../data/matches.csv")

df_future = df_full[df_full["season"] == 2024]
df_future.to_csv("../data/matches_future.csv")