In [77]:
import pandas as pd
import requests

# Matches

In [78]:
bl1_matches = []

for year in range(2010, 2024):
    response = requests.get(f"https://api.openligadb.de/getmatchdata/bl1/{year}")
    years_matches = response.json()
    bl1_matches += years_matches
print(f"Scraped {len(bl1_matches)} bl1 matches")


Scraped 4284 bl1 matches


In [79]:
def get_end_result(row):
    results = row['matchResults']
    for result in results:
        if result["resultName"] == "Endergebnis":
            return result
    raise ValueError("Couldn't find end result of {row}")

def match_to_record(row):
    result = get_end_result(row)
    return {
        "id": row['matchID'],
        "match_day": row['group']['groupOrderID'],
        "season": row['leagueSeason'],
        "host_id":row['team1']['teamId'],
        "host_name": row['team1']['shortName'] or row['team1']['teamName'],
        "guest_id": row['team2']['teamId'],
        "guest_name": row['team2']['shortName'] or row['team2']['teamName'],
        "host_goals":result['pointsTeam1'],
        "guest_goals": result['pointsTeam2'],
    }

records = [match_to_record(match) for match in bl1_matches]
df_matches = pd.DataFrame.from_records(records)
df_matches = df_matches.set_index("id")
df_matches.tail(2)

Unnamed: 0_level_0,match_day,season,host_id,host_name,guest_id,guest_name,host_goals,guest_goals
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
66935,34,2023,16,Stuttgart,87,Gladbach,4,0
66936,34,2023,199,Heidenheim,65,Köln,4,1


In [80]:
df_matches.loc[df_matches["host_name"] == "TSG 1899 Hoffenheim", "host_name"] = "Hoffenheim"
df_matches.loc[df_matches["guest_name"] == "TSG 1899 Hoffenheim", "guest_name"] = "Hoffenheim"
df_matches.to_csv("matches.csv")

# Teams

In [81]:
bl1_tables = {}
for year in range(2010, 2024):
    response = requests.get(f"https://api.openligadb.de/getbltable/bl1/{year}")
    bl1_tables[str(year)] = response.json()

columns = ['points', 'opponentGoals', 'goals', 'matches', 'won', 'lost', 'draw', 'goalDiff']
team_rows = []
for year, teams in bl1_tables.items():
    for team in teams:
        for col in columns:
            team_rows.append([col, year, team['teamInfoId'], team[col]])

df_teams_all = pd.DataFrame(team_rows, columns=['feature', 'year', 'team', 'value'])
print(f"Scraped {len(df_teams_all.team.unique())} teams")


Scraped 32 teams


In [82]:
df_teams = df_teams_all.groupby(["team", "year", "feature"]).sum()["value"].reset_index()
df_teams = pd.pivot_table(df_teams, values="value", index=["team", "year"], columns="feature")
df_teams = df_teams.rename(columns={"draw": "draws", "won": "wins", "lost": "defeats", "goalDiff": "goal_diff", "opponentGoals": "opponent_goals"})
df_teams = df_teams.astype(int)
df_teams.to_csv("teams.csv")
df_teams.tail()

Unnamed: 0_level_0,feature,draws,goal_diff,goals,defeats,matches,opponent_goals,points,wins
team,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1635,2019,12,44,81,4,34,37,66,18
1635,2020,8,28,60,7,34,32,65,19
1635,2021,7,35,72,10,34,37,58,17
1635,2022,6,23,64,8,34,41,66,20
1635,2023,8,38,77,7,34,39,65,19


# Combine

In [86]:
df_matches = pd.read_csv("matches.csv", index_col="id")
df_teams = pd.read_csv("teams.csv", index_col="team")

horizon = 3
historic_columns = ["points", "goals", "opponent_goals", "goal_diff", "wins", "draws", "defeats"]

historic_mean = []
for team in df_teams.index.unique():
    mask_team = df_teams.index == team
    years = df_teams.loc[mask_team, "year"].unique()
    for test_year in years:
        horizont_years = [y for y in range(test_year - horizon, test_year) if y in years]
        if len(horizont_years) == 0:
            continue
        
        mask_horizon = df_teams["year"].isin(horizont_years)

        masked = df_teams.loc[mask_team & mask_horizon, historic_columns].mean()

        record = {
            "team": team,
            "year": test_year,
            "horizon_years": horizont_years
        }
        for col in historic_columns:
            record["horizon_" + col] = masked[col]
        historic_mean.append(record)
    
df_teams_historic = pd.DataFrame.from_records(historic_mean)
df_teams_historic.to_csv("teams_historic.csv")
df_teams_historic.head(1)

Unnamed: 0,team,year,horizon_years,horizon_points,horizon_goals,horizon_opponent_goals,horizon_goal_diff,horizon_wins,horizon_draws,horizon_defeats
0,6,2011,[2010],68.0,64.0,44.0,20.0,20.0,8.0,6.0


In [84]:
df_team_host = df_teams.reset_index()
df_team_host["year"] = df_team_host["year"] + 1
df_team_host.columns = "host_last_season_" + df_team_host.columns
df_full = pd.merge(left=df_matches, left_on=["host_id", "season"], right=df_team_host, right_on=["host_last_season_team", "host_last_season_year"], how="left", validate="m:1")

df_team_guest = df_teams.reset_index()
df_team_guest["year"] = df_team_guest["year"] + 1
df_team_guest.columns = "guest_last_season_" + df_team_guest.columns
df_full = pd.merge(left=df_full, left_on=["guest_id", "season"], right=df_team_guest, right_on=["guest_last_season_team", "guest_last_season_year"], how="left", validate="m:1")

df_full = df_full.drop(columns=["guest_last_season_team", "host_last_season_team", "guest_last_season_year", "host_last_season_year"])

df_team_host = df_teams_historic.copy()
df_team_host.columns = "host_" + df_team_host.columns
df_full = pd.merge(left=df_full, left_on=["host_id", "season"], right=df_team_host, right_on=["host_team", "host_year"], how="left", validate="m:1")

df_team_guest = df_teams_historic.copy()
df_team_guest.columns = "guest_" + df_team_guest.columns
df_full = pd.merge(left=df_full, left_on=["guest_id", "season"], right=df_team_guest, right_on=["guest_team", "guest_year"], how="left", validate="m:1")

df_full = df_full.drop(columns=["guest_team", "guest_year", "host_team", "host_year", "guest_horizon_years", "host_horizon_years"])

df_full.tail(2)

Unnamed: 0,match_day,season,host_id,host_name,guest_id,guest_name,host_goals,guest_goals,host_last_season_draws,host_last_season_goal_diff,...,host_horizon_wins,host_horizon_draws,host_horizon_defeats,guest_horizon_points,guest_horizon_goals,guest_horizon_opponent_goals,guest_horizon_goal_diff,guest_horizon_wins,guest_horizon_draws,guest_horizon_defeats
4282,34,2023,16,Stuttgart,87,Gladbach,4,0,12.0,-12.0,...,8.666667,11.0,14.333333,45.666667,56.666667,57.333333,-0.666667,12.0,9.666667,12.333333
4283,34,2023,199,Heidenheim,65,Köln,4,1,,,...,,,,42.333333,45.0,54.333333,-9.333333,10.666667,10.333333,13.0


In [85]:
df_full.to_csv("matches_with_context.csv")