# Data Acquisition

This notebook downloads the data from the _OpenLigaDB_ API and prepares pandas dataframes.
The data consists of two types:
- matches
- tables

In [1]:
import requests
import json
from tqdm.notebook import trange
import os

os.makedirs("data", exist_ok=True)

### Download matches

In [2]:
SEASON_START = 2010
SEASON_END = 2024

matches = []
for season in trange(SEASON_START, SEASON_END + 1):
    response = requests.get(f"https://api.openligadb.de/getmatchdata/bl1/{season}")
    matches += response.json()

with open("data/matches.json", "w") as file:
    file.write(json.dumps(matches))

print(f"Downloaded {len(matches)} matches")
del matches

  0%|          | 0/15 [00:00<?, ?it/s]

Downloaded 4590 matches


### Download tables

In [3]:
tables = {}
for season in trange(SEASON_START, SEASON_END + 1):
    response = requests.get(f"https://api.openligadb.de/getbltable/bl1/{season}")
    tables[str(season)] = response.json()


with open("data/tables.json", "w") as file:
    file.write(json.dumps(tables))

del tables

  0%|          | 0/15 [00:00<?, ?it/s]

## Prepare Dataset

### Matches

In [4]:
import pandas as pd

with open("data/matches.json", "r") as file:
    matches = json.load(file)


def retrieve_end_result(row: dict):
    results = row["matchResults"]
    end_result = next(
        (result for result in results if result.get("resultName") == "Endergebnis"),
        None,
    )

    if end_result is None:
        raise ValueError(f"Couldn't retrieve end result for {row}")

    return end_result


def match_to_record(row):
    record = {
        "id": row["matchID"],
        "match_day": row["group"]["groupOrderID"],
        "season": row["leagueSeason"],
        "host_id": row["team1"]["teamId"],
        "host_name": row["team1"]["shortName"] or row["team1"]["teamName"],
        "guest_id": row["team2"]["teamId"],
        "guest_name": row["team2"]["shortName"] or row["team2"]["teamName"],
    }
    if row["leagueSeason"] != 2024:
        result = retrieve_end_result(row)
        record["host_goals"] = result["pointsTeam1"]
        record["guest_goals"] = result["pointsTeam2"]
    else:
        try:
            result = retrieve_end_result(row)
            record["host_goals"] = result["pointsTeam1"]
            record["guest_goals"] = result["pointsTeam2"]
        except ValueError:
            record["host_goals"] = None
            record["guest_goals"] = None

    return record


records = [match_to_record(match) for match in matches]
del matches
df_matches = pd.DataFrame.from_records(records)
df_matches = df_matches.set_index("id")
df_matches.tail(2)

Unnamed: 0_level_0,match_day,season,host_id,host_name,guest_id,guest_name,host_goals,guest_goals
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
72518,34,2024,87,Gladbach,131,Wolfsburg,,
72519,34,2024,98,St. Pauli,129,Bochum,,


In [5]:
# Replace long team name with shorter one
df_matches.loc[df_matches["host_name"] == "TSG 1899 Hoffenheim", "host_name"] = "Hoffenheim"
df_matches.loc[df_matches["guest_name"] == "TSG 1899 Hoffenheim", "guest_name"] = "Hoffenheim"

### Tables / Teams

In [6]:
with open("data/tables.json", "r") as file:
    tables = json.load(file)

columns = ["points", "opponentGoals", "goals", "matches", "won", "lost", "draw", "goalDiff"]

team_rows = [
    [col, year, team["teamInfoId"], team[col]]
    for year, teams in tables.items()
    for team in teams
    for col in columns
]

df_teams = pd.DataFrame(team_rows, columns=["feature", "year", "team", "value"])
del team_rows
df_teams = df_teams.groupby(["team", "year", "feature"]).sum()["value"].reset_index()
df_teams = pd.pivot_table(df_teams, values="value", index=["team", "year"], columns="feature")
df_teams = df_teams.rename(
    columns={
        "draw": "draws",
        "won": "wins",
        "lost": "defeats",
        "goalDiff": "goal_diff",
        "opponentGoals": "opponent_goals",
    }
)
df_teams = df_teams.reset_index()
df_teams = df_teams.astype(int)
df_teams.tail(3)

feature,team,year,draws,goal_diff,goals,defeats,matches,opponent_goals,points,wins
267,1635,2022,6,23,64,8,34,41,66,20
268,1635,2023,8,38,77,7,34,39,65,19
269,1635,2024,1,2,4,0,3,2,7,2


In [7]:
df_matches.to_pickle("data/df_matches.pickle")
df_teams.to_pickle("data/df_teams.pickle")