In [1]:
from api import openligadb
import datetime

def extract_result(match_results):
    for match_result in match_results:
        if match_result['resultName'] == 'Endergebnis':
            return "finished", match_result['pointsTeam1'], match_result['pointsTeam2']
    return "future", None, None



def extract_winner(match_data):
    try:
        if match_data["goalsHome"] > match_data["goalsAway"]:
            return match_data["teamHomeId"]
        elif match_data["goalsHome"] < match_data["goalsAway"]:
            return match_data["teamAwayId"]
        else:
            return None
    except TypeError:
        return None


def extract_match_data(json):
    match_data = {}
    match_data["id"] = json["matchID"]
    match_data["date"] = datetime.datetime.strptime(json["matchDateTime"],"%Y-%m-%dT%H:%M:%S")
    match_data["teamHomeId"] = json["team1"]["teamId"]
    match_data["teamHomeName"] = json["team1"]["teamName"]
    match_data["teamAwayId"] = json["team2"]["teamId"]
    match_data["teamAwayName"] = json["team2"]["teamName"]
    match_data["status"], match_data["goalsHome"], match_data["goalsAway"] = extract_result(json["matchResults"])
    match_data["winnerTeamId"] = extract_winner(match_data)
    match_data["matchDay"] = json["group"]["groupOrderID"]
    match_data["season"] = json["leagueSeason"]
    return match_data

In [2]:
from tqdm import tqdm
import pickle
import pandas as pd

try:
    match_df = pickle.load(open("match_df.pck", "rb"))
except:
    match_data = []
    for year in tqdm(range(2005, 2024)):
        json = openligadb.get_all_season_matches("bl1", year)
        for match_data_json in json:
            match_data.append(extract_match_data(match_data_json))
    match_df = pd.DataFrame(match_data)
    pickle.dump(match_df, open("match_df.pck", "wb"))

In [3]:
import numpy as np
from dateutil.relativedelta import relativedelta
import datetime

testing = False

if testing:
    testing_date = datetime.datetime.today() + relativedelta(days=-14)
    manip_df = match_df.copy(deep=True)
    manip_df = manip_df.loc[(match_df["status"] == "finished") & (match_df["date"] > testing_date)]
    manip_df["status"] = ["future" for _ in range(len(manip_df.index))]
    manip_df[["goalsHome", "goalsAway", "winnerTeamId"]] = np.NaN
    match_df[(match_df["status"] == "finished") & (match_df["date"] > testing_date)] = manip_df

In [4]:
updatable_df = match_df[(match_df["status"] == "future") & (match_df["date"] < datetime.datetime.today())]
for _, row in tqdm(updatable_df.iterrows()):
    row_match_data = openligadb.get_match_data(row["id"])
    match_df.loc[row.name] = row_match_data
if len(updatable_df) > 0:
    pickle.dump(match_df, open("match_df.pck", "wb"))

9it [00:01,  8.89it/s]


In [5]:
next_match_day = 1
for _, row in match_df[match_df["status"] == "future"].iterrows():
    next_match_day = row["matchDay"]
    if len(match_df[(match_df["matchDay"] == next_match_day) & (match_df["season"] == row["season"])]) == 9:
        break
next_match_day_df = match_df[(match_df["matchDay"] == next_match_day) & (match_df["season"] == row["season"])]

In [6]:
match_df = match_df[match_df["status"] == "finished"].fillna(0)
next_match_day_df = next_match_day_df.fillna(0)
match_df = pd.concat([match_df, next_match_day_df])

In [7]:
team_match_df_dict = dict()

In [8]:
def get_team_match_df(teamId):
    team_match_df = match_df[(match_df["teamHomeId"] == teamId) | (match_df["teamAwayId"] == teamId)].copy(deep=True)
    team_match_df.sort_values(by="date")
    goalsTeam = []
    goalsOpponent = []
    teamPoints = []
    for _, row in team_match_df.iterrows():
        if row["teamHomeId"] == teamId:
            goalsTeam.append(row["goalsHome"])
            goalsOpponent.append(row["goalsAway"])
        else:
            goalsOpponent.append(row["goalsHome"])
            goalsTeam.append(row["goalsAway"])
        if row["winnerTeamId"] == teamId:
            teamPoints.append(3)
        elif row["winnerTeamId"] == 0:
            teamPoints.append(1)
        else:
            teamPoints.append(0)
    team_match_df["goalsTeam"] = goalsTeam
    team_match_df["goalsOpponent"] = goalsOpponent
    team_match_df["teamPoints"] = teamPoints
    team_match_df["avgScoredGoals5"] = team_match_df["goalsTeam"].rolling(window=5).mean().shift(1)
    team_match_df["avgScoredGoals10"] = team_match_df["goalsTeam"].rolling(window=10).mean().shift(1)
    team_match_df["avgGottenGoals5"] = team_match_df["goalsOpponent"].rolling(window=5).mean().shift(1)
    team_match_df["avgGottenGoals10"] = team_match_df["goalsOpponent"].rolling(window=10).mean().shift(1)
    team_match_df["avgTeamPoints5"] = team_match_df["teamPoints"].rolling(window=5).mean().shift(1)
    team_match_df["avgTeamPoints10"] = team_match_df["teamPoints"].rolling(window=10).mean().shift(1)
    return team_match_df

In [9]:
for teamId in tqdm(set(match_df["teamHomeId"])):
    team_match_df_dict[teamId] = get_team_match_df(teamId)

100%|██████████| 37/37 [00:00<00:00, 139.40it/s]


In [10]:
from api.transfermarkt import get_teams_market_values_threaded2
from collections import defaultdict

try:
    market_value_dict =  pickle.load(open("market_values_dict.pck", "rb"))
except:
    team_on_season_df = match_df.groupby(["teamHomeName", "season"]).size().reset_index(name='Freq')
    market_value_list = get_teams_market_values_threaded2(team_on_season_df)
    market_value_dict = defaultdict(lambda: defaultdict(float))
    for market_value in market_value_list:
        market_value_dict[market_value[0]][market_value[1]] = market_value[2]
    market_value_dict = dict(market_value_dict)
    pickle.dump(market_value_dict, open("market_values_dict.pck", "wb"))

In [11]:
def get_market_value(team, season):
    team = team.replace("1. FC", "1.FC")
    team = team.replace("1. FSV", "1.FSV")
    team = team.replace("FC Kickers Würzburg", "Würzburger Kickers")
    return market_value_dict[team][season]

In [12]:
from varname import  argname


def add_to_match_df(var):
    global match_df
    match_df[argname('var')] = var

In [13]:
match_df = match_df.sort_values(by="date")

In [14]:
teamHomeValue = []
teamAwayValue = []

teamHomeAvgScoredGoals5 = []
teamHomeAvgScoredGoals10 = []
teamHomeAvgGottenGoals5 = []
teamHomeAvgGottenGoals10 = []
teamHomeAvgTeamPoints5 = []
teamHomeAvgTeamPoints10 = []

teamAwayAvgScoredGoals5 = []
teamAwayAvgScoredGoals10 = []
teamAwayAvgGottenGoals5 = []
teamAwayAvgGottenGoals10 = []
teamAwayAvgTeamPoints5 = []
teamAwayAvgTeamPoints10 = []
resultClass = []

for _, row in match_df.iterrows():
    teamHomeValue.append(get_market_value(row["teamHomeName"],row["season"]))
    teamAwayValue.append(get_market_value(row["teamAwayName"],row["season"]))
    
    teamHomeAvgScoredGoals5.append(team_match_df_dict[row["teamHomeId"]]["avgScoredGoals5"].loc[row.name])
    teamHomeAvgScoredGoals10.append(team_match_df_dict[row["teamHomeId"]]["avgScoredGoals10"].loc[row.name])
    teamHomeAvgGottenGoals5.append(team_match_df_dict[row["teamHomeId"]]["avgGottenGoals5"].loc[row.name])
    teamHomeAvgGottenGoals10.append(team_match_df_dict[row["teamHomeId"]]["avgGottenGoals10"].loc[row.name])
    teamHomeAvgTeamPoints5.append(team_match_df_dict[row["teamHomeId"]]["avgTeamPoints5"].loc[row.name])
    teamHomeAvgTeamPoints10.append(team_match_df_dict[row["teamHomeId"]]["avgTeamPoints10"].loc[row.name])
    
    teamAwayAvgScoredGoals5.append(team_match_df_dict[row["teamAwayId"]]["avgScoredGoals5"].loc[row.name])
    teamAwayAvgScoredGoals10.append(team_match_df_dict[row["teamAwayId"]]["avgScoredGoals10"].loc[row.name])
    teamAwayAvgGottenGoals5.append(team_match_df_dict[row["teamAwayId"]]["avgGottenGoals5"].loc[row.name])
    teamAwayAvgGottenGoals10.append(team_match_df_dict[row["teamAwayId"]]["avgGottenGoals10"].loc[row.name])
    teamAwayAvgTeamPoints5.append(team_match_df_dict[row["teamAwayId"]]["avgTeamPoints5"].loc[row.name])
    teamAwayAvgTeamPoints10.append(team_match_df_dict[row["teamAwayId"]]["avgTeamPoints10"].loc[row.name])
    
    resultClass.append(str(int(row["goalsHome"])) + ":" + str(int(row["goalsAway"])))
add_to_match_df(teamHomeValue)
add_to_match_df(teamAwayValue)

add_to_match_df(teamHomeAvgScoredGoals5)
add_to_match_df(teamHomeAvgScoredGoals10)
add_to_match_df(teamHomeAvgGottenGoals5)
add_to_match_df(teamHomeAvgGottenGoals10)
add_to_match_df(teamHomeAvgTeamPoints5)
add_to_match_df(teamHomeAvgTeamPoints10)

add_to_match_df(teamAwayAvgScoredGoals5)
add_to_match_df(teamAwayAvgScoredGoals10)
add_to_match_df(teamAwayAvgGottenGoals5)
add_to_match_df(teamAwayAvgGottenGoals10)
add_to_match_df(teamAwayAvgTeamPoints5)
add_to_match_df(teamAwayAvgTeamPoints10)

add_to_match_df(resultClass)

In [15]:
match_df.dropna(inplace=True)

In [16]:
next_match_day_df = match_df[match_df["status"] == "future"].copy(deep=True)
next_match_day_df.drop(["id", "winnerTeamId", "goalsHome", "goalsAway", "status"], axis=1, inplace=True)
pickle.dump(next_match_day_df, open("next_matchday_df.pck", "wb"))

In [17]:
match_df = match_df[match_df["status"] == "finished"]
match_df.drop(["id", "date", "teamHomeName", "teamAwayName", "winnerTeamId", "goalsHome", "goalsAway", "status"], axis=1, inplace=True)
pickle.dump(match_df, open("prepped_match_df.pck", "wb"))