# Feature Engineering

This notebook enriches each match with information about both teams, for example their points of the last season.
In addition, the data is split into a training set (all seasons before 2023), a validation set (2023/24 season) and the current season (2024/25).

In [1]:
import pandas as pd

In [2]:
df_teams = pd.read_pickle("data/df_teams.pickle")
df_matches = pd.read_pickle("data/df_matches.pickle")

### Create horizon

In [3]:
HORIZON = 3

historic_columns = ["points", "goals", "opponent_goals", "goal_diff", "wins", "draws", "defeats"]
test_years = range(df_teams["year"].min(), df_teams["year"].max() + 1)

historic_mean = []
for team in df_teams.index.unique():
    team_data = df_teams[df_teams["team"] == team]
    years = team_data["year"].unique()

    for test_year in test_years:
        horizont_years = [y for y in range(test_year - HORIZON, test_year) if y in years]
        if not horizont_years:
            continue

        mask_horizon = df_teams["year"].isin(horizont_years)
        masked_data = team_data.loc[mask_horizon, historic_columns]
        mean_values = masked_data.mean()

        record = {"team": team, "year": test_year, "horizon_years": horizont_years}
        for col in historic_columns:
            record[f"horizon_{col}"] = mean_values[col]

        historic_mean.append(record)


df_teams_historic = pd.DataFrame.from_records(historic_mean)
df_teams_historic.head(5)

Unnamed: 0,team,year,horizon_years,horizon_points,horizon_goals,horizon_opponent_goals,horizon_goal_diff,horizon_wins,horizon_draws,horizon_defeats
0,6,2011,[2010],68.0,64.0,44.0,20.0,20.0,8.0,6.0
1,6,2012,"[2010, 2011]",61.0,58.0,44.0,14.0,17.5,8.5,8.0
2,6,2013,"[2010, 2011, 2012]",62.333333,60.333333,42.333333,18.0,18.0,8.333333,7.666667
3,6,2014,"[2011, 2012, 2013]",60.0,59.0,41.333333,17.666667,17.666667,7.0,9.333333
4,6,2015,"[2012, 2013, 2014]",62.333333,62.333333,39.0,23.333333,18.333333,7.333333,8.333333


### Enrich matches with both teams' horizons

In [4]:
df_team_host = df_teams.reset_index()
df_team_host["year"] += 1
df_team_host.columns = "host_last_season_" + df_team_host.columns
df_full = pd.merge(
    left=df_matches.reset_index(),
    left_on=["host_id", "season"],
    right=df_team_host,
    right_on=["host_last_season_team", "host_last_season_year"],
    how="left",
    validate="m:1",
)

df_team_guest = df_teams.reset_index()
df_team_guest["year"] += 1
df_team_guest.columns = "guest_last_season_" + df_team_guest.columns
df_full = pd.merge(
    left=df_full,
    left_on=["guest_id", "season"],
    right=df_team_guest,
    right_on=["guest_last_season_team", "guest_last_season_year"],
    how="left",
    validate="m:1",
)

df_full = df_full.drop(
    columns=[
        "guest_last_season_team",
        "host_last_season_team",
        "guest_last_season_year",
        "host_last_season_year",
    ]
)

df_team_host = df_teams_historic.copy()
df_team_host.columns = "host_" + df_team_host.columns
df_full = pd.merge(
    left=df_full,
    left_on=["host_id", "season"],
    right=df_team_host,
    right_on=["host_team", "host_year"],
    how="left",
    validate="m:1",
)

df_team_guest = df_teams_historic.copy()
df_team_guest.columns = "guest_" + df_team_guest.columns
df_full = pd.merge(
    left=df_full,
    left_on=["guest_id", "season"],
    right=df_team_guest,
    right_on=["guest_team", "guest_year"],
    how="left",
    validate="m:1",
)

df_full = df_full.drop(
    columns=[
        "guest_team",
        "guest_year",
        "host_team",
        "host_year",
        "guest_horizon_years",
        "host_horizon_years",
        "host_last_season_index",
        "guest_last_season_index",
    ]
)
df_full = df_full.set_index("id")
df_full.tail(5)

Unnamed: 0_level_0,match_day,season,host_id,host_name,guest_id,guest_name,host_goals,guest_goals,host_last_season_draws,host_last_season_goal_diff,...,host_horizon_wins,host_horizon_draws,host_horizon_defeats,guest_horizon_points,guest_horizon_goals,guest_horizon_opponent_goals,guest_horizon_goal_diff,guest_horizon_wins,guest_horizon_draws,guest_horizon_defeats
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
72515,34,2024,112,Freiburg,91,Frankfurt,,,9.0,-13.0,...,14.333333,9.0,10.666667,46.333333,51.333333,50.333333,1.0,11.333333,12.333333,10.333333
72516,34,2024,95,Augsburg,80,Union Berlin,,,9.0,-10.0,...,9.666667,8.0,16.333333,50.666667,44.666667,46.666667,-2.0,14.333333,7.666667,12.0
72517,34,2024,81,Mainz,6,Leverkusen,,,14.0,-12.0,...,10.666667,10.333333,13.0,68.0,75.333333,40.0,35.333333,20.333333,7.0,6.666667
72518,34,2024,87,Gladbach,131,Wolfsburg,,,13.0,-11.0,...,10.0,10.666667,13.333333,42.666667,47.0,52.666667,-5.666667,11.666667,7.666667,14.666667
72519,34,2024,98,St. Pauli,129,Bochum,,,,,...,,,,36.666667,40.0,66.0,-26.0,9.666667,7.666667,16.666667


## Imputation

In [5]:
cols_fill_zero = ["goals", "wins", "points", "matches"]
for col in df_full.columns:
    if not any([c for c in cols_fill_zero if c in col]):
        continue
    df_full[col] = df_full[col].fillna(0)

cols_fill_min = ["goal_diff", "draws"]
for col in df_full.columns:
    if not any([c for c in cols_fill_min if c in col]):
        continue
    df_full[col] = df_full[col].fillna(df_full[col].min())

cols_fill_max = ["defeats"]
for col in df_full.columns:
    if not any([c for c in cols_fill_max if c in col]):
        continue
    df_full[col] = df_full[col].fillna(df_full[col].max())

df_full["host_goals"] = df_full["host_goals"].astype(int)
df_full["guest_goals"] = df_full["guest_goals"].astype(int)

## Create _difference_ features

In [6]:
cols = [col.replace("host_", "") for col in df_full.columns if col.startswith("host")]
cols = [col for col in cols if col not in ["id", "name", "goals"]]

for col in cols:
    df_full["diff_" + col] = df_full["host_" + col] - df_full["guest_" + col]

## Splitting

In [7]:
df_full[df_full["season"] < 2023].to_pickle("data/matches_train.pickle")
df_full[df_full["season"] == 2023].to_pickle("data/matches_test.pickle")
df_full[df_full["season"] < 2024].to_pickle("data/matches_train_all.pickle")
df_full[df_full["season"] == 2024].to_pickle("data/matches_future.pickle")