In [1]:
import polars as pl
import os
import numpy as np

FOLDER = "../raw_data/"
SAVE_FOLDER = "../processed_data/"
N_TRACKING_FILES = 2
TRACKING_FILES = [FOLDER + f"tracking_week_{i}.csv" for i in range(1, N_TRACKING_FILES + 1)]

# Data Split Ratios
TRAIN = .7
TEST = .2
VAL = .1

os.makedirs(SAVE_FOLDER, exist_ok=True)

# Load *only* gameId / playId once per play
tracking_minimal = pl.concat([
        pl.read_csv(f, columns=["gameId", "playId"]).unique()
        for f in TRACKING_FILES
]).unique()                 # <-- 1 row per (gameId, playId)


# Split based on unique gameIds
unique_game_ids = tracking_minimal.select("gameId").unique().to_series().to_list()
np.random.seed(42)  # reproducible split
np.random.shuffle(unique_game_ids)

n = len(unique_game_ids)
train_ids = set(unique_game_ids[:int(n * TRAIN)])
val_ids = set(unique_game_ids[int(n * TRAIN):int(n * (TEST + TRAIN))])
test_ids = set(unique_game_ids[int(n * (TRAIN + TEST)):])

# Assign split label
tracking_minimal = tracking_minimal.with_columns(
    pl.when(pl.col("gameId").is_in(train_ids)).then(pl.lit("train"))
    .when(pl.col("gameId").is_in(val_ids)).then(pl.lit("val"))
    .otherwise(pl.lit("test"))
    .alias("split")
)

# Save to disk
tracking_minimal.write_parquet(SAVE_FOLDER + "train_test_val.parquet")
print("Saved minimal tracking data with split column.")


Saved minimal tracking data with split column.
