In [3]:
import logging
import os
import pickle
import sys
import time
from pathlib import Path
from statistics import mean

import numpy as np
import polars as pl
import yaml
from sklearn.calibration import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


In [4]:
from pathlib import Path

import numpy as np
import tensorflow as tf
import yaml

from dvclive import Live
from dvclive.keras import DVCLiveCallback

params = Path("../params.yaml")
with open(params, "r") as file:
    params = yaml.safe_load(file)
DROPOUT = params["train"]["dropout"]
PATIENCE = params["train"]["patience"]
KERN_REG = params["train"]["kernel_regularizer"]
BATCH_NORMALIZATION = params["train"]["batch_normalization"]
BATCH_SIZE = params["train"]["batch_size"]
EPOCHS = params["train"]["epochs"]
LSTM_UNITS = params["train"]["lstm_units"]


def compile_and_fit(model, X_train, y_train, X_val, y_val, pitcher_name):
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=1e-4,  # - learning_rate: controls how much to change the model in response to the estimated error each time the model weights are updated.
        clipnorm=1.0,  # - clipnorm: clips gradients by norm; helps prevent exploding gradients.
        weight_decay=1e-4,  # - weight_decay: adds a penalty to the loss function to prevent overfitting.
    )
    model.compile(
        optimizer=optimizer,
        loss="sparse_categorical_crossentropy",
        metrics=["sparse_categorical_accuracy"],
    )

    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor="val_sparse_categorical_accuracy",
            patience=PATIENCE,
            restore_best_weights=True,
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor="val_loss",
            factor=0.2,
            patience=PATIENCE,
            min_lr=1e-6,
        ),
        DVCLiveCallback(live=Live(f"dvclive/{pitcher_name}_logs")),
    ]

    history = model.fit(
        X_train,
        y_train,
        validation_data=(X_val, y_val),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=callbacks,
    )

    return history


def create_model(input_shape, num_classes):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.InputLayer(shape=input_shape))
    model.add(
        tf.keras.layers.LSTM(
            LSTM_UNITS,
            return_sequences=True,
            dropout=DROPOUT,
            kernel_regularizer=tf.keras.regularizers.l2(KERN_REG),
        )
    )
    if BATCH_NORMALIZATION:
        model.add(tf.keras.layers.BatchNormalization())
    model.add(
        tf.keras.layers.LSTM(
            LSTM_UNITS,
            return_sequences=True,
            dropout=DROPOUT,
            kernel_regularizer=tf.keras.regularizers.l2(KERN_REG),
        )
    )
    if BATCH_NORMALIZATION:
        model.add(tf.keras.layers.BatchNormalization())
    model.add(
        tf.keras.layers.LSTM(
            LSTM_UNITS,
            dropout=DROPOUT,
            kernel_regularizer=tf.keras.regularizers.l2(KERN_REG),
        )
    )
    if BATCH_NORMALIZATION:
        model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dense(num_classes, activation="softmax"))
    return model


def calculate_class_weights(y):
    proportions = np.bincount(y) / len(y)
    for i, proportion in enumerate(proportions):
        if proportion == 0:
            proportions[i] = 1e-6
    inverseN = 1 / len(proportions)
    weights = [inverseN / proportion for proportion in proportions]
    return {i: w for i, w in enumerate(weights)}


In [5]:
df = pl.read_parquet(Path("../" + params["train"]["input_data_path"]))

In [6]:
logger = logging.getLogger("choo choo")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(
    logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
)
logger.addHandler(handler)

In [None]:
def training_loop(df, params):
    pitcher_data = {}
    count = 0
    features = []
    features_path = Path("../" + params["train"]["features_path"])
    with open(features_path, "r") as f:
        for item in f.readlines():
            features.append(item.strip())
    start_time = time.time()
    for pitcher_df in df.group_by("pitcher"):
        pitcher_code = pitcher_df[0]
        print(df.head())

In [22]:
pitcher_data = training_loop(df, params)

shape: (5, 63)
┌────────────┬──────────────┬───────────────┬───────────────┬───┬───────┬───────┬───────┬───────┐
│ pitch_type ┆ game_date    ┆ release_speed ┆ release_pos_x ┆ … ┆ BA    ┆ OBP   ┆ SLG   ┆ OPS   │
│ ---        ┆ ---          ┆ ---           ┆ ---           ┆   ┆ ---   ┆ ---   ┆ ---   ┆ ---   │
│ i64        ┆ datetime[μs] ┆ f64           ┆ f64           ┆   ┆ f64   ┆ f64   ┆ f64   ┆ f64   │
╞════════════╪══════════════╪═══════════════╪═══════════════╪═══╪═══════╪═══════╪═══════╪═══════╡
│ 5          ┆ 2024-09-26   ┆ 87.6          ┆ -0.22         ┆ … ┆ 0.244 ┆ 0.303 ┆ 0.409 ┆ 0.713 │
│            ┆ 00:00:00     ┆               ┆               ┆   ┆       ┆       ┆       ┆       │
│ 2          ┆ 2024-09-26   ┆ 79.4          ┆ -0.16         ┆ … ┆ 0.169 ┆ 0.242 ┆ 0.228 ┆ 0.469 │
│            ┆ 00:00:00     ┆               ┆               ┆   ┆       ┆       ┆       ┆       │
│ 3          ┆ 2024-09-26   ┆ 70.7          ┆ -0.04         ┆ … ┆ 0.169 ┆ 0.242 ┆ 0.228 ┆ 0.469 │
│    