In [1]:
from collections.abc import Callable, Mapping, Iterable, Sequence, Generator
from typing import Any, Optional

import numpy as np
import pandas as pd
import os

In [2]:
import tensorflow as tf, keras
from keras import layers
from keras.layers import StringLookup, IntegerLookup, Embedding, Normalization, Dense

from sklearn.model_selection import train_test_split

2024-08-18 20:25:00.172412: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-18 20:25:00.177150: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-18 20:25:00.191244: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-18 20:25:00.214051: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-18 20:25:00.220568: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-18 20:25:00.237539: I tensorflow/core/platform/cpu_feature_gu

In [3]:
path = os.path.normpath("practice_baseball_data.csv")

In [4]:
df = pd.read_csv(path)
def preprocess_data(df) -> pd.DataFrame:
    """Not really a part of practice. This is cleaning we would normally expect to be finished."""
    df = (
        df.rename(columns={"K%": "K", "BB%": "BB"})
        .assign(BB=lambda df: df['BB'].str.replace('%', ""), K=lambda df: df['K'].str.replace('%', ""))
        .astype({"BB": 'float', 'K': 'float'})
        .drop(columns=['xwOBA'])
    )
    df = df.merge(df.assign(Season=lambda df: df.Season + 1, label=lambda df: df.HR)[['Season', 'playerid','label']], on=['Season', 'playerid'], how='left').dropna()
    return df

df = preprocess_data(df)

In [5]:
df.dtypes

Season        int64
Name         object
Team         object
G             int64
PA            int64
HR            int64
R             int64
RBI           int64
SB            int64
BB          float64
K           float64
ISO         float64
BABIP       float64
AVG         float64
OBP         float64
SLG         float64
wOBA        float64
wRC+          int64
BsR         float64
Off         float64
Def         float64
WAR         float64
playerid      int64
label       float64
dtype: object

In [6]:
def split_data(df, seed=None, frac=None) -> tuple[pd.DataFrame, ...]:
    test_size = min(int(len(df) * .2), 5000)
    train, val = train_test_split(df, test_size=test_size, random_state=seed)
    print(len(train), len(val))
    return train, val


In [7]:
def make_dataset(df: pd.DataFrame, shuffle: bool = True, batch_size: int = 256) -> tf.data.Dataset:
    df = df.copy()
    labels = df.pop('label')
    df = {key: value.values[:, tf.newaxis] for key, value in df.items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

In [8]:
def extract_feature_ds(ds: keras.Input, name) -> tuple:
    feature_ds = ds.map(lambda x, _: x[name])
    dtype = feature_ds.element_spec.dtype
    return feature_ds, dtype

def make_embedding(data: tf.data.Dataset, name) -> Callable[..., Embedding]:
    feature, dtype = extract_feature_ds(data, name)
    if dtype == tf.string:
        print(name)
        lookup = StringLookup()
    else:
        lookup = IntegerLookup()
    lookup.adapt(feature)
    n_tokens = len(lookup.get_vocabulary())
    output_dim = int(np.log(n_tokens))
    embedding = Embedding(n_tokens, max(3, output_dim))
    # return lookup
    return lambda x: embedding(lookup(x))

def make_normalization(data, name):
    feature, dtype = extract_feature_ds(data, name)
    encoder = Normalization()
    encoder.adapt(feature)
    return encoder

def make_inputs_and_encoded_features(data, cat_cols) -> tuple[list, dict]:
    all_inputs = []
    encoded_features = {}
    features = data.element_spec[0]
    for name in features:
        if name in ["Name", "Team"]:
            continue
        _input = keras.Input(shape=(1, ), name=name)
        if name in cat_cols:
            encoder = make_embedding(data, name)
        else:
            encoder = make_normalization(data, name)
        # print(name, encoder)
        encoded_input = encoder(_input)
        all_inputs.append(_input)
        encoded_features[name] = encoded_input
    return all_inputs, encoded_features


In [9]:
train, val = split_data(df, 42)
train_ds = make_dataset(train)
val_ds = make_dataset(val, shuffle=False)

16822 4205


2024-08-18 20:25:13.254021: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [10]:
cat_cols = [col for col in df.select_dtypes('object').columns] + ['Season', 'playerid']
cat_cols

['Name', 'Team', 'Season', 'playerid']

In [16]:
all_inputs, all_features = make_inputs_and_encoded_features(train_ds, cat_cols)


2024-08-18 20:27:41.355034: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [19]:

def model_topology(all_features, cat_cols):
    flat_categorical = [keras.layers.Flatten()(all_features[x]) for x in cat_cols if x in all_features]
    non_cat = [all_features[key] for key in all_features if key not in cat_cols]
    x = keras.layers.Concatenate()(flat_categorical + non_cat)
    x = Dense(8, activation='relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    return x

model = keras.Model(all_inputs, model_topology(all_features, cat_cols))


In [23]:
all_inputs

[<KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=Season>,
 <KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=G>,
 <KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=PA>,
 <KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=HR>,
 <KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=R>,
 <KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=RBI>,
 <KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=SB>,
 <KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=BB>,
 <KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=K>,
 <KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=ISO>,
 <KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=BABIP>,
 <KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=AVG>,
 <KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=OBP>,
 <KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=SLG>,
 <KerasTens

In [20]:
model.summary()

In [21]:
model.compile('adam', keras.losses.MeanSquaredError())

In [22]:
model.fit(train_ds, epochs = 3)

Epoch 1/3


2024-08-18 20:33:10.985464: W tensorflow/core/framework/op_kernel.cc:1817] OP_REQUIRES failed at cast_op.cc:122 : UNIMPLEMENTED: Cast string to float is not supported


UnimplementedError: Graph execution error:

Detected at node functional_1_1/Cast_9 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.11/asyncio/base_events.py", line 608, in run_forever

  File "/usr/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once

  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/tmp/ipykernel_28656/722438006.py", line 1, in <module>

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 320, in fit

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 121, in one_step_on_iterator

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 108, in one_step_on_data

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 51, in train_step

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/keras/src/layers/layer.py", line 901, in __call__

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/keras/src/models/functional.py", line 167, in call

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/keras/src/models/functional.py", line 258, in _standardize_inputs

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/keras/src/models/functional.py", line 218, in _convert_inputs_to_tensors

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/keras/src/ops/core.py", line 822, in convert_to_tensor

  File "/home/lenhart/Repos/learning/.venv/lib/python3.11/site-packages/keras/src/backend/tensorflow/core.py", line 132, in convert_to_tensor

Cast string to float is not supported
	 [[{{node functional_1_1/Cast_9}}]] [Op:__inference_one_step_on_iterator_78769]

In [None]:
make_embedding(train_ds, 'Name')

#### OLD

In [None]:
def make_inputs(df: pd.DataFrame, as_cat=set()) -> dict:
    encoders = {}
    for col in df:

        name=f"{col}_encoder"
        print(name)
        if df[col].dtype == 'object' or col in as_cat:
            # print(df[col])
            print(name)
            if isinstance(df[col][0], str):
                lookup_layer = keras.layers.StringLookup
                print(col, " is string type.")
            else:
                print(col, "is int type.")
                lookup_layer = keras.layers.IntegerLookup
            values =  df[col].unique().tolist()
            lookup = lookup_layer(vocabulary=values)
            embedding = keras.layers.Embedding(len(values), int(np.log(len(values))))
            # FIXME: just pass an input to the damned sequence.
            # and flatten
            # print(lookup)
            flatten = layers.Flatten()
            encoder = lambda x: flatten(embedding(lookup_layer(x), name=f"{col}_embedding"))
            encoders[col] = encoder
        else:
            # print(col, " is numeric.")
            encoders[col] = keras.layers.Normalization(name=name)
        # encoders[col] = map_to_encoder(args)
    return encoders

def make_inputs(df, as_cat=set()) -> dict:
    encoders = make_encoders(df, as_cat)
    return {k: encoder(keras.Input(shape=(1,), name=k)) for k, encoder in encoders.items()}


In [None]:
from sklearn.model_selection import train_test_split

def prepare_data(df: pd.DataFrame) -> tuple[pd.DataFrame, ...]:
    df = df.copy()
    test_size = int(min(len(df) / 5, 5_000))
    target = df.pop('target')
    split_data = train_test_split(df, target, test_size=test_size)
    return [dict(df) for df in split_data]

x_train, x_val, y_train, y_val = prepare_data(df.drop(columns='Name'))

In [None]:
StringLookup(vocabulary=['SEA'])(['se', 'SEA'])

In [None]:
make_encoders(x_train)

In [None]:
inputs = make_inputs(x_train, as_cat=['playerid', 'Season'])
inputs

In [None]:
list(inputs.values())

In [None]:
x = layers.Concatenate()(list(inputs.values()))
out = Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, out)

In [None]:
x_train.values

In [None]:
model.fit(x_train.values, y_train.values)

In [None]:
for i in range(12):
    print(f"    {i}: ( ),")