In [29]:
from collections.abc import Callable, Mapping, Iterable, Sequence, Generator
from typing import Any, Optional

import numpy as np
import pandas as pd
import os

In [30]:
import tensorflow as tf, keras
from keras import layers
from keras.layers import StringLookup, IntegerLookup, Embedding, Normalization, Dense

from sklearn.model_selection import train_test_split

In [31]:
path = os.path.normpath("C:\\Users\lenha\Dropbox\\baseball\\all_seasons_hitters.csv")

In [32]:
df = pd.read_csv(path)
df.to_csv("practice_baseball_data.csv")
def preprocess_data(df) -> pd.DataFrame:
    """Not really a part of practice. This is cleaning we would normally expect to be finished."""
    df = (
        df.rename(columns={"K%": "K", "BB%": "BB"})
        .assign(BB=lambda df: df['BB'].str.replace('%', ""), K=lambda df: df['K'].str.replace('%', ""))
        .astype({"BB": 'float', 'K': 'float'})
        .drop(columns=['xwOBA'])
    )
    df = df.merge(df.assign(Season=lambda df: df.Season + 1, label=lambda df: df.HR)[['Season', 'playerid','label']], on=['Season', 'playerid'], how='left').dropna()
    return df

df = preprocess_data(df)

In [33]:
def split_data(df, seed=None, frac=None) -> tuple[pd.DataFrame, ...]:
    test_size = min(int(len(df) * .2), 5000)
    train, val = train_test_split(df, test_size=test_size, random_state=seed)
    print(len(train), len(val))
    return train, val


In [34]:
def make_dataset(df: pd.DataFrame, shuffle: bool = True, batch_size: int = 256) -> tf.data.Dataset:
    df = df.copy()
    labels = df.pop('label')
    df = {key: value.values[:, tf.newaxis] for key, value in df.items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

In [40]:
def extract_feature_ds(ds: tf.data.Dataset, name) -> tuple:
    feature_ds = ds.map(lambda x, _: x[name])
    dtype = feature_ds.element_spec.dtype
    return feature_ds, dtype

def make_embedding(data: tf.data.Dataset, name) -> Callable[..., Embedding]:
    feature, dtype = extract_feature_ds(data, name)
    if dtype == tf.string:
        print(name)
        lookup = StringLookup()
    else:
        lookup = IntegerLookup()
    lookup.adapt(feature)
    n_tokens = len(lookup.get_vocabulary())
    # return lookup
    return lambda x: Embedding(n_tokens, max(3, int(np.log(n_tokens))))(lookup(x))

def make_normalization(data, name):
    feature, dtype = extract_feature_ds(data, name)
    encoder = Normalization()
    encoder.adapt(feature)
    return encoder

def make_inputs_and_encoded_features(data, cat_cols) -> tuple[list, dict]:
    all_inputs = []
    encoded_features = {}
    features = data.element_spec[0]
    for name in features:
        _input = keras.Input(shape=(1, ), name=name)
        if name in cat_cols:
            encoder = make_embedding(data, name)
        else:
            encoder = make_normalization(data, name)
        print(name, encoder)
        encoded_input = encoder(_input)
        all_inputs.append(_input)
        encoded_features[name] = encoded_input
    return all_inputs, encoded_features


In [41]:
train, val = split_data(df, 42)
train_ds = make_dataset(train)
val_ds = make_dataset(val, shuffle=False)

16822 4205


In [42]:
cat_cols =[col for col in df.select_dtypes('object').columns] + ['Season', 'playerid']
cat_cols

['Name', 'Team', 'Season', 'playerid']

In [43]:
all_inputs, all_features = make_inputs_and_encoded_features(train_ds, cat_cols)


Season <function make_embedding.<locals>.<lambda> at 0x000002998698BC40>
Name
Name <function make_embedding.<locals>.<lambda> at 0x000002998394CD60>
Team
Team <function make_embedding.<locals>.<lambda> at 0x000002998394E3E0>
G <Normalization name=normalization_57, built=True>
PA <Normalization name=normalization_58, built=True>
HR <Normalization name=normalization_59, built=True>
R <Normalization name=normalization_60, built=True>
RBI <Normalization name=normalization_61, built=True>
SB <Normalization name=normalization_62, built=True>
BB <Normalization name=normalization_63, built=True>
K <Normalization name=normalization_64, built=True>
ISO <Normalization name=normalization_65, built=True>
BABIP <Normalization name=normalization_66, built=True>
AVG <Normalization name=normalization_67, built=True>
OBP <Normalization name=normalization_68, built=True>
SLG <Normalization name=normalization_69, built=True>
wOBA <Normalization name=normalization_70, built=True>
wRC+ <Normalization name=n

In [None]:

def model_topology(all_features, cat_cols):
    flat_categorical = [keras.layers.Flatten()(all_features[x]) for x in cat_cols]
    non_cat = [all_features[key] for key in all_features if key not in cat_cols]
    x = keras.layers.Concatenate()(flat_categorical + non_cat)
    x = Dense(8, activation='relu')(x)
    x = Dense(1, 'sigmoid')(x)
    return x

model = keras.Model(all_inputs, model_topology(all_features, cat_cols))


In [39]:
for i, (name, encoder) in enumerate(all_features.items()):
    feature = train_ds.map(lambda x, _: x[name])
    keras.Model(all_inputs[i], encoder)(feature)


ValueError: Inputs to a layer should be tensors. Got '<_MapDataset element_spec=TensorSpec(shape=(None, 1), dtype=tf.int64, name=None)>' (of type <class 'tensorflow.python.data.ops.map_op._MapDataset'>) as input for layer 'functional_4'.

In [None]:
model.summary()

In [None]:
model.compile('adam', keras.losses.MeanSquaredError())

In [None]:
model.fit(train_ds, epochs = 3)

In [None]:
make_embedding(train_ds, 'Name')

#### OLD

In [None]:
def make_inputs(df: pd.DataFrame, as_cat=set()) -> dict:
    encoders = {}
    for col in df:

        name=f"{col}_encoder"
        print(name)
        if df[col].dtype == 'object' or col in as_cat:
            # print(df[col])
            print(name)
            if isinstance(df[col][0], str):
                lookup_layer = keras.layers.StringLookup
                print(col, " is string type.")
            else:
                print(col, "is int type.")
                lookup_layer = keras.layers.IntegerLookup
            values =  df[col].unique().tolist()
            lookup = lookup_layer(vocabulary=values)
            embedding = keras.layers.Embedding(len(values), int(np.log(len(values))))
            # FIXME: just pass an input to the damned sequence.
            # and flatten
            # print(lookup)
            flatten = layers.Flatten()
            encoder = lambda x: flatten(embedding(lookup_layer(x), name=f"{col}_embedding"))
            encoders[col] = encoder
        else:
            # print(col, " is numeric.")
            encoders[col] = keras.layers.Normalization(name=name)
        # encoders[col] = map_to_encoder(args)
    return encoders

def make_inputs(df, as_cat=set()) -> dict:
    encoders = make_encoders(df, as_cat)
    return {k: encoder(keras.Input(shape=(1,), name=k)) for k, encoder in encoders.items()}


In [None]:
from sklearn.model_selection import train_test_split

def prepare_data(df: pd.DataFrame) -> tuple[pd.DataFrame, ...]:
    df = df.copy()
    test_size = int(min(len(df) / 5, 5_000))
    target = df.pop('target')
    split_data = train_test_split(df, target, test_size=test_size)
    return [dict(df) for df in split_data]

x_train, x_val, y_train, y_val = prepare_data(df.drop(columns='Name'))

In [None]:
StringLookup(vocabulary=['SEA'])(['se', 'SEA'])

In [None]:
make_encoders(x_train)

In [None]:
inputs = make_inputs(x_train, as_cat=['playerid', 'Season'])
inputs

In [None]:
list(inputs.values())

In [None]:
x = layers.Concatenate()(list(inputs.values()))
out = Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, out)

In [None]:
x_train.values

In [None]:
model.fit(x_train.values, y_train.values)

In [None]:
for i in range(12):
    print(f"    {i}: ( ),")