# Neural net approach

The goal of this notebook is to try using neural nets (in particular the [`tabular` nets from `fastai`](https://docs.fast.ai/tutorial.tabular)). While just straight-up using a NN is somewhat problematic in terms of explainability, my goal is just to see what kind of accuracy we can get. If there's no real improvement over what we're seeing with XGB then there's no point fussing with NNs anymore. If there _is_ an improvement, then we should turn our attention to this.

In [130]:
import pandas as pd
import numpy as np

from utils.data_loader import Dataset

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers.experimental import preprocessing

from sklearn.preprocessing import LabelEncoder

In [131]:
def load_ds(train_or_test='train'):
    if train_or_test == 'train':
        start_date = '2000-01-01'
        end_date = '2015-01-01'
    else:
        start_date = '2015-01-01'
        end_date = '2020-01-01'
        
    ds = Dataset('tf')
    ds.load_games(start_date, end_date)
    ds.add_team_stats(cols=['Avg_Attendance', 'W-L-pct'])
    ds.add_team_pitching_stats(cols=['WHIP', 'ERA'])
    ds.add_pitcher_stats(cols=['WHIP', 'ERA', 'IP'], game_offset=5);

    ds.data = ds.data.drop(['home_pitcher', 'away_pitcher', 'date'], axis='columns')
    
    embedding_cols = ['home_team', 'away_team', 'Y', 'M', 'D']
    numeric_cols = list(set(ds.data.columns) - set(embedding_cols))
    numeric_cols.remove('home_win')
    assert set(numeric_cols).intersection(set(embedding_cols)) == set()
    assert len(embedding_cols) + len(numeric_cols) + 1 == len(ds.data.columns)
    
    for c in ds.data.columns:
        if ds.data[c].isin([-np.inf, np.inf]).sum() > 0:
            ds.data[c] = ds.data[c].replace([-np.inf, np.inf], None)
        if ds.data[c].isna().sum() > 0:
            med = ds.data[c].median()
            ds.data[c] = ds.data[c].fillna(med)
            
    le = LabelEncoder()
    ds.data['away_team'] = le.fit_transform(ds.data['away_team'])
    ds.data['home_team'] = le.transform(ds.data['home_team'])
    
    for c in embedding_cols:
        ds.data[c] = ds.data[c].astype(int)
        
    assert ds.data.isna().sum().sum() == 0
    
    y = ds.data.pop('home_win')
    y = y.astype(int)
    X = ds.data
    
    tf_ds = tf.data.Dataset.from_tensor_slices((dict(X), y)).batch(128)
    return tf_ds, ds, le, embedding_cols, numeric_cols

In [132]:
train_ds, train_df, le, embedding_cols, numeric_cols = load_ds('train')
test_ds, test_df, _, _, _ = load_ds('test')

## Preprocessing layers

In [112]:
def get_normalization_layer(name, dataset):
    # Create a Normalization layer for our feature.
    normalizer = preprocessing.Normalization()

    # Prepare a Dataset that only yields our feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)

    return normalizer

In [113]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a StringLookup layer which will turn strings into integer indices
    if dtype == 'string':
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = preprocessing.IntegerLookup(max_values=max_tokens)

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)
    
    if name == 'home_team':
        print('home_team')
        print(index.get_vocabulary())
        
    if name == 'away_team':
        print('away_team')
        print(index.get_vocabulary())

    # Create a Discretization for our integer indices.
    encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

    # Prepare a Dataset that only yields our feature.
    feature_ds = feature_ds.map(index)

    # Learn the space of possible indices.
    encoder.adapt(feature_ds)

    # Apply one-hot encoding to our indices. The lambda function captures the
    # layer so we can use them, or include them in the functional model later.
    return lambda feature: encoder(index(feature))

In [114]:
def prep_columns(dataset, embedding_dims=[10]):
    cont_inputs = []
    cat_inputs = []
    encoded_cont_features = []
    encoded_cat_features = []
    if isinstance(embedding_dims, int):
        embedding_dims = [embedding_dims] * len(embedding_cols)
    assert len(embedding_dims) == len(embedding_cols), 'embedding_dims must be an integer or a list with the same length as embedding_cols'

    # Numeric features.
    print(f'Numerical columns: {numeric_cols}')
    for header in numeric_cols:
        numeric_col = tf.keras.Input(shape=(1,), name=header)
        normalization_layer = get_normalization_layer(header, dataset)
        encoded_numeric_col = normalization_layer(numeric_col)
        encoded_numeric_col = tf.keras.layers.Dropout(0.1)(encoded_numeric_col)
        cont_inputs.append(numeric_col)
        encoded_cont_features.append(encoded_numeric_col)

    # Home and away teams
    print(f'Embedding home_team and away_team')
    for i, header in enumerate(['home_team', 'away_team']):
        categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='int64')
        encoded_categorical_col = tf.keras.layers.Embedding(30, embedding_dims[i], name=f'{header}_embedding')(categorical_col)
        encoded_categorical_col = tf.keras.layers.Flatten()(encoded_categorical_col)
        cat_inputs.append(categorical_col)
        encoded_cat_features.append(encoded_categorical_col)
        
    # Categorical features encoded as ints.
    encoded_embedding_cols = embedding_cols.copy()
    encoded_embedding_cols.remove('home_team')
    encoded_embedding_cols.remove('away_team')
    print(f'Categorical columns: {encoded_embedding_cols}')
    for i, header in enumerate(encoded_embedding_cols):
        categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='int64')
        encoding_layer = get_category_encoding_layer(header, dataset, 
                                                     dtype='int', 
                                                     max_tokens=20)
        encoded_categorical_col = encoding_layer(categorical_col)
        encoded_categorical_col = tf.keras.layers.Embedding(20, 5, name=f'{header}_embedding')(encoded_categorical_col)
        encoded_categorical_col = tf.keras.layers.Flatten()(encoded_categorical_col)
        cat_inputs.append(categorical_col)
        encoded_cat_features.append(encoded_categorical_col)
        
    all_inputs = cont_inputs + cat_inputs
        
    return all_inputs, encoded_cont_features, encoded_cat_features

In [115]:
all_inputs, encoded_cont_features, encoded_cat_features = prep_columns(train_ds, 10)

Numerical columns: ['obp_pct_diff', 'team_ERA_pct_diff', 'bayes_pct_diff', 'ops_pct_diff', 'RD_pct_diff', 'R_pct_diff', 'avg_pct_diff', 'pytha_pct_diff', 'team_W-L_pct_diff', 'team_WHIP_pct_diff', 'win_pct_diff', 'team_FP_pct_diff', 'slg_pct_diff', 'RA_pct_diff', 'team_Rank_pct_diff', 'pitcher_WHIP_pct_diff', 'WPA_pct_diff', 'FP_pct_diff', 'Rank_pct_diff', 'pitcher_IP_pct_diff', 'pitcher_ERA_pct_diff', 'log_5']
Embedding home_team and away_team
Categorical columns: ['Y', 'M']


## Creating the model

In [116]:
x = tf.keras.layers.concatenate(encoded_cat_features + encoded_cont_features)
x = tf.keras.layers.Dense(64, activation="relu")(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(32, activation="relu")(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(all_inputs, output)
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"])

In [117]:
model.fit(train_ds, epochs=50, validation_data=test_ds)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x167345eb0>

In [74]:
loss, acc = model.evaluate(test_ds)
print(f'Test accuracy = {100*acc:.2f}%')

Test accuracy = 60.15%


## Word embeddings

Get the team embeddings from the embedding layers. Following [this tutorial](https://www.tensorflow.org/tutorials/text/word_embeddings#retrieve_the_trained_word_embeddings_and_save_them_to_disk).

In [75]:
home_team_weights = model.get_layer('home_team_embedding').get_weights()[0]
away_team_weights = model.get_layer('away_team_embedding').get_weights()[0]

In [76]:
vocab = list(le.classes_)

In [77]:
home_team_weights.shape

(30, 10)

In [78]:
import io

def save_embeddings(home_or_away, vocab, weights):
    out_v = io.open(f'{home_or_away}_team_vectors.tsv', 'w', encoding='utf-8')
    out_m = io.open(f'{home_or_away}_team_metadata.tsv', 'w', encoding='utf-8')

    for index, word in enumerate(vocab):
        vec = weights[index] 
        out_v.write('\t'.join([str(x) for x in vec]) + "\n")
        out_m.write(f'{word}\n')
    out_v.close()
    out_m.close()
    
    return dict(zip(vocab, weights))

In [79]:
home_mapping = save_embeddings('home', vocab, home_team_weights)
away_mapping = save_embeddings('away', vocab, away_team_weights)

## Cosine similarity

In [133]:
def cosine_similarity(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [142]:
team_top5 = dict()

for t1 in vocab:
    t1_distances = []
    for t2 in vocab:
        if t1 != t2:
            t1_weights = home_mapping[t1]
            t2_weights = home_mapping[t2]
            t1_distances.append((t2, cosine_similarity(t1_weights, t2_weights)))
    t1_distances = sorted(t1_distances, key=lambda x: x[-1])
    team_top5[t1] = [x[0] for x in t1_distances[-5:]]

In [143]:
team_top5

{'ANA': ['SFN', 'SEA', 'LAN', 'WAS', 'SLN'],
 'ARI': ['WAS', 'HOU', 'BOS', 'MIL', 'MIA'],
 'ATL': ['TBA', 'DET', 'ANA', 'SLN', 'COL'],
 'BAL': ['MIN', 'KCA', 'SDN', 'CLE', 'PHI'],
 'BOS': ['MIL', 'ARI', 'WAS', 'LAN', 'MIA'],
 'CHA': ['ARI', 'PIT', 'SLN', 'COL', 'WAS'],
 'CHN': ['PIT', 'MIL', 'TOR', 'TBA', 'CIN'],
 'CIN': ['KCA', 'TBA', 'TEX', 'CHN', 'DET'],
 'CLE': ['TEX', 'SDN', 'NYN', 'BAL', 'PHI'],
 'COL': ['TOR', 'CHA', 'TBA', 'SLN', 'ATL'],
 'DET': ['ATL', 'NYN', 'TBA', 'TEX', 'CIN'],
 'HOU': ['BOS', 'BAL', 'ARI', 'MIA', 'MIN'],
 'KCA': ['CIN', 'PHI', 'BAL', 'TEX', 'NYN'],
 'LAN': ['MIA', 'ANA', 'BOS', 'SFN', 'WAS'],
 'MIA': ['MIL', 'LAN', 'WAS', 'ARI', 'BOS'],
 'MIL': ['CHA', 'CHN', 'MIA', 'BOS', 'ARI'],
 'MIN': ['NYA', 'MIA', 'BAL', 'OAK', 'HOU'],
 'NYA': ['ATL', 'SEA', 'SLN', 'SFN', 'OAK'],
 'NYN': ['DET', 'SDN', 'PHI', 'KCA', 'TEX'],
 'OAK': ['SFN', 'ANA', 'MIN', 'SEA', 'NYA'],
 'PHI': ['KCA', 'TEX', 'NYN', 'CLE', 'BAL'],
 'PIT': ['SDN', 'ARI', 'CHN', 'CHA', 'TOR'],
 'SDN': ['

In [144]:
ana_2016_test_df = test_df.data[(test_df.data['home_team'] == le.transform(['ANA'])[0]) & (test_df.data['Y'] == 2016)]

In [145]:
ana_2016_test_df.head()

Unnamed: 0,Y,M,D,home_team,away_team,home_elo,away_elo,home_avg,away_avg,home_obp,...,away_WHIP_offset1year,away_ERA_offset1year,home_pitcher_season_game,home_pitcher_WHIP_avg_5games,home_pitcher_ERA_avg_5games,home_pitcher_IP_avg_5games,away_pitcher_season_game,away_pitcher_WHIP_avg_5games,away_pitcher_ERA_avg_5games,away_pitcher_IP_avg_5games
2439,2016,4,4,0,6,1509.867554,1526.674805,0.240166,0.239208,0.30189,...,1.151872,3.36,1.0,1.381221,3.826,5.6,1.0,1.382767,3.828,5.6
2448,2016,4,5,0,6,1505.534912,1531.007324,0.240166,0.239208,0.30189,...,1.151872,3.36,1.0,1.381221,3.826,5.6,1.0,1.382767,3.828,5.6
2469,2016,4,7,0,27,1502.591187,1510.292358,0.240166,0.249648,0.30189,...,1.363889,4.25,1.0,1.381221,3.826,5.6,1.0,1.382767,3.828,5.6
2482,2016,4,8,0,27,1503.895264,1508.988159,0.240166,0.249648,0.30189,...,1.363889,4.25,1.0,1.381221,3.826,5.6,1.0,1.382767,3.828,5.6
2496,2016,4,9,0,27,1501.109863,1511.77356,0.240166,0.249648,0.30189,...,1.363889,4.25,2.0,1.8,5.4,5.0,1.0,1.382767,3.828,5.6
