<a href="https://colab.research.google.com/github/ncorriveau/transformers_for_prediction/blob/main/attention_layers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

In [23]:
#load our numpy tensors to load into keras 
features_path = '/content/drive/MyDrive/Data Mining/features.npy'
labels_path = '/content/drive/MyDrive/Data Mining/labels.npy'
features = np.load(features_path)
labels = np.load(labels_path)

In [24]:
print(features.shape)
print(labels.shape)


(21361, 22, 66)
(21361, 3)


In [25]:
#normalize the numerical features which are locatead in cols 0-35
features_num = features[:,:,:35]
print(features_num.shape)

features_num[0][0]

(21361, 22, 35)


array([58., 67., 25., 25., 25., 25., 10., 25., 16., 12., 46., 35., 45.,
       41., 41., 57., 72., 25., 56., 40., 73., 25., 57., 40., 33., 50.,
       48., 25., 25., 35., 63., 51., 46., 53., 68.])

In [26]:
mean = np.mean(features_num,axis=(0,1))
print(f"shape of mean vector = {mean.shape}")

std = np.std(features_num,axis=(0,1))
print(f"shape of std vector = {std.shape}")


shape of mean vector = (35,)
shape of std vector = (35,)


In [27]:
features_num = (features_num - mean) / std
print(features_num.shape)
print(features_num[0][0])

features[:,:,:35] = features_num 
print(features.shape)

(21361, 22, 35)
[-2.13402071 -1.37252412 -1.77469422 -1.23838184 -1.99655923 -2.73920252
 -2.04805005 -1.86919281 -1.97499507 -2.01346997 -0.99274355 -1.92213186
 -1.8076617  -2.24156397 -2.00765514 -1.48199584  0.46714265 -2.30508048
 -1.20626801 -2.30813745  0.28191452 -1.50510393 -0.48443304 -0.86434178
 -1.24425375 -0.70543806 -0.52144225 -1.16782289 -1.33347824 -0.76945225
  2.46779935  1.85987951  0.88080786  1.93442016  2.57236977]
(21361, 22, 66)


In [49]:
train_num = np.round(features.shape[0]*0.7)
val_num = np.round(features.shape[0]*0.10)
test_num = np.round(features.shape[0]*0.2)
train_index = int(train_num)
val_index = int(train_num+val_num)
test_index = int(val_index+test_num)

print(f"Training sample size = {train_num}, Validation set size = {val_num}, Test set size = {test_num}")


Training sample size = 14953.0, Validation set size = 2136.0, Test set size = 4272.0


We are working with time series data, i.e. each in the match dataset and thus our numpy array is in ordered by match date. 

In [29]:
train_features = features[:train_index,:,:]
train_labels = labels[:train_index,:]

val_features = features[train_index:val_index,:,:]
val_labels = labels[train_index:val_index,:]

test_features = features[val_index:,:,:]
test_labels = labels[val_index:,:]

In [30]:
print(test_features.shape, test_labels.shape)

(4272, 22, 66) (4272, 3)


In [31]:
#turn data into dataset objects 
train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((val_features, val_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_labels))

Ok, now we finally have data that's ready to go in tf. let's run it through a very simple network and see what we get. 

In [40]:
train_ds = train_dataset.batch(32).shuffle(100)

simple_model = tf.keras.Sequential([
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(3,activation="softmax",)
])

simple_model.compile(optimizer='adam',
              loss=['categorical_crossentropy'] ,
              metrics=['accuracy'])

simple_model.build(input_shape=(,22,66))
simple_model.summary()

In [42]:
simple_model.fit(train_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f1154ab5a10>

So we basically fed the data into a random model, and it spit out a train accuracy of ~53% which is ok. So this is a base case structure that we can build off of. Now we will implement a transformer architecture. starting with a linaer projection layer, followed by a vanilla transformer encoder, and lastly a multiclass classification head to output our 3 part vector of [home win, home tie, home loss] probabilities

In [43]:
from keras import layers

#create the transformer encoder 
class TransformerEncoder(layers.Layer):
  def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
      super().__init__(**kwargs)
      self.embed_dim = embed_dim
      self.dense_dim = dense_dim
      self.num_heads = num_heads
      self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
      self.dense_proj = keras.Sequential(
          [layers.Dense(dense_dim, activation="relu"),
          layers.Dense(embed_dim),]
          )
      self.layernorm_1 = layers.LayerNormalization()
      self.layernorm_2 = layers.LayerNormalization()

  def call(self, inputs, mask=None):
      attention_output = self.attention(
                inputs, inputs, inputs)
      proj_input = self.layernorm_1(inputs + attention_output)
      proj_output = self.dense_proj(proj_input)
      return self.layernorm_2(proj_input + proj_output)

  def get_config(self):
      config = super().get_config()
      config.update({
      "embed_dim": self.embed_dim,
      "num_heads": self.num_heads,
      "dense_dim": self.dense_dim,
      })
      return config


In [47]:
#create the full flow 
embed_dim = 256 
num_heads = 4
dense_dim = 32
input_shape = (features.shape[1],features.shape[-1])
print(input_shape)

#create full flow of model 
inputs = keras.Input(shape=(input_shape), dtype="int64")
x = layers.Dense(embed_dim, activation="relu")(inputs)
#run through self encoding 12 times 
x = keras.Sequential([ 
    TransformerEncoder(embed_dim, dense_dim, num_heads),
    TransformerEncoder(embed_dim, dense_dim, num_heads),
    TransformerEncoder(embed_dim, dense_dim, num_heads),
    TransformerEncoder(embed_dim, dense_dim, num_heads),
    TransformerEncoder(embed_dim, dense_dim, num_heads),
    TransformerEncoder(embed_dim, dense_dim, num_heads),
    TransformerEncoder(embed_dim, dense_dim, num_heads),
    TransformerEncoder(embed_dim, dense_dim, num_heads),
    TransformerEncoder(embed_dim, dense_dim, num_heads),
    TransformerEncoder(embed_dim, dense_dim, num_heads),
    TransformerEncoder(embed_dim, dense_dim, num_heads),
    TransformerEncoder(embed_dim, dense_dim, num_heads),  
  ])(x)
x = layers.GlobalMaxPooling1D()(x)
outputs = layers.Dense(3, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer='adam',
              loss=['categorical_crossentropy'] ,
              metrics=['accuracy'])
model.summary()

(22, 66)
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 22, 66)]          0         
                                                                 
 dense_56 (Dense)            (None, 22, 256)           17152     
                                                                 
 sequential_43 (Sequential)  (None, 22, 256)           12835200  
                                                                 
 global_max_pooling1d_1 (Glo  (None, 256)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_81 (Dense)            (None, 3)                 771       
                                                                 
Total params: 12,853,123
Trainable params: 12,853,123
Non-trainable params: 0
______________________________________

In [None]:
train_ds = train_dataset.shuffle(100).batch(32)
val_ds = val_dataset.batch(32)
test_ds = test_dataset.batch(32)

callbacks = keras.callbacks.ModelCheckpoint(filepath="transformer_encoder.keras",
 monitor="val_loss",
 save_best_only=True,
 )

model.fit(train_ds, validation_data=val_ds, epochs=10,
 callbacks=callbacks)

model = keras.models.load_model(
 "transformer_encoder.keras",
 custom_objects={"TransformerEncoder": TransformerEncoder})
print(f"Test acc: {model.evaluate(test_ds):.3f}")
