<a href="https://colab.research.google.com/github/ncorriveau/transformers_for_prediction/blob/main/attention_layers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data Processing 





In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.18.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 31.3 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.18.0


In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
from keras import layers
import tensorflow_addons as tfa

In [4]:
#load our numpy tensors to load into keras 
features_path = '/content/drive/MyDrive/Data Mining/features.npy'
labels_path = '/content/drive/MyDrive/Data Mining/labels.npy'
features = np.load(features_path)
labels = np.load(labels_path)

In [5]:
print(features.shape)
print(labels.shape)


(21361, 22, 66)
(21361, 3)


In [6]:
#normalize the numerical features which are locatead in cols 0-35
features_num = features[:,:,:35]
print(features_num.shape)

features_num[0][0]

(21361, 22, 35)


array([58., 67., 25., 25., 25., 25., 10., 25., 16., 12., 46., 35., 45.,
       41., 41., 57., 72., 25., 56., 40., 73., 25., 57., 40., 33., 50.,
       48., 25., 25., 35., 63., 51., 46., 53., 68.])

In [7]:
mean = np.mean(features_num,axis=(0,1))
print(f"shape of mean vector = {mean.shape}")

std = np.std(features_num,axis=(0,1))
print(f"shape of std vector = {std.shape}")


shape of mean vector = (35,)
shape of std vector = (35,)


In [8]:
features_num = (features_num - mean) / std
print(features_num.shape)
print(features_num[0][0])

features[:,:,:35] = features_num 
print(features.shape)

(21361, 22, 35)
[-2.13402071 -1.37252412 -1.77469422 -1.23838184 -1.99655923 -2.73920252
 -2.04805005 -1.86919281 -1.97499507 -2.01346997 -0.99274355 -1.92213186
 -1.8076617  -2.24156397 -2.00765514 -1.48199584  0.46714265 -2.30508048
 -1.20626801 -2.30813745  0.28191452 -1.50510393 -0.48443304 -0.86434178
 -1.24425375 -0.70543806 -0.52144225 -1.16782289 -1.33347824 -0.76945225
  2.46779935  1.85987951  0.88080786  1.93442016  2.57236977]
(21361, 22, 66)


In [9]:
train_num = np.round(features.shape[0]*0.7)
val_num = np.round(features.shape[0]*0.15)
test_num = np.round(features.shape[0]*0.15)
train_index = int(train_num)
val_index = int(train_num+val_num)
test_index = int(val_index+test_num)

print(f"Training sample size = {train_num}, Validation set size = {val_num}, Test set size = {test_num}")


Training sample size = 14953.0, Validation set size = 3204.0, Test set size = 3204.0


We are working with time series data, i.e. each in the match dataset and thus our numpy array is in ordered by match date. 

In [10]:
train_features = features[:train_index,:,:]
train_labels = labels[:train_index,:]

val_features = features[train_index:val_index,:,:]
val_labels = labels[train_index:val_index,:]

test_features = features[val_index:,:,:]
test_labels = labels[val_index:,:]

In [11]:
print(test_features.shape, test_labels.shape)

(3204, 22, 66) (3204, 3)


In [12]:
#turn data into dataset objects 
train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((val_features, val_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_labels))

#Naive Implementations

We get a baseline model by implementing a naive dense model that just flattens all weights, and then we try a first stab at a transformer with 12 layers that self encodes all of the data and has a MLP classifier as a head



Ok, now we finally have data that's ready to go in tf. let's run it through a very simple network and see what we get. 

In [13]:
train_ds = train_dataset.shuffle(100).batch(32)
val_ds = val_dataset.batch(32)
test_ds = test_dataset.batch(32)

In [14]:

simple_model = tf.keras.Sequential([
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(3,activation="softmax",)
])

simple_model.compile(optimizer='adam',
              loss=['categorical_crossentropy'] ,
              metrics=['accuracy'])

simple_model.build(input_shape=(None,22,66))
simple_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 1452)              0         
                                                                 
 dense (Dense)               (None, 3)                 4359      
                                                                 
Total params: 4,359
Trainable params: 4,359
Non-trainable params: 0
_________________________________________________________________


In [15]:
simple_model.fit(train_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0c90217b90>

In [16]:
print(f"Test acc: {simple_model.evaluate(test_ds)}")

Test acc: [1.0827760696411133, 0.4697253406047821]


#Transformer Implementation
Here we are going to follow some more granular techniques to create a couple sub modules from Keras Layers to implement an architecture closely related to [Vision Transformers ](https://https://github.com/keras-team/keras-io/blob/master/examples/vision/image_classification_with_vision_transformer.py)

So far we ran our data, (which is a (sample x 22 players x 66 features) vector through both a simple flatten --> MLP layer, resulting in ~45% test accuracy, and a naive transformer implementation where we put it through a linear projection layer, 12 layers of 4 head multihead self attention, global max pooling and then a classification layer on top of that. 

We will be changing our implementation to make it more customizable and now try two different approaches: 
1.) Separate the data to learn a representation of each team through self attention (i.e. two (11xfeature length) inputs) 
2.) Same approach as before 

In both cases we will increase numper of MLP layers, and also flatten the output of the self attention blocks into (batch x (256*11)) vectors as inputs into the MLP. 

We will also take the position indicator in the feature vector (second to last position), and embed it and add it to back to the feature vector before putting it through the attention block. This is the same approach taken in the Vision Transformers paper linked above. 

In [17]:
def split_tensor(features,labels, index1=0,index2=-1):
  '''Start with our full feature tensor of shape (matches x 22 x feature_size) 
  and slice it into train, val, and test set with 
  splitting of teams and then creating a separate tensor for 
  positions in each match 
  Output will be:
   team1 (index1:index2 x 11 x feature_size-2)
   team2 (index1:index2 x 11 x feature_size-2)
   team1_pos (index1:index2 x 11 x 1)
   team2_pos (index1:index2 x 11 x 1)
   Slice the labels tensor to be (index1:index2, 3,) '''
  
  features_ = features[index1:index2,:,:-2]
  team_1 = features_[:,:11,:]
  team_2 = features_[:,11:,:]
  team_1_pos = features[index1:index2,:11,-2:-1]
  team_2_pos = features[index1:index2,11:,-2:-1]
  labels_ = labels[index1:index2,:]

  return team_1, team_2, team_1_pos, team_2_pos, labels_


train_team_1, train_team_2, train_team_1_pos, train_team_2_pos,\
 train_labels = split_tensor(features, labels, index2=train_index)

val_team_1, val_team_2, val_team_1_pos, val_team_2_pos,\
 val_labels = split_tensor(features,labels,index1=train_index,index2=val_index)


test_team_1, test_team_2, test_team_1_pos, test_team_2_pos,\
test_labels = split_tensor(features, labels, index1=val_index)

In [18]:
print(train_index, val_index, test_index)

14953 18157 21361


In [19]:
print(f"Team 1 Shape = {train_team_1.shape}, \nTeam 2 Shape = {train_team_2.shape}\
        \nTeam 1 Positions = {train_team_1_pos.shape}")

print(f"Team 1 Shape = {val_team_1.shape}, \nTeam 2 Shape = {val_team_2.shape}\
        \nTeam 1 Positions = {val_team_1_pos.shape}")



print(f"Team 1 Shape = {test_team_1.shape}, \nTeam 2 Shape = {test_team_2.shape}\
        \nTeam 1 Positions = {test_team_1_pos.shape}")


Team 1 Shape = (14953, 11, 64), 
Team 2 Shape = (14953, 11, 64)        
Team 1 Positions = (14953, 11, 1)
Team 1 Shape = (3204, 11, 64), 
Team 2 Shape = (3204, 11, 64)        
Team 1 Positions = (3204, 11, 1)
Team 1 Shape = (3203, 11, 64), 
Team 2 Shape = (3203, 11, 64)        
Team 1 Positions = (3203, 11, 1)


In [20]:
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_1":train_team_1,"input_2":train_team_2}, train_labels))

val_dataset = tf.data.Dataset.from_tensor_slices(({"input_1":val_team_1,"input_2":val_team_2}, val_labels))

test_dataset = tf.data.Dataset.from_tensor_slices(({"input_1":test_team_1,"input_2":test_team_2}, test_labels))


In [21]:
num_classes = 3
input_shape = (11,64)
learning_rate = 0.001
weight_decay = 0.0001
batch_size = 32
num_epochs = 100
num_players = 11
projection_dim = 256
num_heads = 4
transformer_units = [
    projection_dim * 2,
    projection_dim,
] 

transformer_layers = 8
mlp_head_units = [2048, 1024]  # Size of the dense layers of the final classifier


In [22]:
'''Implement MLP layers'''
def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
        x = layers.Dense(units, activation=tf.nn.gelu)(x)
        x = layers.Dropout(dropout_rate)(x)
    return x


class Encoder(layers.Layer):
  '''Takes the original input layer, projects it and then adds a positional encoding layer to it'''
  def __init__(self, num_players, projection_dim):
      super().__init__()
      self.num_players = num_players
      self.projection = layers.Dense(units=projection_dim)
      self.position_embedding = layers.Embedding(
          input_dim=num_players, output_dim=projection_dim
      )

  def call(self, players):
      positions = tf.range(start=0, limit=self.num_players, delta=1)
      encoded = self.projection(players) + self.position_embedding(positions)
      return encoded
  

In [23]:
def transformer_block(team):
    '''Input a particular team as a set of (batch_size, 11, feature size) vectors,
     and returns output of transformer block that will be used in classification downstream'''
    encoded_players = Encoder(num_players, projection_dim)(team)

    # Create multiple layers of the Transformer block.
    for _ in range(transformer_layers):
        # Layer normalization 1.
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_players)
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
        )(x1, x1)
        # Skip connection 1.
        x2 = layers.Add()([attention_output, encoded_players])
        # Layer normalization 2.
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        # MLP.
        x4 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
       
        # Skip connection 2.
        encoded_players1= layers.Add()([x3, x2])

    # Create a [batch_size, projection_dim] tensor.
    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_players)

    representation = layers.Flatten()(representation)
    representation = layers.Dropout(0.5)(representation)
    return representation
    

In [24]:

input1 = layers.Input(shape=input_shape)
input2 = layers.Input(shape=input_shape)

representation1 = transformer_block(input1)
representation2 = transformer_block(input2)

representation = layers.Concatenate()([representation1,representation2])
features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.5)
# Classify outputs.
logits = layers.Dense(num_classes)(features)
# Create the Keras model.
model = keras.Model(inputs=[input1, input2], outputs=logits)

In [27]:
def run_experiment(model):
    optimizer = tfa.optimizers.AdamW(
        learning_rate=learning_rate, weight_decay=weight_decay
    )

    model.compile(
        optimizer=optimizer,
        loss=keras.losses.CategoricalCrossentropy(from_logits=True),
        metrics=[keras.metrics.CategoricalAccuracy(name="accuracy")],
    )

    checkpoint_filepath = "/content/drive/MyDrive/Data Mining/"
    checkpoint_list = [
        keras.callbacks.ModelCheckpoint(
          checkpoint_filepath,
          monitor="val_accuracy",
          save_best_only=True,
          save_weights_only=True,
        ), 
        keras.callbacks.EarlyStopping(
          monitor="val_accuracy",
          patience=2,
        ),

        keras.callbacks.TensorBoard(
          log_dir="/content/drive/MyDrive/Data Mining/",
          ),
       ]

    history = model.fit(
        x = [train_team_1, train_team_2],
        y = train_labels,
        batch_size=batch_size,
        epochs=num_epochs,
        validation_split=0.1,
        validation_data = ([val_team_1,val_team_2],val_labels),
        callbacks=checkpoint_list,
    )

    return history

In [None]:
history = run_experiment(model)

In [36]:
checkpoint_filepath = "/content/drive/MyDrive/Data Mining/"
model.load_weights(checkpoint_filepath)
loss, accuracy = model.evaluate([test_team_1, test_team_2],test_labels)
print(f"Test accuracy: {accuracy*100:.2f}")


Test accuracy: 53.01


In [None]:
%reload_ext tensorboard
%tensorboard --logdir /checkpoint_filepath

In [40]:
!kill 553