In [None]:
import matplotlib.pyplot as plt
import numpy as np
 
! pip install -q pyyaml h5py  # Required to save models in HDF5 format

### Mount Google Drive

**Requires dataset_tensor.npy file in "Colab Notebooks/Tensorized Transformers/Data" folder!**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

PATH = '/content/drive/My Drive/Colab Notebooks/Tensorized Transformers/'
DATA_PATH = PATH + 'Data/'

Mounted at /content/drive


### Clone Tensorized Transformers github repository

In [None]:
print('Github username:')
git_username = %sx read -p ''
git_username = git_username[0]

In [None]:
print('Github access token (https://github.com/settings/tokens):')
git_token =  %sx read -p ''
git_token = git_token[0]

In [None]:
# Clone the entire repo.
%cd /content
!git clone -l -s https://$git_username:$git_token@github.com/onurbil/tensorized_transformers.git tensorized_transformers
%cd tensorized_transformers
!ls
%cd ..

REPO_PATH = '/content/tensorized_transformers'

In [None]:
import sys
sys.path.append(REPO_PATH)
print(sys.path)

## Experiments

In [None]:
import tensorflow as tf
import tensorflow.keras as kr
import numpy as np
import matplotlib.pyplot as plt

import model.tt_mode_weights_TPU as tt
import dataset_tools.split
from visualization_tools.visualization import visualize_pos_encoding, attention_plotter

from tensorflow.keras.callbacks import TensorBoard
import datetime

# %load_ext tensorboard
# %tensorboard --logdir '/content/drive/My Drive/Colab Notebooks/Tensorized Transformers/output/'

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

strategy = tf.distribute.TPUStrategy(tpu)#tf.distribute.experimental.TPUStrategy(tpu)
print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
dir = '/content/drive/My Drive/Colab Notebooks/Tensorized Transformers/output/'  + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# Load dataset:
filename = DATA_PATH + 'dataset_tensor.npy'
# file_path = os.path.join(common.paths.PROCESSED_DATASET_DIR, filename)
dataset = np.load(filename, allow_pickle=True)

print(dataset.shape)

###### ALL PARAMETERS HERE######:
softmax_type = 2
input_length = 16
lag = 4
epoch = 200

d_model = 256
warmup_steps = 50
factor1=-0.6
factor2=-1.5

learning_rate = 0.0001
head_num = 64
d_model = 256
dense_units = 512
batch_size = 4

num_examples = 10112 
num_valid_examples = 512
initializer = 'RandomNormal'
patience = 60

num_examples = (num_examples // batch_size) * batch_size
num_valid_examples = (num_valid_examples // batch_size) * batch_size

train, test = dataset_tools.split.split_train_test(dataset)
x_train, y_train = dataset_tools.split.get_xy(train, input_length=input_length, lag=lag)
x_test, y_test = dataset_tools.split.get_xy(test, input_length=input_length, lag=lag)

#x_train = x_train.astype('float32')
x_train = tf.reshape(x_train, (x_train.shape[0], x_train.shape[1], dataset.shape[1], dataset.shape[2]))
y_train = tf.reshape(y_train, (y_train.shape[0], dataset.shape[1], dataset.shape[2]))
x_test = tf.reshape(x_test, (x_test.shape[0], x_test.shape[1], dataset.shape[1], dataset.shape[2]))
y_test = tf.reshape(y_test, (y_test.shape[0], dataset.shape[1], dataset.shape[2]))

# Choosing first 29 cities
x_train = x_train[:, :, :29, :]
y_train = y_train[:, :29, :]
x_test = x_test[:, :, :29, :]
y_test = y_test[:, :29, :]

print(f'FULL_x_train.shape: {x_train.shape}')

input_shape = (input_length, x_train.shape[-2], x_train.shape[-1])
output_shape = (1, 1)

# Choosing temperature as output
y_train = y_train[..., 0, 4]
y_test = y_test[..., 0, 4]

learning_rate = tt.CustomSchedule(d_model, warmup_steps=warmup_steps, factor1=factor1, factor2=factor2) #tt.CustomSchedule(d_model)                 # , warmup_steps=50, factor1=-0.84, factor2=-1.7)
optimizer = tf.keras.optimizers.Adam(learning_rate, 
                                     beta_1=0.9, 
                                     beta_2=0.98, 
                                     epsilon=1e-9
                                     )
lr_metric = tt.get_lr_metric(optimizer)

# optimizer = tf.keras.optimizers.Adadelta(learning_rate)
# optimizer = tf.keras.optimizers.Nadam(learning_rate)

temp_learning_rate_schedule = tt.CustomSchedule(d_model, warmup_steps=warmup_steps, factor1=factor1, factor2=factor2)
plt.plot(temp_learning_rate_schedule(tf.range(20000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")
plt.legend()
plt.show()

##tf.keras.optimizers.Adadelta

with strategy.scope():
  model = kr.Sequential([
              kr.Input(shape=input_shape),
              tt.PositionalEncoding(broadcast=True),
              tt.EncoderLayer(input_length, d_model, head_num, dense_units, initializer, softmax_type, batch_size),
              tt.EncoderLayer(input_length, d_model, head_num, dense_units, initializer, softmax_type, batch_size),
              tt.EncoderLayer(input_length, d_model, head_num, dense_units, initializer, softmax_type, batch_size),
              # tt.EncoderLayer(input_length, d_model, head_num, dense_units, initializer, softmax_type, batch_size),
              # tt.EncoderLayer(input_length, d_model, head_num, dense_units, initializer, softmax_type, batch_size),
              # tt.EncoderLayer(input_length, d_model, head_num, dense_units, initializer, softmax_type, batch_size),
              kr.layers.Flatten(),
              kr.layers.Dense(tf.reduce_prod(output_shape), activation='linear'),
              kr.layers.Reshape(output_shape),
              ])
  model.compile(optimizer=optimizer, loss='mae', metrics=['mse', lr_metric])
  #model.compile(optimizer=kr.optimizers.Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'])

model.summary()

x_valid = x_train[-num_examples - num_valid_examples:-num_examples, ...]
y_valid = y_train[-num_examples - num_valid_examples:-num_examples]
print(f'x_valid.shape: {x_valid.shape}')

x_train = x_train[-num_examples:]
y_train = y_train[-num_examples:]

print(f'x_train.shape: {x_train.shape}')
print(f'x_test.shape: {x_test.shape}')

# Callbacks
print_attention_weights = kr.callbacks.LambdaCallback(
    on_train_end=lambda batch: print(model.layers[1].attention_weights))
early_stopping = kr.callbacks.EarlyStopping(monitor='val_loss',
                                            patience=patience,
                                            restore_best_weights=True,
                                            verbose = 1)

model.fit(
    x_train, y_train,
    epochs=epoch,
    batch_size=batch_size * 8,
    validation_data=(x_valid, y_valid),
    callbacks=[early_stopping]
         )
#TensorBoard(log_dir=dir), 
# labels = np.arange(model.layers[1].attention_weights.shape[-2]).tolist()

# if (softmax_type == 1 or softmax_type == 2):
#     attention_plotter(tf.reshape(model.layers[1].attention_weights[1][0], (input_length,-1)), labels)
#     attention_plotter(tf.reshape(model.layers[1].attention_weights[2][0], (input_length,-1)), labels)
#     attention_plotter(tf.reshape(model.layers[1].attention_weights[3][0], (input_length,-1)), labels)        
#     attention_plotter(tf.reshape(model.layers[1].attention_weights[4][0], (input_length,-1)), labels)        

# elif softmax_type == 3:
#     # print(model.layers[1].attention_weights[0][3].numpy())
#     attention_3d_plotter(model.layers[1].attention_weights[0][3].numpy(), city_labels)
# else:
#     pass

pred = model.predict(x_valid)
mae = kr.metrics.mae(y_valid.numpy().flatten(), pred.flatten())
print(f'Figure mae: {np.mean(mae)}')

plt.figure(figsize=(20, 8))
plt.plot(range(pred.size), pred.flatten(), label='pred')
plt.plot(range(len(y_valid)), y_valid, label='true')
plt.legend()
plt.show()

print("\n\n######################## Model description ################################")
model.summary()
print("softmax_type = ", softmax_type)
print("Input_length = ", input_length)
print("Lag = ", lag)
print("Epoch = ", epoch)
print("warmup_steps = ", warmup_steps)
print("factor1 = ", factor1)
print("factor2 = ", factor2)

print("LR = ", learning_rate)
print("Head_num = ", head_num)
print("d_model = ", d_model)
print("dense_units = ", dense_units)
print("batch_size = ", batch_size)

print("num_examples = ", num_examples)
print("num_valid_examples = ", num_valid_examples)
print("input_shape = ", input_shape)
print("patience = ", patience)

pred = model.predict(x_test[-(8813//batch_size)*batch_size:, ...])
mae = kr.metrics.mae(y_test[-(8813//batch_size)*batch_size:, ...].numpy().flatten(), pred.flatten())
print("\n\n######################## Results ##########################################")
print(f'test mae: {np.mean(mae)}')

### Saving Model:
# model.save('/content/drive/My Drive/Colab Notebooks/Model/' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
# TO get it back
# new_model = tf.keras.models.load_model('saved_model/my_model')

## **Test of the learning rate curve**

In [None]:
d_model = 256
warmup_steps = 50
factor1=-0.6
factor2=-1.5

temp_learning_rate_schedule = tt.CustomSchedule(d_model, warmup_steps=warmup_steps, factor1=factor1, factor2=factor2)
plt.figure(figsize=(20, 8))
plt.plot(temp_learning_rate_schedule(tf.range(10000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")
plt.show()

#### Old experiments