# PAT Conv Pretraining Notebook


Trained with Google TPU v2

#Set Up

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# @title Importing

#Installs
!pip install pyarrow fastparquet

# Packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
import tensorflow as tf
import os
from IPython.display import clear_output

# Sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

# Keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
#from keras.layers.embeddings import Embedding
from keras.metrics import AUC

# Tf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
import random

In [None]:
#@title Random Seeds
import random
## SEEDS

# Hard Code Random Seeds.
r1 = 0
r2 = 1

# Set Random Seed
random.seed(r1)
tf.random.set_seed(r2)

In [None]:
#@title Connect to TPU
print("TensorFlow version:", tf.__version__)

# Connect to the TPU cluster or fall back to CPU/GPU
try:
  resolver = tf.distribute.cluster_resolver.TPUClusterResolver()  # Tries to connect to the TPU
  tf.config.experimental_connect_to_cluster(resolver)
  tf.tpu.experimental.initialize_tpu_system(resolver)
  strategy = tf.distribute.TPUStrategy(resolver)
  devices = tf.config.list_logical_devices('TPU')
  print('TPU devices:', devices)
except ValueError:
  print("Could not connect to TPU; using CPU/GPU strategy instead.")
  strategy = tf.distribute.get_strategy()

# Example computation using the strategy
with strategy.scope():
  a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
  b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])

  @tf.function
  def matmul_fn(x, y):
    return tf.matmul(x, y)

  z = strategy.run(matmul_fn, args=(a, b))

print(z)

# Hyperparameters & Settings (Fill out)


In [None]:
# write where you want to save all your files
## CHANGE ##
root = "/content/drive/MyDrive/ActigraphyTransformer/A-NEW/PAT Experiments /PAT Conv Pretraining/Encoders"

In [None]:
"""
Please Fill out Parameters Below
"""
## Model size
# eg. ["small", "medium", "large", "huge"]
size = "small"

## Mask ratio
# eg. [.25, .50, .75]
mask_ratio = 0.90

## Smoothing
# eg. [True, False]
smoothing = False

## Loss Function
# eg. [True, False], meaning MSE on only the masked portion or everything in the reconstruction
mse_only_masked = False

In [None]:
# Model naming
mask_name = int(mask_ratio*100)

name = f"/conv_encoder_{size}_{mask_name}"

if smoothing == True:
  name = f"{name}_smoothed"
else:
  name = f"{name}_unsmoothed"

if mse_only_masked == True:
  name = f"{name}_mse_only_masked.h5"
else:
  name = f"{name}_mse_all.h5"

In [None]:
print(name)

# Hyperparameter Additional Info

In [None]:
"""
Model Size
"""
## Model Size
if size == "small":

  patch_size = 18
  embed_dim = 96
  # encoder
  encoder_num_heads = 6
  encoder_ff_dim = 256
  encoder_num_layers = 1
  encoder_rate = 0.1
  # decoder
  decoder_num_heads = 6
  decoder_ff_dim = 256
  decoder_num_layers = 1
  decoder_rate = 0.1

if size == "medium":

  patch_size = 18
  embed_dim = 96
  # encoder
  encoder_num_heads = 12
  encoder_ff_dim = 256
  encoder_num_layers = 2
  encoder_rate = 0.1
  # decoder
  decoder_num_heads = 12
  decoder_ff_dim = 256
  decoder_num_layers = 1
  decoder_rate = 0.1

if size == "large":

  patch_size = 9
  embed_dim = 96
  # encoder
  encoder_num_heads = 12
  encoder_ff_dim = 256
  encoder_num_layers = 4
  encoder_rate = 0.1
  # decoder
  decoder_num_heads = 12
  decoder_ff_dim = 256
  decoder_num_layers = 1
  decoder_rate = 0.1

if size == "huge":

  patch_size = 5
  embed_dim = 96
  # encoder
  encoder_num_heads = 12
  encoder_ff_dim = 256
  encoder_num_layers = 8
  encoder_rate = 0.1
  # decoder
  decoder_num_heads = 12
  decoder_ff_dim = 256
  decoder_num_layers = 1
  decoder_rate = 0.1

In [None]:
"""
For Pretraining
"""

## Model Size
if size == "small":

  learning_rate = 0.001
  early_stopping_patience = 250
  reduce_lr_patience = 75
  min_lr = 1e-4

if size == "medium":

  learning_rate = 0.001
  early_stopping_patience = 250
  reduce_lr_patience = 75
  min_lr = 1e-4

if size == "large":

  learning_rate = 0.0001
  early_stopping_patience = 500
  reduce_lr_patience = 100
  min_lr = 1e-5


if size == "huge":

  learning_rate = 0.00001
  early_stopping_patience = 500
  reduce_lr_patience = 100
  min_lr = 1e-6

In [None]:
"""
Smoothing
"""
if smoothing == True:
  data = pd.read_parquet('/content/drive/MyDrive/ActigraphyTransformer/A-NEW/Data Preprocessing/SelfSupervised Datasets/Smooth/[SelfSupervised][Smooth]WideSeqnActi_AndMeds_2013.parq')

else:
  data = pd.read_parquet('/content/drive/MyDrive/ActigraphyTransformer/A-NEW/Data Preprocessing/SelfSupervised Datasets/Raw/[SelfSupervised][Raw]WideSeqnActi_AndMeds_2013.parq')

# Process Data

In [None]:
from sklearn.model_selection import train_test_split

# First, split into train and temp (this temp will be split into validation and test)
X_train, X_test = train_test_split(np.array(data), test_size=0.005, random_state=19, shuffle=True)


# Reshape Train and Test
n_participants_train = X_train.shape[0]
n_participants_test = X_test.shape[0]
n_timesteps = X_train.shape[1]
n_features = 1

# Reshape data
X_train = X_train.reshape((n_participants_train, n_timesteps, n_features))
X_test = X_test.reshape((n_participants_test, n_timesteps, n_features))

In [None]:
X_train.shape

In [None]:
X_test.shape

# Autoencoder


#MAE

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Modified Transformer Block to output attention weights with explicit layer names
def TransformerBlock(embed_dim, num_heads, ff_dim, rate=0.1, name_prefix="encoder"):
    # Input
    input_layer = layers.Input(shape=(None, embed_dim), name=f"{name_prefix}_input")
    #Attention
    attention_layer = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim, name=f"{name_prefix}_attention")
    attention_output, attention_weights = attention_layer(input_layer, input_layer, return_attention_scores=True)
    attention_output = layers.Dropout(rate, name=f"{name_prefix}_dropout")(attention_output)
    # Add + Norm
    out1 = layers.LayerNormalization(epsilon=1e-6, name=f"{name_prefix}_norm1")(input_layer + attention_output)
    # FF Network
    ff_output = layers.Dense(ff_dim, activation="relu", name=f"{name_prefix}_ff1")(out1)
    ff_output = layers.Dense(embed_dim, name=f"{name_prefix}_ff2")(ff_output)
    ff_output = layers.Dropout(rate, name=f"{name_prefix}_dropout2")(ff_output)
    # Add + Norm
    final_output = layers.LayerNormalization(epsilon=1e-6, name=f"{name_prefix}_norm2")(out1 + ff_output)
    return models.Model(inputs=input_layer, outputs=[final_output, attention_weights], name=f"{name_prefix}_transformer")

# Custom Layer to create and apply the mask for MAE
class MaskLayer(layers.Layer):
    def __init__(self, mask_ratio, embed_dim, **kwargs):
        super(MaskLayer, self).__init__(**kwargs)
        self.mask_ratio = mask_ratio
        self.embed_dim = embed_dim

    def build(self, input_shape):
        self.mask_token = self.add_weight(
            shape=(1, 1, self.embed_dim),
            initializer='random_normal',
            trainable=True,
            name='mask_token'
        )

    def call(self, patch_embeddings, positional_embeddings):
        batch_size = tf.shape(patch_embeddings)[0]
        num_patches = tf.shape(patch_embeddings)[1]

        shuffled_indices = tf.random.shuffle(tf.range(num_patches))
        num_masked = tf.cast(tf.math.round(self.mask_ratio * tf.cast(num_patches, tf.float32)), tf.int32)

        masked_indices = shuffled_indices[:num_masked]
        visible_indices = shuffled_indices[num_masked:]

        visible_patches = tf.gather(patch_embeddings, indices=visible_indices, axis=1)
        masked_patches = tf.gather(patch_embeddings, indices=masked_indices, axis=1)
        visible_positional_embeddings = tf.gather(positional_embeddings, indices=visible_indices, axis=0)

        return visible_patches, masked_patches, visible_positional_embeddings, shuffled_indices, masked_indices

# Custom Layer to create mask tokens for the decoder
class MaskTokenLayer(layers.Layer):
    def __init__(self, **kwargs):
        super(MaskTokenLayer, self).__init__(**kwargs)

    def call(self, mask_token, masked_patches):
        tiled_mask_tokens = tf.tile(mask_token, [tf.shape(masked_patches)[0], tf.shape(masked_patches)[1], 1])
        return tiled_mask_tokens

# Custom Layer to concatenate tensors
class ConcatLayer(layers.Layer):
    def __init__(self, axis, **kwargs):
        super(ConcatLayer, self).__init__(**kwargs)
        self.axis = axis

    def call(self, inputs):
        return tf.concat(inputs, axis=self.axis)

# Unshuffling Layer to revert the shuffle applied during masking
class UnshuffleLayer(layers.Layer):
    def __init__(self, **kwargs):
        super(UnshuffleLayer, self).__init__(**kwargs)

    def call(self, patches, shuffle_indices):
        num_patches = tf.shape(shuffle_indices)[0]

        reverse_indices = tf.scatter_nd(
            tf.expand_dims(shuffle_indices, axis=-1),
            tf.range(num_patches),
            [num_patches]
        )

        unshuffled_patches = tf.gather(patches, indices=reverse_indices, axis=1)

        return unshuffled_patches

# Sine/Cosine positional embeddings
def get_positional_embeddings(num_patches, embed_dim):
    position = tf.range(num_patches, dtype=tf.float32)[:, tf.newaxis]
    div_term = tf.exp(tf.range(0, embed_dim, 2, dtype=tf.float32) * (-tf.math.log(10000.0) / embed_dim))
    pos_embeddings = tf.concat([tf.sin(position * div_term), tf.cos(position * div_term)], axis=-1)
    return pos_embeddings

# Custom Layer to calculate MSE only on masked portions
class MaskedMSELayer(layers.Layer):
    def __init__(self, **kwargs):
        super(MaskedMSELayer, self).__init__(**kwargs)

    def call(self, y_true, y_pred, masked_indices, mse_only_masked):

        if mse_only_masked:
            y_true_flat = tf.reshape(y_true, [-1])
            y_pred_flat = tf.reshape(y_pred, [-1])

            y_true_masked = tf.gather(y_true_flat, masked_indices)
            y_pred_masked = tf.gather(y_pred_flat, masked_indices)

            mse_loss = tf.reduce_mean(tf.square(y_true_masked - y_pred_masked))
        else:
            mse_loss = tf.reduce_mean(tf.square(y_true - y_pred))

        return mse_loss

# Model creation function
def create_model(input_size=10080, patch_size=patch_size, embed_dim=embed_dim,
                 encoder_num_heads=encoder_num_heads, encoder_ff_dim=encoder_ff_dim, encoder_num_layers=encoder_num_layers, encoder_rate=encoder_rate,
                 decoder_num_heads=decoder_num_heads, decoder_ff_dim=decoder_ff_dim, decoder_num_layers=decoder_num_layers, decoder_rate=decoder_rate,
                 mask_ratio=mask_ratio, mse_only_masked=mse_only_masked, return_attention=False):

    num_patches = input_size // patch_size
    inputs = layers.Input(shape=(input_size,), name="inputs")
    reshaped = layers.Reshape((num_patches, patch_size), name="reshape")(inputs)

    # Add a channel dimension: (num_patches, patch_size, 1)
    patches_with_channel = layers.Reshape((num_patches, patch_size, 1), name="add_channel_dim")(reshaped)

    # Apply Conv1D to each patch individually using TimeDistributed
    conv_layer = layers.TimeDistributed(
        layers.Conv1D(
            filters=embed_dim,
            kernel_size=3,
            padding='same',
            activation='relu'
        ),
        name="conv1d_patch_embedding"
    )

    # Output shape: (num_patches, patch_size, embed_dim)
    conv_output = conv_layer(patches_with_channel)

    # Pool over the patch dimension to get a fixed-size embedding per patch
    pooling_layer = layers.TimeDistributed(
        layers.GlobalAveragePooling1D(),
        name="global_avg_pooling"
    )

    # Output shape: (num_patches, embed_dim)
    patch_embeddings = pooling_layer(conv_output)

    positional_embeddings = get_positional_embeddings(num_patches, embed_dim)

    mask_layer = MaskLayer(mask_ratio, embed_dim, name="mask_layer")
    visible_patches, masked_patches, visible_positional_embeddings, shuffle_indices, masked_indices = mask_layer(patch_embeddings, positional_embeddings)

    x = visible_patches + visible_positional_embeddings
    attention_weights = []

    for i in range(encoder_num_layers):
        x, weights = TransformerBlock(embed_dim, encoder_num_heads, encoder_ff_dim, encoder_rate, name_prefix=f"encoder_layer_{i+1}")(x)
        if return_attention:
            attention_weights.append(weights)

    mask_token_layer = MaskTokenLayer(name="mask_token_layer")
    mask_tokens = mask_token_layer(mask_layer.mask_token, masked_patches)

    decoder_input = ConcatLayer(axis=1, name="concat_layer")([x, mask_tokens])

    unshuffle_layer = UnshuffleLayer(name="unshuffle_layer")
    decoder_input_unshuffled = unshuffle_layer(decoder_input, shuffle_indices)

    decoder_input_with_pos = decoder_input_unshuffled + positional_embeddings

    y = decoder_input_with_pos
    for i in range(decoder_num_layers):
        y, _ = TransformerBlock(embed_dim, decoder_num_heads, encoder_ff_dim, decoder_rate, name_prefix=f"decoder_layer_{i+1}")(y)

    outputs = layers.Dense(patch_size, activation='tanh', name="decoder_dense")(y)
    outputs = 2 * outputs #nothing above 2 standard deviations we'll just categorize as the same
    outputs = layers.Reshape((input_size,), name="decoder_reshape")(outputs)

    masked_mse_layer = MaskedMSELayer(name="masked_mse_layer")
    loss = masked_mse_layer(inputs, outputs, masked_indices, mse_only_masked)

    model = models.Model(inputs=inputs, outputs=outputs, name="MAE_model")
    model.add_loss(loss)

    if return_attention:
        return models.Model(inputs=inputs, outputs=[outputs] + attention_weights, name="MAE_with_attention")
    else:
        return model


with strategy.scope():
    model = create_model()

    model.compile(optimizer=tf.keras.optimizers.Adam(
        learning_rate=learning_rate,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-07,
        amsgrad=False))

model.summary()

# Train Model

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard

# Define your custom callback
class CustomCallback(tf.keras.callbacks.Callback):
    def __init__(self, function, frequency):
        super(CustomCallback, self).__init__()
        self.function = function
        self.frequency = frequency

    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.frequency == 0:
            self.function(epoch + 1, logs)

# Define your custom function
def my_custom_function(epoch, logs):
    print(f"\n plotting.. output {epoch}")

    # Plot the input and output for X_test[0]
    input_data = X_test[0]
    output_data = model.predict(input_data.reshape(1, -1, 1)).flatten()  # Reshape to match the model input and flatten the output

    plt.figure(figsize=(12, 6))

    # Plot the input data and output data as an overlay
    plt.plot(input_data, label='Input')
    plt.plot(output_data, label='Output', color='orange')
    plt.title(f'Input and Output Data at Epoch {epoch}')
    plt.xlabel('Time Steps')
    plt.ylabel('Value')
    plt.legend()

    plt.tight_layout()

    # Save the figure with a zero-padded filename for correct sorting
    plt.savefig(f'output_epoch_{epoch:06d}.png')
    plt.show()
    plt.close()


# Define the callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=early_stopping_patience, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=reduce_lr_patience, min_lr=min_lr, verbose=1)
# tensorboard = TensorBoard(log_dir='./logs', histogram_freq=1, write_graph=True, write_images=True)
custom_callback = CustomCallback(function=my_custom_function, frequency=10)


# Fit the model with all callbacks
with strategy.scope():
    history = model.fit(
        X_train, X_train,
        epochs=10000,
        batch_size=128,
        validation_split=0.1,
        verbose=1,
        callbacks=[early_stopping, reduce_lr, custom_callback]
    )

# Save model

In [None]:
# Function to save just the encoder part of the model
def save_encoder_only(model, encoder_num_layers, embed_dim, save_path=root+name):
    # Define a new input that matches the expected input shape of the encoder
    encoder_input = model.input

    # include patch embedding and reshape
    x = model.get_layer(name="conv1d_patch_embedding").output  # The Dense layer after reshaping
    x = model.get_layer(name="global_avg_pooling")(x)

    # give positional embedding
    num_patches = 10080 // patch_size
    positional_embeddings = get_positional_embeddings(num_patches, embed_dim)

    x = x + positional_embeddings

    attention_weights = []
    for i in range(encoder_num_layers):
        transformer_block = model.get_layer(name=f"encoder_layer_{i+1}_transformer")
        x, weights = transformer_block(x)
        attention_weights.append(weights)

    # Create the encoder model with the new input
    encoder_model = models.Model(inputs=encoder_input, outputs=[x] + attention_weights, name="encoder_model")
    encoder_model.save(save_path)
    print(f"Encoder model saved to {save_path}")

# Save the encoder model
save_encoder_only(model, encoder_num_layers, embed_dim=embed_dim)

In [None]:
encoder_model = tf.keras.models.load_model(root+name, custom_objects={'TransformerBlock': TransformerBlock, 'get_positional_embeddings': get_positional_embeddings})

In [None]:
encoder_model.summary()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras import Model

def plot_input_mask_output(model, X_test, num_samples=20, patch_size=patch_size):
    """
    Creates a separate plot for each input from X_test, showing the original input,
    the masked input that the model sees, the model's output, and a combination
    of original and output data, stacked vertically.

    Parameters:
    - model: The trained model.
    - X_test: Test data, shape should be (num_samples, input_size, 1).
    - num_samples: Number of samples to visualize.
    - patch_size: Size of the patches used by the model.
    """
    num_patches = X_test.shape[1] // patch_size

    # Create a sub-model that outputs the visible patches and model output
    intermediate_model = Model(inputs=model.input,
                               outputs=[model.get_layer('mask_layer').output, model.output])

    for i in range(num_samples):
        original_input = X_test[i].flatten()  # Flatten the original input for easy handling

        # Get the intermediate output and final output from the model
        mask_layer_output, model_output = intermediate_model.predict(X_test[i:i+1])
        visible_patches, _, _, shuffle_indices, masked_indices = mask_layer_output


        # Ensure the indices are properly interpreted as arrays
        shuffle_indices = np.array(shuffle_indices)
        masked_indices = np.array(masked_indices)

        # Calculate the visible indices
        visible_indices = np.setdiff1d(shuffle_indices, masked_indices)

        # Combine the original and output data for the fourth plot
        combined_data = original_input.copy()
        model_output = model_output.flatten()
        for idx in masked_indices:
            start = idx * patch_size
            end = start + patch_size
            combined_data[start:end] = model_output[start:end]

        # Create a new figure for each input
        fig, axes = plt.subplots(4, 1, figsize=(30, 20))

        # Plot original input
        axes[0].plot(original_input, color='blue')
        axes[0].set_title('Original Input')

        # Plot the input and overlay gray highlight on non-visible patches
        axes[1].plot(original_input, color='blue')
        for idx in masked_indices:
            start = idx * patch_size
            end = start + patch_size
            axes[1].axvspan(start, end, color='gray', zorder=10)  # Gray overlay in the foreground with high zorder
        axes[1].set_title('Input with Non-Visible Patches Covered')

        # Plot model output
        axes[2].plot(original_input, color='lightgray', linestyle='dashed')
        axes[2].plot(model_output, color='green')
        axes[2].set_title('Model Output')

        # Plot combined data with original input and light red background for masked areas
        axes[3].plot(original_input, color='lightgray', linestyle='dashed')
        for idx in masked_indices:
            start = idx * patch_size
            end = start + patch_size
            axes[3].axvspan(start, end, color='lightcoral', alpha=1, zorder=1)  # Light red background
        axes[3].plot(combined_data, color='green')
        axes[3].set_title('Combined Data with Original and Masked Highlight')

        # Adjust layout and show the plot
        plt.tight_layout()
        plt.show()


In [None]:
# Plot results
plot_input_mask_output(model, X_test, patch_size=patch_size, num_samples=50)

In [None]:
#inspect the data rq