In [405]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [421]:
# Encoding legend for translation after prediction
encoding_legend = {
    1: 'MRI_CCS_11', 2: 'MRI_EXU_95', 3: 'MRI_FRR_18', 4: 'MRI_FRR_257',
    5: 'MRI_FRR_264', 6: 'MRI_FRR_3', 7: 'MRI_FRR_34', 8: 'MRI_MPT_1005',
    9: 'MRI_MSR_100', 10: 'MRI_MSR_104', 11: 'MRI_MSR_21', 12: 'MRI_MSR_34',
    0: 'PADDED',  # Add a padding category
    10: 'START',  # Start token
    9: 'END'      # End token
}

In [407]:
# Directory containing the 150 CSV data blocks
data_directory = "../data/filtered_blocks_padded/"

In [422]:
# Preprocessing function to load and prepare data blocks from CSV files
def preprocess_blocks_from_directory(directory):
    all_blocks = []
    all_source_ids = np.array(list(encoding_legend.keys())).reshape(-1, 1)
    encoder = OneHotEncoder(sparse=False)
    encoder.fit(all_source_ids)  # Fit once on all possible sourceIDs
    
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            block = pd.read_csv(file_path)

            assert block.shape == (36, 5), f"Block {filename} has an unexpected shape {block.shape}"

            source_ids = block[['sourceID']].values.reshape(-1, 1)
            one_hot_encoded_sourceID = encoder.transform(source_ids)

            timediff = block[['timediff']].values
            ptab = np.nan_to_num(block[['PTAB']].values)
            timediff = (timediff - np.mean(timediff)) / np.std(timediff)  # Normalize all rows
            ptab = (ptab - np.mean(ptab)) / np.std(ptab)  # Normalize all rows
            
            body_group_from = block[['BodyGroup_from']].values
            body_group_to = block[['BodyGroup_to']].values

            # Adding start and end tokens to the sequence
            X_block = np.concatenate((one_hot_encoded_sourceID, timediff, ptab, body_group_from, body_group_to), axis=1)
            X_block_with_tokens = np.vstack((np.zeros((1, X_block.shape[1])), X_block, np.zeros((1, X_block.shape[1]))))  # Start and end tokens
            all_blocks.append(X_block_with_tokens)

    all_blocks = np.stack(all_blocks, axis=0)  # Shape: (num_blocks, 38, feature_size)
    return all_blocks, encoder

# Preprocess the blocks from the directory
data_blocks, encoder = preprocess_blocks_from_directory(data_directory)

# Convert processed blocks to tensor
X_train = torch.tensor(data_blocks, dtype=torch.float32)

# Model input shape: (batch_size, sequence_length, feature_size)
print(f"Processed input shape (X_train): {X_train.shape}")



Processed input shape (X_train): torch.Size([150, 38, 17])


In [423]:
class TransformerModel(nn.Module):
    def __init__(self, input_size, d_model=128, nhead=8, num_encoder_layers=6):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_size, d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, batch_first=True)
        self.fc_out = nn.Linear(d_model, input_size)

    def forward(self, src, tgt=None, teacher_forcing_ratio=0.5):
        src_emb = self.embedding(src)
        
        if tgt is not None and torch.rand(1).item() < teacher_forcing_ratio:
            tgt_emb = self.embedding(tgt)
            output = self.transformer(src_emb, tgt_emb)
        else:
            output = self.transformer(src_emb, src_emb)  # Use src as tgt

        return self.fc_out(output)

In [424]:
# Initialize model
input_size = X_train.shape[2]
model = TransformerModel(input_size)
print("Transformer model initialized.")

Transformer model initialized.


In [425]:
# Step 3: Training loop setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 100

In [429]:
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Prepare the input and target data with start and end tokens
    src = X_train[:, :-1, :]  # Source: All but the last time step
    tgt = X_train[:, 1:, :]    # Target: All but the first time step

    # Forward pass with teacher forcing
    output = model(src, tgt=tgt, teacher_forcing_ratio=0.5)
    
    # Flatten for cross-entropy
    output_flat = output.reshape(-1, input_size)  # Use reshape here
    target_flat = tgt.reshape(-1, input_size)      # Use reshape here

    # Calculate loss
    loss = criterion(output_flat, target_flat)

    # Backward pass
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}")


Epoch [10/100], Loss: 7.281587600708008
Epoch [20/100], Loss: 6.8644938468933105
Epoch [30/100], Loss: 6.831202030181885
Epoch [40/100], Loss: 6.805388450622559
Epoch [50/100], Loss: 6.807685852050781
Epoch [60/100], Loss: 6.799750328063965
Epoch [70/100], Loss: 6.80089807510376
Epoch [80/100], Loss: 6.796604156494141
Epoch [90/100], Loss: 6.790532112121582
Epoch [100/100], Loss: 6.793975353240967


In [427]:
# Step 5: Predict cloned sequence
model.eval()
with torch.no_grad():
    prediction = model(X_train)

print("Predicted (cloned) data block: \n", prediction)

Predicted (cloned) data block: 
 tensor([[[ 1.2248, -0.2410, -0.2538,  ..., -0.6112, -0.6305, -0.1204],
         [ 1.1301, -0.2366, -0.3000,  ..., -0.6501, -0.6588, -0.1044],
         [ 1.1371, -0.2254, -0.3106,  ..., -0.6334, -0.6828, -0.0684],
         ...,
         [ 1.1945, -0.2190, -0.3097,  ..., -0.6167, -0.6305, -0.0748],
         [ 1.1945, -0.2190, -0.3097,  ..., -0.6167, -0.6305, -0.0748],
         [ 1.2248, -0.2410, -0.2538,  ..., -0.6112, -0.6305, -0.1204]],

        [[ 1.1532, -0.1896, -0.3096,  ..., -0.6297, -0.6930, -0.2331],
         [ 1.0652, -0.1876, -0.3523,  ..., -0.6704, -0.7301, -0.2277],
         [ 1.0748, -0.1685, -0.3631,  ..., -0.6468, -0.7519, -0.1770],
         ...,
         [ 1.1386, -0.1695, -0.3611,  ..., -0.6325, -0.6962, -0.1852],
         [ 1.1386, -0.1695, -0.3611,  ..., -0.6325, -0.6962, -0.1852],
         [ 1.1532, -0.1896, -0.3096,  ..., -0.6297, -0.6930, -0.2331]],

        [[ 1.2306, -0.2353, -0.2576,  ..., -0.6039, -0.6234, -0.1055],
         [ 1

In [414]:
# Step 5: Predict cloned sequence
model.eval()
with torch.no_grad():
    prediction = model(X_train)
    
print("Predicted (cloned) data block: \n", prediction)

Predicted (cloned) data block: 
 tensor([[[ 2.8260,  0.4137, -1.6006,  ..., -3.5471,  3.3198,  4.9645],
         [ 2.8260,  0.4137, -1.6006,  ..., -3.5471,  3.3198,  4.9645],
         [ 2.8260,  0.4137, -1.6006,  ..., -3.5471,  3.3198,  4.9645],
         ...,
         [ 2.8260,  0.4137, -1.6006,  ..., -3.5471,  3.3198,  4.9645],
         [ 2.8260,  0.4137, -1.6006,  ..., -3.5471,  3.3198,  4.9645],
         [ 2.8260,  0.4137, -1.6006,  ..., -3.5471,  3.3198,  4.9645]],

        [[ 2.8260,  0.4137, -1.6006,  ..., -3.5471,  3.3198,  4.9645],
         [ 2.8260,  0.4137, -1.6006,  ..., -3.5471,  3.3198,  4.9645],
         [ 2.8260,  0.4137, -1.6006,  ..., -3.5471,  3.3198,  4.9645],
         ...,
         [ 2.8260,  0.4137, -1.6006,  ..., -3.5471,  3.3198,  4.9645],
         [ 2.8260,  0.4137, -1.6006,  ..., -3.5471,  3.3198,  4.9645],
         [ 2.8260,  0.4137, -1.6006,  ..., -3.5471,  3.3198,  4.9645]],

        [[ 2.8260,  0.4137, -1.6006,  ..., -3.5471,  3.3198,  4.9645],
         [ 2

In [428]:
# Decode the predicted output using the encoding legend
def decode_predictions(pred, encoder):
    pred_source_ids = np.argmax(pred.numpy()[:, :, :len(encoder.categories_[0])], axis=2)
    decoded_blocks = [[encoding_legend[id] for id in block] for block in pred_source_ids]
    return decoded_blocks

decoded_predictions = decode_predictions(prediction, encoder)
print(f"Decoded predicted source IDs: {decoded_predictions}")

Decoded predicted source IDs: [['PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED'], ['PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED'], ['PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PADDED', 'PAD