# Small Language Model

## Set Up

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.notebook import tqdm
from transformers import GPT2Tokenizer 

In [2]:
# Initialize tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
vocab_size = tokenizer.vocab_size

print(vocab_size)

50257


## Dataset Preparation

In [3]:
# prepare the dataset
class TextDataset(Dataset):
    def __init__(self, text, seq_length, tokenizer):
        self.seq_length = seq_length
        self.tokenizer = tokenizer
        self.data = tokenizer.encode(text)
        self.vocab_size = tokenizer.vocab_size
    
    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, index):
        x = self.data[index:index + self.seq_length]
        y = self.data[index + 1:index + self.seq_length + 1]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

## Example steps

### Tokenize

In [4]:
x_text = "This thesis addresses the critical need"
x = tokenizer(x_text, return_tensors="pt").input_ids
print(x, x.shape)

tensor([[ 1212, 21554,  9405,   262,  4688,   761]]) torch.Size([1, 6])


### Embedding

In [5]:
embed_size = 64
embedding = nn.Embedding(vocab_size, embedding_dim=embed_size)
x_emb = embedding(x)
print(x_emb.shape)

torch.Size([1, 6, 64])


### Positional Encoding

In [6]:
seq_length = 10
positional_encoding = nn.Parameter(torch.zeros(1, seq_length, embed_size))
print(positional_encoding.shape)

torch.Size([1, 10, 64])


In [7]:
x_emb_pos_encoding = x_emb + positional_encoding[:, :x_emb.shape[1], :]
print(x_emb_pos_encoding.shape)

torch.Size([1, 6, 64])


## Transformer Layer

In [8]:
num_heads = 4
hidden_dim = 128

transformer_layer = nn.TransformerEncoderLayer(
    d_model=embed_size,
    nhead=num_heads,
    dim_feedforward=hidden_dim,
    batch_first=True
)

In [9]:
num_weights = 0
for p in transformer_layer.parameters():
    num_weights += p.numel()
print(num_weights)

33472


In [10]:
mask = torch.triu(torch.ones(x_emb.shape[1], x_emb.shape[1]) * float("-inf"), diagonal=1).to(x.device)
mask.shape

torch.Size([6, 6])

In [11]:
x_contextual_emb = transformer_layer(x_emb_pos_encoding, mask)
print(x_contextual_emb.shape)

torch.Size([1, 6, 64])


### Final Layer

In [12]:
fc = nn.Linear(embed_size, vocab_size)
x_logits = fc(x_contextual_emb)
print(x_logits.shape)

torch.Size([1, 6, 50257])


## Loss Calculation

In [13]:
text = "This thesis addresses the critical need for effective Fault Detection and Isolation (FDI) in green hydrogen (GH2) production, " \
"a key player in mitigating the greenhouse effect. To tackle this challenge, this thesis introduces a hybrid strategy for FDI. " \
"Extensive reviews of FDI algorithms reveal a gap in existing literature, emphasizing accuracy but neglecting the need for labeled data. " \
"Additionally, explainability in Hybrid-FDI is often overlooked. The proposed hybrid approach aims to be efficient in data usage and " \
"explainable, leveraging physics-based models and Artificial Intelligence (AI). This study introduces Bond Graph-Convolutional Neural Net " \
"(BG-CNN), a novel hybrid FDI method addressing AI model training challenges for fault diagnosis. BG-CNN combines BG residual generation " \
"and CNN-based fault classification, particularly in scenarios with limited labeled data. Additionally, a Self-Supervised Learning (SSL) " \
"method enhances FDI in such situations. The study also discusses Bond Graph-eXplainable AI (BG-XAI), an occlusion-based method, " \
"emphasizing the importance of meaningful explanations for fault predictions, showcasing its effectiveness through visualizations. " \
"The BG-CNN method with SSL was employed for the FDI of the Proton Exchange Membrane (PEM) electrolyzer and railway tracks, " \
"surpassing the performance of traditional methods. Comparative analysis demonstrated the superior performance of the proposed method, " \
"particularly in scenarios with limited labeled data, outperforming state-of-the-art SSL methods. The BG-XAI method was used to " \
"provide explanations for predictions in accordance with structural analysis."

tokenized_text = tokenizer(text, return_tensors='pt').input_ids

seq_length = 10
x = tokenized_text[:,:seq_length]
y = tokenized_text[:,1:seq_length+1]

print(x, x.shape)
print(y, y.shape)

print(tokenizer.batch_decode(x))
print(tokenizer.batch_decode(y))

tensor([[ 1212, 21554,  9405,   262,  4688,   761,   329,  4050, 40050, 46254]]) torch.Size([1, 10])
tensor([[21554,  9405,   262,  4688,   761,   329,  4050, 40050, 46254,   290]]) torch.Size([1, 10])
['This thesis addresses the critical need for effective Fault Detection']
[' thesis addresses the critical need for effective Fault Detection and']


In [14]:
seq_length = 10
batch_size = 16

dataset = TextDataset(text, seq_length, tokenizer)
dataloader = DataLoader(dataset, batch_size, shuffle=True)

In [15]:
x_emb = embedding(x)
x_emb_pos_encoding = x_emb + positional_encoding[:, :x_emb.shape[1], :]
mask = torch.triu(torch.ones(x_emb.shape[1], x_emb.shape[1]) * float("-inf"), diagonal=1).to(x.device)
x_contextual_emb = transformer_layer(x_emb_pos_encoding, mask)
x_logits = fc(x_contextual_emb)

print(x_logits.shape)

loss_fn = nn.CrossEntropyLoss()

print(x_logits.argmax(dim=-1))

loss = loss_fn(x_logits.view(-1, dataset.vocab_size), y.view(-1))

print(loss)

torch.Size([1, 10, 50257])
tensor([[ 9385, 46044, 21010, 26934,  1942, 22825,  8601,  9184, 32086, 31447]])
tensor(11.0865, grad_fn=<NllLossBackward0>)


## Create the Model

In [16]:
# define the model
class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, hidden_dim, seq_length):
        super(TransformerLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.zeros(1, seq_length, embed_size))

        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_size,
            nhead=num_heads,
            dim_feedforward=hidden_dim,
            batch_first=True
        )

        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x, mask=None):
        seq_length = x.size(1)
        x = self.embedding(x) + self.positional_encoding[:, :seq_length, :]
        x = self.encoder_layer(x, src_mask = mask)
        x = self.fc(x)
        return x

## Train the Model

In [17]:
num_epochs = 20
learning_rate = 0.001

# Model, loss, optimizer
model = TransformerLanguageModel(
    vocab_size=dataset.vocab_size,
    embed_size=embed_size,
    num_heads=num_heads,
    hidden_dim=hidden_dim,
    seq_length=seq_length
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [18]:
for epoch in tqdm(range(num_epochs)):
    model.train()
    total_loss = 0
    for x, y in dataloader:
        optimizer.zero_grad()
        mask = torch.triu(torch.ones(seq_length, seq_length) * float("-inf"), diagonal=1).to(x.device)
        output = model(x, mask)
        loss = criterion(output.view(-1, dataset.vocab_size), y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch 1/20, Loss: 10.3945
Epoch 2/20, Loss: 8.6460
Epoch 3/20, Loss: 6.2310
Epoch 4/20, Loss: 4.4847
Epoch 5/20, Loss: 3.6225
Epoch 6/20, Loss: 2.9358
Epoch 7/20, Loss: 2.3371
Epoch 8/20, Loss: 1.8033
Epoch 9/20, Loss: 1.3875
Epoch 10/20, Loss: 1.0885
Epoch 11/20, Loss: 0.8781
Epoch 12/20, Loss: 0.7237
Epoch 13/20, Loss: 0.6022
Epoch 14/20, Loss: 0.5258
Epoch 15/20, Loss: 0.4494
Epoch 16/20, Loss: 0.4023
Epoch 17/20, Loss: 0.3540
Epoch 18/20, Loss: 0.3278
Epoch 19/20, Loss: 0.2996
Epoch 20/20, Loss: 0.2738


## Infernece

In [19]:
# Generate text
model.eval()
input_text = "a key player in mitigating the greenhouse effect"
input_seq = torch.tensor(tokenizer.encode(input_text), dtype=torch.long).unsqueeze(0)
generated = input_text

for _ in range(50):
    with torch.no_grad():
        output = model(input_seq)
        next_token = output.argmax(dim=-1)[0, -1].item()
        generated += tokenizer.decode([next_token])
        input_seq = torch.cat([input_seq, torch.tensor([[next_token]])], dim=1)
        input_seq = input_seq[:, -seq_length:]
        # break

print("Genearted Text:", generated)

Genearted Text: a key player in mitigating the greenhouse effect. To tackle this challenge, this thesis introduces a hybrid strategy for FDI. Extensive reviews of FDI algorithms reveal a gap in existing literature, emphasizing accuracy but neglecting the need for labeled data. Additionally, explainability in Hybrid-FDI
