In [1]:
!pip install torch datasets



# **GPT-Like architecture**

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# -----------------------------------------
# Self Attention (1 head)
# -----------------------------------------
class Selfattention(nn.Module):
    def __init__(self, d_model, head_dim):
        super().__init__()
        self.head_dim = head_dim

        self.q = nn.Linear(d_model, head_dim, bias=False)
        self.k = nn.Linear(d_model, head_dim, bias=False)
        self.v = nn.Linear(d_model, head_dim, bias=False)

    def forward(self, x, mask=None):
        # x: (B,T,d_model)
        q = self.q(x)   # (B,T,head_dim)
        k = self.k(x)   # (B,T,head_dim)
        v = self.v(x)   # (B,T,head_dim)

        # attention scores
        scores = q @ k.transpose(-2, -1) / math.sqrt(self.head_dim)  # (B,T,T)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attWeights = torch.softmax(scores, dim=-1)   # (B,T,T)
        output = attWeights @ v                      # (B,T,head_dim)
        return output


# -----------------------------------------
# Multi Heads
# -----------------------------------------
class Multiheads(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()

        assert d_model % num_heads == 0
        head_dim = d_model // num_heads

        self.heads = nn.ModuleList([
            Selfattention(d_model, head_dim) for _ in range(num_heads)
        ])
        self.out = nn.Linear(num_heads * head_dim, d_model)

    def forward(self, x, mask=None):
        # compute each head
        outs = [h(x, mask) for h in self.heads]  # list of (B,T,head_dim)
        out = torch.cat(outs, dim=-1)            # (B,T,d_model)
        out = self.out(out)
        return out


# -----------------------------------------
# Feed Forward MLP
# -----------------------------------------
class MLP(nn.Module):
    def __init__(self, d_model, expansion=4):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, expansion * d_model),
            nn.ReLU(),
            nn.Linear(expansion * d_model, d_model),
            nn.Dropout(0.1)
        )

    def forward(self, x):
        return self.net(x)


# -----------------------------------------
# Decoder Block
# -----------------------------------------
class Decoderblock(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.att = Multiheads(d_model, num_heads)
        self.ffn = MLP(d_model)

    def forward(self, x, mask=None):
        x1 = x + self.att(self.norm1(x), mask)
        x2 = x1 + self.ffn(self.norm2(x1))
        return x2


# -----------------------------------------
# Decoder
# -----------------------------------------
class Decoder(nn.Module):
    def __init__(self, dict_size, d_model, num_heads, num_layers, seq_length):
        super().__init__()

        self.emb = nn.Embedding(dict_size, d_model)
        self.pos = nn.Embedding(seq_length, d_model)
        self.norm = nn.LayerNorm(d_model)

        self.blocks = nn.ModuleList(
            [Decoderblock(d_model, num_heads) for _ in range(num_layers)]
        )

    def forward(self, x):
        B, T = x.shape

        x = self.emb(x) + self.pos(torch.arange(T, device=x.device))

        # causal mask shape: (1,1,T,T)
        mask = torch.tril(torch.ones(T, T, device=x.device)).unsqueeze(0).unsqueeze(0)

        for block in self.blocks:
            x = block(x, mask)

        return self.norm(x)


# -----------------------------------------
# Causal LM
# -----------------------------------------
class CausalLm(nn.Module):
    def __init__(self, d_model, dict_size, num_heads, num_layers, seq_length):
        super().__init__()
        self.lm_head = nn.Linear(d_model, dict_size)
        self.Decoder = Decoder(dict_size, d_model, num_heads, num_layers, seq_length)

    def forward(self, x, outputs=None):
        x = self.Decoder(x)
        logits = self.lm_head(x)
        loss = None

        if outputs is not None:
            # Before unpacking, ensure logits has the expected 3 dimensions.
            # An extra dimension of size 1 might have been introduced unexpectedly.
            # logits = logits.squeeze()
            if logits.dim() == 4:  # If shape is (1, B, T, V) or similar
               logits = logits.squeeze(0)

            # flatten for CE
            B, T, V = logits.shape
            logits_flat = logits.reshape(B * T, V)
            outputs_flat = outputs.reshape(B * T)
            loss = F.cross_entropy(logits_flat, outputs_flat)

        return logits, loss
    def generate(self, x, max_len=576, temperature=1.0, top_k=None):
        """
        Generate a sequence given a starting input tensor `x`.


        """
        B, T = x.shape
        generated = x  # Initialize generated sequence with the input

        # Set the model to evaluation mode
        self.eval()

        print('hi')

        # Generate tokens one by one
        for i in range(max_len):
            logits, _ = self.forward(generated)
            # print('logits shape:', logits.shape)  # Debug print
            while logits.dim() > 3:
                logits = logits.squeeze(1)  # Remove dimension at index 1
            logits = logits[:, -1, :]  # Get logits for the last token (T-1)
            # print('logits after slicing:', logits.shape)  # Debug print

            # Apply temperature scaling
            logits = logits / temperature

            if top_k is not None:
                # Apply top_k filtering
                values, indices = torch.topk(logits, top_k)
                logits = torch.full_like(logits, float('-inf'))
                logits.scatter_(-1, indices, values)

            # Sample from the logits (softmax for probabilities)
            probs = torch.softmax(logits, dim=-1)
            # print(f'Step {i}: probs shape:', probs.shape)

            next_token = torch.multinomial(probs, 1)
            # print(f'Step {i}: next_token shape:', next_token.shape)
            # print(f'Step {i}: generated shape before cat:', generated.shape)

            # Append the sampled token to the generated sequence
            generated = torch.cat([generated, next_token], dim=1)
            # print(f'Step {i}: generated shape after cat:', generated.shape)

        return generated


In [None]:
# Creating my GPT-like model
# i'm making sure d_model % num_heads = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CausalLm(
    d_model=256,
    dict_size=30000,
    num_heads=4,
    num_layers=3,
    seq_length=576
)
model.to(device)


# **Tokenizing My Training Dataset**

In [4]:
from datasets import load_dataset
import os

# Load the full 'train' split of the dataset as the initial dataset
raw_train_dataset = load_dataset('wikimedia/wikipedia',"20231101.en", split="train")

# Split the raw_train_dataset into a new training set (80%) and a temporary set (20%)
# The 'temp' set will then be split into validation and test.
# Using a fixed seed for reproducibility.
train_val_test_split = raw_train_dataset.train_test_split(test_size=0.2, seed=42)

# The new 'train' split
train_dataset = train_val_test_split['train'][:10000] # Corrected slicing

# Split the 'test' part of train_val_test_split (which is 20% of original)
# into validation (10% of original) and test (10% of original)
val_test_dataset = train_val_test_split['test'].train_test_split(test_size=0.5, seed=42)

validation_dataset = val_test_dataset['train'][:1000] # Corrected slicing
test_dataset = val_test_dataset['test'][:1000] # Corrected slicing

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(validation_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

from tokenizers import Tokenizer, models, trainers, pre_tokenizers

# Creating a tokenizer with BPE model
tokenizer = Tokenizer(models.BPE())

# Pre-tokenizer (split on whitespace)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Trainer
trainer = trainers.BpeTrainer(vocab_size=30000, min_frequency=2, special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"])


corpus_file_path = "wiki_train_corpus.txt"
with open(corpus_file_path, "w", encoding="utf-8") as f:
    # Iterate directly over the 'text' list within the train_dataset dictionary
    for text_entry in train_dataset['text']:
        if text_entry is not None:
            f.write(text_entry + "\n")

# Training on corpus files
tokenizer.train(files=[corpus_file_path], trainer=trainer)

# Saving the tokenizer for later use
tokenizer_path = "my_tokenizer.json"
tokenizer.save(tokenizer_path)
print(f"Tokenizer saved to {tokenizer_path}")

# Encoding text with the trained tokenizer for testing
encoded = tokenizer.encode("I went to the bank")
print(f"Encoded IDs for 'I went to the bank': {encoded.ids}")
print(f"Decoded text: {tokenizer.decode(encoded.ids)}")

# Cleaning up the temporary corpus file
os.remove(corpus_file_path)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/41 [00:00<?, ?files/s]

20231101.en/train-00000-of-00041.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

20231101.en/train-00001-of-00041.parquet:   0%|          | 0.00/351M [00:00<?, ?B/s]

20231101.en/train-00002-of-00041.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

20231101.en/train-00003-of-00041.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

20231101.en/train-00004-of-00041.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

20231101.en/train-00005-of-00041.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

20231101.en/train-00006-of-00041.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

20231101.en/train-00007-of-00041.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

20231101.en/train-00008-of-00041.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

20231101.en/train-00009-of-00041.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

20231101.en/train-00010-of-00041.parquet:   0%|          | 0.00/234M [00:00<?, ?B/s]

20231101.en/train-00011-of-00041.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

20231101.en/train-00012-of-00041.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

20231101.en/train-00013-of-00041.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

20231101.en/train-00014-of-00041.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

20231101.en/train-00015-of-00041.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

20231101.en/train-00016-of-00041.parquet:   0%|          | 0.00/503M [00:00<?, ?B/s]

20231101.en/train-00017-of-00041.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

20231101.en/train-00018-of-00041.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

20231101.en/train-00019-of-00041.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

20231101.en/train-00020-of-00041.parquet:   0%|          | 0.00/225M [00:00<?, ?B/s]

20231101.en/train-00021-of-00041.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

20231101.en/train-00022-of-00041.parquet:   0%|          | 0.00/202M [00:00<?, ?B/s]

20231101.en/train-00023-of-00041.parquet:   0%|          | 0.00/213M [00:00<?, ?B/s]

20231101.en/train-00024-of-00041.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

20231101.en/train-00025-of-00041.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

20231101.en/train-00026-of-00041.parquet:   0%|          | 0.00/208M [00:00<?, ?B/s]

20231101.en/train-00027-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

20231101.en/train-00028-of-00041.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

20231101.en/train-00029-of-00041.parquet:   0%|          | 0.00/218M [00:00<?, ?B/s]

20231101.en/train-00030-of-00041.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

20231101.en/train-00031-of-00041.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

20231101.en/train-00032-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

20231101.en/train-00033-of-00041.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

20231101.en/train-00034-of-00041.parquet:   0%|          | 0.00/219M [00:00<?, ?B/s]

20231101.en/train-00035-of-00041.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

20231101.en/train-00036-of-00041.parquet:   0%|          | 0.00/610M [00:00<?, ?B/s]

20231101.en/train-00037-of-00041.parquet:   0%|          | 0.00/674M [00:00<?, ?B/s]

20231101.en/train-00038-of-00041.parquet:   0%|          | 0.00/538M [00:00<?, ?B/s]

20231101.en/train-00039-of-00041.parquet:   0%|          | 0.00/465M [00:00<?, ?B/s]

20231101.en/train-00040-of-00041.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6407814 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

Train dataset size: 4
Validation dataset size: 4
Test dataset size: 4
Tokenizer saved to my_tokenizer.json
Encoded IDs for 'I went to the bank': [45, 3668, 2032, 2015, 5057]
Decoded text: I went to the bank


In [5]:
# making sure train_dataset is well set
print(train_dataset['text'][:1])


['The 1992 Akron Zips football team represented Akron University in the 1992 NCAA Division I-A football season as members of the Mid-American Conference. They were led by seventh–year head coach Gerry Faust. The Zips played their home games at the Rubber Bowl in Akron, Ohio. They finished the season with a record of 7–3–1, 5–3 in MAC play to finish in a three-way tie for third place.\n\nSchedule\n\nReferences\n\nAkron\nAkron Zips football seasons\nAkron Zips football']


In [6]:
from torch.utils.data import DataLoader
import torch
from tokenizers import Tokenizer

# Define a custom Dataset class (moved definition before usage)
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

# Load the tokenizer trained in the previous cell
tokenizer = Tokenizer.from_file("my_tokenizer.json")

# Define the sequence length, matching the model's seq_length (from cell 8AR8vdQqvSKs)
SEQ_LENGTH = 576

# Get the PAD token ID from the tokenizer
pad_token_id = tokenizer.token_to_id("[PAD]")
if pad_token_id is None:
    raise ValueError("PAD token ID not found. Ensure '[PAD]' is in tokenizer's special tokens.")

# Tokenize and prepare the data for X_tensor and y_tensor (TRAINING DATA)
tokenized_sequences_train = []
for text_entry in train_dataset['text']:
    if text_entry is not None:
        encoded = tokenizer.encode(text_entry)
        ids = encoded.ids

        # We need a sequence of length SEQ_LENGTH + 1 to create input and output sequences of SEQ_LENGTH
        target_len = SEQ_LENGTH + 1

        if len(ids) >= target_len:
            processed_ids = ids[:target_len]
        else: # Pad if shorter
            processed_ids = ids + [pad_token_id] * (target_len - len(ids))
        tokenized_sequences_train.append(processed_ids)

# Convert the list of tokenized sequences to a PyTorch tensor
if not tokenized_sequences_train:
    raise ValueError("No tokenized sequences generated for training. train_dataset['text'] might be empty or contain only None values.")

idx_tensor_train = torch.tensor(tokenized_sequences_train, dtype=torch.long)

# Apply the shifting logic to create X_tensor and y_tensor
X_tensor_train = idx_tensor_train[:, :-1]
y_tensor_train = idx_tensor_train[:, 1:]

# Create the training dataset and DataLoader
train_dataset_obj = MyDataset(X_tensor_train, y_tensor_train)
train_loader = DataLoader(train_dataset_obj, batch_size=32, shuffle=True)

print(f"X_tensor_train shape: {X_tensor_train.shape}")
print(f"y_tensor_train shape: {y_tensor_train.shape}")

# Tokenize and prepare the data for X_tensor and y_tensor (VALIDATION DATA)
tokenized_sequences_val = []
for text_entry in validation_dataset['text']:
    if text_entry is not None:
        encoded = tokenizer.encode(text_entry)
        ids = encoded.ids

        target_len = SEQ_LENGTH + 1

        if len(ids) >= target_len:
            processed_ids = ids[:target_len]
        else:
            processed_ids = ids + [pad_token_id] * (target_len - len(ids))
        tokenized_sequences_val.append(processed_ids)

if not tokenized_sequences_val:
    raise ValueError("No tokenized sequences generated for validation. validation_dataset['text'] might be empty or contain only None values.")

idx_tensor_val = torch.tensor(tokenized_sequences_val, dtype=torch.long)

X_tensor_val = idx_tensor_val[:, :-1]
y_tensor_val = idx_tensor_val[:, 1:]

# Create the validation dataset and DataLoader
val_dataset_obj = MyDataset(X_tensor_val, y_tensor_val)
validation_loader = DataLoader(val_dataset_obj, batch_size=32, shuffle=False) # No shuffle for validation

print(f"X_tensor_val shape: {X_tensor_val.shape}")
print(f"y_tensor_val shape: {y_tensor_val.shape}")





X_tensor_train shape: torch.Size([10000, 576])
y_tensor_train shape: torch.Size([10000, 576])
X_tensor_val shape: torch.Size([1000, 576])
y_tensor_val shape: torch.Size([1000, 576])


In [7]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("my_tokenizer.json")

# checking vocab size making sure its same as we set it in our model
vocab_size = tokenizer.get_vocab_size()
print("Vocab size:", vocab_size)

Vocab size: 30000


# **Training From Scratch**

In [None]:
import torch
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# Training loop
# epochs = 10
prev_valloss = float('inf')
i = 0
overfitting = 0

# We would train until it start overfitting with no max epoches
while True:
    model.train()  # Setting the model to training mode
    running_loss = 0.0

    # print(''*1000)
    print('new epoch')


    for batch in train_loader:

        X_batch, y_batch = batch
        # Moving batches to the appropriate device
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        # Forward pass
        predictions,loss = model(X_batch,y_batch)
        # loss = criterion(predictions, y_batch)

        # Backward pass
        optimizer.zero_grad()  # Clearinf previous gradients
        loss.backward()        # Computinf gradients
        optimizer.step()       # Updating parameters

        running_loss += loss.item()
        # print('new batch with loss:', loss.item())

    # Calculate average training loss
    avg_train_loss = running_loss / len(train_loader)

    # Validation step
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient computation for evaluation
        for batch in validation_loader:
            X_batch, y_batch = batch
            # Move batches to the appropriate device
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            predictions,loss = model(X_batch,y_batch)
            val_loss += loss.item()

            # # Calculate accuracy
            # predicted_labels = (predictions > 0.5).float()
            # correct += (predicted_labels == y_batch).sum().item()
            # total += y_batch.size(0)

    avg_val_loss = val_loss / len(validation_loader)
    # accuracy = correct / total


    print(f"Epoch {i+1}/{i}")
    print(f"  Training Loss: {avg_train_loss:.4f}")
    print(f"  Validation Loss: {avg_val_loss:.4f}")
    i += 1
    if avg_val_loss >= prev_valloss:
        overfitting += 1

    else:
        prev_valloss = avg_val_loss
        overfitting = 0  # optional: reset if improved

    if overfitting == 2:
        break



In [None]:

prompt = "prompt"

# Get token IDs as plain Python ints
encoded = tokenizer.encode(prompt)

token_ids = [int(t) for t in encoded.ids]

# print('token_ids',token_ids)

# Convert to tensor, add batch dim, move to GPU
token_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)

# print('token_tensor',token_tensor)
# print("token_tensor dtype:", token_tensor.dtype)
# print("token_tensor shape:", token_tensor.shape)


generated_sequence = model.generate(token_tensor, max_len=50, temperature=0.7, top_k=50)
tokens = generated_sequence[0].tolist()
text = tokenizer.decode(tokens)

print(text)
