In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import torch
import tiktoken
import pandas as pd
from pathlib import Path
from torch.utils.data import Dataset, DataLoader, random_split

tokenizer = tiktoken.get_encoding("gpt2")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Load data

In [3]:
file_path = Path('data/SMSSpamCollection.tsv')

df = pd.read_csv(file_path, sep ="\t", header = None, names =["Label", "Text"] )

In [4]:
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Create dataset

In [5]:


class SPAMSmsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        """
        Args:
            dataframe (pd.DataFrame): The dataset containing 'Label' and 'Text' columns.
            tokenizer (tiktoken.Encoding): A pre-initialized tiktoken tokenizer.
            max_length (int): The maximum sequence length for tokenization.
        """
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        # Convert labels to numerical values (spam: 1, ham: 0)
        self.labels = self.data['Label'].map({'ham': 0, 'spam': 1}).values
        self.texts = self.data['Text'].values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get the text and label for the current index
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Tokenize the text using tiktoken
        tokens = self.tokenizer.encode(text)
        
        # Truncate or pad tokens to max_length
        if len(tokens) > self.max_length:
            tokens = tokens[:self.max_length]
        else:
            tokens = tokens + [0] * (self.max_length - len(tokens))  # Padding with 0s

        # Convert to PyTorch tensor
        input_ids = torch.tensor(tokens, dtype=torch.long)
        
        return (input_ids, torch.tensor(label, dtype=torch.long))



In [6]:
full_dataset = SPAMSmsDataset(df, tokenizer, max_length=128)
train_dataset, val_dataset, test_dataset = random_split(
    full_dataset, lengths=[0.6, 0.2, 0.2]
)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=8,
    shuffle=True,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=8,
    shuffle=True,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=8,
    shuffle=True,
)

In [7]:
import torch
from collections import Counter

def value_counts(data_loader, normalize=False):
    """
    Count the occurrences of each label in a PyTorch DataLoader, similar to pd.value_counts().
    
    Args:
        data_loader (DataLoader): A PyTorch DataLoader instance.
        normalize (bool): If True, return the relative frequencies of the labels. Default is False.
    
    Returns:
        dict: A dictionary with label counts or relative frequencies (if normalize=True), sorted by label.
    """
    label_counts = Counter()

    total_samples = 0  # To calculate relative frequencies if normalize=True

    # Iterate through the DataLoader
    for batch in data_loader:
        _, labels = batch  # We only care about the labels (input_ids, labels)
        
        # Update the Counter with the labels
        label_counts.update(labels.tolist())
        total_samples += labels.size(0)  # Update total sample count

    # If normalize=True, convert counts to relative frequencies
    if normalize:
        label_counts = {label: count / total_samples for label, count in label_counts.items()}

    # Sort by label (for consistency with pd.value_counts())
    label_counts = dict(sorted(label_counts.items()))

    return label_counts

In [8]:
print(f"number of training batches: {len(train_loader)}")
print(f"number of validation batches: {len(val_loader)}")
print(f"number of test batches: {len(test_loader)}")

number of training batches: 418
number of validation batches: 140
number of test batches: 140


In [9]:
train_label_normalized = value_counts(train_loader, normalize=True)
val_label_normalized = value_counts(val_loader, normalize=True)
test_label_normalized = value_counts(test_loader, normalize=True)

print(f"class distribution in training loader: {train_label_normalized}")
print(f"class distribution in validation loader: {val_label_normalized}")
print(f"class distribution in test loader: {test_label_normalized}")

class distribution in training loader: {0: 0.8708133971291866, 1: 0.1291866028708134}
class distribution in validation loader: {0: 0.8545780969479354, 1: 0.14542190305206462}
class distribution in test loader: {0: 0.8626570915619389, 1: 0.13734290843806105}


In [10]:
for input_batch, target_batch in train_loader:
    print(input_batch, target_batch)

tensor([[14150,  1635,   922,  ...,     0,     0,     0],
        [20266,   569,   280,  ...,     0,     0,     0],
        [10814,   345,   991,  ...,     0,     0,     0],
        ...,
        [   40, 28329,   651,  ...,     0,     0,     0],
        [18465,    11,  1312,  ...,     0,     0,     0],
        [ 3673,  1654,  1865,  ...,     0,     0,     0]]) tensor([0, 1, 0, 0, 0, 0, 0, 0])
tensor([[  127,   250,  1444,  ...,     0,     0,     0],
        [ 5308, 14720,  2822,  ...,     0,     0,     0],
        [ 2215,   345,  1625,  ...,     0,     0,     0],
        ...,
        [  818,  1339,   345,  ...,     0,     0,     0],
        [ 5297, 21752,     0,  ...,     0,     0,     0],
        [ 1135,   836,   869,  ...,     0,     0,     0]]) tensor([0, 0, 0, 0, 0, 0, 0, 0])
tensor([[26417,  1312, 13140,  ...,     0,     0,     0],
        [18649, 46714,     0,  ...,     0,     0,     0],
        [   56,   283,   300,  ...,     0,     0,     0],
        ...,
        [ 6090,  1282, 

In [11]:
import torch
from transformers import GPT2Model
from src.model import MODEL_ARCHITECTURES, GPTModel
from src.utils import load_weights

gpt_pretrained = GPT2Model.from_pretrained("openai-community/gpt2", cache_dir="checkpoints")
base_configs = MODEL_ARCHITECTURES['gpt2-small']
custom_configs = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "drop_rate": 0.0,  # Dropout rate
    "qkv_bias": True,  # Query-Key-Value bias
}

model_configs = base_configs | custom_configs

gpt = GPTModel(model_configs)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_weights(gpt, gpt_pretrained, model_configs)
gpt.to(device)

  from .autonotebook import tqdm as notebook_tqdm


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (transformer_decoders): Sequential(
    (0): TransformerDecoder(
      (multi_head_attention): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (norm_layer1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (norm_layer2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (feed_foward): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      

In [12]:
from src.utils import generate_sequence
from src.token import token_ids_to_text, text_to_token_ids

text_1 = "Every effort moves you"

token_ids = generate_sequence(
    model=gpt,
    idx=text_to_token_ids(text_1, tokenizer).to(device),
    max_new_tokens=15,
    context_size=model_configs['context_length']
)

print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you forward.

The first step is to understand the importance of your work


In [13]:
text_2 = (
    "Is the following text 'spam'? Answer with 'yes' or 'no':" 
    " 'You are a winner you have been specially" 
    " selected to receive $ 1000 cash or a $ 2000 award.'"
)

token_ids = generate_sequence(
    model=gpt,
    idx=text_to_token_ids(text_2, tokenizer).to(device),
    max_new_tokens=23,
    context_size=model_configs['context_length']
)

print(token_ids_to_text(token_ids, tokenizer))

Is the following text 'spam'? Answer with 'yes' or 'no': 'You are a winner you have been specially selected to receive $ 1000 cash or a $ 2000 award.'

The following text 'spam'? Answer with 'yes' or 'no': 'You are a winner


In [14]:
num_classses = 2

gpt.out = torch.nn.Linear(
    in_features=model_configs['emb_dim'],
    out_features=num_classses
).to(device)

for param in gpt.transformer_decoders[-1].parameters():
    param.requires_grad = True

for param in gpt.final_norm.parameters():
    param.requires_grad = True

In [15]:
gpt

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (transformer_decoders): Sequential(
    (0): TransformerDecoder(
      (multi_head_attention): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (norm_layer1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (norm_layer2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (feed_foward): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      

In [16]:
inputs = tokenizer.encode("Do you have time")
inputs = torch.tensor(inputs).unsqueeze(0).to(device)
print("Inputs:", inputs)
print("Inputs dimensions:", inputs.shape)

Inputs: tensor([[5211,  345,  423,  640]], device='cuda:0')
Inputs dimensions: torch.Size([1, 4])


In [17]:
with torch.no_grad():
    outputs = gpt(inputs)
print("Outputs:\n", outputs)
print("Outpus dimensions:", outputs.shape)

Outputs:
 tensor([[[1.0705, 0.5988],
         [2.7577, 2.5812],
         [2.3211, 1.1269],
         [3.2813, 0.2346]]], device='cuda:0')
Outpus dimensions: torch.Size([1, 4, 2])


In [18]:
from src.train_classifier import (
    calc_precision_loader,
    calc_loss_loader,
    calc_accuracy_loader,
    train_classifier_simple,
)

In [19]:
train_accuracy = calc_accuracy_loader(
    train_loader, gpt, device, num_batches=10
)
val_accuracy = calc_accuracy_loader(
    val_loader, gpt, device, num_batches=10
)
test_accuracy = calc_accuracy_loader(
    test_loader, gpt, device, num_batches=10
)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Validation accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 8.75%
Validation accuracy: 10.00%
Validation accuracy: 12.50%


In [20]:
train_precision = calc_precision_loader(
    train_loader, gpt, device, num_batches=10
)
val_precision = calc_precision_loader(
    val_loader, gpt, device, num_batches=10
)
test_precision = calc_precision_loader(
    test_loader, gpt, device, num_batches=10
)

print(f"Training precision: {train_precision*100:.4f}%")
print(f"Validation precision: {val_precision*100:.4f}%")
print(f"Test precision: {test_precision*100:.4f}%")

Training precision: 12.5000%
Validation precision: 15.0000%
Test precision: 16.4557%


In [21]:
with torch.no_grad():
    train_loss = calc_loss_loader(
        train_loader, gpt, device, num_batches=5
    )
    val_loss = calc_loss_loader(val_loader, gpt, device, num_batches=5)
    test_loss = calc_loss_loader(test_loader, gpt, device, num_batches=5)

print(f"Training loss: {train_loss:.3f}")
print(f"Validation loss: {val_loss:.3f}")
print(f"Test loss: {test_loss:.3f}")

Training loss: 10.263
Validation loss: 8.888
Test loss: 10.250


In [22]:
import time
import torch 
start_time = time.time()
torch.manual_seed(123)

optimizer = torch.optim.AdamW(gpt.parameters(), lr=5e-4, weight_decay=0.1)
num_epochs = 5

train_losses, val_losses, examples_seen = train_classifier_simple(
    gpt, train_loader, val_loader, optimizer, device, num_epochs=num_epochs, eval_freq=50,
    eval_iter=5
)

end_time = time.time()
execution_time_minuter = (end_time - start_time) / 60

print(f"Training completed in {execution_time_minuter: .2f} minutes.")

  import pkg_resources


Epoch 1 (Step 000000): Train loss 0.582, Val loss 1.334
Epoch 1 (Step 000050): Train loss 0.293, Val loss 0.451
Epoch 1 (Step 000100): Train loss 0.150, Val loss 0.268
Epoch 1 (Step 000150): Train loss 0.045, Val loss 0.036
Epoch 1 (Step 000200): Train loss 0.080, Val loss 0.361
Epoch 1 (Step 000250): Train loss 0.137, Val loss 0.110
Epoch 1 (Step 000300): Train loss 0.013, Val loss 0.035
Epoch 1 (Step 000350): Train loss 0.175, Val loss 0.412
Epoch 1 (Step 000400): Train loss 0.008, Val loss 0.263
Training precision: 100.00% | Validation precision: 100.00%
Epoch 2 (Step 000450): Train loss 0.004, Val loss 0.002
Epoch 2 (Step 000500): Train loss 0.001, Val loss 0.217
Epoch 2 (Step 000550): Train loss 0.016, Val loss 0.017
Epoch 2 (Step 000600): Train loss 0.001, Val loss 0.002
Epoch 2 (Step 000650): Train loss 0.038, Val loss 0.090
Epoch 2 (Step 000700): Train loss 0.002, Val loss 0.171
Epoch 2 (Step 000750): Train loss 0.002, Val loss 0.378
Epoch 2 (Step 000800): Train loss 0.055, Val

In [25]:
train_precision = calc_precision_loader(train_loader, gpt, device)
val_precision = calc_precision_loader(val_loader, gpt, device)
test_precision = calc_precision_loader(test_loader, gpt, device)

In [26]:
print(f"Training precision: {train_precision*100:.4f}%")
print(f"Validation precision: {val_precision*100:.4f}%")
print(f"Test precision: {test_precision*100:.4f}%")

Training precision: 100.0000%
Validation precision: 97.4522%
Test precision: 98.6577%
