In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [22]:
train_df = pd.read_csv('../data/train_set.csv')
print('Shape of training data:', train_df.shape)
train_df
validation_df = pd.read_csv('../data/validation_set.csv')

Shape of training data: (4543, 4)


In [23]:
# from positional_encoding import positional_encoding
# import matplotlib.pyplot as plt
# pe = positional_encoding(d_model=512, max_len=1000)
# print(pe.shape)
# print(pe[0].shape)
# plt.pcolormesh(pe[0], cmap='RdBu')
# plt.xlabel('depth')
# plt.xlim((0,512))
# plt.ylabel(('position'))
# plt.colorbar()
# plt.show()

In [24]:
!pip install tiktoken



In [25]:
from torch.utils.data import Dataset, DataLoader
import tiktoken
import math

tokenizer = tiktoken.get_encoding('gpt2')
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.titles = dataframe['title'].str.lower().values
        self.labels = dataframe['label_numeric'].values
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        title = self.titles[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode(title)
        input_ids = torch.tensor(encoding, dtype=torch.long)
        return input_ids, label

# Collate function to pad sequences
def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    max_length = max(len(ids) for ids in input_ids)
    input_ids = torch.stack([torch.cat([ids, torch.zeros(max_length - len(ids), dtype=torch.long)]) for ids in input_ids])
    labels = torch.tensor(labels, dtype=torch.long)
    return input_ids, labels

train_dataset = TextDataset(train_df, tokenizer)
validation_dataset = TextDataset(validation_df, tokenizer)

batch_size=32
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True, collate_fn=collate_fn)
validation_dataloader = DataLoader(validation_dataset, batch_size, shuffle=True, collate_fn=collate_fn)

In [26]:
#early stopping

class EarlyStopping():
    def __init__(self, patience, delta):
        super().__init__()
        self.patience = patience
        self.delta = delta
        self.best_score = None
        self.best_model = None
        self.stop_early = False
        self.counter = 0

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
        elif score<self.best_score + self.delta:
            self.counter+=1
            if self.counter>=self.patience:
                self.stop_early=True
        else: 
            self.best_score = score
            self.best_model = model.state_dict()
            self.counter=0
    def load_best_model(self, updated_model):
        updated_model.load_state_dict(self.best_model)    
    

In [28]:
from embedding import EmbeddingLayer
from encoder_layer import EncoderLayer
from transformer import Transformer
vocab_size = tokenizer.n_vocab
d_model = 256
num_heads = 8
d_ff = 512
output_size = len(train_df['label_numeric'].unique())
num_layers = 6
dropout_rate = 0.1
model = Transformer(vocab_size=vocab_size, d_model=d_model, num_layers=num_layers, max_len=5000, dropout_rate=dropout_rate, num_heads=num_heads, d_ff=d_ff, output_size=output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)
early_stopping = EarlyStopping(5, 0.001)

In [29]:
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for iter, (input_ids, labels) in enumerate(train_dataloader, 1):
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        #print(f"[Epoch {epoch+1}/{num_epochs}] [Iter {iter}/{len(train_dataloader)}], Loss: {loss.item()}")
    avg_loss = total_loss / len(train_dataloader)
    #print(f"Epoch {epoch+1} completed, Average loss: {avg_loss:.4f}")

    # Validation step
    model.eval()
    correct = 0
    total = 0
    valid_loss = 0.0
    with torch.no_grad():
        for input_ids, labels in (validation_dataloader):
            outputs = model(input_ids)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            loss = criterion(outputs, labels)
            valid_loss+=loss.item()
    accuracy = 100 * correct / total
    avg_valid_loss = valid_loss/len(validation_dataloader)
    early_stopping(avg_valid_loss, model)
    if early_stopping.stop_early:
        print(f"Early stopping at the epoch {epoch}")
        early_stopping.load_best_model(model)
        break
    #print(f"Validation Accuracy and Validation Loss after Epoch {epoch+1}: {accuracy:.2f}%")
    print(f"[Epoch {epoch+1}/{num_epochs}], Loss: {avg_loss:.2f}, Validation Loss: {avg_valid_loss:.2f}, Validation Accuracy: {accuracy:.2f}%")


[Epoch 1/20], Loss: 1.45, Validation Loss: 1.07, Validation Accuracy: 67.04%
[Epoch 2/20], Loss: 1.09, Validation Loss: 0.81, Validation Accuracy: 73.00%
[Epoch 3/20], Loss: 0.86, Validation Loss: 0.77, Validation Accuracy: 74.33%
[Epoch 4/20], Loss: 0.74, Validation Loss: 0.71, Validation Accuracy: 79.06%
[Epoch 5/20], Loss: 0.64, Validation Loss: 0.68, Validation Accuracy: 77.31%
[Epoch 6/20], Loss: 0.57, Validation Loss: 0.69, Validation Accuracy: 77.21%
[Epoch 7/20], Loss: 0.51, Validation Loss: 0.66, Validation Accuracy: 78.54%
[Epoch 8/20], Loss: 0.45, Validation Loss: 0.72, Validation Accuracy: 76.80%
[Epoch 9/20], Loss: 0.41, Validation Loss: 0.72, Validation Accuracy: 76.80%
[Epoch 10/20], Loss: 0.36, Validation Loss: 0.68, Validation Accuracy: 78.34%
[Epoch 11/20], Loss: 0.32, Validation Loss: 0.84, Validation Accuracy: 75.98%
Early stopping at the epoch 11
