In [None]:
!pip install tqdm
!pip install torchtext==0.6.0
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0).transpose(0, 1)

    def forward(self, x):
        # Scale the positional encoding to match the embedding magnitude
        scale = math.sqrt(self.encoding.size(-1))
        return x + (self.encoding.to(x.device)[:x.size(0), :] / scale).requires_grad_(False)

        #return x + (self.encoding.to(x.device)[:x.size(0), :]).requires_grad_(False)

class MultiHeadAttention(nn.Module):
    def __init__(self, input_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert input_dim % num_heads == 0  # Ensure the input dimension can be evenly split into heads
        self.num_heads = num_heads
        self.dim_per_head = input_dim // num_heads
        self.W_q = nn.Linear(input_dim, input_dim)
        self.W_k = nn.Linear(input_dim, input_dim)
        self.W_v = nn.Linear(input_dim, input_dim)
        self.fc_out = nn.Linear(input_dim, input_dim)

    def forward(self, x):
        batch_size = x.size(0)
        Q = self.W_q(x).view(batch_size, -1, self.num_heads, self.dim_per_head).transpose(1, 2)
        K = self.W_k(x).view(batch_size, -1, self.num_heads, self.dim_per_head).transpose(1, 2)
        V = self.W_v(x).view(batch_size, -1, self.num_heads, self.dim_per_head).transpose(1, 2)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.dim_per_head)
        attention = F.softmax(scores, dim=-1)
        context = torch.matmul(attention, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.dim_per_head)
        return self.fc_out(context)

class MyLayerNorm(nn.Module):
    def __init__(self, input_dim):
        super(MyLayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(input_dim))
        self.beta = nn.Parameter(torch.zeros(input_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.gamma * (x - mean) / (std + 1e-6) + self.beta

class MyTransformerBlock(nn.Module):
    def __init__(self, input_dim, num_heads):
        super(MyTransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(input_dim, num_heads)
        self.norm1 = MyLayerNorm(input_dim)
        self.fc1 = nn.Linear(input_dim, input_dim)
        self.fc2 = nn.Linear(input_dim, input_dim)
        self.dropout = nn.Dropout(0.3)
        self.norm2 = MyLayerNorm(input_dim)

    def forward(self, x):
        out = self.attention(x)
        x = self.norm1(self.dropout(out) + x)
        out = self.fc2(F.relu(self.fc1(x)))
        out = self.norm2(self.dropout(out) + x)
        return out

class MyTransformer(nn.Module):
    def __init__(self, vocab, max_len, num_of_blocks, num_heads):
        super(MyTransformer, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(vocab.vectors, freeze=False)
        self.positional_encoding = PositionalEncoding(self.embedding.embedding_dim, max_len)
        self.blocks = nn.ModuleList([MyTransformerBlock(self.embedding.embedding_dim, num_heads) for _ in range(num_of_blocks)])
        self.fc = nn.Linear(self.embedding.embedding_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        #x = self.positional_encoding(x)
        for block in self.blocks:
            x = block(x)
        avg_pooling = x.mean(dim=1)
        x = self.fc(avg_pooling)
        return x


In [None]:
!pip install torchtext

In [None]:
import torch
import sys
print(sys.version)
import torchtext
import spacy
from torchtext.data import get_tokenizer
from torch.utils.data import random_split
import torchtext.experimental as d
from torchtext.experimental.datasets import IMDB
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch.nn.functional as F
import os
from matplotlib import pyplot as plt
import numpy as np
from torch.optim.lr_scheduler import StepLR

def pad_trim(data):
    ''' Pads or trims the batch of input data.

    Arguments:
        data (torch.Tensor): input batch
    Returns:
        new_input (torch.Tensor): padded/trimmed input
        labels (torch.Tensor): batch of output target labels
    '''
    data = list(zip(*data)) # the * is for unpacking the list
    # Extract target output labels
    labels = torch.tensor(data[0]).float().to(device)
    # Extract input data
    inputs = data[1]

    # Extract only the part of the input up to the MAX_SEQ_LEN point
    # if input sample contains more than MAX_SEQ_LEN. If not then
    # select entire sample and append <pad_id> until the length of the
    # sequence is MAX_SEQ_LEN
    new_input = torch.stack([torch.cat((input[:MAX_SEQ_LEN],
                                        torch.tensor([pad_id] * max(0, MAX_SEQ_LEN - len(input))).long()))
                             for input in inputs])

    return new_input, labels

def split_train_val(train_set):
    ''' Splits the given set into train and validation sets WRT split ratio
    Arguments:
        train_set: set to split
    Returns:
        train_set: train dataset
        valid_set: validation dataset
    '''
    train_num = int(SPLIT_RATIO * len(train_set))
    valid_num = len(train_set) - train_num
    generator = torch.Generator().manual_seed(SEED)
    train_set, valid_set = random_split(train_set, lengths=[train_num, valid_num],
                                        generator=generator)
    return train_set, valid_set

def load_imdb_data():
    """
    This function loads the IMDB dataset and creates train, validation and test sets.
    It should take around 15-20 minutes to run on the first time (it downloads the GloVe embeddings, IMDB dataset and extracts the vocab).
    Don't worry, it will be fast on the next runs. It is recommended to run this function before you start implementing the training logic.
    :return: train_set, valid_set, test_set, train_loader, valid_loader, test_loader, vocab, pad_id
    """
    cwd = "/content/drive/MyDrive"
    print(cwd)
    if not os.path.exists("/content/drive/MyDrive/.vector_cache"):
        os.makedirs("/content/drive/MyDrive/.vector_cache")
        print("here")
    if not os.path.exists("/content/drive/MyDrive/.data"):
        os.makedirs("/content/drive/MyDrive/.data")
    # Extract the initial vocab from the IMDB dataset
    vocab = IMDB(data_select='train')[0].get_vocab()
    print(vocab.vectors)
    # Create GloVe embeddings based on original vocab word frequencies
    vector_cache_path = "/content/drive/MyDrive/.vector_cache"
    glove_vocab = torchtext.vocab.Vocab(counter=vocab.freqs,
                                        max_size=MAX_VOCAB_SIZE,
                                        min_freq=MIN_FREQ,
                                        vectors=torchtext.vocab.GloVe(name='6B'))
    print(glove_vocab.vectors)
    # Acquire 'Spacy' tokenizer for the vocab words
    tokenizer = get_tokenizer('spacy', 'en_core_web_sm')
    # Acquire train and test IMDB sets with previously created GloVe vocab and 'Spacy' tokenizer

    train_set, test_set = IMDB(tokenizer=tokenizer, vocab=glove_vocab) # the train set is use in percent of 80% for training and the test percet of 20% for testing
    vocab = train_set.get_vocab()  # Extract the vocab of the acquired train set
    pad_id = vocab['<pad>']  # Extract the token used for padding

    train_set, valid_set = split_train_val(train_set)  # Split the train set into train and validation sets
    print(vocab.vectors)

    train_loader = DataLoader(train_set, batch_size=batch_size, collate_fn=pad_trim)
    valid_loader = DataLoader(valid_set, batch_size=batch_size, collate_fn=pad_trim)
    test_loader = DataLoader(test_set, batch_size=batch_size, collate_fn=pad_trim)
    return train_set, valid_set, test_set, train_loader, valid_loader, test_loader, vocab, pad_id

np.random.seed(0)
torch.manual_seed(0)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# VOCAB AND DATASET HYPERPARAMETERS, DO NOT CHANGE
MAX_VOCAB_SIZE = 25000 # Maximum number of words in the vocabulary
MIN_FREQ = 10 # We include only words which occur in the corpus with some minimal frequency
MAX_SEQ_LEN = 500 # We trim/pad each sentence to this number of words
SPLIT_RATIO = 1 # Split ratio between train and validation set
SEED = 0

# YOUR HYPERPARAMETERS
### YOUR CODE HERE ###
num_of_blocks = 1  # More layers
batch_size = 32
num_of_epochs = 10  # Train for more epochs
learning_rate = 0.0001    # Start with a slightly higher learning rate
dropout_rate = 0.4  # Increase dropout rate in transformer blocks
num_heads = 1 # the nuymber of heads
#Load the IMDB dataset
train_set, valid_set, test_set, train_loader, valid_loader, test_loader, vocab, pad_id = load_imdb_data()
test_acc = 0
model = MyTransformer(vocab, MAX_SEQ_LEN, num_of_blocks, num_heads).to(device)
if os.path.exists('/content/drive/MyDrive/model22.pth'):
      print("using model2")
      model.load_state_dict(torch.load('/content/drive/MyDrive/model2.pth')) # with google colab path in drive : '/content/drive/MyDrive/model.pth'

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1)
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)
loss_function = torch.nn.BCEWithLogitsLoss()
l1_lambda =0.0001
l2_lambda=0.01
# train if model.pth does not exist
if not os.path.exists('/content/drive/MyDrive/model3.pth'):
    for epoch in range(num_of_epochs):
        model.train()
        batch_losses = []
        total_correct = 0
        for batch in tqdm(train_loader, desc='Train', total=len(train_loader)):
            inputs_embeddings, labels = batch
            inputs_embeddings = inputs_embeddings.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs_embeddings).squeeze(1) # (batch_size, 1) -> (batch_size)
            total_correct += ((outputs > 0) == labels).sum().item()
            loss = loss_function(outputs, labels)
            # Log gradient norms
            grad_norms = {name: p.grad.norm().item() for name, p in model.named_parameters() if p.grad is not None}
            print("Gradient norms:", grad_norms)
            loss.backward()
            optimizer.step()
        #scheduler.step()  # Adjust the learning rate
        print(f'Accuracy: {total_correct / len(train_set)}')
        model.eval()
        with torch.no_grad():
            total_correct = 0
            for batch in tqdm(test_loader, desc='Test', total=len(test_loader)):
                inputs_embeddings, labels = batch
                inputs_embeddings = inputs_embeddings.to(device)
                labels = labels.to(device)
                outputs = model(inputs_embeddings).squeeze(1)
                total_correct += ((outputs > 0) == labels).sum().item()
            test_acc = total_correct / len(test_set)
            if test_acc > 0.9:
              print("found one!")
              print(f'Accuracy: {total_correct / len(test_set)}')
              torch.save(model.state_dict(), '/content/drive/MyDrive/model2.pth')
              break
            print(f'Accuracy: {total_correct / len(test_set)}')

else:
    model.load_state_dict(torch.load('/content/drive/MyDrive/model.pth')) # with google colab path in drive : '/content/drive/MyDrive/model.pth'
# Test the model

# Save the model,
torch.save(model.state_dict(), '/content/drive/MyDrive/model.pth')


3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]
/content/drive/MyDrive


25000lines [00:05, 4448.50lines/s]


None


.vector_cache/glove.6B.zip: 0.00B [00:09, ?B/s]


FileNotFoundError: [Errno 2] No such file or directory: '.vector_cache/glove.6B.zip'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0).transpose(0, 1)

    def forward(self, x):
        scale = math.sqrt(self.encoding.size(-1))
        return x + (self.encoding.to(x.device)[:x.size(0), :] / scale).requires_grad_(False)

class MultiHeadAttention(nn.Module):
    def __init__(self, input_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert input_dim % num_heads == 0
        self.num_heads = num_heads
        self.dim_per_head = input_dim // num_heads
        self.W_q = nn.Linear(input_dim, input_dim)
        self.W_k = nn.Linear(input_dim, input_dim)
        self.W_v = nn.Linear(input_dim, input_dim)
        self.fc_out = nn.Linear(input_dim, input_dim)

    def forward(self, x):
        batch_size = x.size(0)
        Q = self.W_q(x).view(batch_size, -1, self.num_heads, self.dim_per_head).transpose(1, 2)
        K = self.W_k(x).view(batch_size, -1, self.num_heads, self.dim_per_head).transpose(1, 2)
        V = self.W_v(x).view(batch_size, -1, self.num_heads, self.dim_per_head).transpose(1, 2)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.dim_per_head)
        attention = F.softmax(scores, dim=-1)
        context = torch.matmul(attention, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.dim_per_head)
        return self.fc_out(context)

class MyLayerNorm(nn.Module):
    def __init__(self, input_dim):
        super(MyLayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(input_dim))
        self.beta = nn.Parameter(torch.zeros(input_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.gamma * (x - mean) / (std + 1e-6) + self.beta

class MyTransformerBlock(nn.Module):
    def __init__(self, input_dim, num_heads):
        super(MyTransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(input_dim, num_heads)
        self.norm1 = MyLayerNorm(input_dim)
        self.fc1 = nn.Linear(input_dim, input_dim)
        self.fc2 = nn.Linear(input_dim, input_dim)
        self.dropout = nn.Dropout(0.3)
        self.norm2 = MyLayerNorm(input_dim)

    def forward(self, x):
        out = self.attention(x)
        x = self.norm1(self.dropout(out) + x)
        out = self.fc2(F.relu(self.fc1(x)))
        out = self.norm2(self.dropout(out) + x)
        return out

class MyTransformer(nn.Module):
    def __init__(self, vocab, max_len, num_of_blocks, num_heads):
        super(MyTransformer, self).__init__()
        self.embedding = nn.Embedding(len(vocab), 768)  # Placeholder dimension, match to your tokenizer model dimension
        self.positional_encoding = PositionalEncoding(768, max_len)  # Adjust the dimension as needed
        self.blocks = nn.ModuleList([MyTransformerBlock(768, num_heads) for _ in range(num_of_blocks)])  # Adjust embedding size
        self.fc = nn.Linear(768, 1)  # Output layer to match your needs

    def forward(self, x):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for block in self.blocks:
            x = block(x)
        x = torch.mean(x, dim=1)
        x = self.fc(x)
        return x


In [None]:
import torch
from torch.utils.data import DataLoader, random_split
from torchtext.data.functional import to_map_style_dataset
from torchtext.datasets import IMDB
from transformers import RobertaTokenizer
from tqdm import tqdm
import os

# Initialize the tokenizer from the Hugging Face transformers library
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def tokenize_and_encode(sentences):
    # Tokenize and encode sequences in the input text
    return tokenizer(sentences, padding='max_length', truncation=True, max_length=512, return_tensors="pt").input_ids

def load_imdb_data():
    # Load the IMDB dataset
    train_iter, test_iter = IMDB()

    # Convert iterators to list for reusability
    train_list = list(train_iter)
    test_list = list(test_iter)

    # Split train dataset into train and validation
    train_size = int(len(train_list) * 0.8)
    train_data, valid_data = random_split(train_list, [train_size, len(train_list) - train_size])

    # Creating DataLoaders for each split
    train_loader = DataLoader(train_data, batch_size=32, collate_fn=collate_batch, shuffle=True)
    valid_loader = DataLoader(valid_data, batch_size=32, collate_fn=collate_batch)
    test_loader = DataLoader(test_list, batch_size=32, collate_fn=collate_batch)

    return train_data, valid_data, test_list, train_loader, valid_loader, test_loader

def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(1 if _label == 'pos' else 0)
        processed_text = tokenize_and_encode(_text)
        text_list.append(processed_text.squeeze(0).to(device))  # Ensure tensor is properly shaped
    labels = torch.tensor(label_list, dtype=torch.float32).to(device)
    inputs = torch.stack(text_list)  # Stack all input tensors
    return inputs, labels

# Example Usage
train_data, valid_data, test_data, train_loader, valid_loader, test_loader = load_imdb_data()
np.random.seed(0)
torch.manual_seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_of_blocks = 1  # More layers
batch_size = 32
num_of_epochs = 10  # Train for more epochs
learning_rate = 0.0001    # Start with a slightly higher learning rate
dropout_rate = 0.4  # Increase dropout rate in transformer blocks
num_heads = 1 # the nuymber of heads
train_set, valid_set, test_set, train_loader, valid_loader, test_loader = load_imdb_data()
test_acc = 0
model = MyTransformer(vocab, MAX_SEQ_LEN, num_of_blocks, num_heads).to(device)
if os.path.exists('/content/drive/MyDrive/model22.pth'):
      print("using model2")
      model.load_state_dict(torch.load('/content/drive/MyDrive/model2.pth')) # with google colab path in drive : '/content/drive/MyDrive/model.pth'

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1)
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)
loss_function = torch.nn.BCEWithLogitsLoss()
l1_lambda =0.0001
l2_lambda=0.01
# train if model.pth does not exist
if not os.path.exists('/content/drive/MyDrive/model3.pth'):
    for epoch in range(num_of_epochs):
        model.train()
        batch_losses = []
        total_correct = 0
        for batch in tqdm(train_loader, desc='Train', total=len(train_loader)):
            inputs_embeddings, labels = batch
            inputs_embeddings = inputs_embeddings.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs_embeddings).squeeze(1) # (batch_size, 1) -> (batch_size)
            total_correct += ((outputs > 0) == labels).sum().item()
            loss = loss_function(outputs, labels)
            # Log gradient norms
            grad_norms = {name: p.grad.norm().item() for name, p in model.named_parameters() if p.grad is not None}
            print("Gradient norms:", grad_norms)
            loss.backward()
            optimizer.step()
        #scheduler.step()  # Adjust the learning rate
        print(f'Accuracy: {total_correct / len(train_set)}')
        model.eval()
        with torch.no_grad():
            total_correct = 0
            for batch in tqdm(test_loader, desc='Test', total=len(test_loader)):
                inputs_embeddings, labels = batch
                inputs_embeddings = inputs_embeddings.to(device)
                labels = labels.to(device)
                outputs = model(inputs_embeddings).squeeze(1)
                total_correct += ((outputs > 0) == labels).sum().item()
            test_acc = total_correct / len(test_set)
            if test_acc > 0.9:
              print("found one!")
              print(f'Accuracy: {total_correct / len(test_set)}')
              torch.save(model.state_dict(), '/content/drive/MyDrive/model2.pth')
              break
            print(f'Accuracy: {total_correct / len(test_set)}')

else:
    model.load_state_dict(torch.load('/content/drive/MyDrive/model.pth')) # with google colab path in drive : '/content/drive/MyDrive/model.pth'
# Test the model

# Save the model,
torch.save(model.state_dict(), '/content/drive/MyDrive/model.pth')


ImportError: cannot import name 'to_map_style_dataset' from 'torchtext.data.functional' (/usr/local/lib/python3.10/dist-packages/torchtext/data/functional.py)

In [None]:
!pip show torchtext
!pip install --upgrade torchtext

Name: torchtext
Version: 0.6.0
Summary: Text utilities and datasets for PyTorch
Home-page: https://github.com/pytorch/text
Author: PyTorch core devs and James Bradbury
Author-email: jekbradbury@gmail.com
License: BSD
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, requests, sentencepiece, six, torch, tqdm
Required-by: 
Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting torch>=2.3.0 (from torchtext)
  Downloading torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl (779.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-nccl-cu12==2.20.5 (from torch>=2.3.0->torchtext)
  Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [None]:
!pip install --upgrade torch


