In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
device

device(type='cuda')

In [3]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('./datasets/IMDBDataset.csv')
df = df.head(4096)
df.shape[0]

4096

In [4]:
from collections import Counter

def build_vocab(texts, min_freq=1):
    counter = Counter()
    for text in texts:
        words = text.lower().split()
        counter.update(words)
    # Only keep words with a frequency greater than min_freq
    vocab = {word: idx + 2 for idx, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

# Build vocabulary from reviews
vocab = build_vocab(df['review'].tolist())

In [5]:
def tokenize(text, vocab):
    return [vocab.get(word, vocab['<UNK>']) for word in text.lower().split()]

df['tokenized_review'] = df['review'].apply(lambda x: tokenize(x, vocab))

In [6]:
def tokenize(text, vocab):
    return [vocab.get(word, vocab['<UNK>']) for word in text.lower().split()]

df['tokenized_review'] = df['review'].apply(lambda x: tokenize(x, vocab))

In [19]:
import torch
from torch.utils.data import Dataset

class IMDBDataset(Dataset):
    def __init__(self, reviews, sentiments, vocab, max_length=100):
        self.reviews = reviews
        self.sentiments = sentiments
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        sentiment = self.sentiments[idx]

        # Pad or truncate the review to max_length
        if len(review) > self.max_length:
            review = review[:self.max_length]
        else:
            review = review + [self.vocab['<PAD>']] * (self.max_length - len(review))

        #print(f"sentiment: {sentiment}")

        sentiment_value = 0
        if sentiment == 'positive':
            sentiment_value = 1

        return {
            'input_ids': torch.tensor(review, dtype=torch.long),
            'sentiment': torch.tensor(sentiment_value, dtype=torch.long)
        }

In [20]:
from torch.utils.data import DataLoader, random_split

# Create the dataset
dataset = IMDBDataset(df['tokenized_review'].tolist(), df['sentiment'].tolist(), vocab)

# Split the data into training and testing sets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [21]:
for batch in train_dataloader:
    input_tokens = batch['input_ids']
    labels = batch['sentiment']
    print(f"Input Tokens shape: {input_tokens.shape}")
    print(f"Labels shape: {labels.shape}")
    break

Input Tokens shape: torch.Size([32, 100])
Labels shape: torch.Size([32])


In [22]:
# Iterate through the DataLoader
for batch in train_dataloader:
    input_seq = batch['input_ids']
    target_seq = batch['sentiment']
    
    print("Input Sequence:")
    print(input_seq)  # (batch_size, sequence_length)
    
    print("Target Sequence:")
    print(target_seq)  # (batch_size,)
    
    # Example break to print only one batch
    break

token_to_id = dataset.vocab
print(len(token_to_id))
id_to_token = dict(map(reversed, token_to_id.items()))
print(len(id_to_token))

Input Sequence:
tensor([[   10,   540,   156,  ...,  1198,   806,  7488],
        [  469,   249,   383,  ...,    22,   796, 36823],
        [   12,   161,   768,  ...,  4148,    95,  3587],
        ...,
        [  147,   945,  1243,  ...,  9206,  1054,    54],
        [69694, 16464,  3112,  ..., 42296,  1504,     4],
        [  313,   705,    95,  ...,   897, 27141,    56]])
Target Sequence:
tensor([1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 1, 1, 0])
80921
80921


## using transformer architecture for sentiment analysis

In [71]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(start=0, end=max_len, step=1).float().unsqueeze(1)
        embedding_index = torch.arange(start=0, end=d_model, step=2).float()
        div_term = 1 / torch.tensor(10000.0)**(embedding_index / d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe',pe)

    def forward(self, word_embeddings):
        print(word_embeddings.shape)
        return word_embeddings + self.pe[:word_embeddings.size(0), :]

class Attention(nn.Module):
    def __init__(self, d_model=2):
        super().__init__()
        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.row_dim = 0
        self.col_dim = 1

    def forward(self, encodings, mask=None):
        q = self.W_q(encodings)
        k = self.W_k(encodings)
        v = self.W_v(encodings)

        # Ensure k has the same shape as q before transpose
        assert k.shape == q.shape
        
        # Transpose k to align with q for dot product
        k_transposed = k.transpose(-1, -2)
        
        sims = torch.matmul(q, k_transposed)
        scaled_sims = sims / torch.tensor(k.size(1)**0.5)

        if mask is not None:
            #mask = mask.to(device)
            scaled_sims = scaled_sims.masked_fill(mask=mask, value=-1e9)

        attention_percents = F.softmax(scaled_sims)
        attention_scores = torch.matmul(attention_percents, v)
        return attention_scores

class DecoderOnlyTransformer(nn.Module):
    def __init__(self, num_tokens, d_model, max_len, using_mask=True):
        super().__init__()
        self.we = nn.Embedding(num_embeddings=num_tokens, embedding_dim=d_model)
        #self.pe = PositionalEncoding(d_model=d_model, max_len=max_len)
        self.self_attention = Attention(d_model=d_model)
        self.fc_layer = nn.Linear(in_features=d_model, out_features=1)
        self.loss = nn.CrossEntropyLoss()
        self.using_mask = using_mask

    def forward(self, token_ids):
        # calculate word embeddings out of the tokens
        word_embeddings = self.we(token_ids)
        # apply positional encoding (using the PositionalEncoder layer) to the word embeddings
        # position_encoded = self.pe(word_embeddings)
        # create mask for decoder only transformer so it can not cheat
        if (self.using_mask == True):
            mask_ones = torch.ones((token_ids.size(dim=0), token_ids.size(dim=0)))
            #print(mask_ones)
            mask = torch.tril(mask_ones)
            #print(mask)
            mask = mask == 0
            #print(mask)
            # calculate self attention with the Attention Layer
            self_attention_values = self.self_attention(word_embeddings, mask=mask)
        else:
            self_attention_values = self.self_attention(word_embeddings)    
        # add original position_encoded values to the calculated self attention values (residual connection)
        # residual_connection_values = position_encoded + self_attention_values
        # use the final linear layer to calculate the output probabilities
        return self.fc_layer(self_attention_values.squeeze(0))

In [150]:
vocab_size = len(dataset.vocab)
print(vocab_size)
embedding_dim = 100
hidden_dim = 256
output_dim = 1
max_len = 200
sentiment_model = DecoderOnlyTransformer(num_tokens=vocab_size, d_model=8, max_len=max_len, using_mask=False)

sentiment_model.to(device)
optimizer = Adam(sentiment_model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()
sentiment_model.train()
epochs = 45

for epoch in range(epochs):
    epoch_loss = 0
    total_loss = 0
    for data in train_dataloader:
        optimizer.zero_grad()
        input_tokens = data['input_ids']
        #print(f"input_ids: {input_tokens}")
        labels = data['sentiment']
        input_tokens = input_tokens.to(device)  # Move inputs to GPU if available
        labels = labels.to(device)  # Move labels to GPU if available
        prediction = sentiment_model(input_tokens).squeeze(1)
        prediction = prediction[:, -1, 0]
        labels = labels.view(-1).float()  # [batch_size * seq_length
        #print(prediction.shape)
        #print(prediction)
        #print(labels.shape)
        #print(labels)
        loss = criterion(prediction, labels) 
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * input_tokens.size(0)
        epoch_loss += loss.item()
    average_loss = total_loss / len(train_dataloader.dataset)
    print(f"Epoch {epoch}, Train Loss: {average_loss}")

80921


  attention_percents = F.softmax(scaled_sims)


Epoch 0, Train Loss: 0.6992613198701861
Epoch 1, Train Loss: 0.6901238225173019
Epoch 2, Train Loss: 0.681618024650802
Epoch 3, Train Loss: 0.6676254936918089
Epoch 4, Train Loss: 0.6408633167635972
Epoch 5, Train Loss: 0.5960057391279056
Epoch 6, Train Loss: 0.5351371246234256
Epoch 7, Train Loss: 0.46966583533747003
Epoch 8, Train Loss: 0.40255854971097355
Epoch 9, Train Loss: 0.3397397503748045
Epoch 10, Train Loss: 0.281528045151551
Epoch 11, Train Loss: 0.2287954952435907
Epoch 12, Train Loss: 0.18438456768126787
Epoch 13, Train Loss: 0.1468717411970117
Epoch 14, Train Loss: 0.1155766293344512
Epoch 15, Train Loss: 0.09001529990025638
Epoch 16, Train Loss: 0.0702984923944878
Epoch 17, Train Loss: 0.0547681837679559
Epoch 18, Train Loss: 0.04235404078770197
Epoch 19, Train Loss: 0.03257558666204397
Epoch 20, Train Loss: 0.025428431159374616
Epoch 21, Train Loss: 0.019941985949810174
Epoch 22, Train Loss: 0.016098586900980736
Epoch 23, Train Loss: 0.012808822915063186
Epoch 24, Trai

In [151]:
import torch

# Sample tensor of shape [32, 100, 1]
output_tensor = torch.randn(32, 100, 1)

# Using the value from the last time step
output_tensor_last_step = output_tensor[:, -1, 0]

# The resulting tensor will have shape [32]
print(output_tensor_last_step.shape)
print(output_tensor_last_step)


torch.Size([32])
tensor([ 2.6792e-01,  3.4408e-01,  2.6303e+00,  3.6051e-01,  7.4766e-01,
         1.2125e+00, -4.2127e-01, -6.9262e-01,  4.0398e-01,  1.2544e-01,
         6.1825e-01,  7.1614e-02,  2.0520e+00, -1.3205e+00, -7.8493e-01,
         2.6596e-01,  2.2210e-01,  1.6068e+00, -2.4850e-04, -2.0339e-01,
        -8.1538e-02,  5.6586e-01,  3.1171e-01,  5.5484e-01,  1.0787e+00,
         2.7716e-01,  1.3991e-01,  1.4559e-01,  7.0210e-01,  3.1345e-01,
         4.5697e-01,  9.2115e-01])


In [152]:
test_datas = [
    {"text": "I love this product!", "label": 1},
    {"text": "This product is terrible.", "label": 0},
    {"text": "I'm so happy with this purchase!", "label": 1},
    {"text": "I regret buying this.", "label": 0},
    {"text": "This is the best thing I've ever bought!", "label": 1},
    {"text": "I'm disappointed with this product.", "label": 0},
    {"text": "I would definitely recommend this!", "label": 1},
    {"text": "This is a waste of money.", "label": 0},
    {"text": "I'm so impressed with this!", "label": 1},
    {"text": "I don't like this at all.", "label": 0},
    {"text": "I don't like the movie.", "label": 0},
    {"text": "I love this.", "label": 1},
    {"text": "This movie is bad.", "label": 0},
    {"text": "This is a waste of time.", "label": 0},
    {"text": "I regret looking this.", "label": 0},
    {"text": "I movie is crap.", "label": 0},
    {"text": "I love the movie.", "label": 1},
    {"text": "The movie is a good movie.", "label": 1},
    {"text": "I loved the new Marvel movie, it was so action-packed and exciting!", "label": 1},
    {"text": "The latest rom-com I saw was so cheesy and predictable, I hated it.", "label": 0},
    {"text": "The special effects in the new Star Wars movie were mind-blowing!", "label": 1},
    {"text": "I was really disappointed with the ending of the latest Game of Thrones season.", "label": 0},
    {"text": "The new Pixar movie was so heartwarming and funny.", "label": 1},
    {"text": "The latest horror movie I saw was so boring.", "label": 0},
    {"text": "The acting in the new biopic was incredible, the lead actor deserved an Oscar.", "label": 1},
    {"text": "The plot of the latest sci-fi movie was so confusing and convoluted, I got lost.", "label": 0},
    {"text": "The new comedy movie was hilarious, I laughed out loud the whole time!", "label": 1},
    {"text": "The latest drama movie I saw was so depressing and slow, I didn't enjoy it at all.", "label": 0}
]

In [153]:
test_texts = [sample['text'] for sample in test_datas]
print(test_texts)

['I love this product!', 'This product is terrible.', "I'm so happy with this purchase!", 'I regret buying this.', "This is the best thing I've ever bought!", "I'm disappointed with this product.", 'I would definitely recommend this!', 'This is a waste of money.', "I'm so impressed with this!", "I don't like this at all.", "I don't like the movie.", 'I love this.', 'This movie is bad.', 'This is a waste of time.', 'I regret looking this.', 'I movie is crap.', 'I love the movie.', 'The movie is a good movie.', 'I loved the new Marvel movie, it was so action-packed and exciting!', 'The latest rom-com I saw was so cheesy and predictable, I hated it.', 'The special effects in the new Star Wars movie were mind-blowing!', 'I was really disappointed with the ending of the latest Game of Thrones season.', 'The new Pixar movie was so heartwarming and funny.', 'The latest horror movie I saw was so boring.', 'The acting in the new biopic was incredible, the lead actor deserved an Oscar.', 'The pl

In [154]:
min_len = 10
for test_data in test_datas:
    test_input_text = test_data['text']
    test_label = test_data['label']
    
    test_indices = [vocab.get(word, vocab['<UNK>']) for word in test_input_text.lower().split()]
    if len(test_indices) < min_len:
        test_indices += [0] * (min_len - len(test_indices))  # Use 0 instead of self.vocab['<PAD>']
    test_input_ids = torch.tensor(test_indices)

    # Convert label to tensor
    test_label = torch.tensor(test_label)

    test_input_ids = test_input_ids.to(device)  # Move inputs to GPU if available
    test_label = test_label.to(device)  # Move labels to GPU if available
    #print(test_input_ids)
    #print(test_label)

    # Adding a new dimension at the beginning to change the shape to [1, 10]
    reshaped_test_input_ids = test_input_ids.unsqueeze(0)

    #print(f"Original Tensor shape: {test_input_ids.shape}")  # Output: torch.Size([10])
    #print(f"Reshaped Tensor shape: {reshaped_test_input_ids.shape}")  # Output: torch.Size([1, 10])
    #print(reshaped_test_input_ids)
    
    test_prediction = sentiment_model(reshaped_test_input_ids).squeeze(1)
    test_prediction = test_prediction.mean()
    threshold = 0
    binary_prediction = (test_prediction >= threshold).long()
    #print(f"{test_input_text} - test_label: {test_label}")
    print(f"{test_label == int(binary_prediction)},  {test_input_text} - test_label: {test_label}, binary predition: {int(binary_prediction)}, test_prediction: {test_prediction}")
    

True,  I love this product! - test_label: 1, binary predition: 1, test_prediction: 23.75636863708496
True,  This product is terrible. - test_label: 0, binary predition: 0, test_prediction: -31.529279708862305
True,  I'm so happy with this purchase! - test_label: 1, binary predition: 1, test_prediction: 19.55344581604004
True,  I regret buying this. - test_label: 0, binary predition: 0, test_prediction: -47.1557731628418
True,  This is the best thing I've ever bought! - test_label: 1, binary predition: 1, test_prediction: 44.837825775146484
True,  I'm disappointed with this product. - test_label: 0, binary predition: 0, test_prediction: -12.945472717285156
False,  I would definitely recommend this! - test_label: 1, binary predition: 0, test_prediction: -23.153711318969727
True,  This is a waste of money. - test_label: 0, binary predition: 0, test_prediction: -79.17842864990234
False,  I'm so impressed with this! - test_label: 1, binary predition: 0, test_prediction: -19.659509658813477


  attention_percents = F.softmax(scaled_sims)


## using LSTM and GRU for sentiment analyse

In [155]:
class SentimentLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedding = self.embedding(text)
        _, (hidden, _) = self.rnn(embedding)
        #print(hidden.shape)
        output = self.fc(hidden.squeeze(0))
        return output

In [156]:
class SentimentGRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedding = self.embedding(text)
        _, hidden = self.rnn(embedding)
        #print(hidden.shape)
        output = self.fc(hidden.squeeze(0))
        return output

In [None]:
vocab_size = len(dataset.vocab)
print(vocab_size)
embedding_dim = 100
hidden_dim = 256
output_dim = 1

#sentiment_model = SentimentModel(vocab_size, embedding_dim, hidden_dim, output_dim)
#sentiment_model = SentimentRnnModel(vocab_size, embedding_dim, hidden_dim, output_dim)
#sentiment_model = SentimentLSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)
sentiment_model = SentimentGRUModel(vocab_size, embedding_dim, hidden_dim, output_dim)

sentiment_model.to(device)
optimizer = Adam(sentiment_model.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()
sentiment_model.train()
epochs = 30

for epoch in range(epochs):
    epoch_loss = 0
    total_loss = 0
    for data in train_dataloader:
        optimizer.zero_grad()
        input_tokens = data['input_ids']
        #print(f"input_ids: {input_tokens}")
        labels = data['sentiment']
        input_tokens = input_tokens.to(device)  # Move inputs to GPU if available
        labels = labels.to(device)  # Move labels to GPU if available
        prediction = sentiment_model(input_tokens).squeeze(1)
        labels = labels.view(-1).float()  # [batch_size * seq_length
        #print(prediction.shape)
        #print(prediction)
        #print(labels.shape)
        #print(labels)
        loss = criterion(prediction, labels) 
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * input_tokens.size(0)
        epoch_loss += loss.item()
    average_loss = total_loss / len(train_dataloader.dataset)
    print(f"Epoch {epoch}, Train Loss: {average_loss}")

In [176]:
test_datas = [
    {"text": "I love this product!", "label": 1},
    {"text": "This product is terrible.", "label": 0},
    {"text": "I'm so happy with this purchase!", "label": 1},
    {"text": "I regret buying this.", "label": 0},
    {"text": "This is the best thing I've ever bought!", "label": 1},
    {"text": "I'm disappointed with this product.", "label": 0},
    {"text": "I would definitely recommend this!", "label": 1},
    {"text": "This is a waste of money.", "label": 0},
    {"text": "I'm so impressed with this!", "label": 1},
    {"text": "I don't like this at all.", "label": 0},
    {"text": "I don't like it.", "label": 0},
    {"text": "I love this.", "label": 1},
    {"text": "This movie is bad.", "label": 0},
    {"text": "This is a waste of time.", "label": 0},
    {"text": "I regret looking this.", "label": 0},
    {"text": "I movie is crap.", "label": 0},
    {"text": "I love the movie.", "label": 1},
    {"text": "The movie is a good movie.", "label": 1},
    {"text": "I loved the new Marvel movie, it was so action-packed and exciting!", "label": 1},
    {"text": "The latest rom-com I saw was so cheesy and predictable, I hated it.", "label": 0},
    {"text": "The special effects in the new Star Wars movie were mind-blowing, I was on the edge of my seat!", "label": 1},
    {"text": "I was really disappointed with the ending of the latest Game of Thrones season, it was so rushed.", "label": 0},
    {"text": "The new Pixar movie was so heartwarming and funny, I cried and laughed at the same time!", "label": 1},
    {"text": "The latest horror movie I saw was so boring and not scary at all, I fell asleep.", "label": 0},
    {"text": "The acting in the new biopic was incredible, the lead actor deserved an Oscar.", "label": 1},
    {"text": "The plot of the latest sci-fi movie was so confusing and convoluted, I got lost.", "label": 0},
    {"text": "The new comedy movie was hilarious, I laughed out loud the whole time!", "label": 1},
    {"text": "The latest drama movie I saw was so depressing and slow, I didn't enjoy it at all.", "label": 0}
]

In [177]:
min_len = 10
for test_data in test_datas:
    test_input_text = test_data['text']
    test_label = test_data['label']
    
    test_indices = [vocab.get(word, vocab['<UNK>']) for word in test_input_text.lower().split()]
    if len(test_indices) < min_len:
        test_indices += [0] * (min_len - len(test_indices))  # Use 0 instead of self.vocab['<PAD>']
    test_input_ids = torch.tensor(test_indices)

    # Convert label to tensor
    test_label = torch.tensor(test_label)

    test_input_ids = test_input_ids.to(device)  # Move inputs to GPU if available
    test_label = test_label.to(device)  # Move labels to GPU if available
    #print(test_input_ids)
    #print(test_label)

    # Adding a new dimension at the beginning to change the shape to [1, 10]
    reshaped_test_input_ids = test_input_ids.unsqueeze(0)

    #print(f"Original Tensor shape: {test_input_ids.shape}")  # Output: torch.Size([10])
    #print(f"Reshaped Tensor shape: {reshaped_test_input_ids.shape}")  # Output: torch.Size([1, 10])
    #print(reshaped_test_input_ids)
    
    test_prediction = sentiment_model(reshaped_test_input_ids).squeeze(1)
    test_prediction = test_prediction.mean()
    threshold = 0
    binary_prediction = (test_prediction >= threshold).long()
    print(f"{test_label == int(binary_prediction)},  {test_input_text} - test_label: {test_label}, binary predition: {int(binary_prediction)}, test_prediction: {test_prediction}")
    

  attention_percents = F.softmax(scaled_sims)


False,  I love this product! - test_label: 1, binary predition: 0, test_prediction: -8.277002780232579e-05
True,  This product is terrible. - test_label: 0, binary predition: 0, test_prediction: -8.277002780232579e-05
False,  I'm so happy with this purchase! - test_label: 1, binary predition: 0, test_prediction: -8.277002780232579e-05
True,  I regret buying this. - test_label: 0, binary predition: 0, test_prediction: -8.277002780232579e-05
False,  This is the best thing I've ever bought! - test_label: 1, binary predition: 0, test_prediction: -8.277002780232579e-05
True,  I'm disappointed with this product. - test_label: 0, binary predition: 0, test_prediction: -8.277002780232579e-05
False,  I would definitely recommend this! - test_label: 1, binary predition: 0, test_prediction: -8.277002780232579e-05
True,  This is a waste of money. - test_label: 0, binary predition: 0, test_prediction: -8.277002780232579e-05
False,  I'm so impressed with this! - test_label: 1, binary predition: 0, te

## using decoder only transfomer for sentiment analyse

In [186]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(start=0, end=max_len, step=1).float().unsqueeze(1)
        embedding_index = torch.arange(start=0, end=d_model, step=2).float()
        div_term = 1 / torch.tensor(10000.0)**(embedding_index / d_model)
        #print(div_term)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe',pe)

    def forward(self, word_embeddings):
        pe_temp = self.pe[:word_embeddings.size(0), :]
        pe_temp_expanded = pe_temp.unsqueeze(1)
        #print(f"word_embeddings.shape: {word_embeddings.shape}, self.pe.shape: {pe_temp_expanded.shape}, ")
        return word_embeddings + pe_temp_expanded

class Attention(nn.Module):
    def __init__(self, d_model=2):
        super().__init__()
        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.row_dim = 0
        self.col_dim = 1

    def forward(self, encodings, mask=None):
        q = self.W_q(encodings)
        k = self.W_k(encodings)
        v = self.W_v(encodings)

        # Ensure k has the same shape as q before transpose
        assert k.shape == q.shape

        # Transpose k to align with q for dot product
        k_transposed = k.transpose(-1, -2)

        # Check shapes
        #print("Shape of q:", q.shape)  # [1, 5, 2]
        #print("Shape of k_transposed:", k_transposed.shape)  # [1, 2, 5]
        sims = torch.matmul(q, k_transposed)
        scaled_sims = sims / torch.tensor(k.size(1)**0.5)

        if mask is not None:
            mask = mask.to(device)
            scaled_sims = scaled_sims.masked_fill(mask=mask, value=-1e9)

        attention_percents = F.softmax(scaled_sims)
        attention_scores = torch.matmul(attention_percents, v)
        return attention_scores

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model=2,heads=2):
        super().__init__()

        self.W_qs = []
        self.W_ks = []
        self.W_vs = []

        for index in range(heads):
            W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
            W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
            W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)

            self.W_qs.append(W_q)
            self.W_ks.append(W_k)
            self.W_vs.append(W_v)

        self.unify_heads = nn.Linear(d_model * heads, d_model)

        self.row_dim = 0
        self.col_dim = 1
        self.heads = heads

    def forward(self, encodings, mask=None):
        attentionscores = []
        #encodings.to(device)
        for index in range(self.heads):
            W_q = self.W_qs[index].to(device)
            W_k = self.W_ks[index].to(device)
            W_v = self.W_vs[index].to(device)

            q = W_q(encodings.to(device))
            k = W_k(encodings.to(device))
            v = W_v(encodings.to(device))

            k_transposed = k.transpose(-1, -2)
            sims = torch.matmul(q, k_transposed)
            scaled_sims = sims / torch.tensor(k.size(1)**0.5)

            if mask is not None:
                mask = mask.to(device)
                scaled_sims = scaled_sims.masked_fill(mask=mask, value=-1e9)

            attention_percents = F.softmax(scaled_sims)
            attention_scores = torch.matmul(attention_percents, v)
            attentionscores.append(attention_scores)

        combined_attention_scores = torch.cat(attentionscores, dim=-1)
        combined_output = self.unify_heads(combined_attention_scores)

        return combined_output

class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, num_tokens, using_mask=True):
        super(DecoderBlock, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, heads=num_heads)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.relu = nn.ReLU()
        self.fc_layer = nn.Linear(in_features=d_model, out_features=d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.fc_layer2 = nn.Linear(in_features=d_model, out_features=d_model)
        dropout=0.1
        self.dropout = nn.Dropout(dropout)
        self.using_mask = using_mask

    def forward(self, position_encoded, mask=None):
        if self.using_mask:
            self_attention_values = self.self_attention(position_encoded, mask=mask)
        else:
            self_attention_values = self.self_attention(position_encoded)

        residual_connection_values = position_encoded + self_attention_values
        normalized_values1 = self.layer_norm1(residual_connection_values)

        fc_layer_output_relu = self.relu(self.fc_layer(normalized_values1))
        #fc_layer_output_dropout = self.dropout(fc_layer_output_relu)
        #fc_layer_output = self.fc_layer2(fc_layer_output_dropout)
        #final_output = self.layer_norm2(normalized_values1 + fc_layer_output)
        #fc_layer_output = self.fc_layer2(self.dropout(self.relu(self.fc_layer(normalized_values1))))
        #return final_output
        #fc_layer_output = self.fc_layer(normalized_values1)
        fc_layer_output = self.relu(self.fc_layer(normalized_values1))
        return fc_layer_output

class DecoderOnlyTransformerBlockTransformer(nn.Module):
    def __init__(self, num_tokens, d_model, max_len, using_mask=True):
        super(DecoderOnlyTransformerBlockTransformer, self).__init__()
        self.number_heads = 4
        self.we = nn.Embedding(num_embeddings=num_tokens, embedding_dim=d_model)
        self.pe = PositionalEncoding(d_model=d_model, max_len=max_len)
        self.decoder_block1 = DecoderBlock(d_model=d_model, num_heads=self.number_heads, num_tokens=num_tokens, using_mask=using_mask)
        #self.decoder_block2 = DecoderBlock(d_model=d_model, num_heads=self.number_heads, num_tokens=num_tokens, using_mask=using_mask)
        #self.decoder_block3 = DecoderBlock(d_model=d_model, num_heads=self.number_heads, num_tokens=num_tokens, using_mask=using_mask)
        #self.decoder_block4 = DecoderBlock(d_model=d_model, num_heads=self.number_heads, num_tokens=num_tokens, using_mask=using_mask)
        #self.decoder_block5 = DecoderBlock(d_model=d_model, num_heads=self.number_heads, num_tokens=num_tokens, using_mask=using_mask)
        #self.decoder_block6 = DecoderBlock(d_model=d_model, num_heads=self.number_heads, num_tokens=num_tokens, using_mask=using_mask)

        self.fc_layer = nn.Linear(in_features=d_model, out_features=num_tokens)
        self.loss = nn.CrossEntropyLoss()

    def forward(self, token_ids):
        word_embeddings = self.we(token_ids)
        position_encoded = self.pe(word_embeddings)

        if self.decoder_block1.using_mask:
            mask_ones = torch.ones((token_ids.size(dim=1), token_ids.size(dim=1)))
            mask = torch.tril(mask_ones)
            mask = mask == 0
        else:
            mask = None

        output_block1 = self.decoder_block1(position_encoded, mask=mask)
        #output_block2 = self.decoder_block2(output_block1, mask=mask)
        #output_block3 = self.decoder_block3(output_block2, mask=mask)
        #output_block4 = self.decoder_block4(output_block3, mask=mask)
        #output_block5 = self.decoder_block5(output_block4, mask=mask)
        #output_block6 = self.decoder_block6(output_block5, mask=mask)

        fc_layer_output = self.fc_layer(output_block1)

        return fc_layer_output

In [188]:
vocab_size = len(dataset.vocab)
print(vocab_size)
embedding_dim = 100
d_model = 256
output_dim = 1
max_len = 100
sentiment_model = DecoderOnlyTransformerBlockTransformer(num_tokens=vocab_size, d_model=d_model, max_len=max_len, using_mask=False)

sentiment_model.to(device)
optimizer = Adam(sentiment_model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()
sentiment_model.train()
epochs = 30

for epoch in range(epochs):
    epoch_loss = 0
    total_loss = 0
    for data in train_dataloader:
        optimizer.zero_grad()
        input_tokens = data['input_ids']
        #print(f"input_ids: {input_tokens}")
        labels = data['sentiment']
        input_tokens = input_tokens.to(device)  # Move inputs to GPU if available
        labels = labels.to(device)  # Move labels to GPU if available
        prediction = sentiment_model(input_tokens).squeeze(1)
        prediction = prediction[:, -1, 0]
        labels = labels.view(-1).float()  # [batch_size * seq_length
        #print(prediction.shape)
        #print(prediction)
        #print(labels.shape)
        #print(labels)
        loss = criterion(prediction, labels) 
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * input_tokens.size(0)
        epoch_loss += loss.item()
    average_loss = total_loss / len(train_dataloader.dataset)
    print(f"Epoch {epoch}, Train Loss: {average_loss}")

80921


  attention_percents = F.softmax(scaled_sims)


Epoch 0, Train Loss: 0.7029317706204742
Epoch 1, Train Loss: 0.6932171292066283
Epoch 2, Train Loss: 0.6807057850235694
Epoch 3, Train Loss: 0.5996656423668867
Epoch 4, Train Loss: 0.4255859572372157
Epoch 5, Train Loss: 0.21689187275912153
Epoch 6, Train Loss: 0.1025290111118475
Epoch 7, Train Loss: 0.0818839395721244
Epoch 8, Train Loss: 0.0294594296296292
Epoch 9, Train Loss: 0.021285514667646117
Epoch 10, Train Loss: 0.013430541948675745
Epoch 11, Train Loss: 0.013425597444709569
Epoch 12, Train Loss: 0.0017188184128193852
Epoch 13, Train Loss: 0.011529531356486548
Epoch 14, Train Loss: 0.007095039225910241
Epoch 15, Train Loss: 0.00317407891453714
Epoch 16, Train Loss: 0.003044310553399937
Epoch 17, Train Loss: 0.0039697438784255515
Epoch 18, Train Loss: 0.004232281420894333
Epoch 19, Train Loss: 0.0011917188496165395
Epoch 20, Train Loss: 0.00028811999287639274
Epoch 21, Train Loss: 0.00024084853033931255
Epoch 22, Train Loss: 0.013255235374079513
Epoch 23, Train Loss: 0.00449025

In [181]:
test_datas = [
    {"text": "I love this product!", "label": 1},
    {"text": "This product is terrible.", "label": 0},
    {"text": "I'm so happy with this purchase!", "label": 1},
    {"text": "I regret buying this.", "label": 0},
    {"text": "This is the best thing I've ever bought!", "label": 1},
    {"text": "I'm disappointed with this product.", "label": 0},
    {"text": "I would definitely recommend this!", "label": 1},
    {"text": "This is a waste of money.", "label": 0},
    {"text": "I'm so impressed with this!", "label": 1},
    {"text": "I don't like this at all.", "label": 0},
    {"text": "I don't like the movie.", "label": 0},
    {"text": "I love this.", "label": 1},
    {"text": "This movie is bad.", "label": 0},
    {"text": "This is a waste of time.", "label": 0},
    {"text": "I regret looking this.", "label": 0},
    {"text": "I movie is crap.", "label": 0},
    {"text": "I love the movie.", "label": 1},
    {"text": "The movie is a good movie.", "label": 1},
    {"text": "I loved the new Marvel movie, it was so action-packed and exciting!", "label": 1},
    {"text": "The latest rom-com I saw was so cheesy and predictable, I hated it.", "label": 0},
    {"text": "The special effects in the new Star Wars movie were mind-blowing!", "label": 1},
    {"text": "I was really disappointed with the ending of the latest Game of Thrones season.", "label": 0},
    {"text": "The new Pixar movie was so heartwarming and funny.", "label": 1},
    {"text": "The latest horror movie I saw was so boring.", "label": 0},
    {"text": "The acting in the new biopic was incredible, the lead actor deserved an Oscar.", "label": 1},
    {"text": "The plot of the latest sci-fi movie was so confusing and convoluted, I got lost.", "label": 0},
    {"text": "The new comedy movie was hilarious, I laughed out loud the whole time!", "label": 1},
    {"text": "The latest drama movie I saw was so depressing and slow, I didn't enjoy it at all.", "label": 0}
]

In [189]:
min_len = 10
sentiment_model.eval()
for test_data in test_datas:
    test_input_text = test_data['text']
    test_label = test_data['label']
    
    test_indices = [vocab.get(word, vocab['<UNK>']) for word in test_input_text.lower().split()]
    if len(test_indices) < min_len:
        test_indices += [0] * (min_len - len(test_indices))  # Use 0 instead of self.vocab['<PAD>']
    test_input_ids = torch.tensor(test_indices)

    # Convert label to tensor
    test_label = torch.tensor(test_label)

    test_input_ids = test_input_ids.to(device)  # Move inputs to GPU if available
    test_label = test_label.to(device)  # Move labels to GPU if available
    #print(test_input_ids)
    #print(test_label)

    # Adding a new dimension at the beginning to change the shape to [1, 10]
    reshaped_test_input_ids = test_input_ids.unsqueeze(0)

    #print(f"Original Tensor shape: {test_input_ids.shape}")  # Output: torch.Size([10])
    #print(f"Reshaped Tensor shape: {reshaped_test_input_ids.shape}")  # Output: torch.Size([1, 10])
    #print(reshaped_test_input_ids)
    
    test_prediction = sentiment_model(reshaped_test_input_ids).squeeze(1)
    test_prediction = test_prediction.mean()
    threshold = 0
    binary_prediction = (test_prediction >= threshold).long()
    print(f"{test_label == int(binary_prediction)},  {test_input_text} - test_label: {test_label}, binary predition: {int(binary_prediction)}, test_prediction: {test_prediction}")

  attention_percents = F.softmax(scaled_sims)


True,  I love this product! - test_label: 1, binary predition: 1, test_prediction: 0.000282477616565302
True,  This product is terrible. - test_label: 0, binary predition: 0, test_prediction: -0.0010507262777537107
False,  I'm so happy with this purchase! - test_label: 1, binary predition: 0, test_prediction: -0.00031125196255743504
True,  I regret buying this. - test_label: 0, binary predition: 0, test_prediction: -0.00023693035473115742
True,  This is the best thing I've ever bought! - test_label: 1, binary predition: 1, test_prediction: 0.0006155656883493066
True,  I'm disappointed with this product. - test_label: 0, binary predition: 0, test_prediction: -0.0018173090647906065
True,  I would definitely recommend this! - test_label: 1, binary predition: 1, test_prediction: 0.0005897162482142448
True,  This is a waste of money. - test_label: 0, binary predition: 0, test_prediction: -0.0020566012244671583
False,  I'm so impressed with this! - test_label: 1, binary predition: 0, test_pr