## create a simple self attention mechanism to train sentiment analysis at the imdb dataset

In [31]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer

max_length = 80

# Load the IMDb dataset from Hugging Face
imdb_dataset = load_dataset("imdb")

# Convert the dataset to PyTorch format
imdb_dataset.set_format("torch")

# Get the training and test datasets
train_dataset = imdb_dataset["train"]
test_dataset = imdb_dataset["test"]

#counter = 0
#for item in test_dataset:
    #print("item ", item)
    #print("text ", item["text"])
    #counter += 1
    #if counter > 10:
        #break

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")  # Replace with your desired tokenizer

# Tokenize the dataset
def tokenize_function(examples):
    #return tokenizer(examples["text"], padding="max_length", truncation=True, padding=True, max_length=maxlen)
    #return tokenizer(examples["text"], return_tensors="pt", truncation=False, padding=True, max_length=maxlen)
    return tokenizer(examples["text"], padding='max_length', truncation=True, max_length=max_length) 

imdb_dataset = imdb_dataset.map(tokenize_function, batched=False)

train_dataset = imdb_dataset["train"]
test_dataset = imdb_dataset["test"]

# Extract features (x) and labels (y) from the datasets
x_train = train_dataset["input_ids"]
y_train = train_dataset["label"]
x_test = test_dataset["input_ids"]
y_test = test_dataset["label"]

# Convert labels to torch tensors
y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)

vocab_size = len(tokenizer.vocab)
print('Vocabulary size:', vocab_size)

print("type(x_train)",type(x_train))
print("type(x_train[0])",type(x_train[0]))
print("type(x_test)",type(x_test))
print("type(x_test[0])",type(x_test[0]))
print("y_train.shape ",y_train.shape)
print("y_test.shape ",y_test.shape)

text  I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Vocabulary size: 30522
type(x_train) <class 'torch.Tensor'>
type(x_train[0]) <class 'torch.Tensor'>
type(x_test) <class 'torch.Tensor'>
type(x_test[0]) <class 'torch.Tensor'>
y_train.shape  torch.Size([25000])
y_test.shape  torch.Size([25000])


  y_train = torch.tensor(y_train)
  y_test = torch.tensor(y_test)


In [32]:
print("type(x_train)",type(x_train))
print("type(x_train[0])",type(x_train[0]))
print("type(x_test)",type(x_test))
print("type(x_test[0])",type(x_test[0]))
print("y_train.shape ",y_train.shape)
print("y_test.shape ",y_test.shape)

type(x_train) <class 'torch.Tensor'>
type(x_train[0]) <class 'torch.Tensor'>
type(x_test) <class 'torch.Tensor'>
type(x_test[0]) <class 'torch.Tensor'>
y_train.shape  torch.Size([25000])
y_test.shape  torch.Size([25000])


In [33]:
x_train_tensor = x_train
x_test_tensor = x_test
y_train_tensor = y_train
y_test_tensor = y_test  # Output: torch.Size([3, 3])

# Print the shape of the resulting tensor
print(x_train_tensor.shape)  # Output: torch.Size([3, 3])
print(x_test_tensor.shape)  # Output: torch.Size([3, 3])
print(x_test_tensor.shape)  # Output: torch.Size([3, 3])
print(x_train_tensor.shape)  # Output: torch.Size([3, 3])

torch.Size([25000, 80])
torch.Size([25000, 80])
torch.Size([25000, 80])
torch.Size([25000, 80])


In [34]:
# Get the tokenizer used for the IMDb dataset
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print(tokenizer)  # Output: transformers.BertTokenizer

test_sentence = "This movie is amazing and I loved it."
inputs = tokenizer(test_sentence, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
test_sentence_tensor = inputs.input_ids

vocab_size = len(tokenizer.vocab)
print('Vocabulary size:', vocab_size)

#Text: "This movie was absolutely amazing! The acting, the story, everything was perfect." Label: Positive
#Text: "The worst movie I've ever seen. The plot was nonsensical and the acting was terrible." Label: Negative
#Text: "I loved the cinematography in this film. It was visually stunning." Label: Positive
#Text: "I was so bored during this movie. It felt like it would never end." Label: Negative
#Text: "The special effects in this movie were incredible. I was blown away." Label: Positive
#Text: "I can't believe I wasted my time on this movie. It was a complete disaster." Label: Negative
#Text: "The performances in this film were outstanding. I was truly impressed." Label: Positive
#Text: "This movie was a total snooze fest. I couldn't even make it to the end." Label: Negative
#Text: "The soundtrack in this film was fantastic. It really added to the overall experience." Label: Positive
#Text: "The dialogue in this movie was so cringeworthy. I couldn't take it seriously." Label: Negative
movie_sentiments = [
    {"text": "This movie was absolutely amazing! The acting, the story, everything was perfect.", "label": "Positive"},
    {"text": "The worst movie I've ever seen. The plot was nonsensical and the acting was terrible.", "label": "Negative"},
    {"text": "I loved the cinematography in this film. It was visually stunning.", "label": "Positive"},
    {"text": "I was so bored during this movie. It felt like it would never end.", "label": "Negative"},
    {"text": "The special effects in this movie were incredible. I was blown away.", "label": "Positive"},
    {"text": "I can't believe I wasted my time on this movie. It was a complete disaster.", "label": "Negative"},
    {"text": "The performances in this film were outstanding. I was truly impressed.", "label": "Positive"},
    {"text": "This movie was a total snooze fest. I couldn't even make it to the end.", "label": "Negative"},
    {"text": "The soundtrack in this film was fantastic. It really added to the overall experience.", "label": "Positive"},
    {"text": "The dialogue in this movie was so cringeworthy. I couldn't take it seriously.", "label": "Negative"}
]


DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
Vocabulary size: 30522


In [52]:
x = torch.randn(2, 3)
print(x)
x_transposed = torch.transpose(x, 0, 1)
print(x_transposed)

tensor([[-1.0366, -0.1959, -0.6939],
        [ 0.0432, -0.1044,  0.2539]])
tensor([[-1.0366,  0.0432],
        [-0.1959, -0.1044],
        [-0.6939,  0.2539]])


In [63]:
import torch.nn as nn
import math

class SelfAttention(nn.Module):
    def __init__(self, input_dim):
        super(SelfAttention, self).__init__()
        self.input_dim = input_dim
        self.w_query = nn.Linear(input_dim, input_dim)
        self.w_key = nn.Linear(input_dim, input_dim)
        self.w_value = nn.Linear(input_dim, input_dim)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x):
        queries = self.w_query(x)
        keys = self.w_key(x)
        values = self.w_value(x)
        keys_transposed = torch.transpose(keys, 0, 1)
        keys_transposed = keys_transposed.permute(0, 2, 1)
        #print(queries.shape)
        #print(keys.shape)
        #print(keys_transposed.shape)                
        matmul_queries_keys = torch.matmul(queries, keys_transposed)
        print(matmul_queries_keys.shape)
        matmul_queries_keys_normalized = matmul_queries_keys / math.sqrt(self.input_dim)
        attention = self.softmax(matmul_queries_keys_normalized)
        print(matmul_queries_keys_normalized.shape)
        print(values.shape)
        print(attention.shape)
        # Matrix multiplication using @ operator (PyTorch 1.10 and later)
        result = attention @ values
        print(result)  # Output: tensor([[19, 22], [43, 50]])
        # weighted_attention = torch.matmul(attention, values)
        return weighted_attention

class SimpleSentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_length):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.attention = SelfAttention(embed_dim)
        #self.flatten = nn.Flatten()
        self.relu = nn.ReLU()
        self.linear = nn.Linear(embed_dim * max_length, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        #x = self.flatten(x)
        #x = self.relu(x)
        x = self.attention(x)
        x = self.linear(x)
        x = self.sigmoid(x)
        return x

    def get_embedding_shape(self):
        return self.embedding.embedding_dim, self.embedding.weight.size(0)

In [64]:
import torch
import torch.optim as optim


# Set the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device: ", device)

# Instantiate the model, loss function, and optimizer
model = SimpleSentimentClassifier(vocab_size=vocab_size, embed_dim=8, max_length=max_length).to(device)
embedding_shape = model.get_embedding_shape()
print("Embedding shape:", embedding_shape)
criterion = nn.BCELoss()
#criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert the data to the device (CPU or GPU)
x_train_tensor = x_train_tensor.to(device)
y_train_tensor = y_train_tensor.to(device)
x_test_tensor = x_test_tensor.to(device)
y_test_tensor = y_test_tensor.to(device)

# Training loop without batches
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i in range(len(x_train_tensor)):
        optimizer.zero_grad()
        input_sample = x_train_tensor[i].unsqueeze(0)
        target_sample = y_train_tensor[i].unsqueeze(0).float()
        output = model(input_sample)
        loss = criterion(output.squeeze(), target_sample.squeeze())        
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        #print("training loss ", loss.item())
    model.eval()
    with torch.no_grad():
        for i in range(len(x_test_tensor)):
            test_input_sample = x_test_tensor[i].unsqueeze(0)
            test_target_sample = y_test_tensor[i].unsqueeze(0).float()
            test_outputs = model(test_input_sample)
            #print("test_outputs[0] ", test_outputs[0])
            #print("y_test_tensor ", test_target_sample[0])
            test_loss = criterion(test_outputs.squeeze(), test_target_sample.squeeze())
            #print("test loss ", test_loss.item())                
        test_outputs = model(x_test_tensor)
        #print("test_outputs.shape ",test_outputs.shape)
        #print("y_train_tensor.shape ",y_train_tensor.shape)
        #print("test_outputs ",test_outputs[0])
        #print("y_train_tensor ",y_train_tensor[0])
        y_test_tensor = y_train_tensor.unsqueeze(1).float()       
        #print("test_outputs[0] ", test_outputs[0])
        #print("y_test_tensor ", y_test_tensor[0])
        test_loss = criterion(test_outputs, y_test_tensor)
        test_accuracy = ((test_outputs > 0.5).float() == y_test_tensor).sum().item() / len(y_test_tensor)
    # print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(x_train_tensor)}")
    print(f"Epoch: {epoch + 1}, Train Loss: {loss:.8f}, Test Loss: {test_loss:.8f}, Test Accuracy: {test_accuracy:.8f}")

device:  cuda
Embedding shape: (8, 30522)
torch.Size([80, 80, 1])
torch.Size([80, 80, 1])
torch.Size([1, 80, 8])
torch.Size([80, 80, 1])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (6400x1 and 80x8)

In [19]:
model.eval()
with torch.no_grad():
    test_outputs = model(x_test_tensor)
    y_test_tensor = y_train_tensor.unsqueeze(1).float() 
    print(x_test_tensor)
    print("test_outputs[0] ", test_outputs[0])
    print("y_test_tensor ", y_test_tensor[0])    
    test_loss = criterion(test_outputs, y_test_tensor)
    test_accuracy = ((test_outputs > 0.5).float() == y_test_tensor).sum().item() / len(y_test_tensor)
print(f"Test Loss: {test_loss:.3f}, Test Accuracy: {test_accuracy:.3f}")

tensor([[  101,  1045,  2293,  ..., 22580,  1010,   102],
        [  101,  4276,  1996,  ...,  7987,  1013,   102],
        [  101,  2049,  1037,  ...,  1996,  2927,   102],
        ...,
        [  101,  1045,  2288,  ...,  1998,  7134,   102],
        [  101,  2274,  2781,  ...,  1012,  2059,   102],
        [  101,  1045,  3236,  ...,  2000,  2377,   102]], device='cuda:0')
test_outputs[0]  tensor([1.], device='cuda:0')
y_test_tensor  tensor([0.], device='cuda:0')
Test Loss: 45.253, Test Accuracy: 0.500


In [39]:
movie_sentiments = [
    {"text": "This movie was absolutely amazing! The acting, the story, everything was perfect.", "label": "Positive"},
    {"text": "The worst movie I've ever seen. The plot was nonsensical and the acting was terrible.", "label": "Negative"},
    {"text": "I loved the cinematography in this film. It was visually stunning.", "label": "Positive"},
    {"text": "I was so bored during this movie. It felt like it would never end.", "label": "Negative"},
    {"text": "The special effects in this movie were incredible. I was blown away.", "label": "Positive"},
    {"text": "I can't believe I wasted my time on this movie. It was a complete disaster.", "label": "Negative"},
    {"text": "The performances in this film were outstanding. I was truly impressed.", "label": "Positive"},
    {"text": "This movie was a total snooze fest. I couldn't even make it to the end.", "label": "Negative"},
    {"text": "The soundtrack in this film was fantastic. It really added to the overall experience.", "label": "Positive"},
    {"text": "The dialogue in this movie was so cringeworthy. I couldn't take it seriously.", "label": "Negative"},
    {"text": "The action scenes in this movie were breathtaking. I was on the edge of my seat the whole time.", "label": "Positive"},
    {"text": "This movie was so predictable. I saw the ending coming from a mile away.", "label": "Negative"},
    {"text": "The humor in this film was spot on. I was laughing the whole time.", "label": "Positive"},
    {"text": "The pacing in this movie was terrible. It felt like it dragged on forever.", "label": "Negative"},
    {"text": "The chemistry between the lead actors was incredible. I was completely invested in their story.", "label": "Positive"},
    {"text": "The plot twists in this movie were ridiculous. I couldn't believe how unrealistic they were.", "label": "Negative"},
    {"text": "The costumes in this film were amazing. They really added to the overall aesthetic.", "label": "Positive"},
    {"text": "This movie was so confusing. I had no idea what was going on.", "label": "Negative"},
    {"text": "The direction in this film was brilliant. The use of lighting and camera angles was very effective.", "label": "Positive"},
    {"text": "The editing in this movie was awful. The transitions between scenes were jarring and disorienting.", "label": "Negative"}
]

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print(tokenizer)  # Output: transformers.BertTokenizer
vocab_size = len(tokenizer.vocab)
print('Vocabulary size:', vocab_size)
true_label = 0
label_counter = 0
for item in movie_sentiments:
    test_sentence = item["text"]
    test_label = item["label"]
    
    inputs = tokenizer(test_sentence, return_tensors="pt", padding='max_length', truncation=True, max_length=max_length)
    test_sentence_tensor = inputs.input_ids
    test_sentence_tensor = test_sentence_tensor.to(device)    
    
    prediction = model(test_sentence_tensor).item()
    predictiontext = ""
    if prediction > 0.5:
        predictiontext = "Positive"
    else:
        predictiontext = "Negative"
    if predictiontext == test_label:
        true_label +=1
    label_counter = 1    
    print(f"Sentiment for '{test_sentence}': {predictiontext} / {test_label}")
accuracy = true_label / label_counter
print(f"Accuracy {accuracy}")


DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
Vocabulary size: 30522
Sentiment for 'This movie was absolutely amazing! The acting, th

In [17]:
# Get the tokenizer used for the IMDb dataset
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print(tokenizer)  # Output: transformers.BertTokenizer

test_sentence = "This movie is amazing and I loved it."
test_sentence = "This movie started great but then it got worse and worse."

inputs = tokenizer(test_sentence, return_tensors="pt", padding='max_length', truncation=True, max_length=max_length)
test_sentence_tensor = inputs.input_ids
test_sentence_tensor = test_sentence_tensor.to(device)

vocab_size = len(tokenizer.vocab)
print('Vocabulary size:', vocab_size)
print("type(inputs) ", type(inputs))
print("type(test_sentence_tensor) ", type(test_sentence_tensor))

prediction = model(test_sentence_tensor).item()
predictiontext = ""
if prediction > 0.5:
    predictiontext = "Positive"
else:
    predictiontext = "Negative"

print(f"Sentiment for '{test_sentence}': {predictiontext}")

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
Vocabulary size: 30522
type(inputs)  <class 'transformers.tokenization_utils_base.Batch

In [13]:
import numpy as np
import scipy
# encoder representations of four different words
word_1 = np.array([1, 0, 0])
word_2 = np.array([0, 1, 0])
word_3 = np.array([1, 1, 0])
word_4 = np.array([0, 0, 1])
print(word_1)

[1 0 0]


In [14]:
import numpy as np
# generating the weight matrices
random.seed(42) # to allow us to reproduce the same attention values
W_Q = np.random.randint(3, size=(3, 3))
W_K = np.random.randint(3, size=(3, 3))
W_V = np.random.randint(3, size=(3, 3))
print(W_Q)
print(W_K)
print(W_V)

[[1 0 1]
 [0 0 1]
 [2 1 1]]
[[2 0 0]
 [2 0 2]
 [2 2 0]]
[[0 1 1]
 [1 2 1]
 [0 0 2]]


In [17]:
# generating the queries, keys and values
query_1 = word_1 @ W_Q
key_1 = word_1 @ W_K
value_1 = word_1 @ W_V

query_2 = word_2 @ W_Q
key_2 = word_2 @ W_K
value_2 = word_2 @ W_V

query_3 = word_3 @ W_Q
key_3 = word_3 @ W_K
value_3 = word_3 @ W_V

query_4 = word_4 @ W_Q
key_4 = word_4 @ W_K
value_4 = word_4 @ W_V

print(word_1)
print('*')
print(W_Q)
print('=')
print(query_1)

[1 0 0]
*
[[1 0 1]
 [0 0 1]
 [2 1 1]]
=
[1 0 1]


In [21]:
# scoring the first query vector against all key vectors
scores = np.array([np.dot(query_1, key_1), np.dot(query_1, key_2), np.dot(query_1, key_3), np.dot(query_1, key_4)])
print(scores)

[2 4 6 2]


In [25]:
from scipy.special import softmax
# computing the weights by a softmax operation
weights = softmax(scores / key_1.shape[0] ** 0.5)
print(weights)

[0.06561049 0.20818687 0.66059215 0.06561049]


In [26]:
# computing the attention by a weighted sum of the value vectors
attention = (weights[0] * value_1) + (weights[1] * value_2) + (weights[2] * value_3) + (weights[3] * value_4)
print(attention)

[0.86877902 2.46376069 1.72620264]
