## Create template for Sentiment Analysis

https://github.com/bentrevett/pytorch-sentiment-analysis/blob/main/1%20-%20Neural%20Bag%20of%20Words.ipynb

In [5]:
import collections

import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm

In [21]:
train_data, test_data = datasets.load_dataset("imdb", split=["train", "test"])
train_data, test_data

(Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 }))

In [22]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer

max_length = 256

# Load the IMDb dataset from Hugging Face
imdb_dataset = load_dataset("imdb")

# Convert the dataset to PyTorch format
imdb_dataset.set_format("torch")

# Get the training and test datasets
train_dataset = imdb_dataset["train"]
test_dataset = imdb_dataset["test"]

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")  # Replace with your desired tokenizer

# Tokenize the dataset
def tokenize_function(examples):
    #return tokenizer(examples["text"], padding="max_length", truncation=True, padding=True, max_length=maxlen)
    #return tokenizer(examples["text"], return_tensors="pt", truncation=False, padding=True, max_length=maxlen)
    return tokenizer(examples["text"], padding='max_length', truncation=True, max_length=max_length) 

imdb_dataset = imdb_dataset.map(tokenize_function, batched=False)

train_dataset = imdb_dataset["train"]
test_dataset = imdb_dataset["test"]

# Extract features (x) and labels (y) from the datasets
x_train = train_dataset["input_ids"]
y_train = train_dataset["label"]
x_test = test_dataset["input_ids"]
y_test = test_dataset["label"]

# Convert labels to torch tensors
y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)

vocab_size = len(tokenizer.vocab)
print('Vocabulary size:', vocab_size)

print("type(x_train)",type(x_train))
print("type(x_train[0])",type(x_train[0]))
print("type(x_test)",type(x_test))
print("type(x_test[0])",type(x_test[0]))
print("y_train.shape ",y_train.shape)
print("y_test.shape ",y_test.shape)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Vocabulary size: 30522
type(x_train) <class 'torch.Tensor'>
type(x_train[0]) <class 'torch.Tensor'>
type(x_test) <class 'torch.Tensor'>
type(x_test[0]) <class 'torch.Tensor'>
y_train.shape  torch.Size([25000])
y_test.shape  torch.Size([25000])


  y_train = torch.tensor(y_train)
  y_test = torch.tensor(y_test)


In [23]:
x_train_tensor = x_train
x_test_tensor = x_test
y_train_tensor = y_train
y_test_tensor = y_test  # Output: torch.Size([3, 3])

# Print the shape of the resulting tensor
print(x_train_tensor.shape)  # Output: torch.Size([3, 3])
print(x_test_tensor.shape)  # Output: torch.Size([3, 3])
print(y_train_tensor.shape)  # Output: torch.Size([3, 3])
print(y_test_tensor.shape)  # Output: torch.Size([3, 3])

torch.Size([25000, 256])
torch.Size([25000, 256])
torch.Size([25000])
torch.Size([25000])


In [1]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer
import collections

import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm

max_length = 256
BATCH_SIZE = 32

# Load the IMDb dataset from Hugging Face
imdb_dataset = load_dataset("imdb")

# Convert the dataset to PyTorch format
imdb_dataset.set_format("torch")

# Get the training and test datasets
train_dataset = imdb_dataset["train"]
test_dataset = imdb_dataset["test"]

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")  # Replace with your desired tokenizer

# Tokenize the dataset
def tokenize_function(examples):
    #return tokenizer(examples["text"], padding="max_length", truncation=True, padding=True, max_length=maxlen)
    #return tokenizer(examples["text"], return_tensors="pt", truncation=False, padding=True, max_length=maxlen)
    return tokenizer(examples["text"], padding='max_length', truncation=True, max_length=max_length) 

imdb_dataset = imdb_dataset.map(tokenize_function, batched=False)

train_dataset = imdb_dataset["train"]
test_dataset = imdb_dataset["test"]

vocab_size = len(tokenizer.vocab)
print('Vocabulary size:', vocab_size)

x_train = train_dataset["input_ids"]
y_train = train_dataset["label"]
x_test = test_dataset["input_ids"]
y_test = test_dataset["label"]

train_data = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)

test_data = TensorDataset(x_test, y_test)
test_loader = DataLoader(test_data, shuffle=False, batch_size=BATCH_SIZE)

Vocabulary size: 30522


In [2]:
class NBoW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, ids):
        # ids = [batch size, seq len]
        embedded = self.embedding(ids)
        # embedded = [batch size, seq len, embedding dim]
        pooled = embedded.mean(dim=1)
        # pooled = [batch size, embedding dim]
        prediction = self.fc(pooled)
        # prediction = [batch size, output dim]
        return prediction

In [3]:
embedding_dim = 300
output_dim = 1

model = NBoW(vocab_size, embedding_dim, output_dim)

In [4]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 9,156,901 trainable parameters


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [23]:
optimizer = optim.Adam(model.parameters(), lr=0.01)
#criterion = nn.CrossEntropyLoss()
criterion = nn.BCEWithLogitsLoss()

model = model#.to(device)
criterion = criterion#.to(device)

In [46]:
def get_accuracy(prediction, label):
    for item in prediction:
        item_argmax = 0
        if item < 0.0:
            item_argmax = 0
        else:
            item_argmax = 1
        print(item_argmax)
    predicted_classes = prediction.argmax(dim=-1)
    print(predicted_classes)
    print(label)
    correct_predictions = predicted_classes.eq(label).sum()
    print(correct_predictions)
    accuracy = correct_predictions / batch_size
    return accuracy

In [47]:
n_epochs = 10
metrics = collections.defaultdict(list)
for epoch in range(n_epochs):
    model.train()
    epoch_losses = []
    epoch_accs = []
    #for batch in tqdm.tqdm(train_dataloader, desc="training..."):
    #for step, batch in enumerate(train_dataloader):    
    for batch_idx, (input_ids, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        ids = input_ids#.to(device)
        label = labels#.to(device)
        prediction = model(ids)#.to(device)
        prediction = prediction.squeeze(-1)
        #print(prediction.shape)
        label = label.to(torch.float32)
        #print(label.shape)
        #print(prediction.dtype)
        #print(label.dtype)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
    epoch_losses = np.mean(epoch_losses)
    epoch_accs = np.mean(epoch_accs)
    metrics["train_losses"].append(epoch_losses)
    metrics["train_accs"].append(epoch_accs)
    print(f"Epoch: {epoch + 1}, Train Loss: {epoch_losses:.8f}")
    #, Train Accuracy: {epoch_accs} Test Loss: {test_loss:.8f}, Test Accuracy: {test_accuracy:.8f}")

0
1
0
0
1
0
0
0
1
1
0
0
0
0
1
0
1
0
1
0
1
0
0
0
0
0
0
1
0
0
0
0
tensor(18)
tensor([0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0.,
        1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])
tensor(0)


NameError: name 'batch_size' is not defined

## Decoder Architekture with masked attention for mlmatzeGPT