In [36]:
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import ConcatDataset, random_split, DataLoader
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch
from sklearn.metrics import accuracy_score, classification_report


In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [38]:
# from collections import Counter
# import pandas as pd

# # Load the training dataset
# train_csv = 'imdb_train.csv'  # Replace with actual path
# train_data = pd.read_csv(train_csv)

# # Count unique tokens in the 'tokenized' column
# # Assuming 'tokenized' column contains lists of tokens as strings, e.g., "[1, 23, 456]"
# all_tokens = []
# for tokens in train_data['tokenized']:
#     token_list = eval(tokens)  # Convert the string representation to a list
#     all_tokens.extend(token_list)

# # Calculate the vocabulary size
# vocab_size = len(set(all_tokens))
# print(vocab_size)

# # Load the training dataset
# train_csv = 'imdb_train.csv'  # Replace with actual path
# train_data = pd.read_csv(train_csv)

# # Find the maximum sequence length in the 'tokenized' column
# # Assuming 'tokenized' column contains lists of tokens as strings, e.g., "[1, 23, 456]"
# max_seq_length = max(len(eval(tokens)) for tokens in train_data['tokenized'])
# print("Maximum Sequence Length:", max_seq_length)

In [39]:
# Custom dataset class with padding
class SentimentAnalysisDataset(Dataset):
    def __init__(self, csv_file, max_length=2494, vocab_size=88585):
        # Load data from CSV
        self.data = pd.read_csv(csv_file)
        self.max_length = max_length  # Set max length for padding
        self.vocab_size = vocab_size  # Maximum vocab index
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get the row by index
        review = self.data.loc[idx, 'review']
        tokenized = eval(self.data.loc[idx, 'tokenized'])  # assuming tokenized is stored as a string of list
        label = self.data.loc[idx, 'label']
        
        # Ensure token indices are within the vocab_size range
        tokenized = [min(token, self.vocab_size - 1) for token in tokenized]
        
        # Convert to tensor and pad, move to the appropriate device
        tokenized_tensor = torch.tensor(tokenized, dtype=torch.long).to(device)
        tokenized_tensor = F.pad(
            tokenized_tensor, (0, self.max_length - len(tokenized_tensor)), value=0
        )  # Pad with zeros up to max_length
        
        # Convert label to tensor and move to device
        label_tensor = torch.tensor(label, dtype=torch.long).to(device)
        
        return tokenized_tensor, label_tensor

# Paths to CSV files
train_csv = 'imdb_train.csv'
test_csv = 'imdb_test.csv'

# Load datasets
train_dataset = SentimentAnalysisDataset(train_csv)
test_dataset = SentimentAnalysisDataset(test_csv)

# Combine train and test datasets
combined_dataset = ConcatDataset([train_dataset, test_dataset])

# Define split proportions
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Calculate dataset sizes
train_size = int(train_ratio * len(combined_dataset))
val_size = int(val_ratio * len(combined_dataset))
test_size = len(combined_dataset) - train_size - val_size

# Split combined dataset
new_train_dataset, new_val_dataset, new_test_dataset = random_split(
    combined_dataset, [train_size, val_size, test_size]
)

# # Define desired subset sizes
# train_subset_size = 1000
# val_subset_size = 100
# test_subset_size = 100

# # Create smaller subsets of the original datasets
# new_train_dataset, _ = random_split(new_train_dataset, [train_subset_size, len(new_train_dataset) - train_subset_size])
# new_val_dataset, _ = random_split(new_val_dataset, [val_subset_size, len(new_val_dataset) - val_subset_size])
# new_test_dataset, _ = random_split(new_test_dataset, [test_subset_size, len(new_test_dataset) - test_subset_size])


# Create data loaders
batch_size = 32
train_loader = DataLoader(new_train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(new_val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(new_test_dataset, batch_size=batch_size, shuffle=False)

In [40]:
# Print one data row from the training dataset
sample_index = 0  # Replace with any valid index to print a different row
tokenized_tensor, label_tensor = train_dataset[sample_index]

print("Tokenized Text:", tokenized_tensor.shape)
print("Label:", label_tensor)

Tokenized Text: torch.Size([2494])
Label: tensor(1, device='cuda:0')


<div align="center">
  <img src="image.png" alt="image description"/>
</div>

<div align="center">
  <img src="image-1.png" alt="image description"/>
</div>

<div align="center">
  <img src="image-2.png" alt="image description"/>
</div>


In [41]:
# Define the RNN model for sentiment analysis
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(SimpleRNN, self).__init__()
        """
        N: num batches (sentences).
        D: Each token is represented by a D-dimensional embedding vector (embed_size).
        T: Maximum sequence length (number of words in each sequence)
        H: hidden_size

        shape input: (N, T)
        shape embeded input: (N, T, D)
        
        for each word in the sentence:
            next_h = torch.tanh(x.mm(Wx) + prev_h.mm(Wh) + b)
            {
                where:
                x:      (N,D)
                Wx:     (D,H)
                Wh:     (H,H)
                b:      (H,)
                next_h: (N,H)
            }
            (This is one step in the image above.)
        
        This process repeats for all the words in the sentence, so the number of output or hidden states at the end is T.

        output: (N, T, H)
        """
        self.embedding = nn.Embedding(vocab_size, embed_size) # assigne a vector of embec_size to each word
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 2)
        self.hidden_dim = hidden_size
        
    def forward(self, x):
        """
        hidden: (N, H)
        """        
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        prediction = self.fc(output[:, -1, :]) 
        return prediction
    

# Define the RNN model for sentiment analysis
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(EncoderRNN, self).__init__()
        """
        N: num batches (sentences).
        D: Each token is represented by a D-dimensional embedding vector (embed_size).
        T: Maximum sequence length (number of words in each sequence)
        H: hidden_size

        shape input: (N, T)
        shape embeded input: (N, T, D)
        
        for each word in the sentence:
            next_h = torch.tanh(x.mm(Wx) + prev_h.mm(Wh) + b)
            {
                where:
                x:      (N,D)
                Wx:     (D,H)
                Wh:     (H,H)
                b:      (H,)
                next_h: (N,H)
            }
            (This is one step in the image above.)
        
        This process repeats for all the words in the sentence, so the number of output or hidden states at the end is T.

        output: (N, T, H)
        """
        self.embedding = nn.Embedding(vocab_size, embed_size) # assigne a vector of embec_size to each word
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)

        self.hidden_dim = hidden_size
        
    def forward(self, x):
        """
        hidden: (N, H)
        """        
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        return hidden
    

class DecoderRNN(nn.Module):
    def __init__(self, output_dim, hidden_dim):
        super(DecoderRNN, self).__init__()
        self.rnn = nn.RNN(hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.hidden_dim = hidden_dim

    def forward(self, hidden):
        batch_size = hidden.size(1)
        input = torch.zeros(batch_size, 1, self.hidden_dim).to(hidden.device)  # [batch_size, 1, hidden_dim]
        outputs, hidden = self.rnn(input, hidden)
        
        # Pass final RNN output to linear layer
        prediction = self.fc(outputs.squeeze(1))  # prediction = [batch_size, output_dim]
        return prediction
    

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src):
        hidden = self.encoder(src)
        output = self.decoder(hidden)
        return output

In [42]:
# Set parameters
vocab_size = 88585  # As per your dataset
embed_size = 128
hidden_size = 64
output_size = 2  # Assuming binary classification: positive or negative sentiment
num_epochs = 10
learning_rate = 0.001


# Instantiate the model, define the loss function and optimizer
encoder = EncoderRNN(vocab_size, embed_size, hidden_size)
decoder = DecoderRNN(output_size, hidden_size)
model = Seq2Seq(encoder, decoder).to(device)
model = SimpleRNN(vocab_size, embed_size, hidden_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

def train(model, train_loader, criterion, optimizer, device):
    model.train()  # Set the model to training mode
    epoch_loss = 0

    for batch in train_loader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        
        # Compute loss
        loss = criterion(outputs, labels)
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    return avg_loss

def evaluate(model, val_loader, criterion, device):
    model.eval()  # Set the model to evaluation mode
    epoch_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)
            
            # Compute loss
            loss = criterion(outputs, labels)
            epoch_loss += loss.item()

            # Save predictions and true labels for accuracy calculation
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = epoch_loss / len(val_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    return avg_loss, accuracy

# Set number of epochs
num_epochs = 2
for epoch in range(num_epochs):
    # Training phase
    train_loss = train(model, train_loader, criterion, optimizer, device)
    
    # Validation phase
    val_loss, val_accuracy = evaluate(model, val_loader, criterion, device)
    
    # Print training and validation metrics
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Accuracy: {val_accuracy:.4f}")

Epoch 1/2
Train Loss: 0.6960
Val Loss: 0.6945 | Val Accuracy: 0.4993
Epoch 2/2
Train Loss: 0.6957
Val Loss: 0.6952 | Val Accuracy: 0.4993
