In [1]:
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import ConcatDataset, random_split, DataLoader
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch
from sklearn.metrics import accuracy_score, classification_report


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [3]:
# from collections import Counter
# import pandas as pd

# # Load the training dataset
# train_csv = 'imdb_train.csv'  # Replace with actual path
# train_data = pd.read_csv(train_csv)

# # Count unique tokens in the 'tokenized' column
# # Assuming 'tokenized' column contains lists of tokens as strings, e.g., "[1, 23, 456]"
# all_tokens = []
# for tokens in train_data['tokenized']:
#     token_list = eval(tokens)  # Convert the string representation to a list
#     all_tokens.extend(token_list)

# # Calculate the vocabulary size
# vocab_size = len(set(all_tokens))
# print(vocab_size)

# # Load the training dataset
# train_csv = 'imdb_train.csv'  # Replace with actual path
# train_data = pd.read_csv(train_csv)

# # Find the maximum sequence length in the 'tokenized' column
# # Assuming 'tokenized' column contains lists of tokens as strings, e.g., "[1, 23, 456]"
# max_seq_length = max(len(eval(tokens)) for tokens in train_data['tokenized'])
# print("Maximum Sequence Length:", max_seq_length)

In [4]:
# Custom dataset class with padding
class SentimentAnalysisDataset(Dataset):
    def __init__(self, csv_file, max_length=2494, vocab_size=88585):
        # Load data from CSV
        self.data = pd.read_csv(csv_file)
        self.max_length = max_length  # Set max length for padding
        self.vocab_size = vocab_size  # Maximum vocab index
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get the row by index
        review = self.data.loc[idx, 'review']
        tokenized = eval(self.data.loc[idx, 'tokenized'])  # assuming tokenized is stored as a string of list
        label = self.data.loc[idx, 'label']
        
        # Ensure token indices are within the vocab_size range
        tokenized = [min(token, self.vocab_size - 1) for token in tokenized]
        
        # Convert to tensor and pad, move to the appropriate device
        tokenized_tensor = torch.tensor(tokenized, dtype=torch.long).to(device)
        tokenized_tensor = F.pad(
            tokenized_tensor, (0, self.max_length - len(tokenized_tensor)), value=0
        )  # Pad with zeros up to max_length
        
        # Convert label to tensor and move to device
        label_tensor = torch.tensor(label, dtype=torch.long).to(device)
        
        return tokenized_tensor, label_tensor

# Paths to CSV files
train_csv = 'imdb_train.csv'
test_csv = 'imdb_test.csv'

# Load datasets
train_dataset = SentimentAnalysisDataset(train_csv)
test_dataset = SentimentAnalysisDataset(test_csv)

# Combine train and test datasets
combined_dataset = ConcatDataset([train_dataset, test_dataset])

# Define split proportions
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Calculate dataset sizes
train_size = int(train_ratio * len(combined_dataset))
val_size = int(val_ratio * len(combined_dataset))
test_size = len(combined_dataset) - train_size - val_size

# Split combined dataset
new_train_dataset, new_val_dataset, new_test_dataset = random_split(
    combined_dataset, [train_size, val_size, test_size]
)

# Define desired subset sizes
train_subset_size = 20000
val_subset_size = 2000
test_subset_size = 2000

# Create smaller subsets of the original datasets
new_train_dataset, _ = random_split(new_train_dataset, [train_subset_size, len(new_train_dataset) - train_subset_size])
new_val_dataset, _ = random_split(new_val_dataset, [val_subset_size, len(new_val_dataset) - val_subset_size])
new_test_dataset, _ = random_split(new_test_dataset, [test_subset_size, len(new_test_dataset) - test_subset_size])


# Create data loaders
batch_size = 32
train_loader = DataLoader(new_train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(new_val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(new_test_dataset, batch_size=batch_size, shuffle=False)

In [5]:
# Print one data row from the training dataset
sample_index = 0  # Replace with any valid index to print a different row
tokenized_tensor, label_tensor = train_dataset[sample_index]

print("Tokenized Text:", tokenized_tensor.shape)
print("Label:", label_tensor)

Tokenized Text: torch.Size([2494])
Label: tensor(1, device='cuda:0')


<div align="center">
  <img src="image.png" alt="image description"/>
</div>

<div align="center">
  <img src="image-1.png" alt="image description"/>
</div>

<div align="center">
  <img src="image-2.png" alt="image description"/>
</div>


In [6]:
# Define the RNN model for sentiment analysis
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(SimpleRNN, self).__init__()
        """
        N: num batches (sentences).
        D: Each token is represented by a D-dimensional embedding vector (embed_size).
        T: Maximum sequence length (number of words in each sequence)
        H: hidden_size

        shape input: (N, T)
        shape embeded input: (N, T, D)
        
        for each word in the sentence:
            next_h = torch.tanh(x.mm(Wx) + prev_h.mm(Wh) + b)
            {
                where:
                x:      (N,D)
                Wx:     (D,H)
                Wh:     (H,H)
                b:      (H,)
                next_h: (N,H)
            }
            (This is one step in the image above.)
        
        This process repeats for all the words in the sentence, so the number of output or hidden states at the end is T.

        output: (N, T, H)
        """
        self.embedding = nn.Embedding(vocab_size, embed_size) # assigne a vector of embec_size to each word
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 2)
        self.hidden_dim = hidden_size
        
    def forward(self, x):
        """
        hidden: (N, H)
        """        
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        prediction = self.fc(output[:, -1, :]) 
        return prediction
    

# Define the RNN model for sentiment analysis
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(EncoderRNN, self).__init__()
        """
        N: num batches (sentences).
        D: Each token is represented by a D-dimensional embedding vector (embed_size).
        T: Maximum sequence length (number of words in each sequence)
        H: hidden_size

        shape input: (N, T)
        shape embeded input: (N, T, D)
        
        for each word in the sentence:
            next_h = torch.tanh(x.mm(Wx) + prev_h.mm(Wh) + b)
            {
                where:
                x:      (N,D)
                Wx:     (D,H)
                Wh:     (H,H)
                b:      (H,)
                next_h: (N,H)
            }
            (This is one step in the image above.)
        
        This process repeats for all the words in the sentence, so the number of output or hidden states at the end is T.

        output: (N, T, H)
        """
        self.embedding = nn.Embedding(vocab_size, embed_size) # assigne a vector of embec_size to each word
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)

        self.hidden_dim = hidden_size
        
    def forward(self, x):
        """
        hidden: (N, H)
        """        
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        return hidden
    

class DecoderRNN(nn.Module):
    def __init__(self, output_dim, hidden_dim):
        super(DecoderRNN, self).__init__()
        self.rnn = nn.RNN(hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.hidden_dim = hidden_dim

    def forward(self, hidden):
        batch_size = hidden.size(1)
        input = torch.zeros(batch_size, 1, self.hidden_dim).to(hidden.device)  # [batch_size, 1, hidden_dim]
        outputs, hidden = self.rnn(input, hidden)
        
        # Pass final RNN output to linear layer
        prediction = self.fc(outputs.squeeze(1))  # prediction = [batch_size, output_dim]
        return prediction
    

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src):
        hidden = self.encoder(src)
        output = self.decoder(hidden)
        return output

In [7]:
# Set parameters
vocab_size = 88585  # As per your dataset
embed_size = 64
hidden_size = 256
output_size = 2  # Assuming binary classification: positive or negative sentiment
learning_rate = 0.001


# Instantiate the model, define the loss function and optimizer
encoder = EncoderRNN(vocab_size, embed_size, hidden_size)
decoder = DecoderRNN(output_size, hidden_size)
model = Seq2Seq(encoder, decoder).to(device)
# model = SimpleRNN(vocab_size, embed_size, hidden_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=2):
    print('********* train model **********')
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        total_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            
            # Compute loss and backward pass
            loss = criterion(outputs, labels.long())
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()

        # Validation phase after each epoch
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                
                # Forward pass
                outputs = model(inputs)
                
                # Get predictions
                _, preds = torch.max(outputs, 1)
                all_preds.extend(preds.tolist())
                all_labels.extend(labels.tolist())
        
        # Calculate validation accuracy
        val_accuracy = accuracy_score(all_labels, all_preds)
        
        # Print epoch metrics
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}, Val Accuracy: {val_accuracy:.4f}')

# Evaluation function
def evaluate_model(model, test_loader, criterion, device):
    print('********* evaluate model **********')
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels.long())
            total_loss += loss.item()
            
            # Get predictions
            print(outputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())

    # Calculate test accuracy
    test_accuracy = accuracy_score(all_labels, all_preds)
    
    # Print classification report
    report = classification_report(all_labels, all_preds, target_names=["Negative", "Positive"])
    print(f'Test Loss: {total_loss / len(test_loader):.4f}')
    print(f'Test Accuracy: {test_accuracy:.4f}')
    print("Classification Report:\n", report)

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=20)

# Evaluate the model on test data
evaluate_model(model, test_loader, criterion, device)

********* train model **********
Epoch [1/20], Loss: 0.6951, Val Accuracy: 0.5175
Epoch [2/20], Loss: 0.6937, Val Accuracy: 0.5175
Epoch [3/20], Loss: 0.6933, Val Accuracy: 0.5175
Epoch [4/20], Loss: 0.7064, Val Accuracy: 0.5175
Epoch [5/20], Loss: 0.7048, Val Accuracy: 0.5175
Epoch [6/20], Loss: 0.7017, Val Accuracy: 0.5175
Epoch [7/20], Loss: 0.7022, Val Accuracy: 0.4825
Epoch [8/20], Loss: 0.7012, Val Accuracy: 0.4825
Epoch [9/20], Loss: 0.7038, Val Accuracy: 0.5175
Epoch [10/20], Loss: 0.7015, Val Accuracy: 0.4825
Epoch [11/20], Loss: 0.7048, Val Accuracy: 0.5175
Epoch [12/20], Loss: 0.7039, Val Accuracy: 0.5175
Epoch [13/20], Loss: 0.7032, Val Accuracy: 0.5175
Epoch [14/20], Loss: 0.7026, Val Accuracy: 0.5175
Epoch [15/20], Loss: 0.7045, Val Accuracy: 0.4825
Epoch [16/20], Loss: 0.7029, Val Accuracy: 0.4825
Epoch [17/20], Loss: 0.7026, Val Accuracy: 0.5175
Epoch [18/20], Loss: 0.7041, Val Accuracy: 0.5175
Epoch [19/20], Loss: 0.7026, Val Accuracy: 0.5175
Epoch [20/20], Loss: 0.702

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Save the model's state_dict (weights)
torch.save(model.state_dict(), 'model.pth')

# Optionally, you can also save the optimizer's state_dict if you want to resume training:
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': 20,
    'loss': 0.7,  # or any other info you want to save
}, 'checkpoint.pth')