In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [28]:
import csv
import numpy as np

# Replace 'file.csv' with the path to your CSV file
file_path = 'imdb_train.csv'

# Initialize an empty list to store rows
data = []

# Open and read the CSV file
with open(file_path, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        data.append(row)

data = data[1:]
# Display the first few rows of data
for row in data[:1]:  # Show the first 5 rows
    print(len(row))

print(len(data))

3
25000


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# Custom dataset class
class SentimentAnalysisDataset(Dataset):
    def __init__(self, csv_file):
        # Load data from CSV
        self.data = pd.read_csv(csv_file)
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get the row by index
        review = self.data.loc[idx, 'review']
        tokenized = eval(self.data.loc[idx, 'tokenized'])  # assuming tokenized is stored as a string of list
        label = self.data.loc[idx, 'label']
        
        # Convert to tensor
        tokenized_tensor = torch.tensor(tokenized, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.long)
        
        return tokenized_tensor, label_tensor

# Collate function to pad sequences in each batch
def collate_fn(batch):
    # Separate tokenized sequences and labels
    tokenized_sequences = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    
    # Pad sequences
    padded_sequences = pad_sequence(tokenized_sequences, batch_first=True, padding_value=0)
    
    # Stack labels into a single tensor
    labels = torch.stack(labels)
    
    return padded_sequences, labels

# Paths to CSV files
train_csv = 'imdb_train.csv'
test_csv = 'imdb_test.csv'

# Load datasets
train_dataset = SentimentAnalysisDataset(train_csv)
test_dataset = SentimentAnalysisDataset(test_csv)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [30]:
# Print one data row from the training dataset
sample_index = 0  # Replace with any valid index to print a different row
tokenized_tensor, label_tensor = train_dataset[sample_index]

print("Tokenized Text:", tokenized_tensor.shape)
print("Label:", label_tensor)

Tokenized Text: torch.Size([218])
Label: tensor(1)


In [None]:
# Define the RNN model
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers=1):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = out[:, -1, :]  # Take the output from the last RNN cell
        out = self.fc(out)
        return out

In [36]:
from collections import Counter
import pandas as pd

# Load the training dataset
train_csv = 'imdb_train.csv'  # Replace with actual path
train_data = pd.read_csv(train_csv)

# Count unique tokens in the 'tokenized' column
# Assuming 'tokenized' column contains lists of tokens as strings, e.g., "[1, 23, 456]"
all_tokens = []
for tokens in train_data['tokenized']:
    token_list = eval(tokens)  # Convert the string representation to a list
    all_tokens.extend(token_list)

# Calculate the vocabulary size
vocab_size = len(set(all_tokens))
print(vocab_size)

88585


In [37]:
# Load the training dataset
train_csv = 'imdb_train.csv'  # Replace with actual path
train_data = pd.read_csv(train_csv)

# Find the maximum sequence length in the 'tokenized' column
# Assuming 'tokenized' column contains lists of tokens as strings, e.g., "[1, 23, 456]"
max_seq_length = max(len(eval(tokens)) for tokens in train_data['tokenized'])
print("Maximum Sequence Length:", max_seq_length)

Maximum Sequence Length: 2494


In [None]:
# Parameters
vocab_size = 88585     # Adjust based on vocabulary size
embed_size = 128      # Size of the embedding layer
hidden_size = 2494      # Size of RNN hidden state
output_size = 2       # Number of output classes (e.g., positive and negative)
num_layers = 1        # Number of RNN layers
num_epochs = 5        # Number of epochs
learning_rate = 0.001 # Learning rate

# Initialize model, loss function, and optimizer
model = SentimentRNN(vocab_size, embed_size, hidden_size, output_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for tokenized_tensor, label_tensor in train_loader:
        # Move tensors to the appropriate device if using GPU
        tokenized_tensor = tokenized_tensor
        label_tensor = label_tensor
        
        # Forward pass
        outputs = model(tokenized_tensor)
        loss = criterion(outputs, label_tensor)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    correct = 0
    total = 0
    for tokenized_tensor, label_tensor in test_loader:
        outputs = model(tokenized_tensor)
        _, predicted = torch.max(outputs.data, 1)
        total += label_tensor.size(0)
        correct += (predicted == label_tensor).sum().item()

    print(f'Accuracy of the model on the test data: {100 * correct / total:.2f}%')