In [13]:
# Step 1: Import libraries
# Always start by importing all necessary libraries for data handling, processing, and modeling.
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [14]:
# Step 2: Load Dataset
# Download the Amazon Polarity dataset for sentiment analysis (positive/negative review classification).
dataset = load_dataset("mteb/amazon_polarity", split="train")
print("Sample dataset entry:", dataset[0])  # Inspect sample data structure

Sample dataset entry: {'label': 1, 'text': 'Stuning even for the non-gamer\n\nThis sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^', 'label_text': 'positive'}


In [16]:
# STep 2.1: Check keys in the dataset
print(dataset[0].keys())

dict_keys(['label', 'text', 'label_text'])


In [17]:
# Step 3: Preprocess Data
# Extract review texts and their labels, preparing lists for later tokenization and encoding.
texts = [sample["text"] for sample in dataset]
labels = [sample["label"] for sample in dataset]

In [21]:
# Step 4: Split Data into Training and Validation Sets
texts_train, texts_val, labels_train, labels_val = train_test_split(
    texts, labels, test_size=0.1, random_state=42
)

In [22]:
# Step 5: Tokenization & Vocabulary
# For basic models (like LSTM), tokenize each review using a simple tokenizer.
tokenizer = get_tokenizer("basic_english")
vocab = build_vocab_from_iterator(map(tokenizer, texts_train))
vocab.set_default_index(0)  # Handle unknown tokens gracefully

In [23]:
# Step 6: Custom PyTorch Dataset Class
# Define a PyTorch Dataset to process and encode each review as tensors for training.
class AmazonReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, vocab, max_length=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        indices = [self.vocab[token] for token in tokens][:self.max_length]
        padding = [0] * (self.max_length - len(indices))
        return torch.tensor(indices + padding), torch.tensor(self.labels[idx])

In [24]:
# Step 7: DataLoader Setup
# Create DataLoaders for training and validation sets
train_dataset = AmazonReviewDataset(texts_train, labels_train, tokenizer, vocab)
val_dataset = AmazonReviewDataset(texts_val, labels_val, tokenizer, vocab)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [25]:
# Step 8: Model Definition (Simple LSTM)
# Build a simple neural network using PyTorch, suitable for sequence modeling tasks like text classification.
class SimpleLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = out[:, -1, :]  # Use the last output for classification
        return self.fc(out)

model = SimpleLSTMClassifier(vocab_size=len(vocab), embed_dim=128, hidden_dim=128, output_dim=2)

In [26]:
# Step 9: Training Setup
# Set up optimizer and loss function for model training.
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
# Step 10: Training Loop
# Train the model over multiple epochs.
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_dataloader):.4f}")

In [None]:
# Step 11: Evaluation
# Evaluate model on validation data and calculate accuracy.
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, targets in val_dataloader:
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(targets.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

In [None]:
# Step 12: Save Model
# To deploy, save the trained model using torch.save()
torch.save(model.state_dict(), "amazon_polarity_lstm_model.pth")

In [None]:
# Load saved model state
model_loaded = SimpleLSTMClassifier(vocab_size=len(vocab), embed_dim=128, hidden_dim=128, output_dim=2)
model_loaded.load_state_dict(torch.load("amazon_polarity_lstm_model.pth"))
model_loaded.eval()

def predict_sentiment(text):
    tokens = tokenizer(text)
    indices = [vocab[token] for token in tokens][:100]
    padding = [0] * (100 - len(indices))
    input_tensor = torch.tensor(indices + padding).unsqueeze(0)  # Add batch dim

    with torch.no_grad():
        output = model_loaded(input_tensor)
        prediction = torch.argmax(output, dim=1).item()
    
    return "Positive" if prediction == 1 else "Negative"

# Example usage:
sample_review = "This product exceeded my expectations and works perfectly!"
print(f"Review: {sample_review}")
print(f"Predicted Sentiment: {predict_sentiment(sample_review)}")