In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import re
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from datasets import load_dataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class TextPreprocessor:
    def __init__(self, max_vocab_size=10000, max_seq_length=128):
        self.max_vocab_size = max_vocab_size
        self.max_seq_length = max_seq_length
        self.word2idx = {}
        self.idx2word = {}
        self.vocab_size = 0

    def clean_text(self, text):
      """Clean and normalize text"""
      text = str(text).lower()
      text = re.sub(r'<[^>]+>', ' ', text)  # Replace HTML tags with space
      text = re.sub(r'http\S+|www\S+', ' ', text)  # Replace URLs with space
      text = re.sub(r'[^\w\s]', ' ', text)  # Replace special chars with space
      text = ' '.join(text.split())  # Remove extra whitespace

    # Return '<UNK>' if text is empty after cleaning
      return text if text.strip() else '<UNK>'

    def build_vocabulary(self, texts):
        """Build vocabulary from list of texts"""
        print("Building vocabulary...")
        word_counts = Counter()
        for text in tqdm(texts):
            words = self.clean_text(text).split()
            word_counts.update(words)

        # Add special tokens
        self.word2idx['<PAD>'] = 0
        self.word2idx['<UNK>'] = 1

        # Add most common words
        for word, _ in word_counts.most_common(self.max_vocab_size - 2):
            self.word2idx[word] = len(self.word2idx)

        self.idx2word = {idx: word for word, idx in self.word2idx.items()}
        self.vocab_size = len(self.word2idx)
        print(f"Vocabulary size: {self.vocab_size}")

    def text_to_sequence(self, text):
      """Convert text to sequence of word indices"""
      words = self.clean_text(text).split()

      # Handle empty sequences by adding an unknown token
      if len(words) == 0:
        words = ['<UNK>']

      sequence = [self.word2idx.get(word, self.word2idx['<UNK>'])
               for word in words]

    # Get actual sequence length before padding
      length = min(len(sequence), self.max_seq_length)

    # Pad or truncate sequence
      if len(sequence) < self.max_seq_length:
        sequence = sequence + [self.word2idx['<PAD>']] * (self.max_seq_length - len(sequence))
      else:
        sequence = sequence[:self.max_seq_length]

      return sequence, length

class ReviewDataset(Dataset):
    def __init__(self, texts, ratings, preprocessor):
        self.texts = texts
        self.ratings = ratings  # 1-5 stars
        self.preprocessor = preprocessor

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        rating = self.ratings[idx]

        sequence, length = self.preprocessor.text_to_sequence(text)
        sequence_tensor = torch.tensor(sequence, dtype=torch.long)
        rating_tensor = torch.tensor(rating - 1, dtype=torch.long)  # Convert to 0-4
        length_tensor = torch.tensor(length, dtype=torch.long)

        return sequence_tensor, rating_tensor, length_tensor

import torch
import torch.nn as nn
import gensim.downloader as api
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import numpy as np

class MulticlassSentimentRNN(nn.Module):
    def __init__(self, vocab_size, vocab_to_idx, hidden_dim, n_classes, n_layers, dropout, device):
        super().__init__()

        # Load pre-trained Word2Vec embeddings from gensim
        print("Loading pre-trained word vectors...")
        word2vec = api.load('word2vec-google-news-300')
        embedding_dim = word2vec.vector_size

        # Initialize embedding matrix with pre-trained embeddings
        print("Initializing embedding matrix...")
        weight_matrix = torch.zeros((vocab_size, embedding_dim))
        words_found = 0

        for word, idx in vocab_to_idx.items():
            try:
                weight_matrix[idx] = torch.FloatTensor(word2vec[word])
                words_found += 1
            except KeyError:
                weight_matrix[idx] = torch.randn(embedding_dim) * 0.1  # Random initialization for unknown words

        print(f"Found pre-trained vectors for {words_found}/{vocab_size} words")

        # Create embedding layer with pre-trained weights
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embedding.weight = nn.Parameter(weight_matrix)
        self.embedding.weight.requires_grad = False  # Freeze embeddings

        # RNN layers
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=n_layers,
                         bidirectional=True, dropout=dropout if n_layers > 1 else 0,
                         batch_first=True)

        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_dim, n_classes)

    def forward(self, text, lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = pack_padded_sequence(embedded, lengths.cpu(),
                                           batch_first=True, enforce_sorted=False)
        packed_output, hidden = self.rnn(packed_embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        hidden = self.dropout(hidden)
        hidden = torch.relu(self.fc(hidden))
        return self.classifier(hidden)

def train_model(model, train_loader, valid_loader, optimizer, device, n_epochs):
    """Train the model"""
    criterion = nn.CrossEntropyLoss()
    best_valid_loss = float('inf')

    for epoch in range(n_epochs):
        # Training phase
        model.train()
        total_loss = 0
        correct_predictions = 0
        total_predictions = 0

        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{n_epochs}')
        for batch_idx, (texts, ratings, lengths) in enumerate(progress_bar):
            texts = texts.to(device)
            ratings = ratings.to(device)
            lengths = lengths.to(device)

            optimizer.zero_grad()
            predictions = model(texts, lengths)
            loss = criterion(predictions, ratings)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted_ratings = torch.max(predictions, 1)
            correct_predictions += (predicted_ratings == ratings).sum().item()
            total_predictions += ratings.size(0)

            # Update progress bar
            avg_loss = total_loss / (batch_idx + 1)
            accuracy = correct_predictions / total_predictions
            progress_bar.set_postfix({'loss': f'{avg_loss:.4f}',
                                    'accuracy': f'{accuracy:.4f}'})

        # Validation phase
        model.eval()
        valid_loss = 0
        valid_correct = 0
        valid_total = 0

        with torch.no_grad():
            for texts, ratings, lengths in valid_loader:
                texts = texts.to(device)
                ratings = ratings.to(device)
                lengths = lengths.to(device)

                predictions = model(texts, lengths)
                loss = criterion(predictions, ratings)

                valid_loss += loss.item()
                _, predicted_ratings = torch.max(predictions, 1)
                valid_correct += (predicted_ratings == ratings).sum().item()
                valid_total += ratings.size(0)

        avg_valid_loss = valid_loss / len(valid_loader)
        valid_accuracy = valid_correct / valid_total

        print(f'\nValidation Loss: {avg_valid_loss:.4f}')
        print(f'Validation Accuracy: {valid_accuracy:.4f}')

        # Save the best model
        if avg_valid_loss < best_valid_loss:
            best_valid_loss = avg_valid_loss
            torch.save(model.state_dict(), 'best_model.pt')
            print('Best model saved!')

def predict_rating(model, preprocessor, text, device):
    """Predict rating for a single text"""
    model.eval()
    sequence, length = preprocessor.text_to_sequence(text)
    sequence_tensor = torch.tensor([sequence], dtype=torch.long).to(device)
    length_tensor = torch.tensor([length], dtype=torch.long).to(device)

    with torch.no_grad():
        prediction = model(sequence_tensor, length_tensor)
        probabilities = torch.softmax(prediction, dim=1)
        predicted_rating = torch.argmax(prediction, dim=1).item() + 1
        confidence = probabilities[0][predicted_rating-1].item()

    return predicted_rating, confidence

def main():
    # Set random seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)

    # Load Yelp reviews dataset
    print("Loading dataset...")
    dataset = load_dataset("yelp_review_full")

    # Use a subset of the training data
    train_dataset = dataset["train"].shuffle(seed=42).select(range(50000))

    # Convert to lists
    texts = train_dataset['text']
    ratings = [r + 1 for r in train_dataset['label']]  # Convert from 0-4 to 1-5

    # Print dataset statistics
    print(f"\nTotal samples: {len(texts)}")
    print("\nRating distribution:")
    for i in range(1, 6):
        count = sum(1 for r in ratings if r == i)
        print(f"{i} stars: {count} reviews ({count/len(ratings)*100:.1f}%)")

    # Split data
    train_texts, valid_texts, train_ratings, valid_ratings = train_test_split(
        texts, ratings, test_size=0.2, random_state=42, stratify=ratings
    )

    # Initialize preprocessor
    preprocessor = TextPreprocessor(max_vocab_size=25000, max_seq_length=128)
    preprocessor.build_vocabulary(train_texts)

    # Create datasets
    train_dataset = ReviewDataset(train_texts, train_ratings, preprocessor)
    valid_dataset = ReviewDataset(valid_texts, valid_ratings, preprocessor)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=32)

    # Initialize model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"\nUsing device: {device}")

    model = MulticlassSentimentRNN(
      vocab_size=preprocessor.vocab_size,
      vocab_to_idx=preprocessor.word2idx,
      hidden_dim=256,
      n_classes=5,
      n_layers=3,
      dropout=0.5,
      device=device).to(device)

    # Initialize optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Train model
    train_model(
        model=model,
        train_loader=train_loader,
        valid_loader=valid_loader,
        optimizer=optimizer,
        device=device,
        n_epochs=5
    )

    # Test the model with some example reviews
    test_reviews = [
        "The food was absolutely amazing! The service was impeccable and the atmosphere was perfect. Will definitely come back!",
        "Terrible experience. Rude staff, cold food, and overpriced. Would not recommend to anyone.",
        "It's an okay place. The food is decent but nothing special. Prices are reasonable.",
        "Good restaurant with friendly staff. The food could be better but overall a pleasant experience.",
        "Average place. Service was slow but the food was decent. Might give it another try."
    ]

    print("\nTesting model with example reviews:")
    for review in test_reviews:
        rating, confidence = predict_rating(model, preprocessor, review, device)
        print(f"\nReview: {review}")
        print(f"Predicted Rating: {rating} stars (confidence: {confidence:.2f})")

if __name__ == "__main__":
    main()

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/299M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]


Total samples: 50000

Rating distribution:
1 stars: 10102 reviews (20.2%)
2 stars: 9955 reviews (19.9%)
3 stars: 9901 reviews (19.8%)
4 stars: 10128 reviews (20.3%)
5 stars: 9914 reviews (19.8%)
Building vocabulary...


100%|██████████| 40000/40000 [00:02<00:00, 15336.12it/s]


Vocabulary size: 24999

Using device: cuda
Loading pre-trained word vectors...
Initializing embedding matrix...


  weight_matrix[idx] = torch.FloatTensor(word2vec[word])


Found pre-trained vectors for 21500/24999 words


Epoch 1/5: 100%|██████████| 1250/1250 [00:45<00:00, 27.67it/s, loss=1.5885, accuracy=0.2404]



Validation Loss: 1.5710
Validation Accuracy: 0.2493
Best model saved!


Epoch 2/5: 100%|██████████| 1250/1250 [00:43<00:00, 28.62it/s, loss=1.5599, accuracy=0.2573]



Validation Loss: 1.5388
Validation Accuracy: 0.2743
Best model saved!


Epoch 3/5: 100%|██████████| 1250/1250 [00:39<00:00, 31.34it/s, loss=1.5244, accuracy=0.2893]



Validation Loss: 1.5420
Validation Accuracy: 0.2826


Epoch 4/5: 100%|██████████| 1250/1250 [00:41<00:00, 30.33it/s, loss=1.5533, accuracy=0.2643]



Validation Loss: 1.5349
Validation Accuracy: 0.2768
Best model saved!


Epoch 5/5: 100%|██████████| 1250/1250 [00:43<00:00, 28.64it/s, loss=1.5546, accuracy=0.2616]



Validation Loss: 1.5448
Validation Accuracy: 0.2780

Testing model with example reviews:

Review: The food was absolutely amazing! The service was impeccable and the atmosphere was perfect. Will definitely come back!
Predicted Rating: 2 stars (confidence: 0.21)

Review: Terrible experience. Rude staff, cold food, and overpriced. Would not recommend to anyone.
Predicted Rating: 1 stars (confidence: 0.99)

Review: It's an okay place. The food is decent but nothing special. Prices are reasonable.
Predicted Rating: 2 stars (confidence: 0.22)

Review: Good restaurant with friendly staff. The food could be better but overall a pleasant experience.
Predicted Rating: 4 stars (confidence: 0.27)

Review: Average place. Service was slow but the food was decent. Might give it another try.
Predicted Rating: 3 stars (confidence: 0.21)


In [None]:
!pip install datasets
!pip install gensim