In [1]:
!pip install transformers torch datasets pandas numpy

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Feature Extraction

In [9]:
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Add a padding token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model = BertModel.from_pretrained('bert-base-uncased')
    model.resize_token_embeddings(len(tokenizer))
else:
    model = BertModel.from_pretrained('bert-base-uncased')

# Set the model to evaluation mode
model.eval()

# Move the model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

print(f'Using device: {device}')

# Load the IMDb dataset
dataset = load_dataset('imdb')

# Select the training set
train_dataset = dataset['train']

# Display the first example
print(train_dataset[0])

# Define the maximum sequence length
max_length = 128

# Tokenization function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=max_length)

# Apply the tokenization to the dataset
tokenized_train = train_dataset.map(tokenize, batched=True, batch_size=32)

# Inspect the tokenized data
print(tokenized_train.features)

# Set format for PyTorch
tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Define batch size
batch_size = 32

# Create DataLoader
train_loader = DataLoader(tokenized_train, batch_size=batch_size, shuffle=False)

# Function to extract features
def extract_features(loader, model, device):
    all_features = []
    all_labels = []

    # Disable gradient calculations for efficiency
    with torch.no_grad():
        for batch in tqdm(loader, desc="Extracting Features"):
            # Move inputs to the device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Get the model outputs
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Extract the [CLS] token's embedding
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_features.append(cls_embeddings)

            # Collect labels
            all_labels.extend(batch['label'].cpu().numpy())

    # Concatenate all features
    all_features = np.concatenate(all_features, axis=0)
    all_labels = np.array(all_labels)

    return all_features, all_labels

# Extract features from the tokenized training set
features, labels = extract_features(train_loader, model, device)

print(f'Feature shape: {features.shape}')  # (25000, 768)
print(f'Labels shape: {labels.shape}')      # (25000,)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

print(f'Training set size: {X_train.shape}')
print(f'Validation set size: {X_val.shape}')

# Initialize the classifier
classifier = LogisticRegression(max_iter=1000)

# Train the classifier
print("Training the classifier...")
classifier.fit(X_train, y_train)

# Predict on the validation set
y_pred = classifier.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy * 100:.2f}%')

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=['Negative', 'Positive']))

Using device: cuda
{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes

Extracting Features: 100%|██████████| 782/782 [02:48<00:00,  4.64it/s]


Feature shape: (25000, 768)
Labels shape: (25000,)
Training set size: (20000, 768)
Validation set size: (5000, 768)
Training the classifier...
Validation Accuracy: 81.08%

Classification Report:
              precision    recall  f1-score   support

    Negative       0.81      0.82      0.81      2515
    Positive       0.81      0.81      0.81      2485

    accuracy                           0.81      5000
   macro avg       0.81      0.81      0.81      5000
weighted avg       0.81      0.81      0.81      5000



## Testing

In [12]:
import torch
from transformers import BertTokenizer, BertModel
import pickle
import numpy as np


def predict_label(text, tokenizer, model, classifier, device, max_length=128):
    """
    Predict the label of a given text using the trained classifier and BERT embeddings.

    Args:
        text (str): Input text for which to predict the label.
        tokenizer: Loaded tokenizer.
        model: Loaded BERT model.
        classifier: Loaded classifier.
        device: Device where the model is loaded (CPU or GPU).
        max_length (int): Maximum sequence length for BERT.

    Returns:
        label (str): Predicted label ('Negative' or 'Positive').
    """
    # Tokenize the input text
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

    # Move inputs to the device
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Extract BERT embeddings
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Shape: (1, 768)

    # Predict using the classifier
    predicted_label = classifier.predict(cls_embedding)[0]

    # Map numerical label to string
    label_mapping = {0: 'Negative', 1: 'Positive'}
    return label_mapping.get(predicted_label, 'Unknown')

# Example Usage:
if __name__ == "__main__":
    # Load components
    # tokenizer, model, classifier, device = load_components()

    # Input texts
    texts = [
        "I absolutely loved this movie! The performances were outstanding.",
        "This was the worst film I have ever seen. It was a complete waste of time.",
        "An average movie with some good moments but overall not impressive.",
    ]

    # Predict labels
    for text in texts:
        label = predict_label(text, tokenizer, model, classifier, device)
        print(f"Text: {text}\nPredicted Label: {label}\n")


Text: I absolutely loved this movie! The performances were outstanding.
Predicted Label: Positive

Text: This was the worst film I have ever seen. It was a complete waste of time.
Predicted Label: Negative

Text: An average movie with some good moments but overall not impressive.
Predicted Label: Positive

