In [2]:
from transformers import pipeline
import os

os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [3]:
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.





Device set to use cpu


In [4]:
classifier("We are very happy to show you the 🤗 Transformers library.")

[{'label': 'POSITIVE', 'score': 0.9997795224189758}]

In [34]:
from datasets import load_dataset
from transformers import BertTokenizer, set_seed
import random
import numpy as np
import torch

In [35]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
set_seed(seed)

In [36]:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True)

In [37]:
dataset = load_dataset("imdb")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [38]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [39]:
def tokenize(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

In [40]:
tokenized = dataset.map(tokenize, batched=True)

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [41]:
train_test_val = tokenized['train'].train_test_split(test_size=0.2)
train_dataset = train_test_val['train']
val_dataset = train_test_val['test']

In [42]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [45]:
from transformers import BertForSequenceClassification, AutoModelForSequenceClassification, Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup

#model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

num_epochs = 2
batch_size = 16

total_steps = len(train_dataset) * num_epochs // batch_size
warmup_steps = int(0.1 * total_steps)  # 10% of steps for warmup

# Create scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)



In [47]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
  #  load_best_model_at_end=True,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    fp16=True,
    weight_decay=0.01,
)
# Define Trainer with model, arguments, and datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [49]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import torch
import warnings
warnings.filterwarnings("ignore")

# Load the IMDB dataset
imdb_dataset = load_dataset("imdb")

# Select model name
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# Load tokenizer to help with truncation
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create sentiment analysis pipeline using pre-trained model
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model_name,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,  # Use GPU if available
    truncation=True,
    max_length=512  # Explicitly set max length
)

def analyze_sentiment(text):
    """Analyze sentiment and convert to binary (0 for negative, 1 for positive)"""
    try:
        result = sentiment_pipeline(text)[0]
        # The labels are typically NEGATIVE/POSITIVE
        if result['label'] == 'POSITIVE':
            return 1
        else:
            return 0
    except Exception as e:
        print(f"Error processing text: {str(e)}")
        # Return the most likely class as fallback
        return 1  # Assuming positive is more likely, adjust based on your dataset

# Define a function to evaluate the model on a subset of data
def evaluate_model(dataset, num_samples=None):
    # If num_samples is not specified or is None, use the entire dataset
    if num_samples is None or num_samples >= len(dataset):
        subset = dataset
        print(f"Using the entire dataset of {len(dataset)} samples")
    else:
        # Select a subset for evaluation
        subset = dataset.shuffle(seed=42).select(range(num_samples))
        print(f"Using a subset of {num_samples} samples")

    # Get predictions
    texts = subset["text"]
    true_labels = subset["label"]

    # Process in batches to avoid memory issues
    batch_size = 16  # Smaller batch size to handle memory better
    all_predictions = []

    print(f"Processing {len(texts)} reviews in batches of {batch_size}...")

    for i in range(0, len(texts), batch_size):
        if i % 100 == 0:
            print(f"Processing batch {i//batch_size + 1}/{len(texts)//batch_size + 1}")

        batch_texts = texts[i:i+batch_size]

        # Truncate texts that are too long (optional pre-processing)
        # batch_texts = [text[:10000] for text in batch_texts]  # Simple truncation if needed

        predictions = [analyze_sentiment(text) for text in batch_texts]
        all_predictions.extend(predictions)

    # Calculate metrics
    accuracy = accuracy_score(true_labels, all_predictions)
    precision = precision_score(true_labels, all_predictions)
    recall = recall_score(true_labels, all_predictions)
    f1 = f1_score(true_labels, all_predictions)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Print detailed classification report
    print("\nClassification Report:")
    print(classification_report(true_labels, all_predictions, target_names=["Negative", "Positive"]))

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# Evaluate on the entire test set
print(f"Evaluating on the entire IMDB test set ({len(imdb_dataset['test'])} samples)...")
results = evaluate_model(imdb_dataset["test"], num_samples=len(imdb_dataset["test"]))

# Display some example predictions
def show_example_predictions(dataset, num_examples=5):
    subset = dataset["test"].shuffle(seed=42).select(range(num_examples))

    for i, example in enumerate(subset):
        text = example["text"]
        true_label = "Positive" if example["label"] == 1 else "Negative"

        # Get model prediction
        prediction = sentiment_pipeline(text)[0]
        pred_label = prediction['label']
        confidence = prediction['score']

        print(f"\nExample {i+1}:")
        print(f"Text: {text[:200]}...")  # Show first 200 chars
        print(f"True label: {true_label}")
        print(f"Predicted: {pred_label} (confidence: {confidence:.4f})")

# Uncomment to see example predictions
# show_example_predictions(imdb_dataset)

Device set to use cpu


Evaluating on the entire IMDB test set (25000 samples)...
Using the entire dataset of 25000 samples
Processing 25000 reviews in batches of 16...
Processing batch 1/1563


KeyboardInterrupt: 