In [1]:
!tar xf aclImdb_v1.tar.gz

In [2]:
!rm -rf aclImdb/train/unsup

In [3]:
import os
from transformers import pipeline
from tqdm import tqdm
import torch

# 1. Initialize the Zero-Shot Pipeline
# Using 'facebook/bart-large-mnli' as it is the standard for zero-shot tasks
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

def get_local_imdb_data(split='test', num_samples=100):
    """
    Reads reviews and labels from the local aclImdb directory.
    """
    base_path = f"aclImdb/{split}"
    texts = []
    labels = []
    
    # Each folder (pos/neg) contains 12,500 .txt files
    for label_str in ["pos", "neg"]:
        dir_path = os.path.join(base_path, label_str)
        files = os.listdir(dir_path)[:num_samples // 2]  # Get balanced samples
        
        for fname in files:
            with open(os.path.join(dir_path, fname), 'r', encoding='utf-8') as f:
                texts.append(f.read())
                labels.append("positive" if label_str == "pos" else "negative")
                
    return texts, labels

# 2. Load the local data
test_texts, true_labels = get_local_imdb_data(split='test', num_samples=100)

# 3. Predict and Compute Accuracy
candidate_labels = ["positive", "negative"]
correct = 0

print(f"Running zero-shot classification on {len(test_texts)} local files...")

for text, true_label in tqdm(zip(test_texts, true_labels), total=len(test_texts)):
    # Truncation is necessary because IMDB reviews often exceed 512 tokens
    result = classifier(text, candidate_labels, truncation=True)
    
    # The label with the highest score is at index 0
    if result['labels'][0] == true_label:
        correct += 1

accuracy = (correct / len(test_texts)) * 100
print(f"\nFinal Accuracy: {accuracy:.2f}%")

2026-01-03 13:27:38.284072: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-03 13:27:38.284105: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-03 13:27:38.285254: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-03 13:27:38.291429: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Device set to use cuda:0


Running zero-shot classification on 100 local files...


 10%|█         | 10/100 [00:01<00:13,  6.85it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 100/100 [00:14<00:00,  6.73it/s]


Final Accuracy: 89.00%





In [4]:
import os
import torch
import numpy as np
from pathlib import Path
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorWithPadding
)
import evaluate

# --- 1. Load Local Data ---
def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        dir_path = split_dir / label_dir
        for text_file in dir_path.iterdir():
            texts.append(text_file.read_text(encoding="utf-8"))
            labels.append(1 if label_dir == "pos" else 0)
    return texts, labels

# Assumes you extracted the archive to a folder named 'aclImdb'
train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')

# --- 2. Tokenization ---
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(texts):
    return tokenizer(texts, truncation=True, padding=True)

train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

# --- 3. Create Dataset Object ---
class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDBDataset(train_encodings, train_labels)
test_dataset = IMDBDataset(test_encodings, test_labels)

# --- 4. Define Metrics ---
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# --- 5. Initialize Model & Trainer ---
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=2,        # 2-3 epochs is usually enough for IMDB
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [6]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# --- 6. Train ---
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2342,0.197972,0.92764
2,0.1538,0.230086,0.9322


TrainOutput(global_step=3126, training_loss=0.20515121379420304, metrics={'train_runtime': 3970.9059, 'train_samples_per_second': 12.592, 'train_steps_per_second': 0.787, 'total_flos': 6623369932800000.0, 'train_loss': 0.20515121379420304, 'epoch': 2.0})

In [7]:
# Final Evaluation
results = trainer.evaluate()
print(f"Final Accuracy: {results['eval_accuracy']:.4f}")

Final Accuracy: 0.9276
