In [129]:
import numpy as np
import random
import torch
import os
from transformers import set_seed

# Define a single seed value to use throughout the code
SEED = 42

# Set seeds for Python's random module
random.seed(SEED)

# Set seed for NumPy
np.random.seed(SEED)

# Set seed for PyTorch
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Set seed for Hugging Face Transformers
set_seed(SEED)

# For some operations on Apple Silicon (MPS)
os.environ["PYTHONHASHSEED"] = str(SEED)

print(f"All random seeds have been set to {SEED} for reproducibility")

All random seeds have been set to 42 for reproducibility


In [130]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load the DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [131]:
# Create a fresh instance of the base DistilBERT model
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load the base model and tokenizer (will not use your fine-tuned weights)
base_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
base_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [132]:
from datasets import load_dataset

dataset = load_dataset("stanfordnlp/sst2")

In [133]:
import numpy as np

# Display basic information about the dataset
print("Dataset info:")
print(f"Dataset type: {type(dataset)}")
print(f"Dataset splits: {dataset.keys()}")
print("\nNumber of examples per split:")
for split in dataset.keys():
  print(f"  {split}: {len(dataset[split])}")

# Examine features in the dataset
print("\nFeatures in the dataset:")
for feature in dataset['train'].features:
  print(f"  {feature}: {dataset['train'].features[feature]}")

# Look at label distribution
print("\nLabel distribution:")
for split in dataset.keys():
  label_counts = {}
  for label in dataset[split]['label']:
    if label not in label_counts:
      label_counts[label] = 0
    label_counts[label] += 1
  print(f"  {split}: {label_counts}")

Dataset info:
Dataset type: <class 'datasets.dataset_dict.DatasetDict'>
Dataset splits: dict_keys(['train', 'validation', 'test'])

Number of examples per split:
  train: 67349
  validation: 872
  test: 1821

Features in the dataset:
  idx: Value(dtype='int32', id=None)
  sentence: Value(dtype='string', id=None)
  label: ClassLabel(names=['negative', 'positive'], id=None)

Label distribution:
  train: {0: 29780, 1: 37569}
  validation: {1: 444, 0: 428}
  test: {-1: 1821}


In [134]:
import numpy as np
from datasets import concatenate_datasets

# Step 1: Balance the training set
# Split dataset into positive and negative samples for the training set
train_positive = dataset['train'].filter(lambda example: example['label'] == 1)
train_negative = dataset['train'].filter(lambda example: example['label'] == 0)

print(f"Original training positive samples: {len(train_positive)}")
print(f"Original training negative samples: {len(train_negative)}")

# Downsample the positive class to match negative class size
np.random.seed(42)
positive_indices = np.random.choice(len(train_positive), len(train_negative), replace=False)
downsampled_train_positive = train_positive.select(positive_indices)

# Combine the balanced datasets for training
balanced_train = concatenate_datasets([downsampled_train_positive, train_negative])
balanced_train = balanced_train.shuffle(seed=42)

print(f"Total samples in balanced training set: {len(balanced_train)}")

# Step 2: Balance the validation set
val_positive = dataset['validation'].filter(lambda example: example['label'] == 1)
val_negative = dataset['validation'].filter(lambda example: example['label'] == 0)

print(f"Validation positive samples: {len(val_positive)}")
print(f"Validation negative samples: {len(val_negative)}")

# Balance validation set
if len(val_positive) > len(val_negative):
    val_positive_indices = np.random.choice(len(val_positive), len(val_negative), replace=False)
    balanced_val_positive = val_positive.select(val_positive_indices)
    balanced_validation = concatenate_datasets([balanced_val_positive, val_negative])
else:
    val_negative_indices = np.random.choice(len(val_negative), len(val_positive), replace=False)
    balanced_val_negative = val_negative.select(val_negative_indices)
    balanced_validation = concatenate_datasets([val_positive, balanced_val_negative])

balanced_validation = balanced_validation.shuffle(seed=42)
print(f"Total samples in balanced validation set: {len(balanced_validation)}")

# Step 3: Combine balanced train and validation into a single dataset
combined_dataset = concatenate_datasets([balanced_train, balanced_validation])
combined_dataset = combined_dataset.shuffle(seed=42)

print(f"\nTotal samples in combined dataset: {len(combined_dataset)}")

# Check the final class distribution
combined_balance = {}
for label in combined_dataset['label']:
    if label not in combined_balance:
        combined_balance[label] = 0
    combined_balance[label] += 1
print(f"Combined dataset label distribution: {combined_balance}")

Original training positive samples: 37569
Original training negative samples: 29780
Total samples in balanced training set: 59560
Validation positive samples: 444
Validation negative samples: 428
Total samples in balanced validation set: 856

Total samples in combined dataset: 60416
Combined dataset label distribution: {1: 30208, 0: 30208}


In [135]:
import numpy as np

# Calculate text length statistics
print("\nText length statistics (characters):")
lengths = [len(text) for text in combined_dataset['sentence']]
print(f"  Min: {min(lengths)}")
print(f"  Max: {max(lengths)}")
print(f"  Mean: {np.mean(lengths):.2f}")
print(f"  Median: {np.median(lengths):.2f}")

# Show word count statistics
print("\nWord count statistics:")
word_counts = [len(text.split()) for text in combined_dataset['sentence']]
print(f"  Min: {min(word_counts)}")
print(f"  Max: {max(word_counts)}")
print(f"  Mean: {np.mean(word_counts):.2f}")
print(f"  Median: {np.median(word_counts):.2f}")

# Show some examples from the combined dataset
print("\nSample examples from combined dataset:")
for i in range(3):  # Show 3 examples
    print(f"  Example {i+1}:")
    print(f"    Text: {combined_dataset['sentence'][i]}")
    print(f"    Label: {combined_dataset['label'][i]} ({['Negative', 'Positive'][combined_dataset['label'][i]]})")
    print()


Text length statistics (characters):
  Min: 2
  Max: 268
  Mean: 54.43
  Median: 40.00

Word count statistics:
  Min: 1
  Max: 52
  Mean: 9.60
  Median: 7.00

Sample examples from combined dataset:
  Example 1:
    Text: the emotion is impressively true for being so hot-blooded 
    Label: 1 (Positive)

  Example 2:
    Text: botches 
    Label: 0 (Negative)

  Example 3:
    Text: tricky and satisfying as any of david 
    Label: 1 (Positive)



In [136]:
# Split the combined dataset into train (70%), validation (15%), dev (10%), and test (5%)
dataset_size = len(combined_dataset)

# Calculate split sizes
train_size = int(dataset_size * 0.7)
val_size = int(dataset_size * 0.15)
dev_size = int(dataset_size * 0.1)
# The test_size will be the remainder (approximately 5%)
test_size = dataset_size - train_size - val_size - dev_size

print(f"Split sizes: Train={train_size}, Val={val_size}, Dev={dev_size}, Test={test_size}")

# Create splits - combined_dataset is already shuffled with seed=42
train_dataset = combined_dataset.select(range(train_size))
val_dataset = combined_dataset.select(range(train_size, train_size + val_size))
dev_dataset = combined_dataset.select(range(train_size + val_size, train_size + val_size + dev_size))
test_dataset = combined_dataset.select(range(train_size + val_size + dev_size, dataset_size))

# Verify sizes
print(f"\nFinal dataset sizes:")
print(f"Train: {len(train_dataset)} samples ({len(train_dataset)/dataset_size*100:.1f}%)")
print(f"Validation: {len(val_dataset)} samples ({len(val_dataset)/dataset_size*100:.1f}%)")
print(f"Dev: {len(dev_dataset)} samples ({len(dev_dataset)/dataset_size*100:.1f}%)")
print(f"Test: {len(test_dataset)} samples ({len(test_dataset)/dataset_size*100:.1f}%)")

# Check label distribution in each split
for split_name, split_dataset in [("Train", train_dataset), ("Validation", val_dataset), 
                                 ("Dev", dev_dataset), ("Test", test_dataset)]:
    label_counts = {}
    for label in split_dataset['label']:
        if label not in label_counts:
            label_counts[label] = 0
        label_counts[label] += 1
    print(f"{split_name} label distribution: {label_counts}")

Split sizes: Train=42291, Val=9062, Dev=6041, Test=3022

Final dataset sizes:
Train: 42291 samples (70.0%)
Validation: 9062 samples (15.0%)
Dev: 6041 samples (10.0%)
Test: 3022 samples (5.0%)
Train label distribution: {1: 21226, 0: 21065}
Validation label distribution: {1: 4508, 0: 4554}
Dev label distribution: {0: 3107, 1: 2934}
Test label distribution: {1: 1540, 0: 1482}


In [137]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding='max_length', truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True, num_proc=4)
tokenized_val = val_dataset.map(tokenize_function, batched=True, num_proc=4)
tokenized_dev = dev_dataset.map(tokenize_function, batched=True, num_proc=4)
tokenized_test = test_dataset.map(tokenize_function, batched=True, num_proc=4)

In [138]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [139]:
from transformers import Trainer, TrainingArguments

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    use_mps_device=True,
    seed=SEED,
    data_seed=SEED
)

In [140]:
# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train.select(range(200)),  # Use a subset for faster training
    eval_dataset=tokenized_val.select(range(50)),     # Use a subset for evaluation
    compute_metrics=compute_metrics
)

In [141]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.714729,0.46,0.46,1.0,0.630137
2,No log,0.704896,0.46,0.46,1.0,0.630137




TrainOutput(global_step=26, training_loss=0.6835443056546725, metrics={'train_runtime': 159.2995, 'train_samples_per_second': 2.511, 'train_steps_per_second': 0.163, 'total_flos': 52986959462400.0, 'train_loss': 0.6835443056546725, 'epoch': 2.0})

In [142]:
# Evaluate the model on validation set
val_results = trainer.evaluate()
print("\nValidation Results:")
for key, value in val_results.items():
  if isinstance(value, float):
    print(f"  {key}: {value:.4f}")
  else:
    print(f"  {key}: {value}")




Validation Results:
  eval_loss: 0.7049
  eval_accuracy: 0.4600
  eval_precision: 0.4600
  eval_recall: 1.0000
  eval_f1: 0.6301
  eval_runtime: 3.6534
  eval_samples_per_second: 13.6860
  eval_steps_per_second: 1.0950
  epoch: 2.0000


In [143]:
# Evaluate on the dev set
dev_trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_dev.select(range(50)),
    compute_metrics=compute_metrics,
)

dev_results = dev_trainer.evaluate()
print("\nDev set evaluation results:")
for key, value in dev_results.items():
  if isinstance(value, float):
    print(f"  {key}: {value:.4f}")
  else:
    print(f"  {key}: {value}")




Dev set evaluation results:
  eval_loss: 0.7017
  eval_model_preparation_time: 0.0009
  eval_accuracy: 0.4200
  eval_precision: 0.4200
  eval_recall: 1.0000
  eval_f1: 0.5915
  eval_runtime: 3.2796
  eval_samples_per_second: 15.2460
  eval_steps_per_second: 1.2200


In [144]:
# Evaluate on the test set
test_trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_test.select(range(50))
)

test_results = test_trainer.evaluate()
print("\nTest set evaluation results:")
for key, value in test_results.items():
  if isinstance(value, float):
    print(f"  {key}: {value:.4f}")
  else:
    print(f"  {key}: {value}")




Test set evaluation results:
  eval_loss: 0.6580
  eval_model_preparation_time: 0.0006
  eval_accuracy: 0.6200
  eval_precision: 0.6200
  eval_recall: 1.0000
  eval_f1: 0.7654
  eval_runtime: 3.3016
  eval_samples_per_second: 15.1440
  eval_steps_per_second: 1.2120


In [145]:
# Evaluate the base model on test dataset
base_test_trainer = Trainer(
    model=base_model,
    args=training_args,
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_test.select(range(50))
)

base_test_results = base_test_trainer.evaluate()
print("\nBase (non-fine-tuned) model test results:")
for key, value in base_test_results.items():
  if isinstance(value, float):
    print(f"  {key}: {value:.4f}")
  else:
    print(f"  {key}: {value}")




Base (non-fine-tuned) model test results:
  eval_loss: 0.6950
  eval_model_preparation_time: 0.0006
  eval_accuracy: 0.4200
  eval_precision: 0.6667
  eval_recall: 0.1290
  eval_f1: 0.2162
  eval_runtime: 3.7541
  eval_samples_per_second: 13.3190
  eval_steps_per_second: 1.0650


In [146]:

# Show comparison with fine-tuned model
print("\nPerformance comparison (Test Set):")
print(f"{'Metric':<30} {'Fine-tuned':<15} {'Base model':<15} {'Difference':<15}")
print("-" * 65)
for key in test_results:
    if isinstance(test_results[key], float):
        improvement = test_results[key] - base_test_results[key]
        print(f"{key:<30} {test_results[key]:.4f}{'':<8} {base_test_results[key]:.4f}{'':<8} {improvement:.4f}")


Performance comparison (Test Set):
Metric                         Fine-tuned      Base model      Difference     
-----------------------------------------------------------------
eval_loss                      0.6580         0.6950         -0.0370
eval_model_preparation_time    0.0006         0.0006         0.0000
eval_accuracy                  0.6200         0.4200         0.2000
eval_precision                 0.6200         0.6667         -0.0467
eval_recall                    1.0000         0.1290         0.8710
eval_f1                        0.7654         0.2162         0.5492
eval_runtime                   3.3016         3.7541         -0.4525
eval_samples_per_second        15.1440         13.3190         1.8250
eval_steps_per_second          1.2120         1.0650         0.1470


In [147]:
# Save the model for future use
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json')

In [148]:
import torch

# Function to determine the appropriate device
def get_device():
    """Determine whether to use MPS or CPU based on availability."""
    if torch.backends.mps.is_available() and torch.backends.mps.is_built():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
    return device

# Function to perform sentiment prediction using the fine-tuned model
def predict_sentiment(text):
    # Try using the preferred device first
    device = get_device()
    
    try:
        # Create inputs and move to appropriate device
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Perform prediction
        return perform_prediction(inputs, device)
        
    except Exception as e:
        # If there's an error with the preferred device, fall back to CPU
        if device.type != "cpu":
            print(f"Error with {device.type}: {e}. Falling back to CPU.")
            device = torch.device("cpu")
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            return perform_prediction(inputs, device)
        else:
            # If we're already on CPU and still getting an error, raise it
            raise e

# Helper function to perform the actual prediction
def perform_prediction(inputs, device):
    """Perform sentiment prediction with the model on the specified device."""
    # Move model to device for inference
    model_on_device = model.to(device)
    
    # Get predictions
    with torch.no_grad():
        outputs = model_on_device(**inputs)
    
    predictions = torch.argmax(outputs.logits, dim=-1).item()

    if predictions == 1:
        return "Positive"
    else:
        return "Negative"

In [149]:
# Function to perform sentiment prediction using the base (non-fine-tuned) model
def predict_sentiment_base(text):
    device = get_device()
    
    try:
        inputs = base_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Move base model to device for inference
        base_model_on_device = base_model.to(device)
        
        # Get predictions
        with torch.no_grad():
            outputs = base_model_on_device(**inputs)
        
        predictions = torch.argmax(outputs.logits, dim=-1).item()

        if predictions == 1:
            return "Positive"
        else:
            return "Negative"
            
    except Exception as e:
        if device.type != "cpu":
            print(f"Error with {device.type}: {e}. Falling back to CPU.")
            device = torch.device("cpu")
            inputs = base_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Retry with CPU
            base_model_on_device = base_model.to(device)
            with torch.no_grad():
                outputs = base_model_on_device(**inputs)
            
            predictions = torch.argmax(outputs.logits, dim=-1).item()
            return "Positive" if predictions == 1 else "Negative"
        else:
            raise e

In [150]:
# Compare predictions between fine-tuned and base models
test_examples = [
    ("Bad", "Negative"),
    ("Good", "Positive"),
    ("I hate this", "Negative"),
    ("I love this", "Positive"),
    ("This movie was OK", "Positive"),
    ("This movie was fantastic", "Positive"),
    ("This movie was terrible", "Negative"),
    ("This is not bad", "Positive"),
    ("Good movie but bad acting", "Positive"),
    ("Despite the poor beginning, the ending was great", "Positive"),
    ("The plot was intricate and the characters were well developed", "Positive"),
    ("A masterpiece of modern cinema with stunning visuals", "Positive"),    
    ("The director failed to engage the audience", "Negative"),
    ("Not the best film I've seen, but still enjoyable", "Positive"),
    ("I wouldn't recommend this to anyone", "Negative"),
    ("It wasn't as bad as the critics suggested", "Positive"),
    ("Absolutely brilliant performances by the entire cast", "Positive"),
    ("A complete waste of time and money", "Negative"),
    ("The special effects couldn't save the weak storyline", "Negative"),
    ("Despite its flaws, the film manages to be entertaining", "Positive"),
    ("It's so bad it's actually good", "Positive"),
    ("The film offers nothing new to the genre", "Negative"),
    ("While not perfect, it exceeded my expectations", "Positive"),
    ("The soundtrack was the only redeeming quality", "Negative")
]

print("\nComparing predictions between fine-tuned and base models:")
print(f"{'Text':<60} {'Ground Truth':<15} {'Fine-tuned':<15} {'Base model':<15} {'Match?':<10}")
print("-" * 115)

matches = 0
fine_tuned_correct = 0
base_correct = 0
for example, ground_truth in test_examples:
    fine_tuned_pred = predict_sentiment(example)
    base_pred = predict_sentiment_base(example)
    match = "✓" if fine_tuned_pred == base_pred else "✗"
    if fine_tuned_pred == base_pred:
        matches += 1
    
    # Count correct predictions (excluding Mixed/Neutral cases)
    if ground_truth in ["Positive", "Negative"]:
        if fine_tuned_pred == ground_truth:
            fine_tuned_correct += 1
        if base_pred == ground_truth:
            base_correct += 1
    
    # Truncate long examples for display
    display_text = example[:60] + "..." if len(example) > 60 else example
    print(f"{display_text:<60} {ground_truth:<15} {fine_tuned_pred:<15} {base_pred:<15} {match:<10}")

# Calculate metrics
agreement_pct = (matches / len(test_examples)) * 100
# Count binary sentiment examples (not Mixed or Neutral)
binary_examples = sum(1 for _, label in test_examples if label in ["Positive", "Negative"])
fine_tuned_acc = (fine_tuned_correct / binary_examples) * 100 if binary_examples > 0 else 0
base_acc = (base_correct / binary_examples) * 100 if binary_examples > 0 else 0

# Determine which model is better
better_model = "Fine-tuned" if fine_tuned_acc > base_acc else "Base" if base_acc > fine_tuned_acc else "Both equal"

print(f"\nFine-tuned model accuracy: {fine_tuned_correct}/{binary_examples} ({fine_tuned_acc:.1f}%)")
print(f"Base model accuracy: {base_correct}/{binary_examples} ({base_acc:.1f}%)")
print(f"Higher accuracy: {better_model}")
print(f"\nAgreement between models: {matches}/{len(test_examples)} ({agreement_pct:.1f}%)")


Comparing predictions between fine-tuned and base models:
Text                                                         Ground Truth    Fine-tuned      Base model      Match?    
-------------------------------------------------------------------------------------------------------------------
Bad                                                          Negative        Positive        Positive        ✓         
Good                                                         Positive        Positive        Positive        ✓         
I hate this                                                  Negative        Positive        Positive        ✓         
I love this                                                  Positive        Positive        Negative        ✗         
This movie was OK                                            Positive        Positive        Negative        ✗         
This movie was fantastic                                     Positive        Positive        Negative    