In [1]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load the DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
from datasets import load_dataset

# Load the Yelp Polarity dataset
dataset = load_dataset('yelp_polarity')

In [3]:
# Explore dataset
# Check dataset structure
print("Dataset structure:", dataset.keys())

# Check size of train and test sets
print(f"Train set size: {len(dataset['train'])}")
print(f"Test set size: {len(dataset['test'])}")

# Examine data format
print("\nSample example from train set:")
print(dataset['train'][0])

# Check label distribution
train_labels = [example['label'] for example in dataset['train']]
test_labels = [example['label'] for example in dataset['test']]

print("\nLabel distribution in train set:")
print(f"Label 0 (negative): {train_labels.count(0)}")
print(f"Label 1 (positive): {train_labels.count(1)}")

print("\nLabel distribution in test set:")
print(f"Label 0 (negative): {test_labels.count(0)}")
print(f"Label 1 (positive): {test_labels.count(1)}")

# Display some examples
print("\nNegative review example:")
neg_idx = train_labels.index(0)
print(dataset['train'][neg_idx]['text'][:200] + "...")

print("\nPositive review example:")
pos_idx = train_labels.index(1)
print(dataset['train'][pos_idx]['text'][:200] + "...")

Dataset structure: dict_keys(['train', 'test'])
Train set size: 560000
Test set size: 38000

Sample example from train set:
{'text': "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars.", 'label': 0}

Label distribution in train set:
Label 0 (negative): 280000
Label 1 (positive): 280000

Label distribution in test set:
Label 0 (negative): 19000
Label 1 (positive): 19000

Negative review example:
Unfortuna

In [4]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_train = dataset['train'].map(tokenize_function, batched=True, num_proc=4)
tokenized_test = dataset['test'].map(tokenize_function, batched=True, num_proc=4)

In [None]:
from transformers import Trainer, TrainingArguments

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    use_mps_device=True
)



In [None]:
# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train.select(range(1000)),  # Use a subset for faster training
    eval_dataset=tokenized_test.select(range(500)),     # Use a subset for evaluation
)

In [15]:
# Fine-tune the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.253004




TrainOutput(global_step=63, training_loss=0.26669405377100386, metrics={'train_runtime': 207.7138, 'train_samples_per_second': 4.814, 'train_steps_per_second': 0.303, 'total_flos': 132467398656000.0, 'train_loss': 0.26669405377100386, 'epoch': 1.0})

In [16]:
# Evaluate the model
trainer.evaluate()



{'eval_loss': 0.25300395488739014,
 'eval_runtime': 29.5554,
 'eval_samples_per_second': 16.917,
 'eval_steps_per_second': 1.083,
 'epoch': 1.0}

In [None]:
# Save the model for future use
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

('./fine_tuned_yelp_model/tokenizer_config.json',
 './fine_tuned_yelp_model/special_tokens_map.json',
 './fine_tuned_yelp_model/vocab.txt',
 './fine_tuned_yelp_model/added_tokens.json')

In [24]:
import torch

# Function to determine the appropriate device
def get_device():
    """Determine whether to use MPS or CPU based on availability."""
    if torch.backends.mps.is_available() and torch.backends.mps.is_built():
        device = torch.device("mps")
        print("Using MPS acceleration")
    else:
        device = torch.device("cpu")
        print("MPS not available, using CPU")
    return device

# Function to perform sentiment prediction using the fine-tuned model
def predict_sentiment(text):
    # Try using the preferred device first
    device = get_device()
    
    try:
        # Create inputs and move to appropriate device
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Perform prediction
        return perform_prediction(inputs, device)
        
    except Exception as e:
        # If there's an error with the preferred device, fall back to CPU
        if device.type != "cpu":
            print(f"Error with {device.type}: {e}. Falling back to CPU.")
            device = torch.device("cpu")
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            return perform_prediction(inputs, device)
        else:
            # If we're already on CPU and still getting an error, raise it
            raise e

# Helper function to perform the actual prediction
def perform_prediction(inputs, device):
    """Perform sentiment prediction with the model on the specified device."""
    # Move model to device for inference
    model_on_device = model.to(device)
    
    # Get predictions
    with torch.no_grad():
        outputs = model_on_device(**inputs)
    
    predictions = torch.argmax(outputs.logits, dim=-1).item()

    if predictions == 1:
        return "Positive"
    else:
        return "Negative"

In [25]:
predict_sentiment(text="I hate this")

Using MPS acceleration


'Negative'