In [1]:
import gzip
import shutil
import time
import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext
import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

In [2]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
NUM_EPOCHS = 2

url = (
    "https://github.com/rasbt/"
    "machine-learning-book/raw/"
    "main/ch08/movie_data.csv.gz"
)
filename = url.split("/")[-1]

with open(filename, "wb") as f:
    r = requests.get(url)
    f.write(r.content)

with gzip.open('movie_data.csv.gz', 'rb') as f_in:
    with open('movie_data.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [3]:
df = pd.read_csv('movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [4]:
# Very small subset because too long ...
train_texts = df.iloc[:35]['review'].values
train_labels = df.iloc[:35]['sentiment'].values

valid_texts = df.iloc[:35:40]['review'].values
valid_labels = df.iloc[:35:40]['sentiment'].values

test_texts = df.iloc[40:50]['review'].values
test_labels = df.iloc[40:50]['sentiment'].values

In [5]:
tokenizer = DistilBertTokenizerFast.from_pretrained(
    'distilbert-base-uncased'
)

train_encodings = tokenizer(
    list(train_texts), 
    truncation=True, 
    padding=True
)
valid_encodings = tokenizer(
    list(valid_texts),
    truncation=True,
    padding=True
)
test_encodings = tokenizer(
    list(test_texts),
    truncation=True,
    padding=True
)

In [6]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        }
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)


train_dataset = IMDbDataset(train_encodings, train_labels)
valid_dataset = IMDbDataset(valid_encodings, valid_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=16, shuffle=True
)
valid_loader = torch.utils.data.DataLoader(
    valid_dataset, batch_size=16, shuffle=False
)
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=16, shuffle=False
)

In [7]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased'
)
model.to(DEVICE)
model.train()

optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def compute_accuracy(model, data_loader, device):
    with torch.no_grad():
        correct_pred, num_examples = 0, 0
        for batch_idx, batch in enumerate(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            predicted_labels = torch.argmax(logits, 1)
            num_examples += labels.size(0)
            correct_pred += (predicted_labels == labels).sum()
    return correct_pred.float() / num_examples * 100

In [9]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    
    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        
        outputs = model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss, logits = outputs['loss'], outputs['logits']
        
        optim.zero_grad()
        loss.backward()
        optim.step()
        
        print(
            f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d}'
            f' | Batch'
            f'{batch_idx:04d}/'
            f'{len(train_loader):04d} | '
            f'Loss: {loss:.4f}'
        )
    
    model.eval()
    
    with torch.set_grad_enabled(False):
        print(
            f'Training accuracy: '
            f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
            f'\nValid accuracy: '
            f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%'
        )
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 0001/0002 | Batch0000/0003 | Loss: 0.6989
Epoch: 0001/0002 | Batch0001/0003 | Loss: 0.6750
Epoch: 0001/0002 | Batch0002/0003 | Loss: 0.6892
Training accuracy: 91.43%
Valid accuracy: 100.00%
Time elapsed: 0.97 min
Epoch: 0002/0002 | Batch0000/0003 | Loss: 0.6583
Epoch: 0002/0002 | Batch0001/0003 | Loss: 0.6355
Epoch: 0002/0002 | Batch0002/0003 | Loss: 0.6838
Training accuracy: 97.14%
Valid accuracy: 100.00%
Time elapsed: 1.85 min
Total Training Time: 1.85 min
Test accuracy: 50.00%


In [9]:
# Trainer api

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased'
)
model.to(DEVICE)
model.train()

optim = torch.optim.Adam(model.parameters(), lr=5e-5)

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    'test_trainer',
    evaluation_strategy='epoch',
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    optimizers=(optim, None)
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from datasets import load_metric
import numpy as np

metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(
        predictions=predictions,
        references=labels
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optim, None)
)

In [14]:
start_time = time.time()
trainer.train()

print(
    f'Total Training Time: '
    f'{(time.time() - start_time)/60:.2f} min'
)
print(trainer.evaluate())

Step,Training Loss


Total Training Time: 2.20 min


{'eval_loss': 0.7146758437156677, 'eval_accuracy': 0.5, 'eval_runtime': 3.5424, 'eval_samples_per_second': 2.823, 'eval_steps_per_second': 0.282, 'epoch': 3.0}


In [15]:
print(trainer.evaluate())

{'eval_loss': 0.7146758437156677, 'eval_accuracy': 0.5, 'eval_runtime': 3.9683, 'eval_samples_per_second': 2.52, 'eval_steps_per_second': 0.252, 'epoch': 3.0}


In [16]:
model.eval()
model.to(DEVICE)

print(f'Test accuracy: {compute_accuracy(model, test_loader,DEVICE):.2f}%')

Test accuracy: 50.00%
