In [1]:
import pandas as pd
import numpy as np
import torch
import evaluate

from accelerate import Accelerator, notebook_launcher
from accelerate.utils import set_seed
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
    TrainingArguments, Trainer, get_scheduler

np.random.seed(0) 
torch.manual_seed(0)

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

<torch._C.Generator at 0x7fbabe8cdfb0>

In [2]:
train = pd.read_csv("data/goodreads_train.csv")
train.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,18245960,dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,Sun Jul 30 07:44:10 -0700 2017,Wed Aug 30 00:00:26 -0700 2017,Sat Aug 26 12:05:52 -0700 2017,Tue Aug 15 13:23:18 -0700 2017,28,1
1,8842281e1d1347389f2ab93d60773d4d,16981,a5d2c3628987712d0e05c4f90798eb67,3,Recommended by Don Katz. Avail for free in Dec...,Mon Dec 05 10:46:44 -0800 2016,Wed Mar 22 11:37:04 -0700 2017,,,1,0
2,8842281e1d1347389f2ab93d60773d4d,28684704,2ede853b14dc4583f96cf5d120af636f,3,"A fun, fast paced science fiction thriller. I ...",Tue Nov 15 11:29:22 -0800 2016,Mon Mar 20 23:40:27 -0700 2017,Sat Mar 18 23:22:42 -0700 2017,Fri Mar 17 23:45:40 -0700 2017,22,0
3,8842281e1d1347389f2ab93d60773d4d,27161156,ced5675e55cd9d38a524743f5c40996e,0,Recommended reading to understand what is goin...,Wed Nov 09 17:37:04 -0800 2016,Wed Nov 09 17:38:20 -0800 2016,,,5,1
4,8842281e1d1347389f2ab93d60773d4d,25884323,332732725863131279a8e345b63ac33e,4,"I really enjoyed this book, and there is a lot...",Mon Apr 25 09:31:23 -0700 2016,Mon Apr 25 09:31:23 -0700 2016,Sun Jun 26 00:00:00 -0700 2016,Sat May 28 00:00:00 -0700 2016,9,1


In [3]:
data = train.loc[:, ["review_text", "rating"]]

In [4]:
# create a validation set
train_df, valid_df = train_test_split(data, test_size=0.1, random_state=0)
print(train_df.shape, valid_df.shape)

(810000, 2) (90000, 2)


In [5]:
model_name = "distilbert-base-uncased"

def get_dataloaders(batch_size):
    train_ds = Dataset.from_pandas(train_df)
    valid_ds = Dataset.from_pandas(valid_df)
    
    dataset = DatasetDict()
    dataset["train"] = train_ds
    dataset["valid"] = valid_ds
    
    dataset = dataset.rename_columns({"rating": "label"})
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.save_pretrained("models/tokenizer/")
    
    def tokenize_func(dataset, col="review_text"):
        return tokenizer(dataset[col], padding="max_length", truncation=True)
    
    tokenized_datasets = dataset.map(tokenize_func, batched=True)
    
    tokenized_datasets = tokenized_datasets.remove_columns(
        ["review_text", "__index_level_0__"])
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
    tokenized_datasets.set_format("torch")
    
    train_dl = DataLoader(tokenized_datasets["train"],
                      shuffle=True,
                      batch_size=batch_size)
    valid_dl = DataLoader(tokenized_datasets["valid"],
                          batch_size=batch_size)
    return train_dl, valid_dl

In [6]:
def get_model():
    return AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=6)

# metric = evaluate.load("accuracy")
# model.to(device)

In [7]:
def training_loop(batch_size=32, lr=5e-5, num_epochs=3,
                  mixed_precision="fp16", seed=0):
    set_seed(seed)
    accelerator = Accelerator(mixed_precision=mixed_precision)
    model = get_model()
    train_dl, valid_dl = get_dataloaders(batch_size)
    
    num_training_steps = num_epochs * len(train_dl)
    optimizer = AdamW(model.parameters(), lr=lr)
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps)
    
    model, optimizer, train_dl, valid_dl, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dl, valid_dl, lr_scheduler)
    
    progress_bar = tqdm(range(num_training_steps))
    for epoch in range(num_epochs):
        model.train()
        for batch in train_dl:
            # batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            # loss.backward()
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        model.eval()
        accurate = 0
        num_elems = 0
        for batch in valid_dl:
            #batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            accurate_preds = accelerator.gather(predictions) == accelerator.gather(batch["labels"])
            num_elems += accurate_preds.shape[0]
            accurate += accurate_preds.long().sum()
            
        accuracy = accurate.item() / num_elems
        accelerator.print(f"Epoch {epoch+1} accuracy: {100*accuracy:.2f}%")
        
    # save model
    accelerator.wait_for_everyone()
    
    model = accelerator.unwrap_model(model)
    model.save_pretrained(f"models/transformer_v2/")

In [8]:
num_gpus = torch.cuda.device_count()
num_gpus

8

In [9]:
args = (32, 5e-5, 3, "fp16", 0)
notebook_launcher(training_loop, args, num_processes=num_gpus)

Launching training on 8 GPUs.


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/75939 [00:00<?, ?it/s]

  0%|          | 0/75939 [00:00<?, ?it/s]

  0%|          | 0/75939 [00:00<?, ?it/s]

  0%|          | 0/75939 [00:00<?, ?it/s]

  0%|          | 0/75939 [00:00<?, ?it/s]

  0%|          | 0/75939 [00:00<?, ?it/s]

  0%|          | 0/75939 [00:00<?, ?it/s]

  0%|          | 0/75939 [00:00<?, ?it/s]

Epoch 1 accuracy: 64.95%
Epoch 2 accuracy: 65.88%
Epoch 3 accuracy: 66.04%
