In [1]:
import pandas as pd
import numpy as np
import torch
import evaluate

from tqdm.auto import tqdm
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import cross_entropy
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from accelerate import Accelerator, notebook_launcher
from accelerate.utils import set_seed
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, \
    TrainingArguments, Trainer, get_scheduler, PretrainedConfig, PreTrainedModel

np.random.seed(0) 
torch.manual_seed(0)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [2]:
train = pd.read_csv("data/goodreads_train.csv")
train = train.sample(1000)
train.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
772215,e21627c07b1c16a64b1d55afb0801cd3,119324,003879c133a50fcf89372f653996629a,5,11/11/17 - 5/5 \n This next installment wasn't...,Thu May 28 06:25:19 -0700 2015,Mon Sep 11 03:06:56 -0700 2017,Mon Sep 11 03:06:56 -0700 2017,Mon Sep 11 00:00:00 -0700 2017,0,0
292046,6db54aa53cc7fe06339e1f381eb0b067,23126227,7187b3590d12c35d5d7b44c55ad67f1b,5,Totally enjoyed reading this. Owen and Nathan ...,Tue Jun 09 20:59:44 -0700 2015,Tue Jun 09 21:01:36 -0700 2015,,,1,0
814437,1df102d3aa4d7e724ed2af9cab0e86c2,18190208,53dd1cf2d2bb0e64a08fdc2be8643bea,3,"THE WITCH HUNTER was a quick, entertaining rea...",Sat Sep 27 11:35:31 -0700 2014,Sun Nov 08 08:37:58 -0800 2015,Wed Oct 07 00:00:00 -0700 2015,,7,2
381608,1558bfbf21cdfee8a8064a5abd3c9ae8,4671,730990b75ff5c3f76b81d58583b7cc88,5,I listened to The Great Gatsby on audio. I had...,Thu Aug 18 14:08:20 -0700 2011,Wed Oct 02 09:19:37 -0700 2013,Mon Aug 01 00:00:00 -0700 2011,,0,0
172519,2cba560652c041e9a14b7e25a92d6278,44538,af0143db98f00e3da41073c3c93ea2aa,5,Read if you like Jane Austen with more sex and...,Tue Apr 15 18:04:26 -0700 2008,Wed Mar 25 07:28:59 -0700 2015,Tue Apr 22 00:00:00 -0700 2008,,2,1


In [3]:
# the votes and comments are not reliable 
train.loc[train["book_id"]==18245960, ].head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments


In [4]:
data = train.loc[:, ["user_id", "book_id", "review_text", "rating"]]
data.head()

Unnamed: 0,user_id,book_id,review_text,rating
772215,e21627c07b1c16a64b1d55afb0801cd3,119324,11/11/17 - 5/5 \n This next installment wasn't...,5
292046,6db54aa53cc7fe06339e1f381eb0b067,23126227,Totally enjoyed reading this. Owen and Nathan ...,5
814437,1df102d3aa4d7e724ed2af9cab0e86c2,18190208,"THE WITCH HUNTER was a quick, entertaining rea...",3
381608,1558bfbf21cdfee8a8064a5abd3c9ae8,4671,I listened to The Great Gatsby on audio. I had...,5
172519,2cba560652c041e9a14b7e25a92d6278,44538,Read if you like Jane Austen with more sex and...,5


In [5]:
# encode user and book ids
enc = OrdinalEncoder(handle_unknown="use_encoded_value",
                                   unknown_value=-1)
encodings = enc.fit_transform(data[["user_id", "book_id"]])

# add 1 to X_train, so unknowns would be 0
encodings = encodings + 1

encodings[:5, :]

array([[787., 123.],
       [370., 768.],
       [ 99., 634.],
       [ 73.,  18.],
       [162.,  77.]])

In [6]:
data["user_id"] = encodings[:, 0].astype(int)
data["book_id"] = encodings[:, 1].astype(int)
data.head()

Unnamed: 0,user_id,book_id,review_text,rating
772215,787,123,11/11/17 - 5/5 \n This next installment wasn't...,5
292046,370,768,Totally enjoyed reading this. Owen and Nathan ...,5
814437,99,634,"THE WITCH HUNTER was a quick, entertaining rea...",3
381608,73,18,I listened to The Great Gatsby on audio. I had...,5
172519,162,77,Read if you like Jane Austen with more sex and...,5


In [7]:
n_users, n_books = data["user_id"].max() + 1, data["book_id"].max() + 1
print(n_users, n_books)

902 927


In [8]:
# create a validation set
train_df, valid_df = train_test_split(data, test_size=0.1, random_state=0)
print(train_df.shape, valid_df.shape)

(900, 4) (100, 4)


In [9]:
transformer_model_name = "distilbert-base-cased"

In [10]:
def get_dataloaders(batch_size):
    train_ds = Dataset.from_pandas(train_df)
    valid_ds = Dataset.from_pandas(valid_df)
    
    dataset = DatasetDict()
    dataset["train"] = train_ds
    dataset["valid"] = valid_ds
    
    tokenizer = AutoTokenizer.from_pretrained(transformer_model_name)
    tokenizer.save_pretrained("models/tokenizer/")
    
    def tokenize_func(dataset, col="review_text"):
        return tokenizer(dataset[col], padding="max_length", truncation=True)
    
    tokenized_datasets = dataset.map(tokenize_func, batched=True)
    
    tokenized_datasets = tokenized_datasets.remove_columns(
        ["review_text", "__index_level_0__"])
    tokenized_datasets = tokenized_datasets.rename_column("rating", "labels")
    tokenized_datasets.set_format("torch")
    
    train_dl = DataLoader(tokenized_datasets["train"],
                      shuffle=True,
                      batch_size=batch_size)
    valid_dl = DataLoader(tokenized_datasets["valid"],
                          batch_size=batch_size)
    return train_dl, valid_dl

In [11]:
train_dl, valid_dl = get_dataloaders(8)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
batch = next(iter(train_dl))
batch

{'user_id': tensor([ 76, 822, 304, 451, 458, 161, 716, 196]),
 'book_id': tensor([731, 119, 271, 273, 809, 686, 233, 383]),
 'labels': tensor([5, 4, 5, 3, 5, 4, 4, 5]),
 'input_ids': tensor([[  101, 13432,  1158,  ...,     0,     0,     0],
         [  101,  6355,  1330,  ...,     0,     0,     0],
         [  101,   125,   119,  ...,     0,     0,     0],
         ...,
         [  101,  3274,  5016,  ...,  2147,  1114,   102],
         [  101,   146,  8050,  ...,     0,     0,     0],
         [  101,  2131,  1105,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [13]:
class CombinedConfig(PretrainedConfig):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

In [14]:
class CombinedModel(PreTrainedModel):
    def __init__(self, config, n_users, n_books, transformer_model_name,
                 n_factors=50, n_classes=6):
        super().__init__(config)
        self.user_embs = nn.Embedding(n_users, n_factors)
        self.book_embs = nn.Embedding(n_books, n_factors)
        self.text_transformer = AutoModelForSequenceClassification.from_pretrained(
            transformer_model_name, num_labels=512)
        self.linear_layers = nn.Sequential(
            nn.Linear(n_factors*2+512, 256, bias=False),
            nn.LeakyReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(),
            nn.Linear(256, n_classes)
        )
        
    def forward(self, user_id=None, book_id=None,
                input_ids=None, attention_mask=None, labels=None):
        # first col: users, second col: books
        x_users = self.user_embs(user_id)
        x_books = self.book_embs(book_id)
        x_transformer = self.text_transformer(input_ids=input_ids,
                                              attention_mask=attention_mask).logits
        x = torch.cat([x_users, x_books, x_transformer], dim=-1)
        logits =  self.linear_layers(x)
        
        if labels is not None:
            loss = cross_entropy(logits, labels)
            return {"loss": loss, "logits": logits}
        
        return {"logits": logits}

In [15]:
def get_model():
    config = CombinedConfig()
    model = CombinedModel(config, n_users, n_books, transformer_model_name)
    return model

model = get_model()
model

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier

CombinedModel(
  (user_embs): Embedding(902, 50)
  (book_embs): Embedding(927, 50)
  (text_transformer): DistilBertForSequenceClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(28996, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_no

In [16]:
model(**batch)

{'loss': tensor(2.2753, grad_fn=<NllLossBackward0>),
 'logits': tensor([[ 0.7709, -1.1219,  0.0567,  0.1525,  1.1508, -0.4862],
         [-0.1105, -0.0502,  1.0535,  1.4727,  0.4887,  0.3666],
         [ 0.1450,  1.2135,  0.6036,  0.1206,  0.6247,  1.4563],
         [ 1.0750,  0.2399, -0.5787,  0.1669,  0.3621,  1.1503],
         [-0.4572,  0.9289,  1.2784,  0.5159, -0.4233, -0.1340],
         [ 0.1771,  0.0514,  0.3022, -0.6492, -0.5052, -1.5077],
         [ 0.6737, -0.1828, -0.2362,  0.7395, -0.5491,  0.8389],
         [ 0.2444, -0.5964,  0.6709, -0.6026, -1.2856, -1.4115]],
        grad_fn=<AddmmBackward0>)}

In [17]:
def training_loop(batch_size=8, lr=5e-5, num_epochs=3,
                  mixed_precision="fp16", seed=0):
    # set_seed(seed)
    # accelerator = Accelerator(mixed_precision=mixed_precision)
    model = get_model()
    model.to(device)
    train_dl, valid_dl = get_dataloaders(batch_size)
    
    num_training_steps = num_epochs * len(train_dl)
    optimizer = AdamW(model.parameters(), lr=lr)
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps)
    
    # model, optimizer, train_dl, valid_dl, lr_scheduler = accelerator.prepare(
    #     model, optimizer, train_dl, valid_dl, lr_scheduler)
    
    progress_bar = tqdm(range(num_training_steps))
    for epoch in range(num_epochs):
        model.train()
        for batch in train_dl:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs["loss"]
            loss.backward()
            #accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        model.eval()
        accurate = 0
        num_elems = 0
        for batch in valid_dl:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            logits = outputs["logits"]
            predictions = torch.argmax(logits, dim=-1)
            # accurate_preds = accelerator.gather(predictions) == accelerator.gather(batch["labels"])
            accurate_preds = predictions == batch["labels"]
            num_elems += accurate_preds.shape[0]
            accurate += accurate_preds.long().sum()
            
        accuracy = accurate.item() / num_elems
        #accelerator.print(f"Epoch {epoch+1} accuracy: {100*accuracy:.2f}%")
        print(f"Epoch {epoch+1} accuracy: {100*accuracy:.2f}%")
        
    # save model
    # accelerator.wait_for_everyone()
    # model = accelerator.unwrap_model(model)
    model.save_pretrained(f"models/combined_v1/")

In [18]:
training_loop(batch_size=8, num_epochs=3)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1130 [00:00<?, ?it/s]

Epoch 1 accuracy: 24.00%
Epoch 2 accuracy: 22.00%
Epoch 3 accuracy: 22.00%
Epoch 4 accuracy: 21.00%
Epoch 5 accuracy: 20.00%
Epoch 6 accuracy: 22.00%
Epoch 7 accuracy: 21.00%
Epoch 8 accuracy: 21.00%
Epoch 9 accuracy: 21.00%
Epoch 10 accuracy: 24.00%


In [19]:
# num_gpus = torch.cuda.device_count()
# num_gpus

In [20]:
# args = (32, 5e-5, 3, "fp16", 0)
# notebook_launcher(training_loop, args, num_processes=num_gpus)