In [1]:
import pandas as pd
import numpy as np
import torch
import evaluate

from tqdm.auto import tqdm
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import cross_entropy
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from accelerate import Accelerator, notebook_launcher
from accelerate.utils import set_seed
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, \
    TrainingArguments, Trainer, get_scheduler, PretrainedConfig, PreTrainedModel

np.random.seed(0) 
torch.manual_seed(0)

#device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

<torch._C.Generator at 0x7f0e40c45fb0>

In [2]:
train = pd.read_csv("data/goodreads_train.csv")
#train = train.sample(1000)
train.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,18245960,dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,Sun Jul 30 07:44:10 -0700 2017,Wed Aug 30 00:00:26 -0700 2017,Sat Aug 26 12:05:52 -0700 2017,Tue Aug 15 13:23:18 -0700 2017,28,1
1,8842281e1d1347389f2ab93d60773d4d,16981,a5d2c3628987712d0e05c4f90798eb67,3,Recommended by Don Katz. Avail for free in Dec...,Mon Dec 05 10:46:44 -0800 2016,Wed Mar 22 11:37:04 -0700 2017,,,1,0
2,8842281e1d1347389f2ab93d60773d4d,28684704,2ede853b14dc4583f96cf5d120af636f,3,"A fun, fast paced science fiction thriller. I ...",Tue Nov 15 11:29:22 -0800 2016,Mon Mar 20 23:40:27 -0700 2017,Sat Mar 18 23:22:42 -0700 2017,Fri Mar 17 23:45:40 -0700 2017,22,0
3,8842281e1d1347389f2ab93d60773d4d,27161156,ced5675e55cd9d38a524743f5c40996e,0,Recommended reading to understand what is goin...,Wed Nov 09 17:37:04 -0800 2016,Wed Nov 09 17:38:20 -0800 2016,,,5,1
4,8842281e1d1347389f2ab93d60773d4d,25884323,332732725863131279a8e345b63ac33e,4,"I really enjoyed this book, and there is a lot...",Mon Apr 25 09:31:23 -0700 2016,Mon Apr 25 09:31:23 -0700 2016,Sun Jun 26 00:00:00 -0700 2016,Sat May 28 00:00:00 -0700 2016,9,1


In [3]:
# the votes and comments are not reliable 
train.loc[train["book_id"]==18245960, ].head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,18245960,dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,Sun Jul 30 07:44:10 -0700 2017,Wed Aug 30 00:00:26 -0700 2017,Sat Aug 26 12:05:52 -0700 2017,Tue Aug 15 13:23:18 -0700 2017,28,1
52556,1a2398eca437fed5d9add310a0c09611,18245960,b88eb6519a046159a31afcc21a448b6f,3,Average between the 4 star concepts (overall p...,Tue Oct 13 17:04:44 -0700 2015,Wed Oct 21 09:41:37 -0700 2015,Wed Oct 21 09:41:37 -0700 2015,Tue Oct 13 00:00:00 -0700 2015,0,0
69854,ab19fd8bcc1021d1c0f42dfa8d47f1e3,18245960,5ea837edf3b1386c613a123f46e1cccd,5,A really refreshing take on science fiction an...,Thu Aug 27 00:37:34 -0700 2015,Mon May 22 10:23:46 -0700 2017,Fri Jan 13 00:00:00 -0800 2017,Wed Jan 04 00:00:00 -0800 2017,8,0
71434,e956ca90ba23174bbbccf1161ab19150,18245960,cefa3a787e1a845fb48acad37ff528de,1,"I'm really waffling between whether to rate ""d...",Tue Sep 23 10:40:04 -0700 2014,Fri Jul 10 13:42:47 -0700 2015,Fri Jul 10 16:04:46 -0700 2015,Mon Jul 06 00:00:00 -0700 2015,28,4
104146,da7a0c5ee0c89973224d8853445be68e,18245960,86a7c0571fd59e729cf9e2340dda1224,5,Every time I read a good science fiction novel...,Sun Jul 24 22:51:11 -0700 2016,Sat Jul 30 22:12:15 -0700 2016,Fri Jul 29 00:00:00 -0700 2016,Sun Jul 24 00:00:00 -0700 2016,1,0


In [4]:
data = train.loc[:, ["user_id", "book_id", "review_text", "rating"]]
data.head()

Unnamed: 0,user_id,book_id,review_text,rating
0,8842281e1d1347389f2ab93d60773d4d,18245960,This is a special book. It started slow for ab...,5
1,8842281e1d1347389f2ab93d60773d4d,16981,Recommended by Don Katz. Avail for free in Dec...,3
2,8842281e1d1347389f2ab93d60773d4d,28684704,"A fun, fast paced science fiction thriller. I ...",3
3,8842281e1d1347389f2ab93d60773d4d,27161156,Recommended reading to understand what is goin...,0
4,8842281e1d1347389f2ab93d60773d4d,25884323,"I really enjoyed this book, and there is a lot...",4


In [5]:
# encode user and book ids
enc = OrdinalEncoder(handle_unknown="use_encoded_value",
                                   unknown_value=-1)
encodings = enc.fit_transform(data[["user_id", "book_id"]])

# add 1 to X_train, so unknowns would be 0
encodings = encodings + 1

encodings[:5, :]

array([[ 6442., 15889.],
       [ 6442.,   712.],
       [ 6442., 23331.],
       [ 6442., 22616.],
       [ 6442., 21992.]])

In [6]:
data["user_id"] = encodings[:, 0].astype(int)
data["book_id"] = encodings[:, 1].astype(int)
data.head()

Unnamed: 0,user_id,book_id,review_text,rating
0,6442,15889,This is a special book. It started slow for ab...,5
1,6442,712,Recommended by Don Katz. Avail for free in Dec...,3
2,6442,23331,"A fun, fast paced science fiction thriller. I ...",3
3,6442,22616,Recommended reading to understand what is goin...,0
4,6442,21992,"I really enjoyed this book, and there is a lot...",4


In [7]:
n_users, n_books = data["user_id"].max() + 1, data["book_id"].max() + 1
print(n_users, n_books)

12189 25475


In [8]:
# create a validation set
train_df, valid_df = train_test_split(data, test_size=0.1, random_state=0)
print(train_df.shape, valid_df.shape)

(810000, 4) (90000, 4)


In [9]:
transformer_model_name = "distilbert-base-uncased"

In [10]:
def get_dataloaders(batch_size):
    train_ds = Dataset.from_pandas(train_df)
    valid_ds = Dataset.from_pandas(valid_df)
    
    dataset = DatasetDict()
    dataset["train"] = train_ds
    dataset["valid"] = valid_ds
    
    tokenizer = AutoTokenizer.from_pretrained(transformer_model_name)
    tokenizer.save_pretrained("models/tokenizer/")
    
    def tokenize_func(dataset, col="review_text"):
        return tokenizer(dataset[col], padding="max_length", truncation=True)
    
    tokenized_datasets = dataset.map(tokenize_func, batched=True)
    
    tokenized_datasets = tokenized_datasets.remove_columns(
        ["review_text", "__index_level_0__"])
    tokenized_datasets = tokenized_datasets.rename_column("rating", "labels")
    tokenized_datasets.set_format("torch")
    
    train_dl = DataLoader(tokenized_datasets["train"],
                      shuffle=True,
                      batch_size=batch_size)
    valid_dl = DataLoader(tokenized_datasets["valid"],
                          batch_size=batch_size)
    return train_dl, valid_dl

In [11]:
class CombinedConfig(PretrainedConfig):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

In [12]:
class CombinedModel(PreTrainedModel):
    def __init__(self, config, n_users, n_books, transformer_model_name,
                 n_factors=100, n_classes=6):
        super().__init__(config)
        self.user_embs = nn.Embedding(n_users, n_factors)
        self.book_embs = nn.Embedding(n_books, n_factors)
        self.text_transformer = AutoModelForSequenceClassification.from_pretrained(
            transformer_model_name, num_labels=512)
        self.linear_layers = nn.Sequential(
            nn.Linear(n_factors*2+512, 256, bias=True),
            nn.LeakyReLU(),
            nn.Linear(256, n_classes)
        )
        
    def forward(self, user_id=None, book_id=None,
                input_ids=None, attention_mask=None, labels=None):
        # first col: users, second col: books
        x_users = self.user_embs(user_id)
        x_books = self.book_embs(book_id)
        x_transformer = self.text_transformer(input_ids=input_ids,
                                              attention_mask=attention_mask).logits
        x = torch.cat([x_users, x_books, x_transformer], dim=-1)
        logits =  self.linear_layers(x)
        
        if labels is not None:
            loss = cross_entropy(logits, labels)
            return {"loss": loss, "logits": logits}
        
        return {"logits": logits}

In [13]:
def get_model():
    config = CombinedConfig()
    model = CombinedModel(config, n_users, n_books, transformer_model_name)
    return model

In [14]:
def training_loop(batch_size=32, lr=5e-5, num_epochs=3,
                  mixed_precision="fp16", seed=0):
    set_seed(seed)
    accelerator = Accelerator(mixed_precision=mixed_precision)
    model = get_model()
    #model.to(device)
    train_dl, valid_dl = get_dataloaders(batch_size)
    
    num_training_steps = num_epochs * len(train_dl)
    optimizer = AdamW(model.parameters(), lr=lr)
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps)
    
    model, optimizer, train_dl, valid_dl, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dl, valid_dl, lr_scheduler)
    
    progress_bar = tqdm(range(num_training_steps))
    for epoch in range(num_epochs):
        model.train()
        for batch in train_dl:
            #batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs["loss"]
            #loss.backward()
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        model.eval()
        accurate = 0
        num_elems = 0
        for batch in valid_dl:
            #batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            logits = outputs["logits"]
            predictions = torch.argmax(logits, dim=-1)
            accurate_preds = accelerator.gather(predictions) == accelerator.gather(batch["labels"])
            #accurate_preds = predictions == batch["labels"]
            num_elems += accurate_preds.shape[0]
            accurate += accurate_preds.long().sum()
            
        accuracy = accurate.item() / num_elems
        accelerator.print(f"Epoch {epoch+1} accuracy: {100*accuracy:.2f}%")
        #print(f"Epoch {epoch+1} accuracy: {100*accuracy:.2f}%")
        
    # save model
    accelerator.wait_for_everyone()
    model = accelerator.unwrap_model(model)
    model.save_pretrained(f"models/combined_v2/")

In [15]:
#training_loop(batch_size=8, num_epochs=3)

In [16]:
num_gpus = torch.cuda.device_count()
num_gpus

8

In [17]:
args = (32, 5e-5, 5, "fp16", 0)
notebook_launcher(training_loop, args, num_processes=num_gpus)

Launching training on 8 GPUs.


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weigh

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/810 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/126565 [00:00<?, ?it/s]

  0%|          | 0/126565 [00:00<?, ?it/s]

  0%|          | 0/126565 [00:00<?, ?it/s]

  0%|          | 0/126565 [00:00<?, ?it/s]

  0%|          | 0/126565 [00:00<?, ?it/s]

  0%|          | 0/126565 [00:00<?, ?it/s]

  0%|          | 0/126565 [00:00<?, ?it/s]

  0%|          | 0/126565 [00:00<?, ?it/s]

Epoch 1 accuracy: 64.50%
Epoch 2 accuracy: 65.54%
Epoch 3 accuracy: 65.86%
Epoch 4 accuracy: 65.71%
Epoch 5 accuracy: 65.55%
