In [None]:
import pandas as pd
import numpy as np
import torch
import evaluate

from tqdm.auto import tqdm
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import cross_entropy
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from accelerate import Accelerator, notebook_launcher
from accelerate.utils import set_seed
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, \
    TrainingArguments, Trainer, get_scheduler, PretrainedConfig, PreTrainedModel

np.random.seed(0) 
torch.manual_seed(0)

#device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
train = pd.read_csv("data/goodreads_train.csv")
train = train.sample(1000)
train.head()

In [None]:
# the votes and comments are not reliable 
train.loc[train["book_id"]==18245960, ].head()

In [None]:
data = train.loc[:, ["user_id", "book_id", "review_text", "rating"]]
data.head()

In [None]:
# encode user and book ids
enc = OrdinalEncoder(handle_unknown="use_encoded_value",
                                   unknown_value=-1)
encodings = enc.fit_transform(data[["user_id", "book_id"]])

# add 1 to X_train, so unknowns would be 0
encodings = encodings + 1

encodings[:5, :]

In [None]:
data["user_id"] = encodings[:, 0].astype(int)
data["book_id"] = encodings[:, 1].astype(int)
data.head()

In [None]:
n_users, n_books = data["user_id"].max() + 1, data["book_id"].max() + 1
print(n_users, n_books)

In [None]:
# create a validation set
train_df, valid_df = train_test_split(data, test_size=0.1, random_state=0)
print(train_df.shape, valid_df.shape)

In [None]:
transformer_model_name = "distilbert-base-cased"

In [None]:
def get_dataloaders(batch_size):
    train_ds = Dataset.from_pandas(train_df)
    valid_ds = Dataset.from_pandas(valid_df)
    
    dataset = DatasetDict()
    dataset["train"] = train_ds
    dataset["valid"] = valid_ds
    
    tokenizer = AutoTokenizer.from_pretrained(transformer_model_name)
    tokenizer.save_pretrained("models/tokenizer/")
    
    def tokenize_func(dataset, col="review_text"):
        return tokenizer(dataset[col], padding="max_length", truncation=True)
    
    tokenized_datasets = dataset.map(tokenize_func, batched=True)
    
    tokenized_datasets = tokenized_datasets.remove_columns(
        ["review_text", "__index_level_0__"])
    tokenized_datasets = tokenized_datasets.rename_column("rating", "labels")
    tokenized_datasets.set_format("torch")
    
    train_dl = DataLoader(tokenized_datasets["train"],
                      shuffle=True,
                      batch_size=batch_size)
    valid_dl = DataLoader(tokenized_datasets["valid"],
                          batch_size=batch_size)
    return train_dl, valid_dl

In [None]:
train_dl, valid_dl = get_dataloaders(8)

In [None]:
batch = next(iter(train_dl))
batch

In [None]:
class CombinedConfig(PretrainedConfig):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

In [None]:
class CombinedModel(PreTrainedModel):
    def __init__(self, config, n_users, n_books, transformer_model_name,
                 n_factors=50, n_classes=6):
        super().__init__(config)
        self.user_embs = nn.Embedding(n_users, n_factors)
        self.book_embs = nn.Embedding(n_books, n_factors)
        self.text_transformer = AutoModelForSequenceClassification.from_pretrained(
            transformer_model_name, num_labels=512)
        self.linear_layers = nn.Sequential(
            nn.Linear(n_factors*2+512, 256, bias=False),
            nn.LeakyReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(),
            nn.Linear(256, n_classes)
        )
        
    def forward(self, user_id=None, book_id=None,
                input_ids=None, attention_mask=None, labels=None):
        # first col: users, second col: books
        x_users = self.user_embs(user_id)
        x_books = self.book_embs(book_id)
        x_transformer = self.text_transformer(input_ids=input_ids,
                                              attention_mask=attention_mask).logits
        x = torch.cat([x_users, x_books, x_transformer], dim=-1)
        logits =  self.linear_layers(x)
        
        if labels is not None:
            loss = cross_entropy(logits, labels)
            return {"loss": loss, "logits": logits}
        
        return {"logits": logits}

In [None]:
def get_model():
    config = CombinedConfig()
    model = CombinedModel(config, n_users, n_books, transformer_model_name)
    return model

In [None]:
def training_loop(batch_size=32, lr=5e-5, num_epochs=3,
                  mixed_precision="fp16", seed=0):
    set_seed(seed)
    accelerator = Accelerator(mixed_precision=mixed_precision)
    model = get_model()
    #model.to(device)
    train_dl, valid_dl = get_dataloaders(batch_size)
    
    num_training_steps = num_epochs * len(train_dl)
    optimizer = AdamW(model.parameters(), lr=lr)
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps)
    
    model, optimizer, train_dl, valid_dl, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dl, valid_dl, lr_scheduler)
    
    progress_bar = tqdm(range(num_training_steps))
    for epoch in range(num_epochs):
        model.train()
        for batch in train_dl:
            #batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs["loss"]
            #loss.backward()
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        model.eval()
        accurate = 0
        num_elems = 0
        for batch in valid_dl:
            #batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            logits = outputs["logits"]
            predictions = torch.argmax(logits, dim=-1)
            accurate_preds = accelerator.gather(predictions) == accelerator.gather(batch["labels"])
            #accurate_preds = predictions == batch["labels"]
            num_elems += accurate_preds.shape[0]
            accurate += accurate_preds.long().sum()
            
        accuracy = accurate.item() / num_elems
        accelerator.print(f"Epoch {epoch+1} accuracy: {100*accuracy:.2f}%")
        #print(f"Epoch {epoch+1} accuracy: {100*accuracy:.2f}%")
        
    # save model
    accelerator.wait_for_everyone()
    model = accelerator.unwrap_model(model)
    model.save_pretrained(f"models/combined_v2/")

In [None]:
#training_loop(batch_size=8, num_epochs=3)

In [None]:
num_gpus = torch.cuda.device_count()
num_gpus

In [None]:
args = (32, 5e-5, 10, "fp16", 0)
notebook_launcher(training_loop, args, num_processes=num_gpus)