Use this notebook in [Google Colab](https://drive.google.com/file/d/1BsMFZBG7QhGyXBTO-8BWQRkX4gqcJ7Cb/view?usp=sharing)

#### Resources

[HuggingFace BERT2BERT Tutorial](https://colab.research.google.com/drive/1Ekd5pUeCX7VOrMx94_czTkwNtLN32Uyu?usp=sharing)

[Yelp Open Dataset Documentation](https://www.yelp.com/dataset/documentation/main)

#### Next two cells are only needed when using Google Colab.

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
# change this to your project directory
%cd "drive/MyDrive/CodingProjects/yelp_review_generator"

In [None]:
%%capture
!pip install datasets==1.5.0
!pip install transformers==4.5.1

In [None]:
import json
import random
from typing import Dict, List

from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    EncoderDecoderConfig, 
    EncoderDecoderModel, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments
)

Before continuing, download the [Yelp Open Dataset](https://www.yelp.com/dataset). After decompressing the zip file, the folder should be called `yelp_dataset` and placed in the root directory of this project.

In [None]:
FILEPATH_BUSINESS = "yelp_dataset/yelp_academic_dataset_business.json"
FILEPATH_REVIEW = "yelp_dataset/yelp_academic_dataset_review.json"
FILEPATH_USER = "yelp_dataset/yelp_academic_dataset_user.json"

PRETRAINED_MODEL_NAME = "bert-base-uncased"
TRAINED_MODEL_OUTPUT_DIR = "model"

BATCH_SIZE = 16
ENCODER_MAX_LEN = 32
DECODER_MAX_LEN = 128

## Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

## Data

In [None]:
businesses = {}
with open(FILEPATH_BUSINESS, 'r') as f:
    for line in f:
        business = json.loads(line)
        if business["categories"] and 5 < business["review_count"] < 40:
            # categories is stored as a comma separated str. Convert to a list.
            categories_list = business["categories"].split(", ")
            
            businesses[business["business_id"]] = {
                "name": business["name"],
                "city": business["city"],
                "categories": categories_list
            }

print(f"num businesses: {len(businesses)}")

num businesses: 102635


In [None]:
users = {}
with open(FILEPATH_USER, 'r') as f:
    for line in f:
        user = json.loads(line)
        # "elite" is a str composed of years separated by commas. e.g. "2004,2005"
        # "elite_level" is equivalent to the total number of years
        elite_level = len(user["elite"].split(",")) if user["elite"] else 0
        users[user["user_id"]] = {"elite_level": elite_level}

In [None]:
reviews = {"input_text": [], "output_text": []}
with open(FILEPATH_REVIEW, 'r') as f:
    for line in f:
        review = json.loads(line)
        if review["business_id"] in businesses:
            business = businesses[review["business_id"]]
            user = users[review["user_id"]]
            
            # shuffle categories each time to prevent model from memorizing order
            random.shuffle(business["categories"])
            categories_str = ", ".join(business["categories"])
            
            input_text = (
                f"stars {int(review['stars'])}"
                f"; funny {review['funny']}"
                f"; elite level {user['elite_level']}"
                f"; name {business['name']}"
                f"; city {business['city']}"
                f"; categories {categories_str}"
            )
            reviews["input_text"].append(input_text)
            
            # trim off excess tokens to reduce memory
            output_tokens = review["text"].split()[:DECODER_MAX_LEN]
            output_text = " ".join(output_tokens)
            reviews["output_text"].append(output_text)

In [None]:
ds = Dataset.from_dict(reviews)
ds = ds.train_test_split(train_size=0.95)
train_ds, val_ds = ds["train"], ds["test"]
ds

DatasetDict({
    train: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 1565192
    })
    test: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 82379
    })
})

In [None]:
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(
        batch["input_text"], padding="max_length", truncation=True, max_length=ENCODER_MAX_LEN
    )
    outputs = tokenizer(
        batch["output_text"], padding="max_length", truncation=True, max_length=DECODER_MAX_LEN
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()

    # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
    # We have to make sure that the PAD token is ignored
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels] 
        for labels in batch["labels"]
    ]

    return batch

train_ds = train_ds.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=64,
    remove_columns=["input_text", "output_text"]
)

train_ds.set_format(
    type="torch", 
    columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

val_ds = val_ds.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=64,
    remove_columns=["input_text", "output_text"]
)

val_ds.set_format(
    type="torch", 
    columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

## Model

In [None]:
enc_dec_model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    PRETRAINED_MODEL_NAME, PRETRAINED_MODEL_NAME
)

In [None]:
# set special tokens
enc_dec_model.config.decoder_start_token_id = tokenizer.bos_token_id
enc_dec_model.config.eos_token_id = tokenizer.eos_token_id
enc_dec_model.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
enc_dec_model.config.vocab_size = enc_dec_model.config.decoder.vocab_size
enc_dec_model.config.max_length = DECODER_MAX_LEN
enc_dec_model.config.no_repeat_ngram_size = 3
enc_dec_model.config.early_stopping = True
enc_dec_model.config.length_penalty = 2.0
enc_dec_model.config.top_p = 0.95
enc_dec_model.config.do_sample = True

## Train

In [None]:
# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    output_dir=TRAINED_MODEL_OUTPUT_DIR,
    save_total_limit=2,
    overwrite_output_dir=True,
    save_steps=5000,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="steps",
    logging_steps=5000,
    logging_first_step=True,
    warmup_ratio=0.05,
    num_train_epochs=1,
    fp16=True
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=enc_dec_model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds
)

In [None]:
trainer.train()