# GPT-2 Review Generation Demo

Notes:
- This is a self-contained notebook but we also demo the sampling of prompts from the original Amazon dataset so please upload the file from path 'amazon_reviews/test/amazon_reviews.txt' with the notebook to colab
- Our best model is provided (the category conditioned one), in the training folder, please upload this model to colab alongside this notebook to load it
- We have marked below points where the marker can enter their own inputs into the model to test it

In [2]:
!pip install transformers
!pip install datasets

'wget' is not recognized as an internal or external command,
operable program or batch file.
'id' is not recognized as an internal or external command,
operable program or batch file.


In [24]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
import torch
import numpy as np
import pandas as pd

In [80]:
SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
MODEL = 'distilgpt2'

In [None]:
class GPT2:
    def __init__(self, model_path=None, full_model=False, special_tokens=None) -> None:
        self.tokenizer = self.get_tokenizer(special_tokens)
        self.model = self.get_model(self.tokenizer, special_tokens=special_tokens, load_model_path=model_path, full_model=full_model)
        
    def get_tokenizer(self, special_tokens=None):
        tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

        if special_tokens:
            tokenizer.add_special_tokens(special_tokens)
        return tokenizer
    
    def get_model(self, tokenizer, special_tokens=None, load_model_path=None, full_model=False):
        if full_model:
            model = AutoModelForCausalLM.from_pretrained(load_model_path)
            model.cuda()
            return model 
        
        if special_tokens:
            config = AutoConfig.from_pretrained(MODEL, 
                                                bos_token_id=tokenizer.bos_token_id,
                                                eos_token_id=tokenizer.eos_token_id,
                                                sep_token_id=tokenizer.sep_token_id,
                                                pad_token_id=tokenizer.pad_token_id,
                                                output_hidden_states=False)
        else: 
            config = AutoConfig.from_pretrained(MODEL,                                     
                                                pad_token_id=tokenizer.eos_token_id,
                                                output_hidden_states=False)    

        model = AutoModelForCausalLM.from_pretrained(MODEL, config=config)

        if special_tokens:
            model.resize_token_embeddings(len(tokenizer))

        if load_model_path:
            model.load_state_dict(torch.load(load_model_path))#map_location=torch.device('cpu')))

        model.to(torch.device('cuda:0'))
        return model
    
    def generate_text(self, prompt, category, print_output=True, **kwargs):
        generated_outputs = []
        
        # Tokenize prompt
        tokenized_prompt = self.tokenizer.encode(prompt, return_tensors='pt').to('cuda:0')
        
        # Language modelling
        output = self.model.generate(tokenized_prompt, **kwargs)
        
        for i, o in enumerate(output):
            gen_txt = self.tokenizer.decode(o, skip_special_tokens=True)
            gen_txt = gen_txt[len(category):]
            truncated_txt = gen_txt.split('.')
            truncated_txt = '.'.join(truncated_txt[:-1]) + '.'
            generated_outputs.append(truncated_txt)
            
            if print_output:
                print(truncated_txt + '\n')
                
        return generated_outputs

In [81]:
def sample_start_amazon(df, length=5):
    sample = df.sample(n=1)
    title, category, text = list(sample['REVIEW_TITLE'])[0], list(sample['PRODUCT_CATEGORY'])[0], list(sample['REVIEW_TEXT'])[0]
    sample = str(text).split(' ')
    return ' '.join(sample[:length]), title, category, text

**Please ensure the amazon_reviews.txt file stated at the top of this notebook is in the working dir**

In [25]:
# Load our test-data that we will be sampling categories and prompts from
data = pd.read_csv('amazon_reviews.txt')
data.loc[data['LABEL'] == '__label2__', 'LABEL'] = 0
data.loc[data['LABEL'] == '__label1__', 'LABEL'] = 1
data_amazon = data.get(data['LABEL'] == 0)

  return func(*args, **kwargs)


**Make sure the pytorch_model.bin file is in the working directory to be loaded**

In [83]:
# Load the category model
model_path = 'pytorch_model.bin'
model = GPT2(model_path=model_path, full_model=False, special_tokens=SPECIAL_TOKENS)

In [84]:
# These are the available categories
categories = ['Apparel', 'Automotive', 'Baby', 'Beauty', 'Books', 'Camera', 'Electronics', 'Furniture', 'Grocery', 'Health & Personal Care', 'Home', 'Home Entertainment', 'Home Improvement', 'Jewelry', 'Kitchen', 'Lawn and Garden', 'Luggage', 'Musical Instruments', 'Office Products', 'Outdoors', 'PC', 'Pet Products', 'Shoes', 'Sports', 'Tools', 'Toys', 'Video DVD', 'Video Games', 'Watches', 'Wireless']
start_words = {'A', 'After', 'All', 'Any', 'Apart', 'Arrived', 'As', 'At', 'Attended', 'Avoid', 'Awesome', 'Be', 'Beautiful', 'Before', 'Booked', 'Check', "Didn't", 'Despite', 'Do', "Don't", 'Elegant', 'Even', 'Excellent', 'First', 'Firstly', 'For', 'From', 'Generally', 'Going', 'Good', 'Got', 'Great', 'Guys', 'Had', 'Have', 'Having', 'Here', 'How', 'I', "I'd", "I'll", "I'm", "I've", 'If', 'In', 'It', "It's", 'Just', 'Let', 'Me', 'My', 'Nice', 'No', 'Not', 'Often', 'Ok,', 'On', 'Other', 'Our', 'Overall', 'Recently', 'Rude,', 'Seriously', 'Simply', 'Sometimes', 'The', 'They', 'This', 'Used', 'Very', 'Was', 'We', "We've", 'Well', 'Went', 'What', "What's", 'When', 'While'}

Random sampling

In [None]:
prompt = np.random.choice(start_words)
cat = np.random.choice(categories)
prompt = SPECIAL_TOKENS['bos_token'] + cat + SPECIAL_TOKENS['sep_token'] + prompt
outputs = model.generate_text(prompt, cat, print_output=True, do_sample=True, max_length=200, num_beams=5, repetition_penalty=5.0, early_stopping=True, num_return_sequences=3)

Sampling from Amazon human set

In [None]:
prompt, title, cat, original = sample_start_amazon(data_amazon, length=np.random.randint(4, 8))
prompt = SPECIAL_TOKENS['bos_token'] + cat + SPECIAL_TOKENS['sep_token'] + prompt
outputs = model.generate_text(prompt, cat, print_output=True, do_sample=True, max_length=70, num_beams=5, repetition_penalty=5.0, early_stopping=True, num_return_sequences=3)

Free input section

In [None]:
prompt = 'PUT YOUR PROMPT HERE'
cat = 'PLEASE SELECT ONLY CATEGORIES AVAILABLE IN THE CATEGORIES LIST ABOVE'
prompt = SPECIAL_TOKENS['bos_token'] + cat + SPECIAL_TOKENS['sep_token'] + prompt
outputs = model.generate_text(prompt, cat, print_output=True, do_sample=True, max_length=70, num_beams=5, repetition_penalty=5.0, early_stopping=True, num_return_sequences=3)