# GPT-2 Review Generation Demo

Notes:
- This is a self-contained notebook but we also demo the sampling of prompts from the original Amazon dataset so please upload the file from path 'data/amazon_reviews/test/amazon_reviews.txt' with the notebook to colab
- Our best model is provided in the 'training/distilgpt-topic-ft/pytorch_model.bin', please upload this .bin model file to colab alongside this notebook to load it (place in working directory).
- We have marked below points where the marker can enter their own inputs into the model to test it

In [2]:
!pip install transformers
!pip install datasets

'wget' is not recognized as an internal or external command,
operable program or batch file.
'id' is not recognized as an internal or external command,
operable program or batch file.


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
import torch
import numpy as np
import pandas as pd

In [80]:
SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
MODEL = 'distilgpt2'

In [None]:
class GPT2:
    def __init__(self, model_path=None, full_model=False, special_tokens=None) -> None:
        self.tokenizer = self.get_tokenizer(special_tokens)
        self.model = self.get_model(self.tokenizer, special_tokens=special_tokens, load_model_path=model_path, full_model=full_model)
        
    def get_tokenizer(self, special_tokens=None):
        tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

        if special_tokens:
            tokenizer.add_special_tokens(special_tokens)
        return tokenizer
    
    def get_model(self, tokenizer, special_tokens=None, load_model_path=None, full_model=False):
        if full_model:
            model = AutoModelForCausalLM.from_pretrained(load_model_path)
            model.cuda()
            return model 
        
        if special_tokens:
            config = AutoConfig.from_pretrained(MODEL, 
                                                bos_token_id=tokenizer.bos_token_id,
                                                eos_token_id=tokenizer.eos_token_id,
                                                sep_token_id=tokenizer.sep_token_id,
                                                pad_token_id=tokenizer.pad_token_id,
                                                output_hidden_states=False)
        else: 
            config = AutoConfig.from_pretrained(MODEL,                                     
                                                pad_token_id=tokenizer.eos_token_id,
                                                output_hidden_states=False)    

        model = AutoModelForCausalLM.from_pretrained(MODEL, config=config)

        if special_tokens:
            model.resize_token_embeddings(len(tokenizer))

        if load_model_path:
            model.load_state_dict(torch.load(load_model_path))

        model.to(torch.device('cuda:0'))
        return model
    
    def generate_text(self, prompt, category, print_output=True, **kwargs):
        generated_outputs = []
        
        # Tokenize prompt
        tokenized_prompt = self.tokenizer.encode(prompt, return_tensors='pt').to('cuda:0')
        
        # Language modelling
        output = self.model.generate(tokenized_prompt, **kwargs)
        
        for i, o in enumerate(output):
            gen_txt = self.tokenizer.decode(o, skip_special_tokens=True)
            gen_txt = gen_txt[len(category):]
            truncated_txt = gen_txt.split('.')
            truncated_txt = '.'.join(truncated_txt[:-1]) + '.'
            generated_outputs.append(truncated_txt)
            
            if print_output:
                print(truncated_txt + '\n')
                
        return generated_outputs

In [81]:
def sample_start_amazon(df, length=5):
    sample = df.sample(n=1)
    title, category, text = list(sample['REVIEW_TITLE'])[0], list(sample['PRODUCT_CATEGORY'])[0], list(sample['REVIEW_TEXT'])[0]
    sample = str(text).split(' ')
    return ' '.join(sample[:length]), title, category, text

**Please ensure the amazon_reviews.txt file stated at the top of this notebook is in the working dir**

In [None]:
# Load our test-data that we will be sampling categories and prompts from
data = pd.read_csv('amazon_reviews.txt')
data.loc[data['LABEL'] == '__label2__', 'LABEL'] = 0
data.loc[data['LABEL'] == '__label1__', 'LABEL'] = 1
data_amazon = data.get(data['LABEL'] == 0)

**Make sure the pytorch_model.bin file is in the working directory to be loaded**

In [83]:
# Load the category model
model_path = 'pytorch_model.bin'
model = GPT2(model_path=model_path, full_model=False, special_tokens=SPECIAL_TOKENS)

In [84]:
# These are the available categories
categories = ['Apparel', 'Automotive', 'Baby', 'Beauty', 'Books', 'Camera', 'Electronics', 'Furniture', 'Grocery', 'Health & Personal Care', 'Home', 'Home Entertainment', 'Home Improvement', 'Jewelry', 'Kitchen', 'Lawn and Garden', 'Luggage', 'Musical Instruments', 'Office Products', 'Outdoors', 'PC', 'Pet Products', 'Shoes', 'Sports', 'Tools', 'Toys', 'Video DVD', 'Video Games', 'Watches', 'Wireless']
start_words = ['A', 'After', 'All', 'Any', 'Apart', 'Arrived', 'As', 'At', 'Attended', 'Avoid', 'Awesome', 'Be', 'Beautiful', 'Before', 'Booked', 'Check', "Didn't", 'Despite', 'Do', "Don't", 'Elegant', 'Even', 'Excellent', 'First', 'Firstly', 'For', 'From', 'Generally', 'Going', 'Good', 'Got', 'Great', 'Guys', 'Had', 'Have', 'Having', 'Here', 'How', 'I', "I'd", "I'll", "I'm", "I've", 'If', 'In', 'It', "It's", 'Just', 'Let', 'Me', 'My', 'Nice', 'No', 'Not', 'Often', 'Ok,', 'On', 'Other', 'Our', 'Overall', 'Recently', 'Rude,', 'Seriously', 'Simply', 'Sometimes', 'The', 'They', 'This', 'Used', 'Very', 'Was', 'We', "We've", 'Well', 'Went', 'What', "What's", 'When', 'While']

Random sampling

In [None]:
prompt = np.random.choice(start_words)
cat = np.random.choice(categories)
prompt = SPECIAL_TOKENS['bos_token'] + cat + SPECIAL_TOKENS['sep_token'] + prompt
outputs = model.generate_text(prompt, cat, print_output=True, do_sample=True, max_length=200, num_beams=5, repetition_penalty=5.0, early_stopping=True, num_return_sequences=3)

Sampling from Amazon human set

In [None]:
prompt, title, cat, original = sample_start_amazon(data_amazon, length=np.random.randint(4, 8))
prompt = SPECIAL_TOKENS['bos_token'] + cat + SPECIAL_TOKENS['sep_token'] + prompt
outputs = model.generate_text(prompt, cat, print_output=True, do_sample=True, max_length=70, num_beams=5, repetition_penalty=5.0, early_stopping=True, num_return_sequences=3)

Free input section

In [None]:
prompt = 'PUT YOUR PROMPT HERE'
cat = 'PLEASE SELECT ONLY CATEGORIES AVAILABLE IN THE CATEGORIES LIST ABOVE'
prompt = SPECIAL_TOKENS['bos_token'] + cat + SPECIAL_TOKENS['sep_token'] + prompt
outputs = model.generate_text(prompt, cat, print_output=True, do_sample=True, max_length=70, num_beams=5, repetition_penalty=5.0, early_stopping=True, num_return_sequences=3)

# Review Classification demo

Notes:
- This part shows the top classifier models we built. 
- The classifiers are proven functional by Classifier(Gold dataset) 
- The real-world-data performance are evaluated by Classifier(Amazon dataset) 
- The GPT-2 generated dataset is proven to be able to deceive the classifier by Classifier(Amazon real + GPT-2 generated fake)

In [5]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
from random import shuffle
import copy

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import RidgeClassifier, SGDClassifier

In [6]:
def shuffle_dict(data):
    text = list(data['text'])
    labels = list(data['labels'])

    l =list(zip(text, labels))

    shuffle(l)
    text, labels = zip(*l)
    dict = {'text': text,
            'labels': labels} 

    return dict


def finalize_amazon_test(panda_data):
    text = []
    labels = []
    
    np_data = panda_data.to_numpy()

    for i in range(len(np_data)):
        # the fake data
        if np_data[i][1] == '__label1__':
            text.append(np_data[i][8])
            labels.append('__label1__')

        # the real data
        else:
            text.append(np_data[i][8])
            labels.append('__label2__')


    dict_test = {'text': text,
                'labels': labels} 

    return dict_test


def finalize_amazon(panda_data):
    text_fake = []
    labels_fake = []

    text_real = []
    labels_real = []
    
    np_data = panda_data.to_numpy()

    for i in range(len(np_data)):
        # the fake data
        if np_data[i][1] == '__label1__':
            text_fake.append(np_data[i][8])
            labels_fake.append('__label1__')

        # the real data
        else:
            text_real.append(np_data[i][8])
            labels_real.append('__label2__')


    dict_fake = {'text': text_fake,
                'labels': labels_fake} 

    dict_real = {'text': text_real,
                'labels': labels_real} 

    return dict_fake, dict_real


def finalize_gpt(panda_data):
    text_Guide = []
    labels_Guide = []

    text_nonGuide = []
    labels_nonGuide = []

    np_data = panda_data.to_numpy()

    for i in range(len(np_data)):
        # the guided data
        if np_data[i][2] == 'GUIDED':
            text_Guide.append(np_data[i][1])
            labels_Guide.append('__label1__')

        # the non-guided data
        if np_data[i][2] == 'NON-GUIDED':
            text_nonGuide.append(np_data[i][1])
            labels_nonGuide.append('__label1__')

    dict_Guide = {'text': text_Guide,
                'labels': labels_Guide} 

    dict_nonGuide = {'text': text_nonGuide,
                'labels': labels_nonGuide} 

    return dict_Guide, dict_nonGuide


def finalize_gold(panda_data):
    text_fake = []
    labels_fake = []

    text_real = []
    labels_real = []
    
    np_data = panda_data.to_numpy()

    for i in range(len(np_data)):
        # the fake data
        if np_data[i][0] == '__label1__':
            text_fake.append(np_data[i][-1])
            labels_fake.append('__label1__')

        # the real data
        else:
            text_real.append(np_data[i][-1])
            labels_real.append('__label2__')


    dict_fake = {'text': text_fake,
                'labels': labels_fake} 

    dict_real = {'text': text_real,
                'labels': labels_real} 

    return dict_fake, dict_real


def merge_data(d1, d2):
    merged_dic = {}
    for key in d1.keys():
        merged_dic[key] = d1[key] + d2[key]
    
    return merged_dic


def merge_shuffle_data(data1, data2):
    data = merge_data(data1, data2)
    data = shuffle_dict(data)
    print('Datasets merged and shuffled, total data: ', len(data['text']))

    return data

def run_classifier(n_range, info, train_data, test_data):
    print(info)
    best_acc = 0
    for clf, name in (
            (LinearSVC(), "LinearSVC"),
            (MultinomialNB(), "Multi  NB"),
            (RidgeClassifier(), "Ridge cls"),
            (SGDClassifier(), "SGD   cls"),
            ):

        text_clf = Pipeline([
                            ('vect', CountVectorizer(ngram_range=n_range)),
                            ('tfidf', TfidfTransformer()),
                            ('clf', clf),
                            ])

        text_clf.fit(train_data['text'], train_data['labels'])

        predicted = text_clf.predict(test_data['text'])
        acc = np.mean(predicted == test_data['labels'])

        print(name, acc)

        if acc > best_acc:
            best_acc = acc
            best_result = (name, acc)
    
    print('Best result in this run: ', best_result)

Load the data and seperate them into real/fake reviews

In [8]:
# Load the 3 main datasets: gold, amazon, gpt
gpt_data = pd.read_csv('gpt_generated_data.csv', sep=',')
gold_data = pd.read_csv('opspam.txt', sep='   ')
train_amazon = pd.read_csv('train/amazon_reviews.txt', sep=',')
test_amazon = pd.read_csv('test/amazon_reviews.txt', sep=',')

# process the data into dictionary to get ready to throw into the classifier 
amazon_test = finalize_amazon_test(test_amazon)
print("amazon testing set", len(amazon_test['text']))

amazon_fake, amazon_real = finalize_amazon(train_amazon)
print("amazon fake/real", len(amazon_fake['text']), len(amazon_real['text']))

gold_data_num  = len(gold_data)
gold_fake, gold_real = finalize_gold(gold_data.head(int(gold_data_num * 0.8)))
print("gold fake/real", len(gold_fake['text']), len(gold_real['text']))
gold_fake_test, gold_real_test = finalize_gold(gold_data.tail(int(gold_data_num * 0.2)))
print("gold test fake/real", len(gold_fake_test['text']), len(gold_real_test['text']))

gpt_Guide, gpt_nonGuide = finalize_gpt(gpt_data)
print("gpt Guide/nonGuide", len(gpt_Guide['text']), len(gpt_nonGuide['text']))

  gold_data = pd.read_csv('opspam.txt', sep='   ')


amazon testing set 5250
amazon fake/real 7834 7916
gold fake/real 651 629
gold test fake/real 149 171
gpt Guide/nonGuide 4954 4954


Classifier(Gold dataset) with cross-domain classification

In [9]:
gold_test = merge_shuffle_data(gold_fake_test, gold_real_test)
gold_mix = merge_shuffle_data(gold_fake, gold_real)

# Classifier trained with gold dataset, tested on gold dataset
run_classifier((1,2), 'ngram: 1 - 2', gold_mix, gold_test)

# Classifier trained with gold dataset, tested on amazon dataset (this is the cross-domain classification)
run_classifier((1,2), 'ngram: 1 - 2', gold_mix, amazon_test)

Datasets merged and shuffled, total data:  320
Datasets merged and shuffled, total data:  1280
ngram: 1 - 2
LinearSVC 0.909375
Multi  NB 0.73125
Ridge cls 0.9
SGD   cls 0.896875
Best result in this run:  ('LinearSVC', 0.909375)
ngram: 1 - 2
LinearSVC 0.5118095238095238
Multi  NB 0.5499047619047619
Ridge cls 0.5110476190476191
SGD   cls 0.5135238095238095
Best result in this run:  ('Multi  NB', 0.5499047619047619)


Classifier(Amazon dataset) with cross-domain classification

In [10]:
amazon_mix = merge_shuffle_data(amazon_fake, amazon_real )

# Classifier trained with amazon dataset, tested on amazon dataset
run_classifier((1,2), 'ngram: 1 - 2', amazon_mix, amazon_test)

# Classifier trained with ammazon dataset, tested on gold dataset (this is the cross-domain classification)
run_classifier((1,2), 'ngram: 1 - 2', amazon_mix, gold_test)

Datasets merged and shuffled, total data:  15750
ngram: 1 - 2
LinearSVC 0.6651428571428571
Multi  NB 0.6622857142857143
Ridge cls 0.6693333333333333
SGD   cls 0.6660952380952381
Best result in this run:  ('Ridge cls', 0.6693333333333333)
ngram: 1 - 2
LinearSVC 0.575
Multi  NB 0.6125
Ridge cls 0.58125
SGD   cls 0.59375
Best result in this run:  ('Multi  NB', 0.6125)


Classifier(Amazon real + GPT-2 generated fake) 

GPT-2 generated fake data includes guided and non-guided data

In [11]:
amazonReal_gptGuide = merge_shuffle_data(amazon_real, gpt_Guide)
amazonReal_gptNonGuide = merge_shuffle_data(amazon_real, gpt_nonGuide)

# Classifier trained with (amazon real + guided GPT-2 generated fake), tested on amazon dataset
run_classifier((1,2), 'ngram: 1 - 2', amazonReal_gptGuide, amazon_test)

# Classifier trained with (amazon real + no-guided GPT-2 generated fake), tested on amazon dataset
run_classifier((1,2), 'ngram: 1 - 2', amazonReal_gptNonGuide, amazon_test)

Datasets merged and shuffled, total data:  12870
Datasets merged and shuffled, total data:  12870
ngram: 1 - 2
LinearSVC 0.5001904761904762
Multi  NB 0.4942857142857143
Ridge cls 0.49676190476190474
SGD   cls 0.49866666666666665
Best result in this run:  ('LinearSVC', 0.5001904761904762)
ngram: 1 - 2
LinearSVC 0.4956190476190476
Multi  NB 0.4942857142857143
Ridge cls 0.4956190476190476
SGD   cls 0.49523809523809526
Best result in this run:  ('LinearSVC', 0.4956190476190476)
