In [79]:
from utils.loader import DataLoader
from models.gpt import GPT2
import numpy as np
import pandas as pd

In [80]:
SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}

In [81]:
def sample_start_amazon(df, length=5):
    sample = df.sample(n=1)
    title, category, text = list(sample['REVIEW_TITLE'])[0], list(sample['PRODUCT_CATEGORY'])[0], list(sample['REVIEW_TEXT'])[0]
    sample = str(text).split(' ')
    return ' '.join(sample[:length]), title, category, text

def sample_start_gold(df, length=5):
    sample = df.sample(n=1)
    text = list(sample['REVIEW_TEXT'])[0]
    sample = str(text).split(' ')
    return ' '.join(sample[:length]), text

In [82]:
# Load our test-data that we will be sampling categories and prompts from
data_loader = DataLoader()
data_amazon = data_loader.load_amazon(test_mode=True, deceptive=False)
data_gold = data_loader.load_gold_txt()

  return func(*args, **kwargs)


In [83]:
# Load our model
model_path = 'training/distilgpt-topic2/pytorch_model.bin'
model = GPT2(model_path=model_path, full_model=False, special_tokens=SPECIAL_TOKENS)

In [84]:
# These are the available categories
categories = ['Apparel', 'Automotive', 'Baby', 'Beauty', 'Books', 'Camera', 'Electronics', 'Furniture', 'Grocery', 'Health & Personal Care', 'Home', 'Home Entertainment', 'Home Improvement', 'Jewelry', 'Kitchen', 'Lawn and Garden', 'Luggage', 'Musical Instruments', 'Office Products', 'Outdoors', 'PC', 'Pet Products', 'Shoes', 'Sports', 'Tools', 'Toys', 'Video DVD', 'Video Games', 'Watches', 'Wireless']
start_words = ['A', 'The', 'We', 'I', 'This', 'I love', 'I hate', '']

In [86]:
# Begin generating samples
# 25k will be sampled with random category and random start word from OPSpam as it has better grammar
# 25k sampled from Amazon dataset with corresponding category and first 2-5 words and let GPT finish

In [None]:
# Here we sample a random category, and a random start word.
all_reviews = []
while len(all_reviews) < 5000:
    print(f'{len(all_reviews)}/25000')
    prompt, original = sample_start_gold(data_gold, length=1)
    cat = np.random.choice(categories)
    prompt = SPECIAL_TOKENS['bos_token'] + cat + SPECIAL_TOKENS['sep_token'] + prompt
    outputs = model.generate_text(prompt, cat, print_output=False, do_sample=True, max_length=200, num_beams=5, repetition_penalty=5.0, early_stopping=True, num_return_sequences=3)
    for review in outputs:
        if len(review) > 10: # Ensure text generated is text
            all_reviews.append([cat, review])

In [None]:
# Here we sample a random review and use its category and a random length start of sentence prompt.
all_reviews = []
while len(all_reviews) < 5000:
    print(f'{len(all_reviews)}/25000')
    # Sample a random prompt and corresponding category from the dataset and gemerate
    prompt, title, cat, original = sample_start_amazon(data_amazon, length=np.random.randint(2, 5))
    prompt = SPECIAL_TOKENS['bos_token'] + cat + SPECIAL_TOKENS['sep_token'] + prompt
    outputs = model.generate_text(prompt, cat, print_output=False, do_sample=True, max_length=70, num_beams=5, repetition_penalty=5.0, early_stopping=True, num_return_sequences=3)
    for review in outputs:
        if len(review) > 10: # Ensure text generated is text
            all_reviews.append([cat, review])

In [88]:
fake_reviews = pd.DataFrame(all_reviews, columns=['PRODUCT_CATEGORY', 'REVIEW_TEXT'])
fake_reviews = fake_reviews.drop_duplicates('REVIEW_TEXT')
fake_reviews = fake_reviews.reset_index()

In [98]:
fake_reviews_second = pd.DataFrame(all_reviews, columns=['PRODUCT_CATEGORY', 'REVIEW_TEXT'])
fake_reviews_second = fake_reviews_second.drop_duplicates('REVIEW_TEXT')
fake_reviews_second = fake_reviews_second.reset_index()

In [101]:
fake_reviews['SAMPLE_TYPE'] = 'NON-GUIDED'
fake_reviews_second['SAMPLE_TYPE'] = 'GUIDED'

In [111]:
final_data = pd.concat([fake_reviews, fake_reviews_second])
final_data = final_data.drop('index', axis=1)

In [115]:
final_data['REVIEW_TEXT'] = final_data['REVIEW_TEXT'].astype(str)

In [113]:
final_data.to_csv('gpt_generated_data.csv', index=False)