In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoConfig
from utils.loader import DataLoader
import numpy as np
import pandas as pd
import torch

In [2]:
MODEL = 'distilgpt2'
SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
data_loader = DataLoader()
truth_data = data_loader.load_amazon(test_mode=True, deceptive=False)

In [3]:
def get_tokenier(special_tokens=None):
    tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True) #GPT2Tokenizer

    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
        print("Special tokens added")
    return tokenizer

def get_model(tokenizer, special_tokens=None, load_model_path=None):
    if special_tokens:
        config = AutoConfig.from_pretrained(MODEL, 
                                            bos_token_id=tokenizer.bos_token_id,
                                            eos_token_id=tokenizer.eos_token_id,
                                            sep_token_id=tokenizer.sep_token_id,
                                            pad_token_id=tokenizer.pad_token_id,
                                            output_hidden_states=False)
    else: 
        config = AutoConfig.from_pretrained(MODEL,                                     
                                            pad_token_id=tokenizer.eos_token_id,
                                            output_hidden_states=False)    

    #----------------------------------------------------------------#
    model = AutoModelForCausalLM.from_pretrained(MODEL, config=config)

    if special_tokens:
        #Special tokens added, model needs to be resized accordingly
        model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path, map_location=torch.device('cpu')))

    #model.cuda()
    return model

In [7]:
tokenizer = get_tokenier()
model = get_model(tokenizer, 'training/gpt-gold/pytorch_model.bin')

Gold

In [None]:
gold_data = data_loader.load_gold_txt()

In [25]:
def sample_start(df, length=5):
    sample = df.sample(n=1)
    text = list(sample['REVIEW_TEXT'])[0]
    sample = str(text).split(' ')
    return ' '.join(sample[:length]), text

In [None]:
prompt, original = sample_start(gold_data, length=np.random.randint(5, 8))
#prompt = SPECIAL_TOKENS['bos_token'] + cat + SPECIAL_TOKENS['sep_token'] + prompt

tokenized_prompt = tokenizer.encode('Here is my review: ' + prompt, return_tensors='pt')
out = model.generate(tokenized_prompt, max_length=50, top_k=50,  top_p=0.7, num_return_sequences=3, do_sample=True, early_stopping=True, repition_penalty=1.5)
print(f'Text: {original[:300]}...\n')
for i, o in enumerate(out):
    gen_txt = tokenizer.decode(o, skip_special_tokens=True)
    print(gen_txt)

Amazon

In [20]:
tokenizer = get_tokenier(SPECIAL_TOKENS)
model = get_model(tokenizer, load_model_path='training/distilgpt-topic2/pytorch_model.bin', special_tokens=SPECIAL_TOKENS)
#model = AutoModelForCausalLM.from_pretrained('training/distilgpt-8')

Special tokens added


In [1]:
import torch

In [5]:
def sample_start(df, length=5):
    sample = df.sample(n=1)
    title, category, text = list(sample['REVIEW_TITLE'])[0], list(sample['PRODUCT_CATEGORY'])[0], list(sample['REVIEW_TEXT'])[0]
    sample = str(text).split(' ')
    return ' '.join(sample[:length]), title, category, text

In [18]:
# No cat
prompt, title, cat, original = sample_start(truth_data, length=np.random.randint(5, 8))
#prompt = SPECIAL_TOKENS['bos_token'] + cat + SPECIAL_TOKENS['sep_token'] + prompt
prompt = 'I love this.'

tokenized_prompt = tokenizer.encode(prompt, return_tensors='pt')
out = model.generate(tokenized_prompt, do_sample=True, max_length=70, num_beams=5, repetition_penalty=5.0, early_stopping=True, num_return_sequences=3)
print(f'Text: {original[:200]} \nPrompt: {prompt}...\n')
for i, o in enumerate(out):
    gen_txt = tokenizer.decode(o, skip_special_tokens=True)
    truncated_txt = gen_txt.split('.')
    print('.'.join(truncated_txt[:-1]) + '.\n')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Text: This is a handy holder/dispenser for wipes. The opening is large enough that it's easy to insert the wipes. Although the handle would be useful for a stroller, I like to put mine in the glove compartm 
Prompt: I love this....

I love this.  I have been using it for over a year now and am very pleased with the quality of the product.

I love this.  I have been using it for a couple of years now and am very happy with it.

I love this.  It is a little small and easy to put together.  I am very happy with it.



In [22]:
# With cat
prompt, title, cat, original = sample_start(truth_data, length=np.random.randint(5, 8))
cat = 'Books'
prompt = 'I love this.'
prompt = SPECIAL_TOKENS['bos_token'] + cat + SPECIAL_TOKENS['sep_token'] + prompt
pos = len(cat)

tokenized_prompt = tokenizer.encode(prompt, return_tensors='pt')
#out = model.generate(tokenized_prompt, max_length=50, top_k=50,  top_p=0.7, num_return_sequences=3, do_sample=True, repetition_penalty=1)
out = model.generate(tokenized_prompt, do_sample=True, max_length=70, num_beams=5, repetition_penalty=5.0, early_stopping=True, num_return_sequences=3)
print(f'Category: {cat}  \nPrompt: {prompt}...\n')
for i, o in enumerate(out):
    gen_txt = tokenizer.decode(o, skip_special_tokens=True)
    truncated_txt = gen_txt[pos:].split('.')
    print('.'.join(truncated_txt[:-1]) + '.\n')

Category: Books  
Prompt: <|BOS|>Books<|SEP|>I love this....

I love this.  It's the best book I've ever read.  The author is a true storyteller and has done everything he can to make my life so much easier.

I love this.  I read a lot of books on how to make your life so much easier.<br /><br />This book is not for the faint of heart.

I love this.



In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments, \
	Trainer, DataCollatorWithPadding
import datasets
import torch
from utils.loader import DataLoader


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilgpt2', use_fast=True)
val_ratio = 0.2
block_size = 1024


def df_to_dataset_obj(dataframe, columns):
	dataset = datasets.Dataset.from_pandas(dataframe[columns])
	dataset = dataset.remove_columns('__index_level_0__')
	dataset = dataset.rename_column('LABEL', 'labels')
	dataset = dataset.rename_column('REVIEW_TEXT', 'text')

	return dataset


def tokenize_data(inputs):
	tokens = tokenizer(inputs['text'], padding='max_length', truncation=True, max_length=block_size)
	tokens['labels'] = tokens['input_ids'].copy()
	return tokens


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

# Load datasets
data_loader = DataLoader()
truth_data = data_loader.load_amazon(deceptive=False)
truth_data = truth_data.sample(frac=1)
truth_data_val = truth_data.iloc[:int(val_ratio * len(truth_data))]
truth_data_train = truth_data.iloc[int(val_ratio * len(truth_data)):]

# Clean and convert to Dataset objects
# dataset_dec = df_to_dataset_obj(dec_data, ['LABEL', 'REVIEW_TEXT'])
dataset_truth_val = df_to_dataset_obj(truth_data_val, ['LABEL', 'REVIEW_TEXT'])
dataset_truth_train = df_to_dataset_obj(truth_data_train, ['LABEL', 'REVIEW_TEXT'])

# tokenized_dec = dataset_dec.map(tokenize_data(tokenizer=), batched=True)
tokenizer.pad_token = tokenizer.eos_token
tokenized_val = dataset_truth_val.map(tokenize_data, batched=True, remove_columns=['text'])
tokenized_train = dataset_truth_train.map(tokenize_data, batched=True, remove_columns=['text'])

lm_train = tokenized_train
lm_val = tokenized_val


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [None]:
lm_train

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 6333
})

In [None]:
tokenizer.decode(lm_train[11]["input_ids"])