In [1]:
from utils.loader import DataLoader
import pandas as pd

In [4]:
print(len(DataLoader().load_amazon(test_mode=True, all=True)))
len(DataLoader().load_amazon(all=True))

5250


15750

In [6]:
test_ratio = 0.25
loader = DataLoader()
truth_data = pd.read_table('data/amazon_reviews/amazon_reviews.txt')
truth_data = truth_data.sample(frac=1) # shuffle
truth_data_test = truth_data[:int(test_ratio * len(truth_data))]
truth_data_train = truth_data[int(test_ratio * len(truth_data)):]

In [18]:
truth_data_train.to_csv('amazon_reviews.txt', index=False)
truth_data_test.to_csv('amazon_reviews2.txt', index=False)

In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

In [13]:
from models.gpt import GPT2

model = AutoModelForCausalLM.from_pretrained('checkpoints/distilgpt2/checkpoint-6336')
gpt = GPT2(model)

In [18]:
gpt.generate_text('My favourite', 3)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['My favourite game of the year in the history of this game.  It feels as though the player is constantly working to improve the game and that was true of this game.  The game also feels less action and more character-driven as the player progresses.  You have enough time to be in the mood to really appreciate the game and the quality of each piece of dialogue being cut in an action.',
 'My favourite, soft drink. This is a perfect blend for coffee and tea.  I really enjoy it.',
 'My favourite chocolate cake I have tried but it was better.  It seems to be made from wood and has quite a bit of room, so it was nice to have it sitting in an area while making it.  We really enjoyed the texture and the simplicity of the cake.']

In [7]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments, \
	Trainer, DataCollatorWithPadding
import datasets
import torch
from utils.loader import DataLoader


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilgpt2', use_fast=True)
val_ratio = 0.2
block_size = 1024


def df_to_dataset_obj(dataframe, columns):
	dataset = datasets.Dataset.from_pandas(dataframe[columns])
	dataset = dataset.remove_columns('__index_level_0__')
	dataset = dataset.rename_column('LABEL', 'labels')
	dataset = dataset.rename_column('REVIEW_TEXT', 'text')

	return dataset


def tokenize_data(inputs):
	tokens = tokenizer(inputs['text'], padding='max_length', truncation=True, max_length=block_size)
	tokens['labels'] = tokens['input_ids'].copy()
	return tokens


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

# Load datasets
data_loader = DataLoader()
truth_data = data_loader.load_amazon(deceptive=False)
truth_data = truth_data.sample(frac=1)
truth_data_val = truth_data.iloc[:int(val_ratio * len(truth_data))]
truth_data_train = truth_data.iloc[int(val_ratio * len(truth_data)):]

# Clean and convert to Dataset objects
# dataset_dec = df_to_dataset_obj(dec_data, ['LABEL', 'REVIEW_TEXT'])
dataset_truth_val = df_to_dataset_obj(truth_data_val, ['LABEL', 'REVIEW_TEXT'])
dataset_truth_train = df_to_dataset_obj(truth_data_train, ['LABEL', 'REVIEW_TEXT'])

# tokenized_dec = dataset_dec.map(tokenize_data(tokenizer=), batched=True)
tokenizer.pad_token = tokenizer.eos_token
tokenized_val = dataset_truth_val.map(tokenize_data, batched=True, remove_columns=['text'])
tokenized_train = dataset_truth_train.map(tokenize_data, batched=True, remove_columns=['text'])

lm_train = tokenized_train
lm_val = tokenized_val


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [10]:
lm_train

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 6333
})

In [None]:
tokenizer.decode(lm_train[11]["input_ids"])