In [20]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments, Trainer
#from models.gpt import GPT2
from utils.loader import DataLoader
import datasets
import torch

In [4]:
def df_to_dataset_obj(dataframe, columns):
    dataset = datasets.Dataset.from_pandas(dataframe[columns])
    dataset = dataset.remove_columns('__index_level_0__')
    dataset = dataset.rename_column('LABEL', 'labels')
    dataset = dataset.rename_column('REVIEW_TEXT', 'text')
    
    return dataset

In [5]:
def tokenize_data(inputs):
    return tokenizer(inputs['text'], padding='max_length', truncation=True)

In [7]:
val_split = 0.2

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')

# Load datasets
loader = DataLoader()
#dec_data = loader.load_amazon(deceptive=True)
truth_data = loader.load_amazon()
truth_data = truth_data.sample(frac=1)
truth_data_val = truth_data.iloc[:int(0.2*len(truth_data))]
truth_data_train = truth_data.iloc[int(0.2*len(truth_data)):]

In [8]:
# Clean and convert to Dataset objects
#dataset_dec = df_to_dataset_obj(dec_data, ['LABEL', 'REVIEW_TEXT'])
dataset_truth_val = df_to_dataset_obj(truth_data_val, ['LABEL', 'REVIEW_TEXT'])
dataset_truth_train = df_to_dataset_obj(truth_data_train, ['LABEL', 'REVIEW_TEXT'])

In [14]:
tokenizer.pad_token = tokenizer.eos_token
#tokenized_dec = dataset_dec.map(tokenize_data(tokenizer=), batched=True)
tokenized_val = dataset_truth_val.map(tokenize_data, batched=True, remove_columns=['labels', 'text'])
tokenized_train = dataset_truth_train.map(tokenize_data, batched=True, remove_columns=['labels', 'text'])

100%|██████████| 11/11 [00:02<00:00,  3.84ba/s]


In [23]:
del dataset_truth_val
del dataset_truth_train
del truth_data_val
del truth_data_train
del truth_data

In [24]:
# Set padding token and use mlm to use the inputs as the labels shifted to right by one
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [25]:
# Load model
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
training_args = TrainingArguments(
    output_dir="sample_data",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=4
    
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator
)

Downloading: 100%|██████████| 336M/336M [00:34<00:00, 10.1MB/s] 


In [26]:
trainer.train()

***** Running training *****
  Num examples = 10500
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3939
  0%|          | 1/3939 [01:03<69:05:48, 63.17s/it]