In [None]:
import os
import time
import datetime

import pandas as pd
import numpy as np
import torch

from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, DataCollatorForLanguageModeling

In [None]:
model_dir = "./generate_replies_model"

In [None]:
# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|start|>', eos_token='<|end|>', pad_token='<|pad|>')

In [None]:
def tokenize_and_split(examples):
    return tokenizer(
        '<|start|>'+ examples["op_with_reply_text"] + '<|end|>',
        truncation=True,
        max_length= 583 # Tweet max = 280, 2 tweets + "{REPLY}" + start and end of text tokens
    )

In [None]:
train_path = 'final_liked_gpt.csv'
test_path = 'final_liked_gpt_test.csv'

In [None]:
data_files = {"train": "final_liked_gpt.csv", "test": "final_liked_gpt_test.csv"}
dataset_base = load_dataset("csv", data_files=data_files)
dataset_base = dataset_base.map(tokenize_and_split, remove_columns=['op_id','reply_id'])

In [None]:
dataset = dataset_base["train"].train_test_split(train_size=0.8, seed=42)
dataset["validation"] = dataset.pop("test") # Renames the default feature "test" split to "validation"
dataset["test"] = dataset_base["test"] # Add the "test" feature
dataset

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

In [None]:
#RELOAD MODEL

#configuration = GPT2Config.from_pretrained(model_dir, output_hidden_states=False)
#model = GPT2LMHeadModel.from_pretrained(model_dir, config=configuration)

In [None]:
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
model.resize_token_embeddings(len(tokenizer))

In [None]:
model.cuda() # Run on the GPU

In [None]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

training_args = TrainingArguments(
    output_dir=model_dir,
    overwrite_output_dir=True, 
    num_train_epochs=5, 
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8,
    eval_steps = 1500, 
    save_steps=1500,
    warmup_steps=500,
    evaluation_strategy="steps"
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"]
)

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
# GENERATE TEXT

In [None]:
model.eval()

In [None]:
def generate_outputs(input_text, nb_seq):
    text_to_generate = "<|start|>" + input_text + "{REPLY}"
    
    generated_output = torch.tensor(tokenizer.encode(input_text)).unsqueeze(0).to(device)
    outputs = model.generate(
            generated_output, 
            do_sample=True,   
            top_k=50, 
            max_length = 567,
            top_p=0.95, 
            num_return_sequences=nb_seq
        )
    return [tokenizer.decode(o, skip_special_tokens=True).split('{REPLY}')[1] for o in outputs]

In [None]:
tweet_to_reply_to = "What are you going to do for the holidays?" # Put a tweet to reply to here

decoded_outputs = generate_outputs(tweet_to_reply_to, 3)

for i, output in enumerate(decoded_outputs):
    if len(output) > 1:
        print("{}: {}\n\n".format(i, output))

In [None]:
#from huggingface_hub import notebook_login

#notebook_login()

In [None]:
#model.push_to_hub("twitter_reply_generator")