In [1]:
import os
import time
import datetime

import pandas as pd
import numpy as np
import torch
import math

from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, DataCollatorForLanguageModeling

In [2]:
model_dir = "./generate_replies_model_new"

In [3]:
# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large', bos_token='<|start|>', eos_token='<|end|>', pad_token='<|pad|>')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
max_token_length = 150 # Tweet max = 280, 2 tweets * 280 / about 4 when converted to tokens

def tokenize_and_split(examples):
    return tokenizer(
        '<|start|>'+ examples["op_text"] + "{REPLY}" + examples["reply_text"] + '<|end|>',
        truncation=True,
        max_length= max_token_length
    )

In [6]:
train_path = 'final_liked_gpt.csv'
test_path = 'final_liked_gpt_test.csv'

In [7]:
data_files = {"train": "final_liked_gpt.csv", "test": "final_liked_gpt_test.csv"}
dataset_base = load_dataset("csv", data_files=data_files)
dataset_base = dataset_base.map(tokenize_and_split) 

Using custom data configuration default-50f0f9100ec1c76d
Found cached dataset csv (/home/mark/.cache/huggingface/datasets/csv/default-50f0f9100ec1c76d/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/mark/.cache/huggingface/datasets/csv/default-50f0f9100ec1c76d/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-7382ebffc8f037de.arrow
Loading cached processed dataset at /home/mark/.cache/huggingface/datasets/csv/default-50f0f9100ec1c76d/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-c91bb755908009b9.arrow


In [8]:
dataset = dataset_base["train"].train_test_split(train_size=0.9, seed=42)
dataset["validation"] = dataset.pop("test") # Renames the default feature "test" split to "validation"
dataset["test"] = dataset_base["test"] # Add the "test" feature
dataset

Loading cached split indices for dataset at /home/mark/.cache/huggingface/datasets/csv/default-50f0f9100ec1c76d/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-7661f3d757345892.arrow and /home/mark/.cache/huggingface/datasets/csv/default-50f0f9100ec1c76d/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-f75c62a2190596a8.arrow


DatasetDict({
    train: Dataset({
        features: ['op_id', 'reply_id', 'op_text', 'reply_text', 'input_ids', 'attention_mask'],
        num_rows: 87428
    })
    validation: Dataset({
        features: ['op_id', 'reply_id', 'op_text', 'reply_text', 'input_ids', 'attention_mask'],
        num_rows: 9715
    })
    test: Dataset({
        features: ['op_id', 'reply_id', 'op_text', 'reply_text', 'input_ids', 'attention_mask'],
        num_rows: 1008
    })
})

In [9]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

In [10]:
#RELOAD MODEL

#configuration = GPT2Config.from_pretrained(model_dir, output_hidden_states=False)
#model = GPT2LMHeadModel.from_pretrained(model_dir, config=configuration)
#model.resize_token_embeddings(len(tokenizer))

In [11]:
configuration = GPT2Config.from_pretrained('gpt2-large', output_hidden_states=False)
model = GPT2LMHeadModel.from_pretrained("gpt2-large", config=configuration)
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 1280)

In [14]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

training_args = TrainingArguments(
    output_dir=model_dir,
    overwrite_output_dir=True, 
    num_train_epochs=3, 
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=5000,
    save_strategy="epoch",
    evaluation_strategy="epoch"
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"]
)

In [15]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: op_text, op_id, reply_id, reply_text. If op_text, op_id, reply_id, reply_text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 87428
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 65571
  Number of trainable parameters = 774033920


Epoch,Training Loss,Validation Loss
1,2.9553,2.886704
2,2.2848,2.726014
3,1.7363,2.779712


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: op_text, op_id, reply_id, reply_text. If op_text, op_id, reply_id, reply_text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9715
  Batch size = 4
Saving model checkpoint to ./generate_replies_model_new/checkpoint-21857
Configuration saved in ./generate_replies_model_new/checkpoint-21857/config.json
Model weights saved in ./generate_replies_model_new/checkpoint-21857/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: op_text, op_id, reply_id, reply_text. If op_text, op_id, reply_id, reply_text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9715
  Batch size = 4
Saving model checkpoint to ./ge

TrainOutput(global_step=65571, training_loss=2.427599144793497, metrics={'train_runtime': 19534.5851, 'train_samples_per_second': 13.427, 'train_steps_per_second': 3.357, 'total_flos': 1.318448221836288e+17, 'train_loss': 2.427599144793497, 'epoch': 3.0})

In [16]:
trainer.save_model()

Saving model checkpoint to ./generate_replies_model_new
Configuration saved in ./generate_replies_model_new/config.json
Model weights saved in ./generate_replies_model_new/pytorch_model.bin


In [17]:
# GENERATE TEXT

In [18]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50260, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout)

In [40]:
device = torch.device("cuda")

def generate_outputs(input_text, nb_seq):
    text_to_generate = input_text + "{REPLY}"
    
    generated_output = torch.tensor(tokenizer.encode(text_to_generate)).unsqueeze(0).to(device)
    
    outputs = model.generate(
            generated_output, 
            do_sample=True,   
            top_k=50, 
            max_length = max_token_length,
            top_p=0.95, 
            num_return_sequences=nb_seq
        )
    return [tokenizer.decode(o, skip_special_tokens=True).split('{REPLY}')[1]  for o in outputs]

In [49]:
tweet_to_reply_to = "Christmas is coming soon! What are you going to do for the holidays?" # Put a tweet to reply to here

decoded_outputs = generate_outputs(tweet_to_reply_to, 1)

for i, output in enumerate(decoded_outputs):
    if len(output) > 1:
        print("{}: {}\n\n".format(i, output))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: Hendery has a huge gift for you and I know Christmas isn’t far away s coming 😃😃😃😃

🎄🎄🎄🎄🎄🎄🎄🎄🎄🎄🎄  got some big plans for you and that’s for me and for you!                        .                 


