In [1]:
import os
import time
import datetime

import pandas as pd
import numpy as np
import torch
import math

from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, DataCollatorForLanguageModeling

In [2]:
model_dir = "./generate_replies_model_new"
finetuned_model_name = "gpt2-large"

In [3]:
# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained(finetuned_model_name, bos_token='<|start|>', eos_token='<|end|>', pad_token='<|pad|>')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
max_token_length = 260 # Tweet max = 280, 2 tweets + "{REPLY}" divided by about 2 when converted to tokens

def tokenize_and_split(examples):
    return tokenizer(
        '<|start|>'+ examples["op_text"] + "{REPLY}" + examples["reply_text"] + '<|end|>',
        truncation=True,
        max_length= max_token_length
    )

In [6]:
train_path = 'final_liked_gpt.csv'
test_path = 'final_liked_gpt_test.csv'

In [7]:
data_files = {"train": "final_liked_gpt.csv", "test": "final_liked_gpt_test.csv"}
dataset_base = load_dataset("csv", data_files=data_files)
dataset_base = dataset_base.map(tokenize_and_split) #, remove_columns=['op_id','reply_id'])

Using custom data configuration default-421c9a74140117f8


Downloading and preparing dataset csv/default to /home/mark/.cache/huggingface/datasets/csv/default-421c9a74140117f8/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/mark/.cache/huggingface/datasets/csv/default-421c9a74140117f8/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/180391 [00:00<?, ?ex/s]

  0%|          | 0/2169 [00:00<?, ?ex/s]

In [8]:
dataset = dataset_base["train"].train_test_split(train_size=0.9, seed=42)
dataset["validation"] = dataset.pop("test") # Renames the default feature "test" split to "validation"
dataset["test"] = dataset_base["test"] # Add the "test" feature
dataset

DatasetDict({
    train: Dataset({
        features: ['op_id', 'reply_id', 'op_text', 'reply_text', 'input_ids', 'attention_mask'],
        num_rows: 162351
    })
    validation: Dataset({
        features: ['op_id', 'reply_id', 'op_text', 'reply_text', 'input_ids', 'attention_mask'],
        num_rows: 18040
    })
    test: Dataset({
        features: ['op_id', 'reply_id', 'op_text', 'reply_text', 'input_ids', 'attention_mask'],
        num_rows: 2169
    })
})

In [9]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

In [10]:
#RELOAD MODEL

#configuration = GPT2Config.from_pretrained(model_dir, output_hidden_states=False)
#model = GPT2LMHeadModel.from_pretrained(model_dir, config=configuration)
#model.resize_token_embeddings(len(tokenizer))

In [11]:
configuration = GPT2Config.from_pretrained(finetuned_model_name, output_hidden_states=False)
model = GPT2LMHeadModel.from_pretrained(finetuned_model_name, config=configuration)
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 1280)

In [14]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

training_args = TrainingArguments(
    output_dir=model_dir,
    overwrite_output_dir=True, 
    num_train_epochs=3, 
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps = 15000, 
    save_steps=15000,
    warmup_steps=5000,
    save_strategy="steps",
    evaluation_strategy="steps"
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"]
)

In [15]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: op_text, reply_text, op_id, reply_id. If op_text, reply_text, op_id, reply_id are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 162351
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 121764
  Number of trainable parameters = 774033920


Step,Training Loss,Validation Loss
15000,3.1817,3.155565
30000,3.0486,2.998014
45000,2.4678,2.92298
60000,2.4051,2.852515
75000,2.4017,2.7942
90000,1.9168,2.859834
105000,1.8387,2.834723
120000,1.919,2.808198


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: op_text, reply_text, op_id, reply_id. If op_text, reply_text, op_id, reply_id are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 18040
  Batch size = 4
Saving model checkpoint to ./generate_replies_model_new/checkpoint-15000
Configuration saved in ./generate_replies_model_new/checkpoint-15000/config.json
Model weights saved in ./generate_replies_model_new/checkpoint-15000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: op_text, reply_text, op_id, reply_id. If op_text, reply_text, op_id, reply_id are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 18040
  Batch size = 4
Saving model checkpoint to ./

TrainOutput(global_step=121764, training_loss=2.508804728561035, metrics={'train_runtime': 38866.4321, 'train_samples_per_second': 12.531, 'train_steps_per_second': 3.133, 'total_flos': 2.476099275139584e+17, 'train_loss': 2.508804728561035, 'epoch': 3.0})

In [16]:
trainer.save_model()

Saving model checkpoint to ./generate_replies_model_new
Configuration saved in ./generate_replies_model_new/config.json
Model weights saved in ./generate_replies_model_new/pytorch_model.bin


In [17]:
# GENERATE TEXT

In [18]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50260, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout)

In [19]:
device = torch.device("cuda")

def generate_outputs(input_text, nb_seq):
    text_to_generate = input_text + "{REPLY}"
    
    encoded_input = tokenizer.encode(text_to_generate)
    generated_output = torch.tensor(encoded_input).unsqueeze(0).to(device)
    
    new_max_length = (max_token_length / 2) + len(encoded_input) # Limit the generated tweet to about 280 characters max
    
    outputs = model.generate(
            generated_output, 
            do_sample=True,   
            top_k=50, 
            max_length = new_max_length,
            top_p=0.95, 
            num_return_sequences=nb_seq
        )
    return [tokenizer.decode(o, skip_special_tokens=True).split('{REPLY}')[1]  for o in outputs] #

In [22]:
tweet_to_reply_to = "Christmas is coming soon. What are you going to do for the holidays?" # Put a tweet to reply to here

decoded_outputs = generate_outputs(tweet_to_reply_to, 1)

for i, output in enumerate(decoded_outputs):
    if len(output) > 1:
        print("{}: {}\n\n".format(i, output))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: I'm celebrating the Christmas season with my family and friends. My favorite holiday drink is orange sherbet. #ChristmasIsComing 🎄 🎄
#happylife #LoveIsComing #HappyThanksgiving  
🧠🤟🧠🤟🧠 🤟🤟🧠  
#HAPPYTHANKSGIVING #ChristmasIsComing #HAPPYGULFDAY  
🍂🎁🎄👸🏼🎄👸🏼🎄


