In [49]:
# INIT

In [50]:
import pandas as pd
import numpy as np
import torch
import pyarrow as pa
from datasets import Dataset, load_dataset
from datasets.dataset_dict import DatasetDict

from torch.utils.data import DataLoader, random_split
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, DataCollatorForLanguageModeling
import modules.tweet_helper as tweet_helper

In [51]:
model_dir = "./generate_replies_model_new_2"
finetuned_model_name = "gpt2-large"

In [52]:
# CREATE AND FILTER THE DATAFRAME

In [53]:
op_df = pd.read_csv("op.csv")
op_df = op_df.drop_duplicates(subset="op_id", keep="last")

In [54]:
replies_df = pd.read_csv("replies.csv")
replies_df = replies_df.drop_duplicates(subset="reply_id", keep="last")

In [55]:
df = replies_df.merge(op_df, how='inner', on="op_id")
df

Unnamed: 0,op_id,reply_id,created_at_x,author_id_x,like_count_x,reply_count_x,quote_count_x,text_x,created_at_y,author_id_y,like_count_y,reply_count_y,quote_count_y,text_y
0,1597824424505651203,1597824964648120320,2022-11-30T05:28:52.000Z,606987031,0,1,0,Representing BCDA in the turnover of facilitie...,2022-11-30T05:26:43.000Z,606987031,1,1,0,Leading the event is Acting Chief of Staff MGe...
1,1597774607259312129,1597825062547050496,2022-11-30T05:29:15.000Z,35877816,0,0,0,"@WalshFreedom Jack The Ripper: ""I believe in a...",2022-11-30T02:08:46.000Z,236487888,3114,74,14,Bullshit. The leader of your party is a crimin...
2,1597774607259312129,1597776539957231616,2022-11-30T02:16:27.000Z,1419231540609884162,1,0,0,@WalshFreedom DemocRat Joey is wrong again 👉🤡 ...,2022-11-30T02:08:46.000Z,236487888,3114,74,14,Bullshit. The leader of your party is a crimin...
3,1597825638630854656,1597825646310588416,2022-11-30T05:31:34.000Z,766540464,0,0,0,Thanks for reading this week's #WallaWarriors ...,2022-11-30T05:31:33.000Z,766540464,0,1,0,"Great job, Monica! Nice that you've got your ..."
4,1597571148501458944,1597824240497364992,2022-11-30T05:25:59.000Z,1593332431276412929,0,0,0,@D__Barbie @THE_FREE_COIN @Freecoin_global @Ma...,2022-11-29T12:40:17.000Z,1289211227030401030,6,3,0,$FREE 💎 up over 30% in the day. Love to see it...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597061,1602043102935633922,1602058505359327232,2022-12-11T21:51:27.000Z,1596442024819015680,0,1,0,"That's a long time to keep refinancing your ""e...",2022-12-11T20:50:14.000Z,1596442024819015680,0,1,0,People deserve the justice of knowing what som...
597062,1602057915904708608,1602058547415875584,2022-12-11T21:51:37.000Z,224777045,9,2,0,"If you click on the image in the story, you wi...",2022-12-11T21:49:06.000Z,224777045,33,1,0,Tucked inside this story is an image I made fr...
597063,1601795232407834624,1602058485759643648,2022-12-11T21:51:22.000Z,1569808143831691265,0,0,0,@gryphon_katsu Noming a naughty chimken! https...,2022-12-11T04:25:17.000Z,2787059839,77,54,0,show me your maws 👀
597064,1602057634567290880,1602058450737020932,2022-12-11T21:51:14.000Z,1456442895154786309,1,1,0,the hyudoro got compressed to shit so heres a ...,2022-12-11T21:47:59.000Z,1456442895154786309,1,1,0,had to add one more https://t.co/bg8WtcaxNZ


In [56]:
df = df[df["like_count_x"] >= 10]

In [57]:
df = df.copy() # Otherwise will throw SettingWithCopyWarning 
df["text_x"] = df["text_x"].map(lambda x: tweet_helper.fix_tweet_text(x))
df["text_y"] = df["text_y"].map(lambda y: tweet_helper.fix_tweet_text(y))

In [58]:
def filter_row(row):
    return not (row["text_x"] == "" or row["text_y"] == "" or tweet_helper.filter_tweet(row["text_x"]) or tweet_helper.filter_tweet(row["text_y"]))

In [59]:
df = df[df.apply(filter_row, axis=1)]

In [60]:
# Refactor the dataframe to only keep the important columns
df.rename(columns = {'text_x':'reply_text', 'text_y':'op_text'}, inplace = True)
df = df[["op_id", "reply_id", "reply_text", "op_text"]]

In [61]:
# Move reply_text at the end
column_to_move = df.pop("reply_text")
df.insert(len(df.columns), "reply_text", column_to_move)
df

Unnamed: 0,op_id,reply_id,op_text,reply_text
13,1597416255207587841,1597623287164547073,I would love to have a sit down conversation w...,"I plan to stay out of gunshot range of you, an..."
14,1597416255207587841,1597602320379936770,I would love to have a sit down conversation w...,Start with the families of the two people you ...
15,1597416255207587841,1597587154255089664,I would love to have a sit down conversation w...,"Cry me a river.\nRittenhouse wasn't found """"in..."
16,1597416255207587841,1597520308927594496,I would love to have a sit down conversation w...,You brought an AR15 to a town and killed peopl...
17,1597416255207587841,1597500162032930818,I would love to have a sit down conversation w...,You are a heroe
...,...,...,...,...
597047,1601970615191224323,1602058884734140416,bro's mad about fortnite skins not having any ...,Y’know the drill homes.
597048,1602058829738418180,1602058837401505792,(4) being close & playful together - feeling s...,(5) Gat returns home after work - Wa smells pe...
597051,1601999533243863044,1602058543460384770,"Okok, I’ll try to watch 5 minutes 🤮","SCAM, Dont watch at all - 1 second is a hit"
597059,1602057744902590464,1602058540281221121,An air fryer?,why does it have a mouth


In [62]:
filtered_ds = Dataset(pa.Table.from_pandas(df))
filtered_ds

Dataset({
    features: ['op_id', 'reply_id', 'op_text', 'reply_text', '__index_level_0__'],
    num_rows: 322073
})

In [63]:
filtered_ds = filtered_ds.remove_columns("__index_level_0__")

In [64]:
# CREATE THE FINAL DATASET

In [65]:
dataset_base = DatasetDict({'train': filtered_ds})
dataset_base

DatasetDict({
    train: Dataset({
        features: ['op_id', 'reply_id', 'op_text', 'reply_text'],
        num_rows: 322073
    })
})

In [66]:
dataset = dataset_base["train"].train_test_split(train_size=0.8) # Train = 80%, test+validation = 20%
dataset

DatasetDict({
    train: Dataset({
        features: ['op_id', 'reply_id', 'op_text', 'reply_text'],
        num_rows: 257658
    })
    test: Dataset({
        features: ['op_id', 'reply_id', 'op_text', 'reply_text'],
        num_rows: 64415
    })
})

In [67]:
dataset_validation_and_test = dataset.pop("test").train_test_split(train_size=0.8) # validation=16%, test=4%
dataset["validation"] = dataset_validation_and_test.pop("train")
dataset["test"] = dataset_validation_and_test.pop("test")
dataset

DatasetDict({
    train: Dataset({
        features: ['op_id', 'reply_id', 'op_text', 'reply_text'],
        num_rows: 257658
    })
    validation: Dataset({
        features: ['op_id', 'reply_id', 'op_text', 'reply_text'],
        num_rows: 51532
    })
    test: Dataset({
        features: ['op_id', 'reply_id', 'op_text', 'reply_text'],
        num_rows: 12883
    })
})

In [68]:
# TOKENIZE THE DATASET

In [69]:
# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained(finetuned_model_name, bos_token='<|start|>', eos_token='<|end|>', pad_token='<|pad|>')

loading file vocab.json from cache at /home/mark/.cache/huggingface/hub/models--gpt2-large/snapshots/e5ab12c7d42b9e60a6025476a688aab2c5695189/vocab.json
loading file merges.txt from cache at /home/mark/.cache/huggingface/hub/models--gpt2-large/snapshots/e5ab12c7d42b9e60a6025476a688aab2c5695189/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /home/mark/.cache/huggingface/hub/models--gpt2-large/snapshots/e5ab12c7d42b9e60a6025476a688aab2c5695189/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-large",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1280,
  "n_head"

In [70]:
def tokenize_and_split(examples):
    return tokenizer(
        '<|start|>'+ examples["op_text"] + "{REPLY}" + examples["reply_text"] + '<|end|>',
        truncation=True,
        max_length= 250 # Tweet max = 280, 2 tweets + "{REPLY}" divided by about 2 when converted to tokens
    )

In [71]:
dataset = dataset.map(tokenize_and_split) 

  0%|          | 0/257658 [00:00<?, ?ex/s]

  0%|          | 0/51532 [00:00<?, ?ex/s]

  0%|          | 0/12883 [00:00<?, ?ex/s]

In [72]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

In [73]:
#RELOAD MODEL

#configuration = GPT2Config.from_pretrained(model_dir, output_hidden_states=False)
#model = GPT2LMHeadModel.from_pretrained(model_dir, config=configuration)
#model.resize_token_embeddings(len(tokenizer))
#model.cuda()

In [74]:
# TRAIN THE MODEL

In [11]:
configuration = GPT2Config.from_pretrained(finetuned_model_name, output_hidden_states=False)
model = GPT2LMHeadModel.from_pretrained(finetuned_model_name, config=configuration)
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 1280)

In [14]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

training_args = TrainingArguments(
    output_dir=model_dir,
    overwrite_output_dir=True, 
    num_train_epochs=3, 
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps = 15000, 
    save_steps=15000,
    warmup_steps=5000,
    save_strategy="steps",
    evaluation_strategy="steps"
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"]
)

In [15]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: op_text, reply_text, op_id, reply_id. If op_text, reply_text, op_id, reply_id are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 162351
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 121764
  Number of trainable parameters = 774033920


Step,Training Loss,Validation Loss
15000,3.1817,3.155565
30000,3.0486,2.998014
45000,2.4678,2.92298
60000,2.4051,2.852515
75000,2.4017,2.7942
90000,1.9168,2.859834
105000,1.8387,2.834723
120000,1.919,2.808198


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: op_text, reply_text, op_id, reply_id. If op_text, reply_text, op_id, reply_id are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 18040
  Batch size = 4
Saving model checkpoint to ./generate_replies_model_new/checkpoint-15000
Configuration saved in ./generate_replies_model_new/checkpoint-15000/config.json
Model weights saved in ./generate_replies_model_new/checkpoint-15000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: op_text, reply_text, op_id, reply_id. If op_text, reply_text, op_id, reply_id are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 18040
  Batch size = 4
Saving model checkpoint to ./

TrainOutput(global_step=121764, training_loss=2.508804728561035, metrics={'train_runtime': 38866.4321, 'train_samples_per_second': 12.531, 'train_steps_per_second': 3.133, 'total_flos': 2.476099275139584e+17, 'train_loss': 2.508804728561035, 'epoch': 3.0})

In [16]:
trainer.save_model()

Saving model checkpoint to ./generate_replies_model_new
Configuration saved in ./generate_replies_model_new/config.json
Model weights saved in ./generate_replies_model_new/pytorch_model.bin


In [17]:
# GENERATE TEXT

In [18]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50260, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout)

In [19]:
device = torch.device("cuda")

def generate_outputs(input_text, nb_seq):
    text_to_generate = input_text + "{REPLY}"
    
    encoded_input = tokenizer.encode(text_to_generate)
    generated_output = torch.tensor(encoded_input).unsqueeze(0).to(device)
    
    new_max_length = (max_token_length / 2) + len(encoded_input) # Limit the generated tweet to about 280 characters max
    
    outputs = model.generate(
            generated_output, 
            do_sample=True,   
            top_k=50, 
            max_length = new_max_length,
            top_p=0.95, 
            num_return_sequences=nb_seq
        )
    return [tokenizer.decode(o, skip_special_tokens=True).split('{REPLY}')[1]  for o in outputs] #

In [22]:
tweet_to_reply_to = "Christmas is coming soon. What are you going to do for the holidays?" # Put a tweet to reply to here

decoded_outputs = generate_outputs(tweet_to_reply_to, 1)

for i, output in enumerate(decoded_outputs):
    if len(output) > 1:
        print("{}: {}\n\n".format(i, output))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: I'm celebrating the Christmas season with my family and friends. My favorite holiday drink is orange sherbet. #ChristmasIsComing 🎄 🎄
#happylife #LoveIsComing #HappyThanksgiving  
🧠🤟🧠🤟🧠 🤟🤟🧠  
#HAPPYTHANKSGIVING #ChristmasIsComing #HAPPYGULFDAY  
🍂🎁🎄👸🏼🎄👸🏼🎄


