# Evaluating sarcasm
This notebook is to test a chat-style inference with an autoregressive model for evaluating for saracasm

In [1]:
# Imports
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, pipeline
import torch
from tqdm import tqdm
import pandas as pd

In [2]:
# Globals
dataset_path = "marcbishara/sarcasm-on-reddit"
sft_model_name = "Zoe3324/gpt2-sft-full-v2"
ppo_model_name = "marcbishara/GenerallySarcasticTransformer"
ppo_revision = "gpt2-sft-full_2Ep_512b_64mb_1-41e-05lr_20Kdsz_32tkn_0.9tmp_0.9tp_0tk_scl5-0.5-0.5_allR"
ppo2_model_name = "tmrcnl/GST-all4"
ppo2_revision = "gpt2-sft-full_2Eps_512bs_64mbs_1-41e-05lr_20Kdsz_32tkn_0.9tmp_0.9t-p_0t-k_all4TOC"
base_model_name = "gpt2"
split_name="holdout"
batch_size = 256

In [3]:
def build_sarcasm_dataset(
    tokenizer,
    dataset_name,
    split_name,
    min_text_length=10,
    num_of_rows=None
):

    tokenizer.pad_token = tokenizer.eos_token

    ds = load_dataset(dataset_name, split=split_name)

    # Filter out short comments
    ds = ds.filter(lambda x: len(x["parent_comment"]) >= min_text_length)

    # Limit by number of rows if provided
    if num_of_rows is not None:
        ds = ds.select(range(num_of_rows))


    def tokenize(sample):
      templated_query = f"<PARENT> {sample['parent_comment']} </PARENT>\n<RESPONSE> "

      enc = tokenizer(
          templated_query,
          # padding="max_length",
          truncation=True,
          max_length=128,
          return_attention_mask=True
      )

      sample["input_ids"] = enc["input_ids"]
      sample["attention_mask"] = enc["attention_mask"]
      sample["query"] = tokenizer.decode(enc["input_ids"])
      return sample

    # Apply tokenization
    ds = ds.map(tokenize, batched=False)

    # Convert to torch tensors
    ds.set_format(type="torch")

    return ds

In [4]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
# load dataset
dataset = build_sarcasm_dataset(tokenizer=tokenizer,dataset_name=dataset_path, split_name=split_name, num_of_rows=1000) #If you don't want to run the full dataset, limit the number of rows

README.md: 0.00B [00:00, ?B/s]

data/holdout-00000-of-00001.parquet:   0%|          | 0.00/18.2M [00:00<?, ?B/s]

data/sft_train-00000-of-00001.parquet:   0%|          | 0.00/49.1M [00:00<?, ?B/s]

data/sft_validation-00000-of-00001.parqu(…):   0%|          | 0.00/5.44M [00:00<?, ?B/s]

data/reward_train-00000-of-00001.parquet:   0%|          | 0.00/49.3M [00:00<?, ?B/s]

data/reward_validation-00000-of-00001.pa(…):   0%|          | 0.00/5.53M [00:00<?, ?B/s]

data/ppo_train-00000-of-00001.parquet:   0%|          | 0.00/49.4M [00:00<?, ?B/s]

data/ppo_validation-00000-of-00001.parqu(…):   0%|          | 0.00/5.51M [00:00<?, ?B/s]

Generating holdout split:   0%|          | 0/101083 [00:00<?, ? examples/s]

Generating sft_train split:   0%|          | 0/272922 [00:00<?, ? examples/s]

Generating sft_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Generating reward_train split:   0%|          | 0/272922 [00:00<?, ? examples/s]

Generating reward_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Generating ppo_train split:   0%|          | 0/272924 [00:00<?, ? examples/s]

Generating ppo_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Filter:   0%|          | 0/101083 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [6]:
dataset[0]

{'label': tensor(0),
 'comment': 'I would stay away from it.',
 'author': 'Nurmes',
 'subreddit': 'Warthunder',
 'score': tensor(5),
 'ups': tensor(5),
 'downs': tensor(0),
 'date': '2015-10',
 'created_utc': '2015-10-30 17:58:54',
 'parent_comment': 'Is This Legit? Golden Eagle Discounts?',
 'input_ids': tensor([   27, 27082,  3525,    29,  1148,   770,  3564,   270,    30,  8407,
         18456, 43474,    82,    30,  7359, 27082,  3525,    29,   198,    27,
         19535,    47,  1340,  5188,    29,   220]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]),
 'query': '<PARENT> Is This Legit? Golden Eagle Discounts? </PARENT>\n<RESPONSE> '}

In [38]:
# Make a dataloader from dataset

# drop all the columns except 'query'
dataset_fltrd = dataset.remove_columns([col for col in dataset.column_names if col not in ['query', 'comment']])

dataloader = torch.utils.data.DataLoader(dataset_fltrd, batch_size=batch_size, shuffle=False)

In [40]:
i, batch = next(enumerate(dataloader))
print(batch['query'][0])
print(batch['comment'][0])
print(len(batch['query']))

<PARENT> Is This Legit? Golden Eagle Discounts? </PARENT>
<RESPONSE> 
I would stay away from it.
256


In [9]:
# build generation config and a small pipeline wrapper for easier inference
gen_cfg = GenerationConfig(
    temperature= 0.9,
    min_new_tokens= 3,
    top_p= 0.9,
    top_k= 0,
    do_sample= True,
    pad_token_id= tokenizer.eos_token_id, # most decoder models don't have a padding token - use EOS token instead
    max_new_tokens= 32, # specify how many tokens you want to generate at most
)

text_gen_sft = pipeline(
    "text-generation",
    model=sft_model_name,
    tokenizer=tokenizer,
    device='cuda',
)

# def generate_reply_sft(parent_comment: str) -> str:
#     out = text_gen(parent_comment, generation_config=gen_cfg, return_full_text=False, clean_up_tokenization_spaces=True)
#     # pipeline returns list of dicts; take first
#     return out[0]["generated_text"].strip()

config.json:   0%|          | 0.00/874 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

Device set to use cuda


In [10]:
# generation for PPO
text_gen_ppo = pipeline(
    "text-generation",
    model=ppo_model_name,
    tokenizer=tokenizer,
    device='cuda',
    revision=ppo_revision
)

# def generate_reply_ppo(parent_comment: str) -> str:
#     out = text_gen_ppo(parent_comment, generation_config=gen_cfg, return_full_text=False, clean_up_tokenization_spaces=True)
#     # pipeline returns list of dicts; take first
#     return out[0]["generated_text"].strip()

config.json:   0%|          | 0.00/874 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of the model checkpoint at marcbishara/GenerallySarcasticTransformer were not used when initializing GPT2LMHeadModel: ['v_head.summary.bias', 'v_head.summary.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

Device set to use cuda


In [11]:
text_gen_ppo2 = pipeline(
    "text-generation",
    model=ppo2_model_name,
    tokenizer=tokenizer,
    device='cuda',
    revision=ppo2_revision
)

# def generate_reply_ppo2(parent_comment: str) -> str:
#     out = text_gen_ppo2(parent_comment, generation_config=gen_cfg, return_full_text=False, clean_up_tokenization_spaces=True)
#     # pipeline returns list of dicts; take first
#     return out[0]["generated_text"].strip()

config.json:   0%|          | 0.00/874 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of the model checkpoint at tmrcnl/GST-all4 were not used when initializing GPT2LMHeadModel: ['v_head.summary.bias', 'v_head.summary.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

Device set to use cuda


In [12]:
text_gen_base = pipeline(
    "text-generation",
    model=base_model_name,
    tokenizer=tokenizer,
    device='cuda',
)

# def generate_reply_base(parent_comment: str) -> str:
#     out = text_gen_base(parent_comment, generation_config=gen_cfg, return_full_text=False, clean_up_tokenization_spaces=True)
#     # pipeline returns list of dicts; take first
#     return out[0]["generated_text"].strip()

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cuda


In [14]:
all_results = []

for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
    print(f"Processing batch {i+1}/{len(dataloader)}")

    # Get all queries in the batch
    queries = batch['query']

    # Process entire batch through pipeline at once
    base_replies = text_gen_base(queries, generation_config=gen_cfg, return_full_text=False, clean_up_tokenization_spaces=True)
    ppo_replies = text_gen_ppo(queries, generation_config=gen_cfg, return_full_text=False, clean_up_tokenization_spaces=True)
    ppo2_replies = text_gen_ppo2(queries, generation_config=gen_cfg, return_full_text=False, clean_up_tokenization_spaces=True)
    sft_replies = text_gen_sft(queries, generation_config=gen_cfg, return_full_text=False, clean_up_tokenization_spaces=True)

    # Extract generated text from replies (handle list structure)
    generated_texts_base = [reply[0]["generated_text"].strip() for reply in base_replies]
    generated_texts_ppo = [reply[0]["generated_text"].strip() for reply in ppo_replies]
    generated_texts_ppo2 = [reply[0]["generated_text"].strip() for reply in ppo2_replies]
    generated_texts_sft = [reply[0]["generated_text"].strip() for reply in sft_replies]

    # Add all results to list
    for parent_comment, comment, base_reply, ppo_reply, ppo2_reply, sft_reply in zip(queries, generated_texts_base, generated_texts_ppo, generated_texts_ppo2, generated_texts_sft):
        all_results.append({
            "parent_comment": parent_comment,
            "reddit_reply": comment,
            "gpt2_reply": base_reply,
            "sft_reply": sft_reply,
            "ppo_reply": ppo_reply,
            "ppo2_reply": ppo2_reply
        })

# Create dataframe from all results at once
all_responses_df = pd.DataFrame(all_results)

  0%|          | 0/4 [00:00<?, ?it/s]

Processing batch 1/4


 25%|██▌       | 1/4 [03:40<11:00, 220.15s/it]

Processing batch 2/4


 50%|█████     | 2/4 [07:20<07:20, 220.33s/it]

Processing batch 3/4


 75%|███████▌  | 3/4 [10:59<03:39, 219.68s/it]

Processing batch 4/4


100%|██████████| 4/4 [14:17<00:00, 214.35s/it]


In [28]:
# I will add the models names to the df at the top row
model_names_df = pd.DataFrame({"parent_comment": ["Model Names"], "gpt2_reply": base_model_name,\
                              "sft_reply": sft_model_name,\
                               "ppo_reply": f"{ppo_model_name}/{ppo_revision}",\
                              "ppo2_reply": f"{ppo2_model_name}/{ppo2_revision}"})

all_responses_df = pd.concat([model_names_df, all_responses_df]).reset_index(drop=True)

In [44]:
all_reddit_responses = []
for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
  for comment in batch['comment']:
    all_reddit_responses.append(comment)

100%|██████████| 4/4 [00:00<00:00, 253.51it/s]


In [58]:
all_responses_df

Unnamed: 0,gpt2_reply,sft_reply,ppo_reply,ppo2_reply,parent_comment,reddit_reply
1, <DIV><DIV><DIV> <DIV><DIV><DIV> <DIV><DIV><D...,~~should've added a link to this post on reddi...,~~sigh~~ Golden Eagle is the only possible way...,~~TOMLHHOOOOOM~~! </RESPONSE>,<PARENT> Is This Legit? Golden Eagle Discounts...,I would stay away from it.
2,!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!,~~How much easier that would be~~ </RESPONSE>,~~No need - the only one who is doing this is ...,ive never been so alone </RESPONSE>,<PARENT> Are you implying that people with bro...,I sincerely hope you forgot a
3,【INSTALLATION】 Deploy.gif </RESPONSE>\n<STYLE ...,~~hey~~ I can't stand how thick he is~~ </RESP...,ive seen it this way countless times Captain A...,~~SOBERING~~ </RESPONSE>,<PARENT> CaptainAmerica.gif </PARENT>\n<RESPON...,dead link
4,0000000000000000000000000000000000000000000000...,~~stirring your little asshole~~ </RESPONSE>,ive never seen this with any other keeper </RE...,~~BEST TEAM THREAT~~ </RESPONSE>,<PARENT> Mitras | top of the line.. Always.. <...,"Damn, I didn't realize that XTR was downgraded..."
5,!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n<RESPONSE...,~~An additional twenty dollars for you!~~ </RE...,~~you better not run~~ </RESPONSE>,~~this is the future~~ </RESPONSE>,<PARENT> I'm the opposite. USPS has lost three...,It must be regional.
...,...,...,...,...,...,...
996,~~~~~~~~~~~~~~~~~~~~~~~ <PREF> <RESPONSE> ~~~~...,"~~ Nous tee, yup, sombre, si Quebecois!~~ </RE...","~~nonsense bro, Azteca is the best team in the...",ive never seen this crowd downvoted before! </...,"<PARENT> Man, the reaction of the crowd felt l...","Wait, were not at the Azteca?"
997,~~~~~~~~~~ Im a little curious to hear the res...,~~Caveman~~ </RESPONSE>,ive never been to a new city and never heard o...,~~Sou~~mooosh she ~~stole the money </RESPONSE>,<PARENT> im a little curious to hear the rest ...,Who?
998,***************************** <TR> <TR> I am a...,~~shes fat~~ </RESPONSE>,~~The joke originates here~~ </RESPONSE>,~~m. </RESPONSE>,<PARENT> You made this thread to post that jok...,You know it!
999,_____ An aerial photograph of the position of ...,~~ 100% finnished~~ </RESPONSE>,"~~STFU~~ no, you forgot </RESPONSE>",~~Trump~~! </RESPONSE>,<PARENT> Thought you Singaporeans may like thi...,"yeah we are fine guys, no help needed."


In [59]:
# Save to CSV
all_responses_df.to_csv("/content/queries_and_4_model_responses.csv", index=False)

## Save dataset to huggingface

In [60]:
# hugging face login
from huggingface_hub import login
from google.colab import userdata

login(token=userdata.get('HF_TOKEN'))


In [63]:
# push scv to huggingface
from huggingface_hub import upload_file
upload_file(
    path_or_fileobj="/content/queries_and_4_model_responses.csv",
    path_in_repo=f"queries_and_4_model_responses.csv",
    repo_id="marcbishara/gst_collection_of_responses",
    repo_type="dataset",
    token=True,
)

CommitInfo(commit_url='https://huggingface.co/datasets/marcbishara/gst_collection_of_responses/commit/23f1c6defca93dde8038aa3f48cc7b79bf0b4314', commit_message='Upload queries_and_4_model_responses.csv with huggingface_hub', commit_description='', oid='23f1c6defca93dde8038aa3f48cc7b79bf0b4314', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/marcbishara/gst_collection_of_responses', endpoint='https://huggingface.co', repo_type='dataset', repo_id='marcbishara/gst_collection_of_responses'), pr_revision=None, pr_num=None)

## Measure diversity of responses

In [None]:
!pip install diversity
!pip install lexicalrichness

Collecting diversity
  Downloading diversity-0.3.0-py3-none-any.whl.metadata (10 kB)
Collecting evaluate<0.5.0,>=0.4.1 (from diversity)
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge-score<0.2.0,>=0.1.2 (from diversity)
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading diversity-0.3.0-py3-none-any.whl (30 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=1c7f5a539f7d15fd63d4d2ff38ea793ed5c8f5cd9209a7944a4a678df95356ee
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
In

In [None]:
from diversity import (
    compression_ratio,
    homogenization_score,
    ngram_diversity_score,
)
from lexicalrichness import LexicalRichness

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Get a diversity score for the responses
sft_ngd = ngram_diversity_score(all_responses, num_n=4)
ppo_ngd = ngram_diversity_score(ppo_all_responses, num_n=4)


In [None]:
print(f"SFT NGD: {sft_ngd}")
print(f"PPO NGD: {ppo_ngd}")

SFT NGD: 3.182
PPO NGD: 3.048


In [None]:
# get lexical richness

def safe_scores(text: str):
    """Return MTLD and HDD for a given text, safely handling empty input and short texts."""
    text = text.strip()
    if not text:  # empty or whitespace-only
        return 0.0, 0.0
    lr = LexicalRichness(text)
    try:
        mtld_score = lr.mtld()
    except ZeroDivisionError:
        mtld_score = 0.0

    # HDD requires draws < word count
    word_count = lr.words
    if word_count > 1:
        draws = min(42, word_count - 1)  # safe draws
        try:
            hdd_score = lr.hdd(draws=draws)
        except ValueError:
            hdd_score = 0.0
    else:
        hdd_score = 0.0

    return mtld_score, hdd_score

# Compute scores for SFT and PPO responses
sft_scores = [safe_scores(resp) for resp in all_responses]
ppo_scores = [safe_scores(resp) for resp in ppo_all_responses]

# Separate MTLD and HDD values
sft_mtld = [mtld for mtld, _ in sft_scores]
sft_hdd  = [hdd  for _, hdd in sft_scores]

ppo_mtld = [mtld for mtld, _ in ppo_scores]
ppo_hdd  = [hdd  for _, hdd in ppo_scores]

# Compute averages safely
sft_avg_mtld = sum(sft_mtld) / len(sft_mtld) if sft_mtld else 0
sft_avg_hdd  = sum(sft_hdd)  / len(sft_hdd)  if sft_hdd  else 0

ppo_avg_mtld = sum(ppo_mtld) / len(ppo_mtld) if ppo_mtld else 0
ppo_avg_hdd  = sum(ppo_hdd)  / len(ppo_hdd)  if ppo_hdd  else 0

print("SFT average MTLD:", sft_avg_mtld)
print("SFT average HDD:",  sft_avg_hdd)
print("PPO average MTLD:", ppo_avg_mtld)
print("PPO average HDD:",  ppo_avg_hdd)


SFT average MTLD: 14.671140978165589
SFT average HDD: 0.8191019130211115
PPO average MTLD: 13.861014666666668
PPO average HDD: 0.9513056033981614
