# Evaluating sarcasm
This notebook is to test a chat-style inference with an autoregressive model for evaluating for saracasm

In [25]:
# Imports
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, pipeline
import torch
from tqdm import tqdm
import pandas as pd

In [30]:
# Globals
dataset_path = "marcbishara/sarcasm-on-reddit"
model_name = "Zoe3324/gpt2-sft-full-v2"
split_name="holdout"
batch_size = 256

In [3]:
def build_sarcasm_dataset(
    tokenizer,   
    dataset_name,
    split_name,
    min_text_length=10,
    num_of_rows=None
):

    tokenizer.pad_token = tokenizer.eos_token

    ds = load_dataset(dataset_name, split=split_name)

    # Filter out short comments
    ds = ds.filter(lambda x: len(x["parent_comment"]) >= min_text_length)

    # Limit by number of rows if provided
    if num_of_rows is not None:
        ds = ds.select(range(num_of_rows))

 
    def tokenize(sample):
      templated_query = f"<PARENT> {sample['parent_comment']}</PARENT>\n<RESPONSE>"

      enc = tokenizer(
          templated_query,
          # padding="max_length",
          truncation=True,
          max_length=128,
          return_attention_mask=True
      )

      sample["input_ids"] = enc["input_ids"]
      sample["attention_mask"] = enc["attention_mask"]
      sample["query"] = tokenizer.decode(enc["input_ids"])
      return sample

    # Apply tokenization
    ds = ds.map(tokenize, batched=False)

    # Convert to torch tensors
    ds.set_format(type="torch")

    return ds

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [6]:
# load dataset
dataset = build_sarcasm_dataset(tokenizer=tokenizer,dataset_name=dataset_path, split_name=split_name, num_of_rows=1000) #If you don't want to run the full dataset, limit the number of rows

README.md: 0.00B [00:00, ?B/s]

data/holdout-00000-of-00001.parquet:   0%|          | 0.00/18.2M [00:00<?, ?B/s]

data/sft_train-00000-of-00001.parquet:   0%|          | 0.00/49.1M [00:00<?, ?B/s]

data/sft_validation-00000-of-00001.parqu(…):   0%|          | 0.00/5.44M [00:00<?, ?B/s]

data/reward_train-00000-of-00001.parquet:   0%|          | 0.00/49.3M [00:00<?, ?B/s]

data/reward_validation-00000-of-00001.pa(…):   0%|          | 0.00/5.53M [00:00<?, ?B/s]

data/ppo_train-00000-of-00001.parquet:   0%|          | 0.00/49.4M [00:00<?, ?B/s]

data/ppo_validation-00000-of-00001.parqu(…):   0%|          | 0.00/5.51M [00:00<?, ?B/s]

Generating holdout split:   0%|          | 0/101083 [00:00<?, ? examples/s]

Generating sft_train split:   0%|          | 0/272922 [00:00<?, ? examples/s]

Generating sft_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Generating reward_train split:   0%|          | 0/272922 [00:00<?, ? examples/s]

Generating reward_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Generating ppo_train split:   0%|          | 0/272924 [00:00<?, ? examples/s]

Generating ppo_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Filter:   0%|          | 0/101083 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
dataset[0]

{'label': tensor(0),
 'comment': 'I would stay away from it.',
 'author': 'Nurmes',
 'subreddit': 'Warthunder',
 'score': tensor(5),
 'ups': tensor(5),
 'downs': tensor(0),
 'date': '2015-10',
 'created_utc': '2015-10-30 17:58:54',
 'parent_comment': 'Is This Legit? Golden Eagle Discounts?',
 'input_ids': tensor([   27, 27082,  3525,    29,  1148,   770,  3564,   270,    30,  8407,
         18456, 43474,    82,    30,  3556, 27082,  3525,    29,   198,    27,
         19535,    47,  1340,  5188,    29]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]),
 'query': '<PARENT> Is This Legit? Golden Eagle Discounts?</PARENT>\n<RESPONSE>'}

In [31]:
# Make a dataloader from dataset

# drop all the columns except 'query'
dataset_fltrd = dataset.remove_columns([col for col in dataset.column_names if col != 'query'])

dataloader = torch.utils.data.DataLoader(dataset_fltrd, batch_size=batch_size, shuffle=False)

In [20]:
i, batch = next(enumerate(dataloader))
print(batch['query'][0])
print(len(batch['query']))

<PARENT> Is This Legit? Golden Eagle Discounts?</PARENT>
<RESPONSE>
64


In [32]:

# build generation config and a small pipeline wrapper for easier inference
gen_cfg = GenerationConfig(
    max_new_tokens=80,
    do_sample=True,
    top_p=0.9,
    temperature=0.7,
    pad_token_id=tokenizer.pad_token_id
)

text_gen = pipeline(
    "text-generation",
    model=model_name,
    tokenizer=tokenizer,
    device='cuda',
)

def generate_reply(parent_comment: str) -> str:
    out = text_gen(parent_comment, generation_config=gen_cfg, return_full_text=False, clean_up_tokenization_spaces=True)
    # pipeline returns list of dicts; take first
    return out[0]["generated_text"].strip()

Device set to use cuda


In [33]:
all_results = []

for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
    print(f"Processing batch {i+1}/{len(dataloader)}")
    
    # Get all queries in the batch
    queries = batch['query']
    
    # Process entire batch through pipeline at once
    replies = text_gen(queries, generation_config=gen_cfg, return_full_text=False, clean_up_tokenization_spaces=True)
    
    # Extract generated text from replies (handle list structure)
    generated_texts = [reply[0]["generated_text"].strip() for reply in replies]
    
    # Add all results to list
    for parent_comment, reply_text in zip(queries, generated_texts):
        all_results.append({
            "parent_comment": parent_comment,
            "gst_reply": reply_text
        })
    
    # Print a sample from the batch for sanity check
    print(f"Sample Parent Comment: {queries[0]}")
    print(f"Generated Reply: {generated_texts[0]}")

# Create dataframe from all results at once
parent_response = pd.DataFrame(all_results)

# Save to CSV
parent_response.to_csv("/content/generated_sarcastic_replies.csv", index=False)

  0%|          | 0/4 [00:00<?, ?it/s]

Processing batch 1/4


 25%|██▌       | 1/4 [01:00<03:02, 60.83s/it]

Sample Parent Comment: <PARENT> Is This Legit? Golden Eagle Discounts?</PARENT>
<RESPONSE>
Generated Reply: No, I'm just taking a moment to explain why I'm upset. </RESPONSE>
Processing batch 2/4


 50%|█████     | 2/4 [02:01<02:01, 60.51s/it]

Sample Parent Comment: <PARENT> How to Tell Black People Apart by David Alan Grier</PARENT>
<RESPONSE>
Generated Reply: I'm sure that's a good thing! </RESPONSE>
Processing batch 3/4


 75%|███████▌  | 3/4 [03:00<00:59, 59.79s/it]

Sample Parent Comment: <PARENT> Yes, it is a bit of a regressive tax.</PARENT>
<RESPONSE>
Generated Reply: Yeah, because it's a bit of a regressive tax. </RESPONSE>
Processing batch 4/4


100%|██████████| 4/4 [03:53<00:00, 58.42s/it]

Sample Parent Comment: <PARENT> Hey that kangaroo stole my ball</PARENT>
<RESPONSE>
Generated Reply: He should have put it in the ball and gotten a free shot </RESPONSE>





In [34]:
# hugging face login
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:
# push scv to huggingface
from huggingface_hub import upload_file
upload_file(
    path_or_fileobj="/content/generated_sarcastic_replies.csv",
    path_in_repo=f"generated_sarcastic_replies_{model_name}_{len(dataset)}_{split_name}.csv",
    repo_id="marcbishara/gst_collection_of_responses",
    repo_type="dataset",
    token=True,
)

CommitInfo(commit_url='https://huggingface.co/datasets/marcbishara/gst_collection_of_responses/commit/6708694b044f672864948f734072df8a25079a7b', commit_message='Upload generated_sarcastic_replies_Zoe3324/gpt2-sft-full-v2_1000_holdout.csv with huggingface_hub', commit_description='', oid='6708694b044f672864948f734072df8a25079a7b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/marcbishara/gst_collection_of_responses', endpoint='https://huggingface.co', repo_type='dataset', repo_id='marcbishara/gst_collection_of_responses'), pr_revision=None, pr_num=None)