In [1]:
!pip install wandb
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myuchenzoe-xu[0m ([33mtmrcnl-university-of-toronto[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
import torch
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
import transformers
import re
import wandb


In [3]:
REWARD_MODEL = "tmrcnl/SarcasmRewardModel"
DATASET_PATH = "marcbishara/sarcasm-on-reddit"
PPO_MODEL = "marcbishara/GenerallySarcasticTransformer"
PPO_REVISION = "gpt2-sft-full_2Ep_512b_64mb_1-41e-05lr_20Kdsz_32tkn_0.9tmp_0.9tp_0tk_scl5-0.5-0.5_allR"
SFT_MODEL = "Zoe3324/gpt2-sft-full-v2"
GPT2_MODEL = "gpt2"
MAX_LENGTH = 128
BATCH_SIZE = 32
SAMPLE_SIZE = 1000
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
wandb.init(
    entity="zoe_123",# change it to yours
    project="gst_sarcasm_rm_eval",
    name="sft_vs_gpt2_avg_reward_all",
    config={
        "batch_size": BATCH_SIZE,
        "sample_size": SAMPLE_SIZE,
        "reward_model": REWARD_MODEL,
        "sft_model": SFT_MODEL
    },
    resume=False
)

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Currently logged in as: [33myuchenzoe-xu[0m ([33mtmrcnl-university-of-toronto[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
# Load dataset
dataset = load_dataset("marcbishara/sarcasm-on-reddit", split="holdout")
data = (dataset.shuffle(seed=42).select(range(SAMPLE_SIZE)))
parent_comments = data["parent_comment"]
print(f"Loaded {SAMPLE_SIZE} test samples")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/holdout-00000-of-00001.parquet:   0%|          | 0.00/18.2M [00:00<?, ?B/s]

data/sft_train-00000-of-00001.parquet:   0%|          | 0.00/49.1M [00:00<?, ?B/s]

data/sft_validation-00000-of-00001.parqu(…):   0%|          | 0.00/5.44M [00:00<?, ?B/s]

data/reward_train-00000-of-00001.parquet:   0%|          | 0.00/49.3M [00:00<?, ?B/s]

data/reward_validation-00000-of-00001.pa(…):   0%|          | 0.00/5.53M [00:00<?, ?B/s]

data/ppo_train-00000-of-00001.parquet:   0%|          | 0.00/49.4M [00:00<?, ?B/s]

data/ppo_validation-00000-of-00001.parqu(…):   0%|          | 0.00/5.51M [00:00<?, ?B/s]

Generating holdout split:   0%|          | 0/101083 [00:00<?, ? examples/s]

Generating sft_train split:   0%|          | 0/272922 [00:00<?, ? examples/s]

Generating sft_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Generating reward_train split:   0%|          | 0/272922 [00:00<?, ? examples/s]

Generating reward_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Generating ppo_train split:   0%|          | 0/272924 [00:00<?, ? examples/s]

Generating ppo_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Loaded 1000 test samples


In [6]:
# Load RM tokenizer/model
rm_tokenizer = AutoTokenizer.from_pretrained(REWARD_MODEL)
if rm_tokenizer.pad_token is None:
    rm_tokenizer.pad_token = rm_tokenizer.eos_token
rm_model = AutoModelForSequenceClassification.from_pretrained(REWARD_MODEL).to(device)

# Load GPT2&SFT tokenizers/models
gpt2_tokenizer = AutoTokenizer.from_pretrained(GPT2_MODEL)
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
gpt2_model = AutoModelForCausalLM.from_pretrained(GPT2_MODEL).to(device)
gpt2_model.eval()

sft_tokenizer = AutoTokenizer.from_pretrained(SFT_MODEL)
sft_tokenizer.pad_token = sft_tokenizer.eos_token
sft_model = AutoModelForCausalLM.from_pretrained(SFT_MODEL).to(device)
sft_model.eval()

# Load PPO tokenizers/models
ppo_tokenizer = AutoTokenizer.from_pretrained(
    PPO_MODEL,
    revision=PPO_REVISION
)
ppo_tokenizer.pad_token = ppo_tokenizer.eos_token
ppo_model = AutoModelForCausalLM.from_pretrained(
    PPO_MODEL,
    revision=PPO_REVISION
).to(device)
ppo_model.eval()

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/874 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/470 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/874 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of the model checkpoint at marcbishara/GenerallySarcasticTransformer were not used when initializing GPT2LMHeadModel: ['v_head.summary.bias', 'v_head.summary.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [10]:
 # Add tags to prompt
def build_prompt(parent_text: str) -> str:
    return f"<PARENT>{parent_text.strip()}</PARENT>\n<RESPONSE>"

# Remove output tags
def extract_clean_response(full_output: str, prompt: str) -> str:
    # Remove parent comment and parent tag
    full_output = re.sub(r"<PARENT>.*?</PARENT>", "", full_output, flags=re.DOTALL)
    # Fetch text in between response tag
    m = re.search(r"<RESPONSE>(.*?)</RESPONSE>", full_output, flags=re.DOTALL)
    if m:
        return m.group(1).strip()
    # fallback for output without </RESPONSE>
    if full_output.startswith(prompt):
        return full_output[len(prompt):].strip()

    # fallback for plain text
    return full_output.strip()

In [8]:
# Generate a model response given a parent comment, return cleaned reponses
def generate_responses(model, tokenizer, parent_comments):
    responses = []
    # Loop through each parent comment
    for text in tqdm(parent_comments, desc="Generating", unit="sample"):
        prompt = build_prompt(text)
        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=80,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
            )

        full_output = tokenizer.decode(output[0], skip_special_tokens=True)
        clean_output = extract_clean_response(full_output, prompt)
        responses.append(clean_output)
    return responses

# Compute average rewards for responses using reward model
def calculate_avg_reward(prompts, responses, rm_tokenizer, rm_model, device, model_label):
    all_scores = []         # all individual reward scores
    batch_avg_rewards = []  # per-batch average reward scores

    for local_step, i in enumerate(range(0, len(prompts), BATCH_SIZE)):
        batch_prompts = prompts[i:i + BATCH_SIZE]
        batch_responses = responses[i:i + BATCH_SIZE]

        # Tokenize (prompt, response) pairs for RM
        rm_inputs = rm_tokenizer(
            batch_prompts,
            batch_responses,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        ).to(device)

        with torch.no_grad():
            rm_outputs = rm_model(**rm_inputs)

        # Get sarcasm score(probability) for label = 1(sarcasm)
        sarcasm_scores = torch.softmax(rm_outputs.logits, dim=-1)[:, 1].cpu().tolist()
        # Compute batch average
        batch_avg = sum(sarcasm_scores) / len(sarcasm_scores)
        batch_avg_rewards.append(batch_avg)
        all_scores.extend(sarcasm_scores)
        wandb.log({
            f"batch_avg_reward/{model_label}": batch_avg,
            "global_step": local_step
        })
    # Compute overall average score
    overall_avg = sum(all_scores) / len(all_scores)
    wandb.log({f"overall_avg_reward/{model_label}": overall_avg})
    return overall_avg, batch_avg_rewards

In [9]:
print("\nEvaluating PPO model")
ppo_outputs = generate_responses(ppo_model, ppo_tokenizer, parent_comments)
ppo_avg, ppo_batch_rewards = calculate_avg_reward(
    parent_comments, ppo_outputs,
    rm_tokenizer, rm_model, device,
    model_label="PPO"
)

print("\nEvaluating SFT model")
sft_outputs = generate_responses(sft_model, sft_tokenizer, parent_comments)
sft_avg, sft_batch_rewards = calculate_avg_reward(
    parent_comments, sft_outputs,
    rm_tokenizer, rm_model, device,
    model_label="SFT"
)

print("\nEvaluating GPT-2 model")
gpt2_outputs = generate_responses(gpt2_model, gpt2_tokenizer, parent_comments)
gpt2_avg, gpt2_batch_rewards = calculate_avg_reward(
    parent_comments, gpt2_outputs,
    rm_tokenizer, rm_model, device,
    model_label="GPT2"
)
print("\n")
print(f"PPO model avg reward: {ppo_avg:.4f}")
print(f"SFT model avg reward:   {sft_avg:.4f}")
print(f"GPT-2 model avg reward: {gpt2_avg:.4f}")


Evaluating PPO model


Generating: 100%|██████████| 1000/1000 [02:40<00:00,  6.23sample/s]



Evaluating SFT model


Generating: 100%|██████████| 1000/1000 [03:46<00:00,  4.42sample/s]



Evaluating GPT-2 model


Generating: 100%|██████████| 1000/1000 [14:11<00:00,  1.17sample/s]




PPO model avg reward: 0.9323
SFT model avg reward:   0.7362
GPT-2 model avg reward: 0.2233


In [11]:
wandb.finish()

0,1
batch_avg_reward/GPT2,▄█▇▆▆▇▂▃▆▄▅▄▃▃▄▅▆▄▃▇▂▆▅▆▄▄█▆▅█▅▁
batch_avg_reward/PPO,▆▆▃▄▄▄▆▅▇▇▃▆█▂▇▇█▇▁▅▄▃██▄▆▄▂▅▆▇▆
batch_avg_reward/SFT,▅▅▅▄▁▄▃▄▆▄▅▄▅▄▄▄▇▅▁▃▃▃▃▅▃▅▄▅▃█▄▄
global_step,▁▁▂▂▃▃▄▄▅▅▆▇▇█▁▂▂▂▃▃▄▄▅▅▅▆▆▇▇█▁▁▂▃▃▄▅▅▅█
overall_avg_reward/GPT2,▁
overall_avg_reward/PPO,▁
overall_avg_reward/SFT,▁

0,1
batch_avg_reward/GPT2,0.1525
batch_avg_reward/PPO,0.94603
batch_avg_reward/SFT,0.71858
global_step,31.0
overall_avg_reward/GPT2,0.22334
overall_avg_reward/PPO,0.93229
overall_avg_reward/SFT,0.7362
