In [7]:
# Step 1: Install Necessary Libraries
# !pip install lime transformers

# Loading the complete dataset
import json
from tqdm import tqdm

# Step 2: Load GPT-2 Model
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load GPT-2 model and tokenizer
fine_tuned_model_path = "/mounts/Users/cisintern/greifensteinn/.cache/huggingface/hub/models--nairdanus--gpt2-rlhf-finetuned-hate/snapshots/ddaa95816a9b5471e0123dc443568096991929cd/pytorch_model.bin"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
base_model = GPT2LMHeadModel.from_pretrained("gpt2")
ft_model = GPT2LMHeadModel.from_pretrained("nairdanus/gpt2-rlhf-finetuned-hate")

# Step 3: Define a Function to Get Model Predictions
import torch


# Step 4: Implement LIME Explainer
from lime.lime_text import LimeTextExplainer
import numpy as np




Some weights of the model checkpoint at nairdanus/gpt2-rlhf-finetuned-hate were not used when initializing GPT2LMHeadModel: ['base_model.lm_head.weight', 'base_model.transformer.h.0.attn.c_attn.bias', 'base_model.transformer.h.0.attn.c_attn.weight', 'base_model.transformer.h.0.attn.c_proj.bias', 'base_model.transformer.h.0.attn.c_proj.weight', 'base_model.transformer.h.0.ln_1.bias', 'base_model.transformer.h.0.ln_1.weight', 'base_model.transformer.h.0.ln_2.bias', 'base_model.transformer.h.0.ln_2.weight', 'base_model.transformer.h.0.mlp.c_fc.bias', 'base_model.transformer.h.0.mlp.c_fc.weight', 'base_model.transformer.h.0.mlp.c_proj.bias', 'base_model.transformer.h.0.mlp.c_proj.weight', 'base_model.transformer.h.1.attn.c_attn.bias', 'base_model.transformer.h.1.attn.c_attn.weight', 'base_model.transformer.h.1.attn.c_proj.bias', 'base_model.transformer.h.1.attn.c_proj.weight', 'base_model.transformer.h.1.ln_1.bias', 'base_model.transformer.h.1.ln_1.weight', 'base_model.transformer.h.1.ln_2

In [23]:

with open("data.json", "r") as f:
    data = json.load(f)

data = [
    (int(i["gold"]), i["data"]["prompt"][20:].replace(", is the word ", ", is ").replace(" considered hate speech and give an explanation why.", " hateful, yes or no?[END]"))
    for i in data["annotations"]]



def get_distribution(text: str, current_model: GPT2LMHeadModel):

    input_ids = tokenizer.encode(text, return_tensors="pt")

    with torch.no_grad():
        outputs = current_model(input_ids)

    logits = outputs.logits[:, -1, :]

    probs = torch.softmax(logits, dim=-1)

    yes_token_id = tokenizer.encode("yes", add_special_tokens=False)[0]
    yesg_token_id = tokenizer.encode("Ġyes", add_special_tokens=False)[0]
    Yes_token_id = tokenizer.encode("Yes", add_special_tokens=False)[0]
    Yesg_token_id = tokenizer.encode("ĠYes", add_special_tokens=False)[0]

    no_token_id = tokenizer.encode("no", add_special_tokens=False)[0] 
    nog_token_id = tokenizer.encode("Ġno", add_special_tokens=False)[0] 
    No_token_id = tokenizer.encode("No", add_special_tokens=False)[0] 
    Nog_token_id = tokenizer.encode("ĠNo", add_special_tokens=False)[0] 

    yes_probs = probs[0,yes_token_id].item() + probs[0,Yes_token_id].item() + probs[0,yesg_token_id].item() + probs[0,Yesg_token_id].item()
    no_probs = probs[0,no_token_id].item() + probs[0,No_token_id].item() + probs[0,nog_token_id].item() + probs[0,Nog_token_id].item()

    return {
        1: yes_probs/(yes_probs+no_probs),
        0: no_probs/(yes_probs+no_probs)
        }

    
def get_metrics():
    base_TP = 0
    base_FP = 0
    base_TN = 0
    base_FN = 0
    
    ft_TP = 0
    ft_FP = 0
    ft_TN = 0
    ft_FN = 0

    for i, d in enumerate(tqdm(data)):
        gold = d[0]
        text = d[1]

        ft_dist = get_distribution(text, ft_model).items()
        base_dist = get_distribution(text, base_model).items()


        ft_res = max(ft_dist, key=lambda x: x[1])[0]
        base_res = max(base_dist, key=lambda x: x[1])[0]

        match ft_res:
            case 0:
                if gold == 0:
                    ft_TN += 1
                else:
                    ft_FN += 1
            case 1:
                if gold == 1:
                    ft_TP += 1
                else:
                    ft_FP += 1

        match base_res:
            case 0:
                if gold == 0:
                    base_TN += 1
                else:
                    base_FN += 1
            case 1:
                if gold == 1:
                    base_TP += 1
                else:
                    base_FP += 1

    return {
            "base_TP": base_TP,
            "base_FP": base_FP,
            "base_TN": base_TN,
            "base_FN": base_FN,
            
            "ft_TP": ft_TP,
            "ft_FP": ft_FP,
            "ft_TN": ft_TN,
            "ft_FN": ft_FN,

            "total": len(data)
        }


In [4]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import os

# Initialize LIME explainer
explainer = LimeTextExplainer(class_names=['No Hate', 'Hate'])

def get_explanations(samples, current_model: GPT2LMHeadModel, model_name):

    explanations = []

    def predict_sentiment(texts):
        current_model.eval()
        probs = []
        
        for text in texts:
            prob = get_distribution(text, current_model=current_model)
            probs.append([prob[0], prob[1]])

        return np.array(probs)
    
    pbar = tqdm(samples)
    pbar.set_description("Samples for model " + model_name)
    
    for sample in pbar:
        tgt = sample["tgt"]
        text_to_explain = sample["context"] + f", is the word '{ tgt }' hateful, yes or no?[END]"

        explanation = explainer.explain_instance(text_to_explain, predict_sentiment, num_features=10)

        explanations.append(explanation)

        path = os.path.join("output_html", tgt + "_" + model_name + ".html")
        
        while os.path.exists(path):
            path = path.replace(".html", "_new.html")

        explanation.save_to_file(path)

    return explanations


In [26]:
{
    'base_TP': 30,
    'base_FP': 33,
    'base_TN': 1997,
    'base_FN': 1785,
    
    'ft_TP': 1804,
    'ft_FP': 2013,
    'ft_TN': 17,
    'ft_FN': 11,

    'total': 3845
 }
 

get_metrics()

100%|██████████| 3845/3845 [06:52<00:00,  9.33it/s]


{'base_TP': 30,
 'base_FP': 33,
 'base_TN': 1997,
 'base_FN': 1785,
 'ft_TP': 1804,
 'ft_FP': 2013,
 'ft_TN': 17,
 'ft_FN': 11,
 'total': 3845}

In [6]:
# Explain a samples

samples = [
    {"context": "Do you think a little thing like a scratch would bother me? I'm no punk.", "tgt": 'punk', "gold": 0},
    {"context": "She's a pure Oreo. You know, like the cookie, black outside and white inside.", "tgt": 'oreo', "gold": 1},
    {"context": "Shut it, ya nonce!", "tgt": 'nonce' , "gold": 1},
    {"context": "Chewing-gum is a pig to get out of your hair.", "tgt": 'pig', "gold": 0},
    {"context": "He was a pig and his apartment a pigpen; take-away containers and pizza boxes in a long, moldy stream lined his counter tops.", "tgt": 'pig', "gold": 1}
    ]

base_exps = get_explanations(samples, base_model, "base")
ft_exps = get_explanations(samples, ft_model, "ft")

Samples for model base:  20%|██        | 1/5 [04:43<18:52, 283.14s/it]


KeyboardInterrupt: 

In [18]:

# Visualize it
ft_exps[0].show_in_notebook()


In [5]:
import os
base_dir = "/mounts/Users/cisintern/greifensteinn/github/llm_sose24_6"
os.chdir(os.path.join(base_dir, 'trlx/examples/summarize_rlhf/'))
from reward_model.reward_model import GPTRewardModel
os.chdir(base_dir)

from transformers import AutoTokenizer

REWARD_CHECKPOINT_PATH = "/mounts/Users/cisintern/greifensteinn/.cache/huggingface/hub/models--nairdanus--appraising_hate_speech/snapshots/7f1c82454e4d15827501fc8162946c3fac0de27e/pytorch_model.bin"
SFT_MODEL_PATH = "gpt2"  # "/mounts/data/corp/huggingface/meta-llama/Meta-Llama-3-8B-Instruct/"


# Load the pre-trained reward model
rw_tokenizer = AutoTokenizer.from_pretrained("gpt2")
rw_tokenizer.pad_token = rw_tokenizer.eos_token
rw_model = GPTRewardModel(SFT_MODEL_PATH)
rw_model.load_state_dict(torch.load(REWARD_CHECKPOINT_PATH))
rw_model.half()
rw_model.eval()
rw_device = torch.device("cuda")  # set reward model device
rw_model.to(rw_device)

  rw_model.load_state_dict(torch.load(REWARD_CHECKPOINT_PATH))


GPTRewardModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (v_head): Linear(in_features=768, out_features=1, bias=False)
)

In [53]:
from transformers import pipeline
from typing import List
from datasets import load_dataset

model_name = "gpt2"
# model_name = "nairdanus/gpt2-rlhf-finetuned-hate"

def generate(text: str, model_name: str):

    generator = pipeline('text-generation', model=model_name, tokenizer="gpt2", device_map="auto")

    return generator(text, max_new_tokens=50, pad_token_id=generator.tokenizer.eos_token_id)

def get_scores(samples: List[str]):
    scores_list = []
    batch_size = 2
    for i in range(0, len(samples), batch_size):
        sub_samples = samples[i : i + batch_size]
        sub_samples = ["<|startoftext|>" + chosen + "<|endoftext|>" for chosen in sub_samples]
        encodings_dict = rw_tokenizer(
            sub_samples,
            truncation=True,
            max_length=1000,
            padding="max_length",
            return_tensors="pt",
        )
        input_ids = encodings_dict["input_ids"].to(rw_device)
        attn_masks = encodings_dict["attention_mask"].to(rw_device)
        input_ids = input_ids.repeat(2, 1)
        attn_masks = attn_masks.repeat(2, 1)
        with torch.no_grad():
            sub_scores = rw_model(input_ids=input_ids, attention_mask=attn_masks)
        scores_list.append(sub_scores["chosen_end_scores"])
    scores = torch.cat(scores_list, dim=0)
    return scores

def reward_fn(samples: List[str], **kwargs):
    original_samples = [sample.split("[END]")[0] + "[END]" for sample in samples]
    original_samples = [post_summary_dict[original_samples[0]][0]["generated_text"]]
    original_scores = get_scores(original_samples)
    scores = get_scores(samples)
    norms_scores = scores - original_scores
    return norms_scores


def get_prompt_dataset(prompts, max_length):
    """
    Get the prompt after T5 decoding to make sure dictionary
    of prompts and summaries is consistent decode prompt from trlX pipeline
    """
    formatted_prompts = []
    for i in tqdm(range(len(prompts))):
        tmp = tokenizer.decode(
            tokenizer(
                prompts[i].split("[END]")[0],
                truncation=True,
                max_length=max_length,  # to make sure "TL;DR" doesnt get truncated
                add_special_tokens=False,
            )["input_ids"],
            skip_special_tokens=True,
        ).strip()
        tmp = tmp + "[END]"
        tmp = tokenizer.decode(
            tokenizer(tmp, truncation=True, max_length=max_length, add_special_tokens=False)["input_ids"],
            skip_special_tokens=True,
        ).strip()
        formatted_prompts.append(tmp)
    return formatted_prompts


In [32]:
post_summary_dict

{"Given this context: 'No wonder that Chauvelin's spies had failed to detect, in the apparently brainless nincompoop, the man whose reckless daring and resourceful ingenuity had baffled the keenest French spies, both in France and in England.', is the word 'nincompoop' considered hate speech and give an explanation why. Start your answer with 'Yes' or 'No'. Keep your answer concise and brief.[END]": [{'generated_text': "Given this context: 'No wonder that Chauvelin's spies had failed to detect, in the apparently brainless nincompoop, the man whose reckless daring and resourceful ingenuity had baffled the keenest French spies, both in France and in England.', is the word 'nincompoop' considered hate speech and give an explanation why. Start your answer with 'Yes' or 'No'. Keep your answer concise and brief. [END]\n\nI would only note that it gets very difficult to learn the facts about the 'Nincoops' as they would not want to know the names of the 'Vickers Brothers'.\n\nC. The second an

In [56]:
reward_fn([k.replace(" [END]", "[END]")]).item()

1.7431640625

In [27]:

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
max_length_input = 1000 - 50


#dataset = load_dataset("CarperAI/openai_summarize_tldr")
dataset = load_dataset('csv', data_files='dataset.csv', delimiter=";", split="train")
dataset = dataset.shuffle()
dataset = dataset.train_test_split(test_size=0.2)

PROMPT_END = " Start your answer with 'Yes' or 'No'. Keep your answer concise and brief. [END]"

# train_set = [(sample["prompt"] + PROMPT_END, generate(sample["prompt"] + PROMPT_END, model_name)) for sample in tqdm(dataset["train"])]
val_set = [(sample["prompt"] + PROMPT_END, generate(sample["prompt"] + PROMPT_END, model_name)) for sample in tqdm(dataset["test"])]

# Split contents into summaries and labels
# train_posts, train_summaries = zip(*train_set)
val_posts, val_summaries = zip(*val_set)

# Get the OpenAI summaries
post_summary_dict = {}
# train_prompts = get_prompt_dataset(train_posts, max_length_input)
# for i in range(len(train_prompts)):
#     post_summary_dict[train_prompts[i]] = train_summaries[i]
val_prompts = get_prompt_dataset(val_posts, max_length_input)
for i in range(len(val_prompts)):
    post_summary_dict[val_prompts[i]] = val_summaries[i]


  0%|          | 3/769 [00:04<20:49,  1.63s/it]

KeyboardInterrupt: 

In [58]:
from collections import defaultdict
import numpy as np

reward_dict = defaultdict(lambda: defaultdict(dict))

rewards = []

print("Starting")
for k, v in tqdm(post_summary_dict.items()):
    
    reward = reward_fn([k.replace(" [END]", "[END]")]).item()

    reward_dict[model_name][k] = {
        "output": v,
        "reward": reward
    }

    rewards.append(reward)

reward_dict[model_name]["metrics"] = {
    "avg": sum(rewards) / len(rewards),
    "std": np.std(np.array(rewards)),
    }


Starting


