In [115]:
# script code
import os
import platform
import random
import sys
import argparse
from typing import List, Dict, Union

import wandb
import plotly
import torch
import numpy as np
import transformers  # type: ignore
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    TrainingArguments,
)
from matplotlib import pyplot as plt  # type: ignore
from superhf.data import get_superhf_prompts  # type: ignore
from superhf.finetuning import SinglePassBestOfNTrainer  # type: ignore

# finetunning code
from accelerate import Accelerator, find_executable_batch_size
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

# data code
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

In [2]:
!pwd

/sailhome/pchatain/projects/superhf/experiments/superhf/shf_single_pass_v1


In [134]:
# args
LANGUAGE_MODEL_NAME = "eleutherai/gpt-neo-1.3B"
REWARD_MODEL_NAME = "OpenAssistant/reward-model-deberta-v3-large"
DATASET_NAME = "openai/webgpt_comparisons"
NUM_TRAIN_EXAMPLES = 8000
NUM_TEST_EXAMPLES = 100
RANDOM_SEED = 66
SHUTDOWN_AFTER_RUN = True
MAX_EXAMPLE_LENGTH = 36
# TODO ask what cache dir the user wants to use. default is ~/.cache/huggingface/

In [135]:
# code
def print_gpu_utilization() -> None:
    """
    Print the GPU memory occupied using nvidia-smi.
    """
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")
print_gpu_utilization()
! nvidia-smi

GPU memory occupied: 43323 MB.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Mon Feb 20 20:50:35 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    On   | 00000000:25:00.0 Off |                  Off |
| 30%   31C    P8    22W / 300W |  42868MiB / 49140MiB |      0%      Default |
|             

In [3]:
# code
def check_node() -> str:
    """
    Check if we are on the sail compute cluster. If so, return a scratch directory to
    write checkpoints to and logs for wandb. If not, return None.
    """
    if not os.path.exists("/sailhome"):
        print("Not on sail compute cluster.")
        return ""
    # print machine name
    machine_name = platform.node().split(".")[0]
    print("We are running on node: ", machine_name)

    # print available scratch directories
    print(" ".join(os.listdir(f"/{machine_name}")))

    # get a random scratch directory
    scratch_dir = "/self/scr0"
    if not os.path.exists(scratch_dir):
      scratch_dir = "/self/" + os.listdir(f"/{machine_name}")[0]
    print("Using scratch directory: ", scratch_dir)
    return scratch_dir
print(f"Check if returned dir exists: {os.path.exists(check_node())}")

We are running on node:  jagupard34
scr-sync
Using scratch directory:  /self/scr-sync
Check if returned dir exists: True


In [4]:
# main code
scratch_dir = check_node()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

wandb.login()

We are running on node:  jagupard34
scr-sync
Using scratch directory:  /self/scr-sync


[34m[1mwandb[0m: Currently logged in as: [33mpchatain[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [9]:
# main code
# Initialize random seeds for everything
random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
transformers.enable_full_determinism(RANDOM_SEED)

In [136]:
# maind code
print_gpu_utilization()
language_model = AutoModelForCausalLM.from_pretrained(LANGUAGE_MODEL_NAME).to(
        device
    )
print_gpu_utilization()
reward_model = AutoModelForSequenceClassification.from_pretrained(
    REWARD_MODEL_NAME
).to(device)
print_gpu_utilization()
language_tokenizer = AutoTokenizer.from_pretrained(
    LANGUAGE_MODEL_NAME, padding_side="left"
)
# Hacky commands in order to get the reward model to work: 
language_tokenizer.pad_token = language_tokenizer.eos_token
language_tokenizer.padding_side = "right"
reward_tokenizer = AutoTokenizer.from_pretrained(REWARD_MODEL_NAME)
print_gpu_utilization()
! nvidia-smi

GPU memory occupied: 43323 MB.
GPU memory occupied: 43323 MB.


Downloading (…)lve/main/config.json:   0%|          | 0.00/991 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

GPU memory occupied: 43323 MB.


Downloading (…)okenizer_config.json:   0%|          | 0.00/455 [00:00<?, ?B/s]

Downloading (…)"spm.model";:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

GPU memory occupied: 43323 MB.


In [137]:
# main code
models = {"language_model": language_model, "reward_model": reward_model}
tokenizers = {
    "language_tokenizer": language_tokenizer,
    "reward_tokenizer": reward_tokenizer,
}
print_gpu_utilization()
!nvidia-smi # why is there a slight disagreement?

GPU memory occupied: 43323 MB.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Mon Feb 20 20:52:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    On   | 00000000:25:00.0 Off |                  Off |
| 30%   34C    P2    69W / 300W |  42868MiB / 49140MiB |      0%      Default |
|             

In [12]:
# data code
class SummarizeFromFeedbackDataset(Dataset):
    """
    A webGPT dataset implementation for generating completions completions as well as finetuning a model on those generated completions.
    Loaded from hugging face.s
    """
    def __init__(self, split: str) -> None:
        """
        Return either the test or the train split for this dataset.
        # TODO: Add both the train and valid split?
        """
        super().__init__()
        self.data = load_dataset("openai/summarize_from_feedback", "comparisons")[split]
    
    def __len__(self) -> int:
        return len(self.data)
    
    def __getitem__(self, idx: int) -> Dict[str, str]:
        """
        Returns a tuple of the prompt, the completion, and the reward. The completions are re-ordered so that it is always
        chosen followed by rejected. 
        """
        assert self.data[idx]["choice"] in [0, 1], f"The choice for entry {idx} is not 0 or 1, it was {self.data[idx]['choice']}."
        prompt = self.data[idx]["info"]["post"]
        completion_chosen = self.data[idx]["summaries"][self.data[idx]["choice"]]["text"]
        completion_rejected = self.data[idx]["summaries"][1 - self.data[idx]["choice"]]["text"]
        return {
            "prompt": prompt,
            "completions": [completion_chosen, completion_rejected]
        }

class WebgptComparisons(Dataset):
    """
    A webGPT dataset implementation for generating completions completions as well as finetuning a model on those generated completions.
    Loaded from hugging face.s
    """
    def __init__(self, split: str) -> None:
        """
        Return either the test or the train split for this dataset.
        # TODO: Add both the train and valid split?
        """
        super().__init__()
        self.data = load_dataset("openai/webgpt_comparisons")[split]
    
    def __len__(self) -> int:
        return len(self.data)
    
    def __getitem__(self, idx: int) -> Dict[str, str]:
        """
        Returns a tuple of the prompt, the completion, and the reward. The completions are re-ordered so that it is always
        chosen followed by rejected. 
        """
        prompt = self.data[idx]["question"]["full_text"]
        score_0, score_1 = self.data[idx]["score_0"], self.data[idx]["score_1"]
        # completion_chosen_idx = "answer_0" if score_0 >= 0 else "answer_1"
        # completion_rejected_idx = "answer_1" if score_0 >= 0 else "answer_0"
        # completion_chosen = self.data[idx][completion_chosen_idx]
        # completion_rejected = self.data[idx][completion_rejected_idx]
        completion_chosen = self.data[idx]["answer_0"]
        completion_rejected = self.data[idx]["answer_1"]
        return {
            "prompt": prompt,
            "completions": [completion_chosen, completion_rejected],
            "scores": [score_0, score_1]
        }

# class FinetuneDataset(Dataset):
#     """
#     A dataset containing only the completions we plan to train on.
#     """
#     def __init__(self, dataset: Dataset, unique_prompt_indices=[]) -> None:
#         super().__init__()
#         self.data = dataset
#         self.unique_prompt_indices = unique_prompt_indices
#     def __len__(self) -> int:
#         return len(self.data)
    
#     def __getitem__(self, index) -> str:
#         """
#         Returns a single completion.
#         """
#         answer_completion_pair = self.data[index]["prompt"] + "[SEP]" + self.data[index]["completions"][0]
#         return answer_completion_pair

class PromptDataset(Dataset):
    """
    Get only the prompts from a particular dataset.
    TODO: implement fully
    """
    def __init__(self, dataset: Dataset) -> None:
        super().__init__()
        self.data = dataset.data
    
    def __len__(self) -> int:
        return len(self.data)
    
    def __getitem__(self, idx: int) -> str:
        """
        Dataset must have a 'prompt' key containing single prompt at each example.
        """
        return self.data[idx]["prompt"]


In [110]:
# main generating code
generating_dataset = WebgptComparisons("train")
print(f"Number of examples in generating dataset: {len(generating_dataset.data)}")

Found cached dataset webgpt_comparisons (/sailhome/pchatain/.cache/huggingface/datasets/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)


  0%|          | 0/1 [00:00<?, ?it/s]

Number of examples in generating dataset: 19578


In [12]:
print(generating_dataset.data[2]["question"]["full_text"])

Heterophobia is the irrational fear of what


#### Figuring out prefixing
 Voiced by Harry Shearer, what Simpsons character was modeled after Ted Koppel?◼[1] Kent Brockman (en.wikipedia.org)

Kent Brockman is a fictional character in the animated television series The Simpsons. He is voiced by Harry Shearer and first appeared in the episode "Krusty Gets Busted". He is a grumpy, self-centered local Springfield news anchor.◼[2] Krusty the Clown (en.wikipedia.org)

Krusty was created by cartoonist Matt Groening and partially inspired by Rusty Nails, a television clown from Groening's hometown of Portland, Oregon.◼

In [13]:
# print(generating_dataset.data[0]["tokens_0"]["prefix"])
print(f"The untokenized version is\n {language_tokenizer.decode(generating_dataset.data[0]['tokens_0']['prefix'])}")
input_str_example1 = generating_dataset.data[0]["quotes_0"]['extract']
print(generating_dataset.data[0]["question"])
print(generating_dataset.data[0]["quotes_0"])
# from matching the above, it looks like the forumla is full_text<SEP>Title[0]\n\nExtract[0]<SEP>title[1]\n\nExtract[1]
print("----")
print(generating_dataset.data[0]["question"]["full_text"] + "[SEP]" + generating_dataset.data[0]["quotes_0"]["title"][0] + "\n\n" + generating_dataset.data[0]["quotes_0"]["extract"][0] + "[SEP]" + generating_dataset.data[0]["quotes_0"]["title"][1] + "\n\n" + generating_dataset.data[0]["quotes_0"]["extract"][1])
print("----")
print(input_str_example1)
print(language_tokenizer(generating_dataset.data[0]["question"]["full_text"] + input_str_example1[0] + input_str_example1[1]).input_ids)
print()
print(generating_dataset.data[0]['tokens_0']['prefix'])
print(generating_dataset.data[0]["score_1"])
print(generating_dataset.data[0]["answer_0"])
print("---")
print(generating_dataset.data[0]["answer_1"])

The untokenized version is
 Voiced by Harry Shearer, what Simpsons character was modeled after Ted Koppel?◼[1] Kent Brockman (en.wikipedia.org)

Kent Brockman is a fictional character in the animated television series The Simpsons. He is voiced by Harry Shearer and first appeared in the episode "Krusty Gets Busted". He is a grumpy, self-centered local Springfield news anchor.◼[2] Krusty the Clown (en.wikipedia.org)

Krusty was created by cartoonist Matt Groening and partially inspired by Rusty Nails, a television clown from Groening's hometown of Portland, Oregon.◼
{'dataset': 'triviaqa', 'id': '18c654a169eb80287f4353d33e701b1c', 'full_text': 'Voiced by Harry Shearer, what Simpsons character was modeled after Ted Koppel?'}
{'title': ['Kent Brockman (en.wikipedia.org)', 'Krusty the Clown (en.wikipedia.org)'], 'extract': ['Kent Brockman is a fictional character in the animated television series The Simpsons. He is voiced by Harry Shearer and first appeared in the episode "Krusty Gets Bus

In [138]:

t_z = generating_dataset.data[0]
language_tokenizer.pad_token = language_tokenizer.eos_token
language_tokenizer.padding_side = "right"
input_test = [t_z["question"]["full_text"] + "[SEP] " + t_z["answer_0"] , t_z["question"]["full_text"] + "[SEP] " + t_z["answer_1"]]
for i in range(8, 32):
  t_z = generating_dataset.data[i+1]
  input_test.append(t_z["question"]["full_text"] + t_z["answer_0"])
  input_test.append(t_z["question"]["full_text"] + t_z["answer_1"])
# print(t_z)
# full_text<SEP>Title[0]\n\nExtract[0]<SEP>title[1]\n\nExtract[1]
# test_input = t_z["question"]["full_text"]
# test_input += "◼[1] " + t_z["quotes_0"]["title"][0] + "\n\n"
# test_input += t_z["quotes_0"]["extract"][0] + "◼[2] " + t_z["quotes_0"]["title"][1] + "\n\n"
# test_input += t_z["quotes_0"]["extract"][1] + "◼ " + t_z["answer_0"]
# print(test_input)
# input_tests = language_tokenizer(test_input, return_tensors="pt").to(device)

with torch.no_grad():
  # print(reward_model(**input_tests))
  # print(reward_model(**language_tokenizer(, return_tensors="pt").to(device)))
  # print(input_test)
  # change the tokenizer to padd at the end
  dict_input = {**reward_tokenizer(input_test, padding=True, truncation=True, return_tensors="pt").to(device)}
  # print(language_tokenizer(input_test, padding=True, truncation=True, return_tensors="pt").attention_mask)
  # attn_mask = language_tokenizer(input_test, padding=True, truncation=True, return_tensors="pt").attention_mask
  # Make the entire attention mask 1
  # attn_mask = torch.ones(dict_input["attention_mask"].shape).to(device)
  # attn_mask[0, 0] = 0
  # dict_input["attention_mask"] = attn_mask
  print(dict_input.keys())
  # print(dict_input["token_type_ids"])
  print(reward_model(**dict_input))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
SequenceClassifierOutput(loss=None, logits=tensor([[ 0.1831],
        [-0.2761],
        [-0.3608],
        [-0.8146],
        [-0.2934],
        [-0.1504],
        [ 0.1949],
        [-0.2499],
        [-0.5041],
        [-0.7924],
        [-1.2534],
        [-0.6537],
        [-0.8017],
        [-0.7752],
        [ 1.0995],
        [ 0.8629],
        [-0.5990],
        [-0.9155],
        [ 0.5163],
        [-0.6813],
        [ 1.3456],
        [ 0.9816],
        [-1.3254],
        [ 0.3311],
        [ 0.3018],
        [ 0.7698],
        [-0.1265],
        [ 1.2438],
        [-0.6673],
        [-0.6716],
        [-1.7653],
        [ 0.0703],
        [ 0.4887],
        [ 0.4995],
        [-0.6711],
        [ 0.3633],
        [ 0.4317],
        [-1.2660],
        [-1.2369],
        [-1.9949],
        [-0.1997],
        [ 0.2107],
        [-0.5659],
        [-0.4063],
        [-1.1667],
        [-0.3571],
        [-2.1465],
   

In [13]:
torch.cuda.current_device()

0

#### Code again

In [139]:
# finetuning code
def process_batch_inputs(model, tokenizer=None, examples=None, max_length=512):
    """
    Process a batch of examples and return the logits of the model

    Args:
      model: the model to use
      tokenizer: the tokenizer to use
      examples: a list of examples to process
      max_length: the max length of the examples

    Returns:
      the logits of the model: a tensor of shape (batch_size, max_length)
    """
    assert examples != None, "examples cannot be None"
    if tokenizer != None:
      examples = tokenizer(examples, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to(device)
    with torch.no_grad():
      score = model(**examples).logits.detach().flatten()
    print("processed one batch")
    return score
reward_model.eval()
@find_executable_batch_size(starting_batch_size=128)
def score_completions(batch_size):
    """
    Process a batch of examples and return the logits of the model.
    Automatically finds the batch size that fits on the GPU.

    Args:
      model: the model to use
      tokenizer: the tokenizer to use
      examples: a list of examples to process
      max_length: the max length of the examples

    Returns:
      the logits of the model: a tensor of shape (batch_size, max_length)
    """
    generating_dataset = DataLoader(WebgptComparisons("train"), shuffle=False, batch_size=batch_size)
    print(f"generating with batch_size {batch_size}...")
    def collate_fn(batch):
      """ Collates examples into a list """
      examples = []
      for i in range(len(batch["prompt"])):
          examples.append(batch["prompt"][i] + "[SEP] " + batch["completions"][0][i])
          examples.append(batch["prompt"][i] + "[SEP] " + batch["completions"][1][i])
      return examples

    scores = {"model": [], "label": []}
    count = 0
    for batch in generating_dataset:
        if len(batch["completions"][0]) != batch_size:
          continue
        # print(batch["completions"][0][0])
        scores["label"].append(batch["scores"])
        examples = collate_fn(batch)
        # for ex in examples:
        #   print(ex)
        #   print("--")
        processed = process_batch_inputs(reward_model, tokenizer=reward_tokenizer, examples=examples, max_length=512).to("cpu")
        processed = processed.reshape(batch_size, 2)
        scores["model"].append(processed)
        # if count == 2:
        #   break
        # count += 1
    return scores
print_gpu_utilization()

GPU memory occupied: 43323 MB.


In [130]:
print_gpu_utilization()

GPU memory occupied: 44759 MB.


In [131]:
scored_completions3 = score_completions()

Found cached dataset webgpt_comparisons (/sailhome/pchatain/.cache/huggingface/datasets/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)


  0%|          | 0/1 [00:00<?, ?it/s]

generating with batch_size 128...
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
proces

In [104]:
scored_completions2 = score_completions()

Found cached dataset webgpt_comparisons (/sailhome/pchatain/.cache/huggingface/datasets/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)


  0%|          | 0/1 [00:00<?, ?it/s]

generating with batch_size 128...
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
proces

In [99]:
scored_completions = score_completions()

Found cached dataset webgpt_comparisons (/sailhome/pchatain/.cache/huggingface/datasets/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)


  0%|          | 0/1 [00:00<?, ?it/s]

generating with batch_size 256...


Found cached dataset webgpt_comparisons (/sailhome/pchatain/.cache/huggingface/datasets/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)


  0%|          | 0/1 [00:00<?, ?it/s]

generating with batch_size 128...
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
processed one batch
proces

In [122]:
def sort_scores(scored_completions):
    obtained_labels = torch.cat([torch.cat((scored_completions["label"][i][0].unsqueeze(dim=0), scored_completions["label"][i][1].unsqueeze(dim=0)), dim=0).transpose(1,0) for i in range(len(scored_completions["model"]))])
    obtained_model_scores = torch.cat(scored_completions["model"])

    model_selections = [(1.0 * (score[0] > score[1])).item() for score in obtained_model_scores]
    true_selections = [(1.0 * (score[0] > score[1])).item() for score in obtained_labels]
    equal_selections = [1.0 * (score[0] == score[1]).item() for score in obtained_labels]

    print(f"The sume of the true selections is {sum(true_selections)}")
    print(f"The sume of the model selections is {sum(model_selections)}")
    print(f"The sume of the equal selections is {sum(equal_selections)}")
    print(f"The length of alll selections is {len(true_selections)}")

    nCorrect = 0
    nWrong = 0
    nEqual = 0
    nWrongNotEqual = 0
    chosen_scores = []
    rejected_scores = []
    equal_scores = []
    # go over the model completions and make a list of the selected ones based on score
    for entry in range(len(model_selections)):
        if equal_selections[entry] == 1:
            equal_scores.append(obtained_model_scores[entry][0])
            equal_scores.append(obtained_model_scores[entry][1])
            if np.abs(obtained_model_scores[entry][0] - obtained_model_scores[entry][1]) < 0.2:
                nEqual += 1
            else:
                nWrongNotEqual += 1
        elif true_selections[entry] == 1:
            chosen_scores.append(obtained_model_scores[entry][0])
            rejected_scores.append(obtained_model_scores[entry][1])
            if obtained_model_scores[entry][0] > obtained_model_scores[entry][1]:
                nCorrect += 1
            else:
                nWrong += 1
        elif true_selections[entry] == 0:
            chosen_scores.append(obtained_model_scores[entry][1])
            rejected_scores.append(obtained_model_scores[entry][0])
            if obtained_model_scores[entry][0] < obtained_model_scores[entry][1]:
                nCorrect += 1
            else:
                nWrong += 1
        else:
            print("error")

    print(f"Num correct: {nCorrect}, Num wrong: {nWrong}, Num equal: {nEqual}, Num wrong not equal: {nWrongNotEqual}")
    return chosen_scores, rejected_scores, equal_scores

    

In [118]:
# make a plotly graph of the scored_completions
print(scored_completions["model"][0].shape)
print(scored_completions["label"][0][0].shape)
obtained_labels = torch.cat([torch.cat((scored_completions["label"][i][0].unsqueeze(dim=0), scored_completions["label"][i][1].unsqueeze(dim=0)), dim=0).transpose(1,0) for i in range(len(scored_completions["model"]))])
print(obtained_labels.shape)
obtained_model_scores = torch.cat(scored_completions["model"])
print(obtained_model_scores.shape)
# print(scored_completions["model"][0])

chosen_scores = []
rejected_scores = []
equal_scores = []

# go over the model completions and make a list of the selected ones based on score
model_selections = [(1.0 * (score[0] > score[1])).item() for score in obtained_model_scores]
# a 1 means the model chose the first completion
# print(model_selections)
true_selections = [(1.0 * (score[0] > score[1])).item() for score in obtained_labels]
# a 1 means the model chose the second completion
# print(true_selections)
equal_selections = [1.0 * (score[0] == score[1]).item() for score in obtained_labels]
# print(equal_selections)

print(f"The sume of the true selections is {sum(true_selections)}")
print(f"The sume of the model selections is {sum(model_selections)}")
print(f"The sume of the equal selections is {sum(equal_selections)}")
print(f"The length of alll selections is {len(true_selections)}")

nCorrect = 0
nWrong = 0
nEqual = 0
nWrongNotEqual = 0
# go over the model completions and make a list of the selected ones based on score
for entry in range(len(model_selections)):
  if equal_selections[entry] == 1:
    equal_scores.append(obtained_model_scores[entry][0])
    equal_scores.append(obtained_model_scores[entry][1])
    if np.abs(obtained_model_scores[entry][0] - obtained_model_scores[entry][1]) < 0.2:
      nEqual += 1
    else:
      nWrongNotEqual += 1
  elif true_selections[entry] == 1:
    chosen_scores.append(obtained_model_scores[entry][0])
    rejected_scores.append(obtained_model_scores[entry][1])
    if obtained_model_scores[entry][0] > obtained_model_scores[entry][1]:
      nCorrect += 1
    else:
      nWrong += 1
  elif true_selections[entry] == 0:
    chosen_scores.append(obtained_model_scores[entry][1])
    rejected_scores.append(obtained_model_scores[entry][0])
    if obtained_model_scores[entry][0] < obtained_model_scores[entry][1]:
      nCorrect += 1
    else:
      nWrong += 1
  else:
    print("error")

print(f"Num correct: {nCorrect}, Num wrong: {nWrong}, Num equal: {nEqual}")

torch.Size([128, 2])
torch.Size([128])
torch.Size([19456, 2])
torch.Size([19456, 2])
The sume of the true selections is 7206.0
The sume of the model selections is 9526.0
The sume of the equal selections is 5203.0
The length of alll selections is 19456
Num correct: 7093, Num wrong: 7160, Num equal: 1639


In [126]:
# Create the histograms of chosen_scores, rejected_scores, equal_scores
chosen_scores, rejected_scores, equal_scores = sort_scores(scored_completions2)
fig = plotly.graph_objects.Figure()

fig.add_trace(plotly.graph_objects.Histogram(x=chosen_scores, name='Choosen Scores', opacity=0.7))
fig.add_trace(plotly.graph_objects.Histogram(x=rejected_scores, name='Rejected Scores', opacity=0.7))
fig.add_trace(plotly.graph_objects.Histogram(x=equal_scores, name='Equal Scores', opacity=0.25))

# Adjust the layout
fig.update_layout(title='Comparison of Model and Label Scores',
                  xaxis_title='Score',
                  yaxis_title='Count',
                  barmode='overlay')

# Show the figure
fig.show()

The sume of the true selections is 7206.0
The sume of the model selections is 9526.0
The sume of the equal selections is 5203.0
The length of alll selections is 19456
Num correct: 7093, Num wrong: 7160, Num equal: 1639, Num wrong not equal: 3564


In [119]:
# make a plotly graph of the scored_completions2
print(scored_completions2["model"][0].shape)
print(scored_completions2["label"][0][0].shape)
obtained_labels = torch.cat([torch.cat((scored_completions2["label"][i][0].unsqueeze(dim=0), scored_completions2["label"][i][1].unsqueeze(dim=0)), dim=0).transpose(1,0) for i in range(len(scored_completions2["model"]))])
print(obtained_labels.shape)
obtained_model_scores = torch.cat(scored_completions2["model"])
print(obtained_model_scores.shape)
# print(scored_completions2["model"][0])

chosen_scores = []
rejected_scores = []
equal_scores = []

# go over the model completions and make a list of the selected ones based on score
model_selections = [(1.0 * (score[0] > score[1])).item() for score in obtained_model_scores]
# a 1 means the model chose the first completion
# print(model_selections)
true_selections = [(1.0 * (score[0] > score[1])).item() for score in obtained_labels]
# a 1 means the model chose the second completion
# print(true_selections)
equal_selections = [1.0 * (score[0] == score[1]).item() for score in obtained_labels]
# print(equal_selections)

print(f"The sume of the true selections is {sum(true_selections)}")
print(f"The sume of the model selections is {sum(model_selections)}")
print(f"The sume of the equal selections is {sum(equal_selections)}")
print(f"The length of alll selections is {len(true_selections)}")

# go over the model completions and make a list of the selected ones based on score
for entry in range(len(model_selections)):
  if equal_selections[entry] == 1:
    equal_scores.append(obtained_model_scores[entry][0])
    equal_scores.append(obtained_model_scores[entry][1])
  elif true_selections[entry] == 1:
    chosen_scores.append(obtained_model_scores[entry][0])
    rejected_scores.append(obtained_model_scores[entry][1])
  elif true_selections[entry] == 0:
    chosen_scores.append(obtained_model_scores[entry][1])
    rejected_scores.append(obtained_model_scores[entry][0])
  else:
    print("error")


torch.Size([128, 2])
torch.Size([128])
torch.Size([19456, 2])
torch.Size([19456, 2])
The sume of the true selections is 7206.0
The sume of the model selections is 9526.0
The sume of the equal selections is 5203.0
The length of alll selections is 19456


In [133]:
# Create the histograms of chosen_scores, rejected_scores, equal_scores
chosen_scores, rejected_scores, equal_scores = sort_scores(scored_completions)
fig = plotly.graph_objects.Figure()

fig.add_trace(plotly.graph_objects.Histogram(x=chosen_scores, name='Choosen Scores', opacity=0.7))
fig.add_trace(plotly.graph_objects.Histogram(x=rejected_scores, name='Rejected Scores', opacity=0.7))
fig.add_trace(plotly.graph_objects.Histogram(x=equal_scores, name='Equal Scores', opacity=0.25))

# Adjust the layout
fig.update_layout(title='Comparison of Model and Label Scores',
                  xaxis_title='Score',
                  yaxis_title='Count',
                  barmode='overlay')

# Show the figure
fig.show()

The sume of the true selections is 7206.0
The sume of the model selections is 9526.0
The sume of the equal selections is 5203.0
The length of alll selections is 19456
Num correct: 7093, Num wrong: 7160, Num equal: 1639, Num wrong not equal: 3564


In [132]:
# Create the histograms of chosen_scores, rejected_scores, equal_scores
chosen_scores, rejected_scores, equal_scores = sort_scores(scored_completions3)
fig = plotly.graph_objects.Figure()

fig.add_trace(plotly.graph_objects.Histogram(x=chosen_scores, name='Choosen Scores', opacity=0.7))
fig.add_trace(plotly.graph_objects.Histogram(x=rejected_scores, name='Rejected Scores', opacity=0.7))
fig.add_trace(plotly.graph_objects.Histogram(x=equal_scores, name='Equal Scores', opacity=0.25))

# Adjust the layout
fig.update_layout(title='Comparison of Model and Label Scores',
                  xaxis_title='Score',
                  yaxis_title='Count',
                  barmode='overlay')

# Show the figure
fig.show()

The sume of the true selections is 7206.0
The sume of the model selections is 9399.0
The sume of the equal selections is 5203.0
The length of alll selections is 19456
Num correct: 9581, Num wrong: 4672, Num equal: 1845, Num wrong not equal: 3358
