# Test Notebook

### Prefix

In [1]:
import sys
sys.path.insert(0,'..')

### Imports

Following the guide from https://huggingface.co/docs/transformers/tasks/language_modeling. 

In [4]:
from superhf import skeleton
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer


# Sample Usage of Open Assistant Reward model base (~700mb model)

## Load the Model

In [5]:
reward_name = "OpenAssistant/reward-model-deberta-v3-base"
rank_model, tokenizer = AutoModelForSequenceClassification.from_pretrained(reward_name), AutoTokenizer.from_pretrained(reward_name)

Downloading (…)lve/main/config.json: 100%|██████████| 988/988 [00:00<00:00, 267kB/s]
Downloading (…)"pytorch_model.bin";: 100%|██████████| 738M/738M [00:47<00:00, 15.5MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 454/454 [00:00<00:00, 185kB/s]
Downloading (…)"spm.model";: 100%|██████████| 2.46M/2.46M [00:00<00:00, 5.25MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 8.66M/8.66M [00:01<00:00, 7.85MB/s]
Downloading (…)in/added_tokens.json: 100%|██████████| 23.0/23.0 [00:00<00:00, 8.64kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 173/173 [00:00<00:00, 82.3kB/s]


## Load and score a dataset

In [13]:

question, answer = "Explain nuclear fusion like I am five", "Nuclear fusion is the process by which two or more protons and neutrons combine to form a single nucleus. It is a very important process in the universe, as it is the source of energy for stars and galaxies. Nuclear fusion is also a key process in the production of energy for nuclear power plants."
inputs = tokenizer(question, answer, return_tensors='pt')
score = rank_model(**inputs).logits.detach()
print(score)

tensor([[0.5816]])


In [18]:
question, answer = "Explain nuclear fusion like I am five", "Nuclear fusion is the process by which two or more protons and neutrons combine to form a single nucleus. Nuclear fusion is also a key process in the production of energy for nuclear power plants."
inputs = tokenizer(question, answer, return_tensors='pt')
score = rank_model(**inputs).logits.detach()
print(score)

tensor([[0.8107]])


In [22]:
question, answer = "Explain nuclear fusion like I am five", "Nuclear fusion is the process by which two or more protons and neutrons combine to form a single nucleus. Nuclear fusion is also a key process in the production of energy for nuclear power plants. Nuclear fusion is what powers the sun."
inputs = tokenizer(question, answer, return_tensors='pt')
score = rank_model(**inputs).logits.detach()
print(score)

tensor([[0.2474]])


# Example Code

In [5]:
# eli5 = load_dataset("eli5", split="train_asks[:5000]")
# eli5 = eli5.train_test_split(test_size=0.2)

In [6]:
# eli5["train"][0]

{'q_id': 'itd2e',
 'title': 'What could be accomplished if we were able to adjust the strength of each fundamental force?',
 'selftext': 'If the force of gravity is only weak because we are separated from its concentration by one or more extra dimensions, then a similar separation from the areas of concentration of the other forces should also be possible, which would result in the adjustments mentioned in the title. So what would the ramifications of such adjustments be?',
 'document': '',
 'subreddit': 'askscience',
 'answers': {'a_id': ['c26hboo', 'c26jbsf'],
  'text': ["> If the force of gravity is only weak because we are separated from its concentration by one or more extra dimensions, then a similar separation from the areas of concentration of the other forces should also be possible,\n\nNo. This... really doesn't mean much of anything, I'm afraid. I can answer the last part of your question, but I have to point out that this whole premise part does not make sense.\n\nIf you co

In [8]:
# tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

Downloading (…)lve/main/config.json: 100%|██████████| 762/762 [00:00<00:00, 322kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 2.12MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.25MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 2.43MB/s]


In [9]:
# eli5 = eli5.flatten()
# eli5["train"][0]

{'q_id': 'itd2e',
 'title': 'What could be accomplished if we were able to adjust the strength of each fundamental force?',
 'selftext': 'If the force of gravity is only weak because we are separated from its concentration by one or more extra dimensions, then a similar separation from the areas of concentration of the other forces should also be possible, which would result in the adjustments mentioned in the title. So what would the ramifications of such adjustments be?',
 'document': '',
 'subreddit': 'askscience',
 'answers.a_id': ['c26hboo', 'c26jbsf'],
 'answers.text': ["> If the force of gravity is only weak because we are separated from its concentration by one or more extra dimensions, then a similar separation from the areas of concentration of the other forces should also be possible,\n\nNo. This... really doesn't mean much of anything, I'm afraid. I can answer the last part of your question, but I have to point out that this whole premise part does not make sense.\n\nIf you

In [10]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]], truncation=True)

In [11]:
# tokenized_eli5 = eli5.map(
#     preprocess_function,
#     batched=True,
#     num_proc=4,
#     remove_columns=eli5["train"].column_names,
# )

#0:   0%|          | 0/1 [00:00<?, ?ba/s]
[A

[A[A
#0: 100%|██████████| 1/1 [00:01<00:00,  1.53s/ba]

#2: 100%|██████████| 1/1 [00:01<00:00,  1.53s/ba]
#0: 100%|██████████| 1/1 [00:01<00:00,  1.55s/ba]
#3: 100%|██████████| 1/1 [00:01<00:00,  1.53s/ba]
#1: 100%|██████████| 1/1 [00:01<00:00,  1.55s/ba]
#0:   0%|          | 0/1 [00:00<?, ?ba/s]
[A

[A[A

#3: 100%|██████████| 1/1 [00:00<00:00,  2.94ba/s]

#1: 100%|██████████| 1/1 [00:00<00:00,  2.68ba/s]
#2: 100%|██████████| 1/1 [00:00<00:00,  2.71ba/s]
#0: 100%|██████████| 1/1 [00:00<00:00,  2.44ba/s]


In [12]:
block_size = 128


def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [13]:
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

#0:   0%|          | 0/1 [00:00<?, ?ba/s]
[A

[A[A
#2: 100%|██████████| 1/1 [00:05<00:00,  5.11s/ba]
#0: 100%|██████████| 1/1 [00:05<00:00,  5.19s/ba]


#3: 100%|██████████| 1/1 [00:05<00:00,  5.25s/ba]
#1: 100%|██████████| 1/1 [00:05<00:00,  5.44s/ba]
#0:   0%|          | 0/1 [00:00<?, ?ba/s]
[A

#0: 100%|██████████| 1/1 [00:00<00:00,  3.47ba/s]

[A

#2: 100%|██████████| 1/1 [00:00<00:00,  3.50ba/s]
#3: 100%|██████████| 1/1 [00:00<00:00,  3.63ba/s]
#1: 100%|██████████| 1/1 [00:00<00:00,  3.30ba/s]
