In [1]:
from scripts.reward_learning import reward_learning
from scripts.reward_evaluation import get_accuracy
from scripts.ppo_model_training import get_dataset_iterator, prepare_model_for_PPO, prepare_reward_model_for_PPO, prepare_PPO_config, prepare_PPO_trainer, train_model
from scripts.model_evaluation import load_true_reward_model, load_model_and_tokenizer,  generate_response_from_model, evaluate_responses

  from .autonotebook import tqdm as notebook_tqdm


## Anthropic + Gemma

### Anthropic + Gemma : Small Epochs

In [2]:
model_name = 'Gemma'
dataset_name = 'Anthropic'
output_folder = './output'
small = True
info = 'le' # identifier for small epoch

In [3]:
samples_per_epoch = 100
epochs = 20
id_apo = 'apo'

In [4]:
reward_learning(dataset_name, model_name, samples_per_epoch, epochs, output_folder, id_apo, small, info)

print('-'*50)
get_accuracy(model_name, dataset_name, output_folder, small, id_apo, info)


100%|██████████| 20/20 [00:02<00:00,  7.62it/s]


--------------------------------------------------
apo accuracy: 94.55%


In [4]:
samples_per_epoch = 100
epochs = 20
id_rand = 'random'

In [6]:
reward_learning(dataset_name, model_name, samples_per_epoch, epochs, output_folder, id_rand, small, info)

print('-'*50)
get_accuracy(model_name, dataset_name, output_folder, small, id_rand, info)

100%|██████████| 20/20 [00:01<00:00, 19.44it/s]


--------------------------------------------------
random accuracy: 83.0%


In [5]:
ds = get_dataset_iterator(dataset_name, model_name)
model_apo, tokenizer = prepare_model_for_PPO(model_name)
reward_model_apo, rm_tokenizer_apo = prepare_reward_model_for_PPO(model_name, dataset_name, output_folder, id_apo, info)
ppo_config = prepare_PPO_config(model_name)
ppo_trainer_apo, gen_kwargs = prepare_PPO_trainer(ppo_config, model_apo, tokenizer, ds)
train_model(ppo_trainer_apo, gen_kwargs, tokenizer, reward_model_apo, rm_tokenizer_apo, output_folder, model_name, dataset_name, id_apo, max_iter=20, small=small, info=info)


# Free up GPU

del model_apo
del tokenizer
del reward_model_apo
del rm_tokenizer_apo
del ppo_config
del ppo_trainer_apo
del gen_kwargs

Map: 100%|██████████| 8000/8000 [00:02<00:00, 3963.97 examples/s]
Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.63it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.67it/s]
Some weights of GemmaForSequenceClassification were not initialized from the model checkpoint at google/gemma-2b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
20it [1:08:59, 206.95s/it]


Done!!!


In [6]:
ds = get_dataset_iterator(dataset_name, model_name)
model_rand, tokenizer = prepare_model_for_PPO(model_name)
reward_model_rand, rm_tokenizer_rand = prepare_reward_model_for_PPO(model_name, dataset_name, output_folder, id_rand, info)
ppo_config = prepare_PPO_config(model_name)
ppo_trainer_rand, gen_kwargs = prepare_PPO_trainer(ppo_config, model_rand, tokenizer, ds)
train_model(ppo_trainer_rand, gen_kwargs, tokenizer, reward_model_rand, rm_tokenizer_rand, output_folder, model_name, dataset_name, id_rand, max_iter=20, small=small, info=info)

# Free up GPU
del model_rand
del tokenizer
del reward_model_rand
del rm_tokenizer_rand
del ppo_config
del ppo_trainer_rand
del gen_kwargs

Map: 100%|██████████| 8000/8000 [00:01<00:00, 4052.22 examples/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.25s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.70it/s]
Some weights of GemmaForSequenceClassification were not initialized from the model checkpoint at google/gemma-2b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
20it [1:05:08, 195.45s/it]


Done!!!


In [None]:
model, tokenizer = load_model_and_tokenizer(model_name, dataset_name, output_path=output_folder, id='apo', small=small, info=info)

generate_response_from_model(model, tokenizer, dataset_name, model_name, 'apo', output_folder, small, 500, info)

del model
del tokenizer

In [None]:

reward_processor = load_true_reward_model(model_name, dataset_name)
evaluate_responses(reward_processor, dataset_name, model_name, 'apo', output_folder, small, info)

In [None]:
model, tokenizer = load_model_and_tokenizer(model_name, dataset_name, output_path=output_folder, id='random', small=small, info=info)

generate_response_from_model(model, tokenizer, dataset_name, model_name, 'random', output_folder, small, 500, info)

del model
del tokenizer


In [None]:

reward_processor = load_true_reward_model(model_name, dataset_name)
evaluate_responses(reward_processor, dataset_name, model_name, 'random', output_folder, small, info)

### Extra Evaluation Code

In [32]:
import pandas as pd
import torch
from transformers import AutoTokenizer, BitsAndBytesConfig
from trl import AutoModelForCausalLMWithValueHead
df = pd.read_csv('./datasets/anthropic_hard_8k_test.csv.gz', compression='gzip')
tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b-it', add_bos_token=False)
query_tensors = tokenizer(list(df['query']))



In [33]:
bnb_config = BitsAndBytesConfig(
                        load_in_4bit=True,
                        bnb_4bit_quant_type="nf4",
                        bnb_4bit_use_double_quant=True,
                        bnb_4bit_compute_dtype=torch.bfloat16
                    )

In [34]:
model = AutoModelForCausalLMWithValueHead.from_pretrained('./output/Anthropic_small_Gemma_final_model_apo_le', use_safetensors=True,
                        quantization_config=bnb_config,
                        device_map="auto",
                        low_cpu_mem_usage=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]


In [35]:
gen_kwargs = {
                            "min_length": -1,
                            "top_k": 0.0,
                            "top_p": 1.0,
                            "do_sample": True,
                            "pad_token_id": tokenizer.eos_token_id,
                            "max_length" : 500
                        }

In [14]:
import os
from tqdm import tqdm
import pandas as pd
def evaluate_chosen_responses(reward_processor, dataset_name, model_name, model_id, output_path, small=True, info=''):
    if small:
        # df = pd.read_csv('./output/Anthropic_small_Gemma_sft_le_result.csv')
        df = pd.read_csv('./datasets/anthropic_hard_8k_test.csv.gz', compression='gzip')
    else:
        df = pd.read_csv(os.path.join(output_path, f'{dataset_name}_{model_name}_{model_id}_{info}_result.csv'))

    if model_name == 'GPT2':
        sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}
        #### reward obtained for query/response pairs for model
        df["reward"] = reward_processor(df["prompt"], df["response"], sent_kwargs)

    elif model_name == 'Gemma':
        arr = []
        data = {'reward' : [], 'chat_format' : []}
        for res in tqdm(df['chosen'].iloc[500:1000]):
            idx_user_start = res.find('<start_of_turn>user\n') + len('<start_of_turn>user\n')
            idx_user_end = res.find('<end_of_turn>\n')
            idx_model_start = res.find('<start_of_turn>model\n') + len('<start_of_turn>model\n')
            # idx_model_end = res.rfind('<eos>')
            idx_model_end = res.rfind('<end_of_turn>\n')
            arr.append([{'role' : 'user', 'content' : res[idx_user_start : idx_user_end]},
                        {'role' : 'assistant', 'content' : res[idx_model_start : idx_model_end]}])
        sent_kwargs = {"padding": 'max_length', "truncation": True, "return_tensors": "pt", 'max_length': 1500}
        #### reward obtained for query/response pairs for model
        data["reward"], data['chat_format'] = reward_processor(df["query"], arr, sent_kwargs)
        df.drop(columns=['rejected', 'chosen'], inplace=True)
        df_res = pd.DataFrame(data)
    

    # store results in a dataframe
    if small:
        df_res.to_csv(os.path.join(output_path, f'{dataset_name}_small_{model_name}_chosen_{info}_reward_2.csv'), index=False)
    else:
        df_res.to_csv(os.path.join(output_path, f'{dataset_name}_{model_name}_chosen_{info}_reward_2.csv'), index=False)



In [15]:
reward_processor = load_true_reward_model('Gemma', 'Anthropic')
evaluate_chosen_responses(reward_processor, 'Anthropic', 'Gemma', 'chosen', './output', True, 'le')



Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.13s/it]
100%|██████████| 500/500 [00:00<00:00, 414702.79it/s]


Processing coversations...


100%|██████████| 500/500 [00:00<00:00, 40166.09it/s]


Done!


100%|██████████| 8/8 [01:14<00:00,  9.31s/it]


In [2]:
reward_processor = load_true_reward_model('Gemma', 'Anthropic')

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.15it/s]


In [3]:
evaluate_responses(reward_processor, 'Anthropic', 'Gemma', 'apo', './output', True, 'le')

100%|██████████| 500/500 [00:00<00:00, 547416.34it/s]


Processing coversations...


100%|██████████| 500/500 [00:00<00:00, 16483.16it/s]


Done!


100%|██████████| 8/8 [01:14<00:00,  9.28s/it]


In [4]:
evaluate_responses(reward_processor, 'Anthropic', 'Gemma', 'random', './output', True, 'le')

100%|██████████| 500/500 [00:00<00:00, 430538.29it/s]


Processing coversations...


100%|██████████| 500/500 [00:00<00:00, 52197.82it/s]


Done!


100%|██████████| 8/8 [01:15<00:00,  9.48s/it]


In [5]:
get_accuracy('Gemma', 'Anthropic', './output', small=True, id='apo', info='le')

apo accuracy: 94.55%


In [6]:
get_accuracy('Gemma', 'Anthropic', './output', small=True, id='random', info='le')

random accuracy: 83.0%
