# Reinforcement Learning from Human Feedback

Resource: https://www.deeplearning.ai/short-courses/reinforcement-learning-from-human-feedback/

## Datasets for RL training

### Preference dataset

In [None]:
preference_dataset_path = "../data/sample_preference.jsonl"

In [None]:
import json

preference_data = []

with open(preference_dataset_path) as f:
    for line in f:
        preference_data.append(json.loads(line))

In [None]:
sample_1 = preference_data[0]
print(type(sample_1))

In [None]:
print(sample_1.keys())

In [None]:
sample_1["input_text"]

In [None]:
preference_data[2]["input_text"][-50:]

In [None]:
print(f"candidate_0:\n{sample_1.get('candidate_0')}\n")
print(f"candidate_1:\n{sample_1.get('candidate_1')}\n")

In [None]:
print(f"choice: {sample_1.get('choice')}")

### Prompt dataset

In [None]:
prompt_dataset_path = "../data/sample_prompt.jsonl"

In [None]:
prompt_data = []

with open(prompt_dataset_path) as f:
    for line in f:
        prompt_data.append(json.loads(line))

In [None]:
len(prompt_data)

In [None]:
# Function to print the information in the prompt dataset with a better visualization
def print_d(d):
    for key, val in d.items():        
        print(f"key:{key}\nval:{val}\n")

In [None]:
print_d(prompt_data[0])

In [None]:
print_d(prompt_data[1])

## Tune an LLM with RLHF

### Create pipeline

In [None]:
# Import (RLFH is currently in preview)
from google_cloud_pipeline_components.preview.llm import rlhf_pipeline

# Import from KubeFlow pipelines
from kfp import compiler

In [None]:
# Define a path to the yaml file
RLHF_PIPELINE_PKG_PATH = "../data/rlhf_pipeline.yaml"

In [None]:
# Execute the compile function
compiler.Compiler().compile(
    pipeline_func=rlhf_pipeline,
    package_path=RLHF_PIPELINE_PKG_PATH
)

In [None]:
# Print the first lines of the YAML file
!head rlhf_pipeline.yaml

### Define pipeline job

```python
parameter_values={
        "preference_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/summarize_from_feedback_tfds/comparisons/train/*.jsonl",
        "prompt_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/reddit_tfds/train/*.jsonl",
        "eval_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/reddit_tfds/val/*.jsonl",
    ...
}
```

### Choose model to be tuned

```python
parameter_values={
        "large_model_reference": "llama-2-7b",
        ...
}
```

### Calculate number of reward model training steps

$$ stepsPerEpoch = \left\lceil \frac{datasetSize}{batchSize} \right\rceil$$
$$ trainSteps = stepsPerEpoch \times numEpochs$$

In [None]:
# Preference dataset size
PREF_DATASET_SIZE = 3000

# Batch size is fixed at 64
BATCH_SIZE = 64

In [None]:
import math

In [None]:
REWARD_STEPS_PER_EPOCH = math.ceil(PREF_DATASET_SIZE / BATCH_SIZE)
print(REWARD_STEPS_PER_EPOCH)

In [None]:
REWARD_NUM_EPOCHS = 30

In [None]:
# Calculate number of steps in the reward model training
reward_model_train_steps = REWARD_STEPS_PER_EPOCH * REWARD_NUM_EPOCHS

In [None]:
print(reward_model_train_steps)

### Calculate number of reinforcement learning training steps

In [None]:
# Prompt dataset size
PROMPT_DATASET_SIZE = 2000

# Batch size is fixed at 64
BATCH_SIZE = 64

In [None]:
import math

In [None]:
RL_STEPS_PER_EPOCH = math.ceil(PROMPT_DATASET_SIZE / BATCH_SIZE)
print(RL_STEPS_PER_EPOCH)

In [None]:
RL_NUM_EPOCHS = 10

In [None]:
# Calculate the number of steps in the RL training
reinforcement_learning_train_steps = RL_STEPS_PER_EPOCH * RL_NUM_EPOCHS

In [None]:
print(reinforcement_learning_train_steps)

### Define the instruction

In [None]:
# Completed values for the dictionary
parameter_values={
        "preference_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/summarize_from_feedback_tfds/comparisons/train/*.jsonl",
        "prompt_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/reddit_tfds/train/*.jsonl",
        "eval_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/reddit_tfds/val/*.jsonl",
        "large_model_reference": "llama-2-7b",
        "reward_model_train_steps": 1410,
        "reinforcement_learning_train_steps": 320, # results from the calculations above
        "reward_model_learning_rate_multiplier": 1.0,
        "reinforcement_learning_rate_multiplier": 1.0,
        "kl_coeff": 0.1, # increased to reduce reward hacking
        "instruction":\
    "Summarize in less than 50 words"
}

### Train with full dataset

```python
parameter_values={
        "preference_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text/summarize_from_feedback_tfds/comparisons/train/*.jsonl",
        "prompt_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text/reddit_tfds/train/*.jsonl",
        "eval_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text/reddit_tfds/val/*.jsonl",
        "large_model_reference": "llama-2-7b",
        "reward_model_train_steps": 10000,
        "reinforcement_learning_train_steps": 10000, 
        "reward_model_learning_rate_multiplier": 1.0,
        "reinforcement_learning_rate_multiplier": 0.2,
        "kl_coeff": 0.1,
        "instruction":\
    "Summarize in less than 50 words"
}
```

### Set up Google Cloud

In [None]:
import os
from dotenv import load_dotenv
import json
import base64
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials

def authenticate():
    #Load .env
    load_dotenv()
    #DLAI Custom Key
    return "DLAI_CREDENTIALS", "DLAI_PROJECT", "gs://gcp-sc2-rlhf"
    
    #Decode key and store in .JSON
    SERVICE_ACCOUNT_KEY_STRING_B64 = os.getenv('SERVICE_ACCOUNT_KEY')
    SERVICE_ACCOUNT_KEY_BYTES_B64 = SERVICE_ACCOUNT_KEY_STRING_B64.encode("ascii")
    SERVICE_ACCOUNT_KEY_STRING_BYTES = base64.b64decode(SERVICE_ACCOUNT_KEY_BYTES_B64)
    SERVICE_ACCOUNT_KEY_STRING = SERVICE_ACCOUNT_KEY_STRING_BYTES.decode("ascii")

    SERVICE_ACCOUNT_KEY = json.loads(SERVICE_ACCOUNT_KEY_STRING)


    # Create credentials based on key from service account
    # Make sure your account has the roles listed in the Google Cloud Setup section
    credentials = Credentials.from_service_account_info(
        SERVICE_ACCOUNT_KEY,
        scopes=['https://www.googleapis.com/auth/cloud-platform'])

    if credentials.expired:
        credentials.refresh(Request())
    
    #Set project ID according to environment variable    
    PROJECT_ID = os.getenv('PROJECT_ID')
    STAGING_BUCKET = os.getenv('STAGING_BUCKET')# 'gs://gcp-sc2-rlhf-staging'
    
    return credentials, PROJECT_ID, STAGING_BUCKET

In [None]:
# Authenticate in utils
credentials, PROJECT_ID, STAGING_BUCKET = authenticate()

# RLFH pipeline is available in this region
REGION = "europe-west4"

### Run pipeline

In [None]:
import google.cloud.aiplatform as aiplatform

In [None]:
aiplatform.init(
    project = PROJECT_ID,
    location = REGION,
    credentials = credentials
)

In [None]:
# Look at the path for the YAML file
RLHF_PIPELINE_PKG_PATH

### Create and run the pipeline job

```Python
job = aiplatform.PipelineJob(
    display_name="tutorial-rlhf-tuning",
    pipeline_root=STAGING_BUCKET,
    template_path=RLHF_PIPELINE_PKG_PATH,
    parameter_values=parameter_values)
```
- To run the pipeline job:

```Python
job.run()
```

## Evaluate tuned model

### Check Tensorboard

In [None]:
%load_ext tensorboard

In [None]:
port = %env PORT1
%tensorboard --logdir reward-logs --port $port --bind_all 

In [None]:
# Look at what this directory has
%ls reward-logs

In [None]:
port = %env PORT2
%tensorboard --logdir reinforcer-logs --port $port --bind_all

In [None]:
port = %env PORT3
%tensorboard --logdir reinforcer-fulldata-logs --port $port --bind_all

In [None]:
parameter_values={
        "preference_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/summarize_from_feedback_tfds/comparisons/train/*.jsonl",
        "prompt_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/reddit_tfds/train/*.jsonl",
        "eval_dataset": \
    "gs://vertex-ai/generative-ai/rlhf/text_small/reddit_tfds/val/*.jsonl",
        "large_model_reference": "llama-2-7b",
        "reward_model_train_steps": 1410,
        "reinforcement_learning_train_steps": 320,
        "reward_model_learning_rate_multiplier": 1.0,
        "reinforcement_learning_rate_multiplier": 1.0,
        "kl_coeff": 0.1,
        "instruction":\
    "Summarize in less than 50 words"
}

### Evaluate tuned and untuned model

In [None]:
import json

In [None]:
eval_tuned_path = "../data/eval_results_tuned.jsonl"

eval_data_tuned = []

with open(eval_tuned_path) as f:
    for line in f:
        eval_data_tuned.append(json.loads(line))

In [None]:
def print_d(d, indent=0):
    for key, val in d.items():
        indentation = "  " * indent
        print(f"{indentation}" + "-"*50)
        print(f"{indentation}key:{key}\n")
        if isinstance(val, dict):
            print(f"{indentation}val")
            print_d(val,indent=indent+1)
        else:
            print(f"{indentation}val:{val}")

In [None]:
# Look at the result produced by the tuned model
print_d(eval_data_tuned[0])

In [None]:
eval_untuned_path = "../data/eval_results_untuned.jsonl"

eval_data_untuned = []

with open(eval_untuned_path) as f:
    for line in f:
        eval_data_untuned.append(json.loads(line))

In [None]:
# Look at the result produced by the untuned model
print_d(eval_data_untuned[0])

### Explore results

In [None]:
# Extract all the prompts
prompts = [sample['inputs']['inputs_pretokenized']
           for sample in eval_data_tuned]

In [None]:
# Completions from the untuned model
untuned_completions = [sample['prediction']
                       for sample in eval_data_untuned]

In [None]:
# Completions from the tuned model
tuned_completions = [sample['prediction']
                     for sample in eval_data_tuned]

In [None]:
import pandas as pd

In [None]:
results = pd.DataFrame(
    data={'prompt': prompts,
          'base_model':untuned_completions,
          'tuned_model': tuned_completions})

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
# Print the results
results