In [1]:
!pip install transformers evaluate accelerate
!pip install -U datasets
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->a

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributions as distributions
import evaluate
import random

In [3]:
SMALL_LLM_NAME = "gpt2" # A small model
LARGE_LLM_NAME = "gpt2-large" # A larger model from the same family (for reward model)
DATASET_NAME = "super_glue"
TASK_NAME = "boolq" # Example SuperGLUE task
REWARD_MODEL_BATCH_SIZE = 8
MAX_SEQUENCE_LENGTH = 128 # Adjust based on your task and model

In [4]:
tokenizer = AutoTokenizer.from_pretrained(SMALL_LLM_NAME)
# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Policy Model (Small LLM)
policy_model = AutoModelForCausalLM.from_pretrained(SMALL_LLM_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
reward_model = AutoModelForCausalLM.from_pretrained(LARGE_LLM_NAME)

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [6]:
dataset = load_dataset(DATASET_NAME, TASK_NAME)

README.md: 0.00B [00:00, ?B/s]

boolq/train-00000-of-00001.parquet:   0%|          | 0.00/3.85M [00:00<?, ?B/s]

boolq/validation-00000-of-00001.parquet:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

boolq/test-00000-of-00001.parquet:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3245 [00:00<?, ? examples/s]

In [7]:
train_dataset = dataset["train"]
# For simplicity, take a small subset for training
train_dataset = train_dataset.select(range(100))

In [8]:
def tokenize_function(examples):
    # Adjust based on the specific SuperGLUE task structure
    if TASK_NAME == "boolq":
        # Example: Combine passage and question
        inputs = [f"Passage: {p} Question: {q}" for p, q in zip(examples["passage"], examples["question"])]
        # The labels (true/false) will be used to guide the reward
    else:
         # Default to using the text column if available
        if "text" in examples:
            inputs = examples["text"]
        elif "sentence" in examples:
             inputs = examples["sentence"]
        else:
             # Handle other task structures as needed
            inputs = [str(x) for x in examples.values()][0] # Fallback to the first column as string

    # Tokenize the inputs and targets (if applicable)
    model_inputs = tokenizer(inputs, max_length=MAX_SEQUENCE_LENGTH, truncation=True, padding="max_length", return_tensors="pt")

    # In RL, the "label" is not directly used for supervised training of the policy,
    # but can be used in the reward calculation (e.g., rewarding generations that lead to the correct answer)
    if "label" in examples:
        model_inputs["labels"] = examples["label"]

    return model_inputs

tokenized_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_dataset.set_format(type="torch")


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [9]:
optimizer = optim.Adam(policy_model.parameters(), lr=0.1)

num_episodes = 50 # Reduce for faster testing
gamma = 0.99 # Discount factor (less relevant for short sequences, but good practice)

# Evaluation metric (example: accuracy for BoolQ, might need custom eval for generation tasks)
metric = evaluate.load("accuracy")



Downloading builder script: 0.00B [00:00, ?B/s]

In [36]:
for episode in range(num_episodes):
    # Sample a data point from the dataset
    batch = tokenized_dataset[random.randint(0, len(tokenized_dataset) - 1)]
    input_ids = batch["input_ids"].unsqueeze(0)
    attention_mask = batch["attention_mask"].unsqueeze(0)
    labels = batch["labels"].unsqueeze(0) if "labels" in batch else None

    # Get policy model output (generates next token distribution)
    # We will generate a sequence of tokens for a response
    generation_output = policy_model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=MAX_SEQUENCE_LENGTH + 20, # Generate slightly longer sequence
        num_return_sequences=1,
        output_scores=True, # Needed to calculate log probs manually if not using model's logit output directly
        return_dict_in_generate=True,
        pad_token_id=tokenizer.pad_token_id
    )
    generated_sequence = generation_output.sequences[0]
    transition_scores = generation_output.scores
    generation_start_index = input_ids.shape[1]
    if generation_start_index >= generated_sequence.shape[0]:
         # Handle cases where the sequence is not longer than the input
        print("Warning: Generated sequence not longer than input. Skipping episode.")
        continue

    # Get the predicted token ID at the first generation step
    predicted_token_id = generated_sequence[generation_start_index]

    # Get the log probabilities for the *first* generated token from the scores
    # transition_scores is a tuple of tensors, one for each generated step
    first_step_scores = transition_scores[0] # Scores for the first generated token
    first_step_probs = torch.softmax(first_step_scores, dim=-1)
    m = distributions.Categorical(first_step_probs)
    log_prob_first_token = m.log_prob(predicted_token_id)
    generated_text = tokenizer.decode(generated_sequence[generation_start_index:], skip_special_tokens=True)

    reward = 0 # Default reward
    if TASK_NAME == "boolq":
        # A very simple heuristic for BoolQ: does the generated text contain "yes" or "no"?
        # And is it consistent with the ground truth label?
        if labels is not None:
            true_label = labels.item() # 0 for False, 1 for True
            if ("yes" in generated_text.lower() and true_label == 1) or ("no" in generated_text.lower() and true_label == 0):
                 reward = 1.0
            elif ("yes" in generated_text.lower() and true_label == 0) or ("no" in generated_text.lower() and true_label == 1):
                 reward = -1.0 # Penalize incorrect answer


In [44]:
    G = torch.tensor(reward)

    # Calculate the loss
    # We need to compute the log probabilities of the *generated* sequence
    # This requires feeding the generated sequence back through the policy model
    # and getting the log likelihoods. This is more complex than sampling and using scores directly.

    # Let's revert to a simpler REINFORCE logic using the policy's ability to score tokens.
    # Instead of `generate`, let's get the policy's likelihood for a *target* sequence or the generated one.
    # This requires restructuring the loop to get log_probs of the *sampled* tokens.

    # --- Revised REINFORCE Loop (Generating token by token and tracking log_probs) ---
    log_probs_episode = []
    generated_ids = input_ids.clone() # Start with the input prompt

    # Decide how many tokens to generate per episode
    num_generation_steps = 10 # Example: Generate 10 new tokens
    for step in range(num_generation_steps):
        outputs = policy_model(generated_ids)
        # Get logits for the last token in the sequence
        last_token_logits = outputs.logits[:, -1, :]
        probs = torch.softmax(last_token_logits, dim=-1)
        m = distributions.Categorical(probs)

        # Sample an action (next token)
        action_token = m.sample()

        # Store the log probability of the sampled action
        log_probs_episode.append(m.log_prob(action_token))

        # Append the sampled token to the generated sequence
        generated_ids = torch.cat([generated_ids, action_token.unsqueeze(-1)], dim=-1)

        # Optional: Check for end-of-sequence token and stop early
        if action_token.item() == tokenizer.eos_token_id:
            break

    # --- RLAIF Reward Calculation (using the full generated sequence) ---
    generated_text = tokenizer.decode(generated_ids[0][input_ids.shape[1]:], skip_special_tokens=True)

    reward = 0.0 # Default reward
    if TASK_NAME == "boolq" and labels is not None:
        true_label = labels.item()
        # A more robust reward would involve the large model.
        # Example (concept): Ask the large model if the generated text answers the question correctly.
        prompt_for_rm = f"Passage: {tokenizer.decode(input_ids[0], skip_special_tokens=True)} Generated Answer: {generated_text} Is the generated answer correct? Yes or No."

        # Tokenize the prompt for the reward model
        rm_input = tokenizer(prompt_for_rm, return_tensors="pt")
        # Move to device if using GPU
        if torch.cuda.is_available():
             rm_input = {k: v.to(policy_model.device) for k, v in rm_input.items()}


        # Generate response from the reward model (GPT2-Large)
        # Limit generation length to get a concise "Yes" or "No" answer
        rm_response = reward_model.generate(
            **rm_input,
            max_new_tokens=5, # Generate a few tokens to get "Yes" or "No"
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id, # Ensure padding token is handled
            eos_token_id=tokenizer.eos_token_id # Ensure EOS token is handled
        )

        rm_text = tokenizer.decode(rm_response[0][rm_input['input_ids'].shape[1]:], skip_special_tokens=True).strip().lower()

        # Evaluate reward based on the reward model's response
        if ("yes" in rm_text and true_label == 1) or ("no" in rm_text and true_label == 0):
            reward = 1.0
        elif ("yes" in rm_text and true_label == 0) or ("no" in rm_text and true_label == 1):
            reward = -1.0
        else:
            reward = -0.5 # Penalty for unclear or wrong format from RM
            # print(f"RM gave unclear response: '{rm_text}' for prompt: '{prompt_for_rm}'")


    # Calculate discounted rewards (G_t) - In REINFORCE, G_t is the cumulative reward from time t.
    # For a single episode with a single reward at the end, G_t is just the total reward for all steps.
    # If we had step-wise rewards, we would use discounting. With an episode-end reward,
    # the effective G_t for each step is the final episode reward.
    # G = [reward] * len(log_probs_episode) # All steps get the same reward

    # Calculate the loss for the episode
    # Loss = - sum(log(pi(a_t|s_t)) * G_t)
    # Since G_t is the same for all t in this case, Loss = - sum(log(pi(a_t|s_t))) * Reward
    loss = -torch.stack(log_probs_episode).sum() * torch.tensor(reward).to(policy_model.device)

    # Backpropagate and update the policy network
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if episode % 10 == 0:
        print(f"Episode {episode}, Loss: {loss.item():.4f}, Reward: {reward:.2f}")
        print(f"Generated Text: {generated_text}")

print("Training finished.")



Training finished.


In [45]:
print(f"Episode {episode}, Loss: {loss.item():.4f}, Reward: {reward:.2f}")
print("Training finished.")

Episode 49, Loss: -0.0574, Reward: -0.50
Training finished.
