In [None]:
!pip install transformers==4.36.2
!pip install datasets==2.15.0
!pip install peft==0.7.1
!pip install bitsandbytes==0.41.3
!pip install accelerate==0.25.0
!pip install trl==0.7.7
!pip install tqdm==4.66.1
!pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
!pip install wandb

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
import torch
torch.device("cuda" if torch.cuda.is_available() else "cpu")
from trl import PPOTrainer, PPOConfig, , create_reference_model, AutoModelForSeq2SeqLMWithValueHead
from huggingface_hub import login
import wandb

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
WANDB_TOKEN = user_secrets.get_secret("WANDB_TOKEN")

login(token = HF_TOKEN)
wandb.login(key = WANDB_TOKEN)

# Hyperparameter Configuration

In [None]:
model_name = "google-t5/t5-small"
reward_model_name = "roberta-large-mnli"

batch_size= 64                  # Set the batch size for training.
learning_rate=5e-5                 # Define the learning rate for the optimizer.
remove_unused_columns=False        # Keep unused data columns in the training dataset.
log_with="mlflow"                  # Specify the logging method as "mlflow".
gradient_accumulation_steps=4      # Number of gradient accumulation steps before updating the model.

range_num = 30000
N_EPOCHS = 1

In [None]:
# Define generation hyperparameters

# Set the minimum length of the generated output to 64 tokens.
generation_kwargs = {
    "min_length": 40,

    # Configure the number of beams for beam search. Higher values lead to more diverse but slower generation.
    "num_beams": 5,  # lookahead parameter

    # Control the repetition of n-grams in the generated text. A value of 5 reduces repetitive phrases.
    "no_repeat_ngram_size": 5,  # presence penalty

    # Enable sampling during generation to introduce randomness in the output.
    "do_sample": True,



    # Set the maximum length of the generated output to 256 tokens.
    "max_length": 512
}
extract_model_name = model_name.split("/")[-1]
extarct_reward_model_name = reward_model_name.replace("/", "-")
new_model_name = f"obtained_model_name"

# Load the base model

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Specify the padding token ID to use when generating sequences.
generation_kwargs["pad_token_id"] =  tokenizer.pad_token_id

# Define the end-of-sequence token ID to signal the end of the generated text.
generation_kwargs["eos_token_id"] = tokenizer.eos_token_id,

# Model to fine-tune (T5 small)
model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(
    model_name
)

# Reference model (Referans modelin tanımlanması)
ref_model = create_reference_model(model)

# Data Prepration

In [None]:
dataset = load_dataset("Muadil/all_cleaned_openai_summarize_comparisons_train_val")["train"]

# Select new dataset
dataset = dataset.select(range(range_num))

dataset=dataset.map(process_func, batched=False)
dataset.set_format("torch")

# PPO Configuration

In [None]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

# Reinforcement Learning Configuration

# Create a configuration object for the PPO (Proximal Policy Optimization) algorithm.
config = PPOConfig(
    model_name=model_name,           # Specify the name of the pre-trained model to use.
    batch_size= batch_size,                  # Set the batch size for training.
    learning_rate= learning_rate,                 # Define the learning rate for the optimizer.
    remove_unused_columns= remove_unused_columns,        # Keep unused data columns in the training dataset.
    log_with= log_with,                  # Specify the logging method as "mlflow".
    gradient_accumulation_steps= gradient_accumulation_steps,      # Number of gradient accumulation steps before updating the model.
)


# Create a PPOTrainer instance for training a Proximal Policy Optimization (PPO) model.
ppo_trainer = PPOTrainer(
    config,  # Configuration settings for the trainer.
    model,  # The primary T5 model used for training.
    ref_model,  # A reference T5 model for comparison or other purposes.
    tokenizer,  # Tokenizer used to process input data.
    dataset,  # Training dataset used to train the PPO model.
    data_collator=collator  # Data collator for processing training data batches.
)

#  Reward Model

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model= reward_model_name)
pipe.tokenizer.pad_token = pipe.tokenizer.eos_token

# PPO Training

In [1]:
sent_kwargs = {"top_k": None, "function_to_apply": "none", "batch_size": 16}

from tqdm import tqdm

resultss = []
for epoch in tqdm(range(N_EPOCHS), desc="Epochs"):
    # We reset this list at the beginning of each epoch.
    epoch_responses = []

    for batch in tqdm(ppo_trainer.dataloader, desc="Batches"):
        # Create a dictionary to store game data for this batch
        game_data = dict()

        # Prepend the 'summarize:' token to each text in the batch
        game_data["query"] = batch["query"]

        # Generate responses from the updated t5 model
        input_tensors = [_.squeeze() for _ in batch["input_ids"]]
        response_tensors = []

        for query in input_tensors:
            # Generate a response using PPO with specified generation parameters
            response = ppo_trainer.generate(query.squeeze(), **generation_kwargs)
            response_tensors.append(response.squeeze())

        # Decode the response tensors to obtain the generated text
        batch["response"] = [
            tokenizer.decode(r.squeeze(), skip_special_tokens=False)
            for r in response_tensors
        ]

        #### Compute sentiment score
        texts = [q + r for q, r in zip(batch["query"], batch["response"])]
        pipe_outputs = pipe(texts, **sent_kwargs)
        resultss.append(pipe_outputs)
        rewards = [torch.tensor(output[0]['score']) for output in pipe_outputs]

        #### Run PPO step
        stats = ppo_trainer.step(input_tensors, response_tensors, rewards)
        if stats:
            print(f"Epoch {epoch} Stats:")
        else:
            raise ValueError("PPO training step failed to return valid statistics.")

          # Let's choose some metrics that will be quickly understood by humans
        meaningful_keys = [
    "train/mean_reward",
    "objective/kl",
    "loss/policy",
    "loss/value"
]

        print("\n--- PPO Stats (Human-Readable) ---")
        for key in meaningful_keys:
          if key in stats:
        # Let's print the metric with, for example, 4 decimal places
            print(f"{key}: {stats[key]:.4f}")
        print("----------------------------------\n")



# Push to Hugging Face

In [None]:
# Hugging Face Repository
repo_name = f"username/{new_model_name}"

# Save the model to Hugging Face Hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"Model successfully uploaded to Hugging Face Hub: https://huggingface.com/{repo_name}")