In [None]:
!pip install datasets torch_optimizer lion_pytorch --break-system-packages

In [5]:
ref_model = None
model = None
old_model = None

In [1]:
import os
import copy
import json
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
)
from transformers.optimization import Adafactor
from datasets import Dataset

# ------------------------------------------------
# Load Q&A from JSON file (manual_data_set/QA.json)
# and create a list of {"content": "..."}
# ------------------------------------------------

def load_qa_dataset(json_file):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    train_examples = []
    for item in data:
        q = item.get("Q", "")
        a = item.get("A", "")
        # Combine Q and A into a single text
        content = f"Q: {q}\nA: {a}"  
        train_examples.append({"content": content})

    return train_examples


# Provide the path to your Q&A JSON file
qa_json_path = "manual_data_set/QA.json"

# Use the load_qa_dataset function
train_data = load_qa_dataset(qa_json_path)

# Create a Hugging Face Dataset from the list
train_dataset = Dataset.from_list(train_data)

# -------------------------------
# Model & Tokenizer Setup
# -------------------------------
model_id = "bigcode/starcoder2-3b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Set pad token if not present (using EOS token as pad token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# -------------------------------
# Tokenization Function
# -------------------------------
def tokenize_function(examples):
    return tokenizer(
        examples["content"],
        truncation=True,
        max_length=512,
        padding="max_length",  # pad all examples to max_length
    )

tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["content"])
tokenized_dataset.set_format("torch")

# -------------------------------
# Data Collator and DataLoader
# -------------------------------
# DataCollatorForLanguageModeling automatically creates a "labels" field equal to input_ids
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
batch_size = 1  # Increased batch size
dataloader = DataLoader(
    tokenized_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator
)

# -------------------------------
# Load Model & Set Training Mode
# -------------------------------
model = AutoModelForCausalLM.from_pretrained(model_id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# -------------------------------
# Optimizer Setup with Adafactor
# -------------------------------
optimizer = Adafactor(
    model.parameters(), 
    lr=1e-4,               # Learning rate can be tuned
    relative_step=False,   # Set to True to use relative step sizes
    scale_parameter=False  # Adjust scaling based on model size
)

epsilon = 0.01

# Setup AMP GradScaler for 16-bit training (float16)
scaler = torch.cuda.amp.GradScaler()

# Create a reference model (deep copy) and set it to eval mode.
ref_model = copy.deepcopy(model)
ref_model = ref_model.half()
ref_model.eval()
for param in ref_model.parameters():
    param.requires_grad = False

kl_lambda = 0.2  # Weight for the KL divergence term

# -------------------------------
# Manual Training Loop with AMP (float16)
# -------------------------------
num_epochs = 1000
num_grpo = 100
for epoch in range(num_epochs):
    running_loss = 0.0

    old_model = None
        
    old_model = copy.deepcopy(model)
    old_model = old_model.half()
    old_model.eval()
    for param in old_model.parameters():
        param.requires_grad = False

    for grpo_idx in range(num_grpo):
        for step, batch in enumerate(dataloader):
            # Move batch tensors to device
            batch = {k: v.to(device).repeat_interleave(3, dim=0) for k, v in batch.items()}
            optimizer.zero_grad()
            
            # Forward pass with AMP autocast (float16)
            with torch.cuda.amp.autocast():
                outputs = model(**batch)  # Data collator already provides "labels"
                loss = outputs.loss
    
                # just test
                advantages = loss # (ri - mean(R))/std(R) to do
    
                old_outputs = old_model(**batch)
    
                # Get logits from both models
                model_logits = outputs.logits  # shape: (batch, seq_len, vocab_size)
                old_model_logits = outputs.logits
                ref_outputs = ref_model(**batch)
                ref_logits = ref_outputs.logits  # same shape as model_logits
                
                probability_ratio = model_logits / old_model_logits
    
                # Calculate the unclipped objective
                unclipped_objective = probability_ratio * advantages
                
                # Calculate the clipped objective
                clipped_ratio = torch.clamp(probability_ratio, 1 - epsilon, 1 + epsilon)
                clipped_objective = clipped_ratio * advantages
                
                # Take the minimum of the unclipped and clipped objectives
                ppo_loss = -torch.min(unclipped_objective, clipped_objective).mean()
            
                # Compute log-probabilities and probabilities
                model_log_probs = F.log_softmax(model_logits, dim=-1)
                ref_log_probs = F.softmax(ref_logits, dim=-1)
            
                # Compute KL divergence (using batchmean reduction)
                kl_div = F.kl_div(model_log_probs, ref_log_probs, reduction='batchmean')
    
                # Combine the primary loss and the KL divergence loss
                combined_loss = ppo_loss + kl_lambda * kl_div
    
            
            # Backward pass with scaled loss
            scaler.scale(combined_loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            running_loss += combined_loss.item()

            if step%10==0:
                print(f"Epoch {epoch+1} Grpo {grpo_idx} Step {step+1}/{len(dataloader)} Loss: {loss.item():.4f}")
    
    avg_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}")

# -------------------------------
# Save the Fine-Tuned Model
# -------------------------------
output_dir = "./starcoder2-3b-finetuned_adafactor_fp16"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")




Map:   0%|          | 0/36 [00:00<?, ? examples/s]

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch 1 Step 1/36 Loss: 4.7925
Epoch 1 Step 2/36 Loss: 5.2342
Epoch 1 Step 3/36 Loss: 5.9867
Epoch 1 Step 4/36 Loss: 5.8720
Epoch 1 Step 5/36 Loss: 5.3160
Epoch 1 Step 6/36 Loss: 6.8686
Epoch 1 Step 7/36 Loss: 5.4841
Epoch 1 Step 8/36 Loss: 5.3843
Epoch 1 Step 9/36 Loss: 5.1209
Epoch 1 Step 10/36 Loss: 6.4761
Epoch 1 Step 11/36 Loss: 5.7799
Epoch 1 Step 12/36 Loss: 6.2239
Epoch 1 Step 13/36 Loss: 6.1394
Epoch 1 Step 14/36 Loss: 4.2185
Epoch 1 Step 15/36 Loss: 4.8698
Epoch 1 Step 16/36 Loss: 4.5611
Epoch 1 Step 17/36 Loss: 5.9608
Epoch 1 Step 18/36 Loss: 5.5476
Epoch 1 Step 19/36 Loss: 5.0294
Epoch 1 Step 20/36 Loss: 4.4260
Epoch 1 Step 21/36 Loss: 6.1924
Epoch 1 Step 22/36 Loss: 5.3547
Epoch 1 Step 23/36 Loss: 4.9466
Epoch 1 Step 24/36 Loss: 5.5774
Epoch 1 Step 25/36 Loss: 6.0174
Epoch 1 Step 26/36 Loss: 5.1029
Epoch 1 Step 27/36 Loss: 5.8129
Epoch 1 Step 28/36 Loss: 5.2120
Epoch 1 Step 29/36 Loss: 5.6221
Epoch 1 Step 30/36 Loss: 6.3168
Epoch 1 Step 31/36 Loss: 4.5538
Epoch 1 Step 32/3

KeyboardInterrupt: 

In [1]:
import os
import copy
import json
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter  # <-- (1) Import TensorBoard
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
)
from transformers.optimization import Adafactor
from datasets import Dataset

# ------------------------------------------------
# Load Q&A from JSON file (manual_data_set/QA.json)
# and create a list of {"content": "..."}
# ------------------------------------------------

def load_qa_dataset(json_file):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    train_examples = []
    for item in data:
        q = item.get("Q", "")
        a = item.get("A", "")
        # Combine Q and A into a single text
        content = f"Q: {q}\nA: {a}"
        train_examples.append({"content": content})

    return train_examples


# Provide the path to your Q&A JSON file
qa_json_path = "manual_data_set/QA.json"

# Use the load_qa_dataset function
train_data = load_qa_dataset(qa_json_path)

# Create a Hugging Face Dataset from the list
train_dataset = Dataset.from_list(train_data)

# -------------------------------
# Model & Tokenizer Setup
# -------------------------------
model_id = "bigcode/starcoder2-3b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Set pad token if not present (using EOS token as pad token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# -------------------------------
# Tokenization Function
# -------------------------------
def tokenize_function(examples):
    return tokenizer(
        examples["content"],
        truncation=True,
        max_length=512,
        padding="max_length",  # pad all examples to max_length
    )

tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["content"])
tokenized_dataset.set_format("torch")

# -------------------------------
# Data Collator and DataLoader
# -------------------------------
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
batch_size = 1  # You can adjust this
dataloader = DataLoader(
    tokenized_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator
)

# -------------------------------
# Load Model & Set Training Mode
# -------------------------------
model = AutoModelForCausalLM.from_pretrained(model_id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# -------------------------------
# Optimizer Setup with Adafactor
# -------------------------------
optimizer = Adafactor(
    model.parameters(),
    lr=1e-4,               # Learning rate can be tuned
    relative_step=False,   # Set to True to use relative step sizes
    scale_parameter=False  # Adjust scaling based on model size
)

epsilon = 0.01

# Setup AMP GradScaler for 16-bit training (float16)
scaler = torch.cuda.amp.GradScaler()

# Create a reference model (deep copy) and set it to eval mode.
ref_model = copy.deepcopy(model).half().eval()
for param in ref_model.parameters():
    param.requires_grad = False

kl_lambda = 0.2  # Weight for the KL divergence term

# -------------------------------
# Initialize TensorBoard Writer
# -------------------------------
writer = SummaryWriter(log_dir="runs/starcoder2_experiment")  # (2) Initialize

# -------------------------------
# Manual Training Loop with AMP
# -------------------------------
num_epochs = 1000
num_grpo = 100

global_step = 0  # We'll track this across epochs for TensorBoard

for epoch in range(num_epochs):
    running_loss = 0.0

    # Keep a copy of the model for PPO-like ratio calculation
    old_model = copy.deepcopy(model).half().eval()
    for param in old_model.parameters():
        param.requires_grad = False

    for grpo_idx in range(num_grpo):
        for step, batch in enumerate(dataloader):
            # Move batch tensors to device
            batch = {k: v.to(device).repeat_interleave(3, dim=0) for k, v in batch.items()}
            optimizer.zero_grad()

            with torch.cuda.amp.autocast():
                # Forward pass
                outputs = model(**batch)
                loss = outputs.loss

                # Just an example advantage placeholder
                advantages = loss  # If you have a real advantage calculation, replace it here

                old_outputs = old_model(**batch)

                # Get logits from both models
                model_logits = outputs.logits          # shape: (batch, seq_len, vocab_size)
                old_model_logits = old_outputs.logits  # shape: (batch, seq_len, vocab_size)

                ref_outputs = ref_model(**batch)
                ref_logits = ref_outputs.logits

                # Probability ratio
                # In a real PPO scenario, you would convert logits -> log_probs -> exp(log_probs)
                # for ratio calculations. This snippet is just a placeholder, illustrating usage.
                probability_ratio = model_logits / (old_model_logits + 1e-8)

                # Unclipped objective
                unclipped_objective = probability_ratio * advantages

                # Clipped objective
                clipped_ratio = torch.clamp(probability_ratio, 1 - epsilon, 1 + epsilon)
                clipped_objective = clipped_ratio * advantages

                # PPO loss
                _ppo_loss = clipped_objective # torch.min(unclipped_objective, clipped_objective)
                ppo_loss= -_ppo_loss.mean()

                # Calculate KL divergence
                model_log_probs = F.log_softmax(model_logits, dim=-1)
                ref_log_probs   = F.softmax(ref_logits, dim=-1)
                kl_div = F.kl_div(model_log_probs, ref_log_probs, reduction='batchmean')

                # Combine the primary loss (ppo_loss) and the KL divergence loss
                combined_loss = ppo_loss + kl_lambda * kl_div

            # Backward pass with AMP
            scaler.scale(combined_loss).backward()
            scaler.step(optimizer)
            scaler.update()

            running_loss += combined_loss.item()

            # -------------------------------
            # TensorBoard logging
            # -------------------------------
            writer.add_scalar("Loss/combined_loss", combined_loss.item(), global_step)
            writer.add_scalar("Loss/ppo_loss", ppo_loss.item(), global_step)
            writer.add_scalar("Loss/kl_div", kl_div.item(), global_step)
            # You can also log 'loss' from outputs if you want:
            writer.add_scalar("Loss/original_loss", loss.item(), global_step)

            global_step += 1

            if step % 10 == 0:
                print(f"Epoch {epoch+1} Grpo {grpo_idx} Step {step+1}/{len(dataloader)} "
                      f"Loss: {loss.item():.4f}  PPO_Loss: {ppo_loss.item():.4f}  KL: {kl_div.item():.4f}")

    avg_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}")

# Close the TensorBoard writer at the end of training
writer.close()

# -------------------------------
# Save the Fine-Tuned Model
# -------------------------------
output_dir = "./starcoder2-3b-finetuned_adafactor_fp16"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")




Map:   0%|          | 0/36 [00:00<?, ? examples/s]

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch 1 Grpo 0 Step 1/36 Loss: 6.5010  PPO_Loss: -6.4648  KL: 1707.8895
Epoch 1 Grpo 0 Step 11/36 Loss: 5.7144  PPO_Loss: -5.7070  KL: 2159.4663
Epoch 1 Grpo 0 Step 21/36 Loss: 23.6110  PPO_Loss: -23.3906  KL: 5228.3813
Epoch 1 Grpo 0 Step 31/36 Loss: 7.9457  PPO_Loss: -7.9219  KL: 1789.4861
Epoch 1 Grpo 1 Step 1/36 Loss: 7.3138  PPO_Loss: -7.2461  KL: 910.1036
Epoch 1 Grpo 1 Step 11/36 Loss: 8.2675  PPO_Loss: -8.2500  KL: 1505.7478
Epoch 1 Grpo 1 Step 21/36 Loss: 6.8513  PPO_Loss: -6.7969  KL: 1011.8757
Epoch 1 Grpo 1 Step 31/36 Loss: 8.4621  PPO_Loss: -8.3906  KL: 1135.5599
Epoch 1 Grpo 2 Step 1/36 Loss: 8.3984  PPO_Loss: -8.3281  KL: 1114.5754
Epoch 1 Grpo 2 Step 11/36 Loss: 7.6475  PPO_Loss: -7.6250  KL: 1377.2168
Epoch 1 Grpo 2 Step 21/36 Loss: 7.4992  PPO_Loss: -7.4453  KL: 1041.7146
Epoch 1 Grpo 2 Step 31/36 Loss: 6.6669  PPO_Loss: -6.6055  KL: 1267.8304
Epoch 1 Grpo 3 Step 1/36 Loss: 7.3089  PPO_Loss: -7.2734  KL: 1055.5793
Epoch 1 Grpo 3 Step 11/36 Loss: 7.0689  PPO_Loss: -7.0

KeyboardInterrupt: 

In [1]:
!rm -rf "runs/starcoder2_optuna_experiment"

In [1]:
import os
import copy
import json
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
)
from transformers.optimization import Adafactor
from datasets import Dataset

# For hyperparameter optimization
import optuna


# ------------------------------------------------
# Load Q&A from JSON file (manual_data_set/QA.json)
# and create a list of {"content": "..."}
# ------------------------------------------------
def load_qa_dataset(json_file):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    train_examples = []
    for item in data:
        q = item.get("Q", "")
        a = item.get("A", "")
        content = f"Q: {q}\nA: {a}"
        train_examples.append({"content": content})
    return train_examples


# Provide the path to your Q&A JSON file
qa_json_path = "manual_data_set/QA.json"
train_data = load_qa_dataset(qa_json_path)

# Create a Hugging Face Dataset from the list
train_dataset = Dataset.from_list(train_data)

# ------------------------------------------------
# Define Tokenization
# ------------------------------------------------
model_id = "bigcode/starcoder2-3b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(
        examples["content"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )

tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["content"])
tokenized_dataset.set_format("torch")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


# ------------------------------------------------
# Define Training Function
# ------------------------------------------------
def train_and_evaluate(
    model,
    ref_model,
    dataloader,
    optimizer,
    device,
    num_epochs,
    num_grpo,
    epsilon,
    kl_lambda,
    scaler
):
    """
    Train the model for `num_epochs` with `num_grpo` PPO groups each epoch,
    and return a metric (e.g., final average loss).
    """
    # Initialize TensorBoard writer (optional)
    writer = SummaryWriter(log_dir="runs/starcoder2_optuna_experiment")

    global_step = 0
    for epoch in range(num_epochs):
        running_loss = 0.0

        old_model = None
        old_model = copy.deepcopy(model)
        old_model = old_model.half()
        old_model.eval()
        for param in old_model.parameters():
            param.requires_grad = False

        for grpo_idx in range(num_grpo):
            for step, batch in enumerate(dataloader):
                batch = {k: v.to(device).repeat_interleave(3, dim=0) for k, v in batch.items()}
                optimizer.zero_grad()

                with torch.cuda.amp.autocast():
                    outputs = model(**batch)
                    loss = outputs.loss

                    # A placeholder for advantage (you'd replace this with real advantage if doing PPO)
                    advantages = -loss

                    # old model forward
                    with torch.no_grad():
                        old_outputs = old_model(**batch)

                    model_logits     = outputs.logits
                    old_model_logits = old_outputs.logits

                    # reference model forward
                    with torch.no_grad():
                        ref_outputs = ref_model(**batch)
                    ref_logits = ref_outputs.logits

                    # Probability ratio
                    # In real PPO, you'd convert logits -> log_probs, then ratio = exp(new_log_prob - old_log_prob)
                    probability_ratio = model_logits / (old_model_logits + 1e-8)

                    # Unclipped objective
                    unclipped_objective = probability_ratio * advantages

                    # Clipped objective
                    clipped_ratio = torch.clamp(probability_ratio, 1 - epsilon, 1 + epsilon)
                    clipped_objective = clipped_ratio * advantages

                    _ppo_loss = clipped_objective # torch.min(unclipped_objective, clipped_objective)
                    ppo_loss = -_ppo_loss.mean()

                    # KL
                    model_log_probs = F.log_softmax(model_logits, dim=-1)
                    ref_log_probs   = F.softmax(ref_logits, dim=-1)
                    kl_div = F.kl_div(model_log_probs, ref_log_probs, reduction='batchmean')

                    combined_loss = ppo_loss + kl_lambda * kl_div

                scaler.scale(combined_loss).backward()
                scaler.step(optimizer)
                scaler.update()

                running_loss += combined_loss.item()

                # TensorBoard logging
                writer.add_scalar("Loss/combined_loss", combined_loss.item(), global_step)
                writer.add_scalar("Loss/ppo_loss", ppo_loss.item(), global_step)
                writer.add_scalar("Loss/kl_div", kl_div.item(), global_step)
                writer.add_scalar("Loss/original_loss", loss.item(), global_step)

                global_step += 1

        avg_loss = running_loss / len(dataloader)
        print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}")

    writer.close()
    
    # Return final average loss as the metric to minimize
    return avg_loss


# ------------------------------------------------
# Optuna Objective Function
# ------------------------------------------------
def objective(trial):
    """
    Defines how Optuna will run each trial:
    - sample hyperparameters
    - set up the model & optimizer with those
    - run a short training loop
    - return a metric (the final avg loss) to minimize
    """
    # Shortened training for demonstration:
    num_epochs = 2   # or 2–3, to save time during hyperparameter search
    
    # Sample hyperparameters
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    kl_lambda = trial.suggest_float("kl_lambda", 0.0, 1.0)
    epsilon = trial.suggest_float("epsilon", 0.01, 0.2)
    num_grpo = trial.suggest_int("num_grpo", 1, 3, step=1)

    print(f"[Optuna] Trial hyperparameters -> lr: {lr}, kl_lambda: {kl_lambda}, epsilon: {epsilon}, num_grpo: {num_grpo}")

    # Model & device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
    model.train()

    # Reference model (for KL)
    old_model = None
    ref_model = copy.deepcopy(model).half().eval()
    for param in ref_model.parameters():
        param.requires_grad = False

    # DataLoader
    batch_size = 1
    dataloader = DataLoader(
        tokenized_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=data_collator
    )

    # Optimizer
    optimizer = Adafactor(
        model.parameters(),
        lr=lr,
        relative_step=False,
        scale_parameter=False
    )

    # AMP GradScaler
    scaler = torch.cuda.amp.GradScaler()

    # Train & get final metric
    final_avg_loss = train_and_evaluate(
        model=model,
        ref_model=ref_model,
        dataloader=dataloader,
        optimizer=optimizer,
        device=device,
        num_epochs=num_epochs,
        num_grpo=num_grpo,
        epsilon=epsilon,
        kl_lambda=kl_lambda,
        scaler=scaler
    )

    # Return the final average loss to Optuna
    return final_avg_loss


# ------------------------------------------------
# Run Optuna Study
# ------------------------------------------------
if __name__ == "__main__":
    # Create study to minimize final loss
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=5)  # You can increase n_trials

    print("Study completed!")
    print("Best trial:")
    best_trial = study.best_trial
    print(f"  Value: {best_trial.value}")
    print("  Params: ")
    for key, value in best_trial.params.items():
        print(f"    {key}: {value}")

    # Optionally, after the best params are found, you can do a full training
    # run with the best hyperparameters, e.g.:
    # best_params = best_trial.params
    # ...
    # re-initialize a new model & train fully using best_params




Map:   0%|          | 0/36 [00:00<?, ? examples/s]

[I 2025-03-18 01:56:08,190] A new study created in memory with name: no-name-34e90352-3486-4718-8417-2c9c3d18c81a


[Optuna] Trial hyperparameters -> lr: 6.43875785595471e-05, kl_lambda: 0.09934036423050208, epsilon: 0.07765466935288792, num_grpo: 2


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch 1 completed. Average Loss: 304.8525


[W 2025-03-18 01:58:07,700] Trial 0 failed with parameters: {'lr': 6.43875785595471e-05, 'kl_lambda': 0.09934036423050208, 'epsilon': 0.07765466935288792, 'num_grpo': 2} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_16666/3952006775.py", line 222, in objective
    final_avg_loss = train_and_evaluate(
                     ^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_16666/3952006775.py", line 146, in train_and_evaluate
    scaler.step(optimizer)
  File "/usr/local/lib/python3.12/dist-packages/torch/amp/grad_scaler.py", line 457, in step
    retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/amp/grad_scaler

KeyboardInterrupt: 