## Imports of dependencies

In [2]:
from transformers import AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
import pandas as pd
import pandas as pd
from datasets import Dataset, load_dataset, load_from_disk
import optuna
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
import torch
from transformers import AutoTokenizer
import pandas as pd
from datasets import Dataset, load_dataset, load_from_disk
from transformers import AutoTokenizer
import wandb

[2025-06-18 22:52:56,832] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


W0618 22:52:58.902000 9888 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


## Loading dataset and prepare Dataframe to easy following process.

In [3]:
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", 'en')
dataset = dataset['train'].train_test_split(test_size=0.1)
df = pd.concat([dataset['train'].to_pandas(), dataset['test'].to_pandas()], ignore_index=True)
df = df.drop_duplicates(subset=['Question'])
print(df.head())
print(len(df))


                                            Question  \
0  What is the ideal management for a 40-year-old...   
1  A 54-year-old woman with a history of chronic ...   
2  A 27-year-old woman with mild persistent asthm...   
3  A 40-year-old man undergoing an insurance asse...   
4  A 43-year-old man presents with nasal congesti...   

                                         Complex_CoT  \
0  Alright, let's think about this case. We've go...   
1  Alright, so we have a 54-year-old woman who's ...   
2  Alright, so this woman with asthma is having t...   
3  Alright, we've got a 40-year-old guy who's goi...   
4  Alright, let's think about what's going on her...   

                                            Response  
0  The management of a 40-year-old female patient...  
1  The clinical presentation described—a chronic ...  
2  Based on the information provided, it is most ...  
3  The most likely diagnosis for the abnormal fin...  
4  Based on the symptoms and findings you've desc..

## Creating clusters to obtain significant subset of dataset

In [None]:
texts = df["Question"].astype(str).tolist()

vectorizer = TfidfVectorizer(max_features=1024)
X = vectorizer.fit_transform(texts)

n_clusters = 1968
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
kmeans.fit(X)

closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X)
centroid_questions = [texts[i] for i in closest]

unique_centroid_questions = list(dict.fromkeys(centroid_questions))

centroids_df = pd.DataFrame({'centroid_question': unique_centroid_questions})
print(centroids_df.head())
print(f"Number of unique centroid questions: {len(centroids_df)}")

                                   centroid_question
0  During a laparotomy for an ovarian tumor, if a...
1  本例检查结果:Hh105g/L,WBC5.9×109/L,尿蛋白(+++),尿素氮20mmo...
2  A 9-month-old boy is brought to the physician ...
3  A 68-year-old man comes to the physician becau...
4  A female presents with symptoms and signs sugg...
Number of unique centroid questions: 1284


## Obtain the answers for the centroid questions

In [None]:
centroids_with_responses = pd.merge(
    centroids_df,
    df[['Question', 'Response']],
    left_on='centroid_question',
    right_on='Question',
    how='left'
)[['Question', 'Response']]

centroids_with_responses = centroids_with_responses.drop_duplicates(subset=['Question'])

print(centroids_with_responses.head())
print(centroids_with_responses.shape)

                                            Question  \
0  A 78-year-old woman with a history of essentia...   
1  A 39-year-old woman presents with 5 days of pa...   
2  A child presents with an infective skin lesion...   
3  本例检查结果:Hh105g/L,WBC5.9×109/L,尿蛋白(+++),尿素氮20mmo...   
4  A 68-year-old woman comes to the physician for...   

                                            Response  
0  For managing a provoked deep venous thrombosis...  
1  Based on the information provided, the most li...  
2  To confirm the identity of the organism likely...  
3  The clinical staging of this patient aligns wi...  
4  The most likely underlying cause of this patie...  
(1250, 2)


## Save meaningful centroids dataset

In [None]:
centroids_dataset = Dataset.from_pandas(centroids_with_responses, preserve_index=False)

centroids_dataset.save_to_disk("centroids_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/1250 [00:00<?, ? examples/s]

## Load centroids dataset into memory

In [None]:
loaded_dataset = load_from_disk("centroids_dataset")
print(loaded_dataset.to_pandas().head())
print(loaded_dataset.to_pandas().shape)
print(loaded_dataset)

                                            Question  \
0  A 78-year-old woman with a history of essentia...   
1  A 39-year-old woman presents with 5 days of pa...   
2  A child presents with an infective skin lesion...   
3  本例检查结果:Hh105g/L,WBC5.9×109/L,尿蛋白(+++),尿素氮20mmo...   
4  A 68-year-old woman comes to the physician for...   

                                            Response  
0  For managing a provoked deep venous thrombosis...  
1  Based on the information provided, the most li...  
2  To confirm the identity of the organism likely...  
3  The clinical staging of this patient aligns wi...  
4  The most likely underlying cause of this patie...  
(1250, 2)
Dataset({
    features: ['Question', 'Response'],
    num_rows: 1250
})


## Split centroids dataset into train and test subset

In [None]:
loaded_dataset = loaded_dataset.train_test_split(test_size=0.1)
print(len(loaded_dataset["train"]))
print(len(loaded_dataset["test"]))
print(loaded_dataset["test"].to_pandas().head())

1125
125
                                            Question  \
0  In a 95% confidence interval for the prevalenc...   
1  According to the Factory Act of 1948, what is ...   
2  In a situation where a 32-year-old man with a ...   
3  CAD predisposing factors-a)  Homocysteinemiab)...   
4  A 4 year old child presents with acute watery ...   

                                            Response  
0  In a 95% confidence interval ranging from 56% ...  
1  According to the Factory Act of 1948, the maxi...  
2  Based on the symptoms described—coma, pinpoint...  
3  The risk factors for coronary artery disease (...  
4       The etiological agent is B. Giardia lamblia.  


## Select Hugging Face model and its tokenizer

In [None]:
model_name = "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

## Function to tokenize and format the data

In [None]:
def preprocess_function(examples):
    prompts = [f"[INST] {question} [/INST]" for question in examples['Question']]
    responses = examples['Response']
    
    texts = []
    for prompt, response in zip(prompts, responses):
        texts.append(f"{prompt} {response}")
    
    print(texts[:1])
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )
    
    tokenized["labels"] = tokenized["input_ids"].clone()
    
    return tokenized

## All dataset is tokenized

In [None]:
tokenized_dataset = loaded_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=loaded_dataset["train"].column_names
)

Map:   0%|          | 0/1125 [00:00<?, ? examples/s]

['[INST] A 7-month-old boy with an unremarkable past medical history is experiencing symptoms such as fever, chills, cough, runny nose, and watery eyes along with his elder brother. He has been diagnosed with an influenza virus infection, and this is his first exposure to the virus. What immune mechanism is most likely responsible for combating the influenza virus infection in this scenario? [/INST] In this scenario, since the 7-month-old boy is experiencing his first influenza virus infection, the innate immune response plays a pivotal role in combating the virus initially. The key elements of this response include the production of Type I interferons (such as IFN-alpha and IFN-beta) by virus-infected cells. These interferons act as signaling molecules, alerting neighboring cells to bolster their defenses against the viral infection and activating natural killer (NK) cells. NK cells then play an essential role in identifying and destroying infected cells, helping to control the spread

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

['[INST] In a 95% confidence interval for the prevalence of cancer among smokers aged over 65 years, ranging from 56% to 76%, what is the probability that the true prevalence is actually less than 56%? [/INST] In a 95% confidence interval ranging from 56% to 76% for the prevalence of cancer among smokers aged over 65, the probability that the true prevalence is actually less than 56% is 2.5%. This accounts for the lower tail of the distribution outside the confidence interval, with the remaining 2.5% being the probability that the true prevalence is more than 76%.']


## Create model shell from specified Hugging Face model's name, and using available GPU.

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

## Grid Search of set of hyperparameters to find the best combination.

In [None]:
def optuna_objective(trial):

    wandb.init(
        project="model-qwen-finetuned-medical-reasoning",
        name=f"trial_{trial.number}",
        reinit=True
    )
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [2, 4, 8, 16])
    gradient_accumulation_steps = trial.suggest_categorical("gradient_accumulation_steps", [4, 8, 16])

    training_args = TrainingArguments(
        output_dir="./qwen-finetuned",
        run_name=f"trial_{trial.number}",
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        save_steps=1000,
        save_total_limit=1,
        eval_steps=500,
        logging_dir="./logs",
        logging_steps=50,
        learning_rate=learning_rate,
        weight_decay=0.01,
        warmup_steps=200,
        lr_scheduler_type="cosine",
        report_to="wandb",
        fp16=False,
        bf16=True,
        gradient_checkpointing=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
    )

    trainer.train()
    eval_metrics = trainer.evaluate()
    wandb.finish()
    return eval_metrics["eval_loss"]


## Run Grid Search to find the best hypermarameter's combination

In [None]:
# Run Optuna study
study = optuna.create_study(direction="minimize")  # minimize eval_loss
study.optimize(optuna_objective, n_trials=20)

print("Best trial:")
print(study.best_trial)
print("Best params:")
print(study.best_params)

[I 2025-06-09 09:17:00,709] A new study created in memory with name: no-name-9bda1e00-b368-4064-b036-3b0ada42bfc2
[34m[1mwandb[0m: Currently logged in as: [33mluis-orellana777[0m ([33mluis-orellana777-sngular[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/loss,0.93118
eval/runtime,2.3854
eval/samples_per_second,52.402
eval/steps_per_second,3.354
total_flos,2.579805945987072e+16
train/epoch,4.90141
train/global_step,40.0
train_loss,5.61875
train_runtime,376.2862
train_samples_per_second,14.949


[I 2025-06-09 09:23:21,582] Trial 0 finished with value: 0.9311835169792175 and parameters: {'learning_rate': 4.673650602358979e-05, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 8}. Best is trial 0 with value: 0.9311835169792175.


Step,Training Loss
50,0.9403
100,0.7806
150,0.6931


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▇██
train/global_step,▁▄▇██
train/grad_norm,█▄▁
train/learning_rate,▁▅█
train/loss,█▃▁

0,1
eval/loss,0.66609
eval/runtime,2.0408
eval/samples_per_second,61.251
eval/steps_per_second,30.871
total_flos,2.6044658557648896e+16
train/epoch,4.88099
train/global_step,175.0
train/grad_norm,0.71484
train/learning_rate,2e-05
train/loss,0.6931


[I 2025-06-09 09:29:35,049] Trial 1 finished with value: 0.6660937666893005 and parameters: {'learning_rate': 3.0711570833265244e-05, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 16}. Best is trial 1 with value: 0.6660937666893005.


Step,Training Loss
50,0.6434


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁██
train/global_step,▁██
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
eval/loss,0.6648
eval/runtime,1.7112
eval/samples_per_second,73.047
eval/steps_per_second,9.35
total_flos,2.5285892102946816e+16
train/epoch,4.73759
train/global_step,85.0
train/grad_norm,0.50391
train/learning_rate,1e-05
train/loss,0.6434


[I 2025-06-09 09:34:04,399] Trial 2 finished with value: 0.6648001670837402 and parameters: {'learning_rate': 2.7333299657081294e-05, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 8}. Best is trial 2 with value: 0.6648001670837402.


Step,Training Loss
50,0.6608


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁██
train/global_step,▁██
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
eval/loss,0.66375
eval/runtime,2.4868
eval/samples_per_second,50.266
eval/steps_per_second,3.217
total_flos,2.610156604175155e+16
train/epoch,4.95775
train/global_step,85.0
train/grad_norm,0.55469
train/learning_rate,1e-05
train/loss,0.6608


[I 2025-06-09 09:40:19,492] Trial 3 finished with value: 0.6637539863586426 and parameters: {'learning_rate': 2.116154911629595e-05, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 4}. Best is trial 3 with value: 0.6637539863586426.


Step,Training Loss
50,0.6249
100,0.6204
150,0.6538
200,0.6254
250,0.63
300,0.6285
350,0.6317
400,0.6049
450,0.6259
500,0.6005


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇███
train/global_step,▁▂▂▃▃▄▄▅▅▆▆▇▇███
train/grad_norm,▅▃▃█▄▃▃▂▁▄▅▅▆▃
train/learning_rate,▃▄▆██▇▇▆▅▃▂▂▁▁
train/loss,▄▄█▄▅▅▅▂▄▁▃▂▂▂

0,1
eval/loss,0.66021
eval/runtime,2.0363
eval/samples_per_second,61.386
eval/steps_per_second,30.939
total_flos,2.6594764237307904e+16
train/epoch,4.99467
train/global_step,700.0
train/grad_norm,1.46094
train/learning_rate,0.0
train/loss,0.605


[I 2025-06-09 09:47:07,041] Trial 4 finished with value: 0.6602098941802979 and parameters: {'learning_rate': 1.3260885303477737e-05, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 4}. Best is trial 4 with value: 0.6602098941802979.


Step,Training Loss


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/loss,0.66026
eval/runtime,1.6923
eval/samples_per_second,73.864
eval/steps_per_second,9.455
total_flos,2.503929300516864e+16
train/epoch,4.9078
train/global_step,40.0
train_loss,0.66263
train_runtime,261.0498
train_samples_per_second,21.548


[I 2025-06-09 09:51:31,238] Trial 5 finished with value: 0.6602570414543152 and parameters: {'learning_rate': 1.1669305797363407e-05, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 16}. Best is trial 4 with value: 0.6602098941802979.


Step,Training Loss
50,0.6294


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁██
train/global_step,▁██
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
eval/loss,0.6603
eval/runtime,2.3114
eval/samples_per_second,54.081
eval/steps_per_second,3.461
total_flos,2.610156604175155e+16
train/epoch,4.95775
train/global_step,85.0
train/grad_norm,0.53906
train/learning_rate,0.0
train/loss,0.6294


[I 2025-06-09 09:57:46,115] Trial 6 finished with value: 0.6603027582168579 and parameters: {'learning_rate': 1.805249450562672e-05, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 4}. Best is trial 4 with value: 0.6602098941802979.


Step,Training Loss
50,0.6268


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁██
train/global_step,▁██
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
eval/loss,0.66037
eval/runtime,2.1331
eval/samples_per_second,58.599
eval/steps_per_second,3.75
total_flos,2.610156604175155e+16
train/epoch,4.95775
train/global_step,85.0
train/grad_norm,0.54297
train/learning_rate,0.0
train/loss,0.6268


[I 2025-06-09 10:04:12,537] Trial 7 finished with value: 0.6603732705116272 and parameters: {'learning_rate': 2.0124183751489064e-05, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 4}. Best is trial 4 with value: 0.6602098941802979.


Step,Training Loss
50,0.6019
100,0.598
150,0.5815


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▇██
train/global_step,▁▄▇██
train/grad_norm,▁▃█
train/learning_rate,▁▅█
train/loss,█▇▁

0,1
eval/loss,0.65984
eval/runtime,1.6837
eval/samples_per_second,74.24
eval/steps_per_second,9.503
total_flos,2.6044658557648896e+16
train/epoch,4.87943
train/global_step,175.0
train/grad_norm,0.69922
train/learning_rate,2e-05
train/loss,0.5815


[I 2025-06-09 10:08:54,450] Trial 8 finished with value: 0.6598436832427979 and parameters: {'learning_rate': 2.6563467269626478e-05, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 4}. Best is trial 8 with value: 0.6598436832427979.


Step,Training Loss
50,0.5687
100,0.563
150,0.5368


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▇██
train/global_step,▁▄▇██
train/grad_norm,▄▁█
train/learning_rate,▁▄█
train/loss,█▇▁

0,1
eval/loss,0.67133
eval/runtime,2.0773
eval/samples_per_second,60.175
eval/steps_per_second,30.328
total_flos,2.6044658557648896e+16
train/epoch,4.88099
train/global_step,175.0
train/grad_norm,0.74219
train/learning_rate,3e-05
train/loss,0.5368


[I 2025-06-09 10:15:06,821] Trial 9 finished with value: 0.6713265180587769 and parameters: {'learning_rate': 3.909964103244932e-05, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 16}. Best is trial 8 with value: 0.6598436832427979.


Step,Training Loss
50,0.4862
100,0.5072
150,0.4938
200,0.4876
250,0.4842
300,0.4695
350,0.4743


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▃▄▆▇███
train/global_step,▁▂▃▅▆▇███
train/grad_norm,▁▃▁▁▃▅█
train/learning_rate,▃▄▆█▆▃▁
train/loss,▄█▆▄▄▁▂

0,1
eval/loss,0.69724
eval/runtime,1.8311
eval/samples_per_second,68.266
eval/steps_per_second,17.476
total_flos,2.663270256004301e+16
train/epoch,4.99291
train/global_step,350.0
train/grad_norm,1.39844
train/learning_rate,0.0
train/loss,0.4743


[I 2025-06-09 10:20:42,869] Trial 10 finished with value: 0.6972418427467346 and parameters: {'learning_rate': 1.5652358124671786e-05, 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4}. Best is trial 8 with value: 0.6598436832427979.


Step,Training Loss
50,0.4669
100,0.4589
150,0.4827
200,0.467
250,0.4682
300,0.4685
350,0.4767
400,0.451
450,0.4681
500,0.4526


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇███
train/global_step,▁▂▂▃▃▄▄▅▅▆▆▇▇███
train/grad_norm,▄▁▃▇▃▃▃▂▂▅▄▅▆█
train/learning_rate,▃▄▆██▇▇▆▅▃▂▂▁▁
train/loss,▅▄█▅▆▆▇▃▅▃▅▁▂▇

0,1
eval/loss,0.70186
eval/runtime,2.1494
eval/samples_per_second,58.156
eval/steps_per_second,29.31
total_flos,2.6594764237307904e+16
train/epoch,4.99467
train/global_step,700.0
train/grad_norm,2.07812
train/learning_rate,0.0
train/loss,0.478


[I 2025-06-09 10:27:29,361] Trial 11 finished with value: 0.701859176158905 and parameters: {'learning_rate': 1.0084606360688313e-05, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 4}. Best is trial 8 with value: 0.6598436832427979.


Step,Training Loss
50,0.4586
100,0.4556
150,0.4425


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▇██
train/global_step,▁▄▇██
train/grad_norm,▁▂█
train/learning_rate,▁▅█
train/loss,█▇▁

0,1
eval/loss,0.69872
eval/runtime,1.6772
eval/samples_per_second,74.529
eval/steps_per_second,9.54
total_flos,2.6044658557648896e+16
train/epoch,4.87943
train/global_step,175.0
train/grad_norm,0.99219
train/learning_rate,1e-05
train/loss,0.4425


[I 2025-06-09 10:32:09,760] Trial 12 finished with value: 0.6987183690071106 and parameters: {'learning_rate': 1.4380827936722391e-05, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 4}. Best is trial 8 with value: 0.6598436832427979.


Step,Training Loss
50,0.4451
100,0.4653
150,0.4513
200,0.4405
250,0.4162
300,0.3951
350,0.3856


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▃▄▆▇███
train/global_step,▁▂▃▅▆▇███
train/grad_norm,▂█▄▁▆▆▅
train/learning_rate,▃▄▆█▆▃▁
train/loss,▆█▇▆▄▂▁

0,1
eval/loss,0.75119
eval/runtime,1.9382
eval/samples_per_second,64.492
eval/steps_per_second,16.51
total_flos,2.663270256004301e+16
train/epoch,4.99291
train/global_step,350.0
train/grad_norm,1.39062
train/learning_rate,0.0
train/loss,0.3856


[I 2025-06-09 10:37:46,236] Trial 13 finished with value: 0.7511882185935974 and parameters: {'learning_rate': 2.6992782370856335e-05, 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4}. Best is trial 8 with value: 0.6598436832427979.


Step,Training Loss
50,0.3816
100,0.372
150,0.3901
200,0.3728
250,0.3741
300,0.3507
350,0.3204
400,0.2939
450,0.2767
500,0.2546


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇███
train/global_step,▁▂▂▃▃▄▄▅▅▆▆▇▇███
train/grad_norm,▁▁▂▆▂█▆▃█▆▅▃▂▃
train/learning_rate,▃▄▆██▇▇▆▅▃▂▂▁▁
train/loss,█▇█▇▇▆▅▄▃▂▃▁▁▂

0,1
eval/loss,0.87318
eval/runtime,2.0688
eval/samples_per_second,60.422
eval/steps_per_second,30.453
total_flos,2.6594764237307904e+16
train/epoch,4.99467
train/global_step,700.0
train/grad_norm,2.46875
train/learning_rate,0.0
train/loss,0.2581


[I 2025-06-09 10:44:32,868] Trial 14 finished with value: 0.8731797337532043 and parameters: {'learning_rate': 3.36945998843922e-05, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 4}. Best is trial 8 with value: 0.6598436832427979.


Step,Training Loss
50,0.2435


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁██
train/global_step,▁██
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
eval/loss,0.87411
eval/runtime,1.7342
eval/samples_per_second,72.079
eval/steps_per_second,9.226
total_flos,2.5285892102946816e+16
train/epoch,4.73759
train/global_step,85.0
train/grad_norm,0.71094
train/learning_rate,0.0
train/loss,0.2435


[I 2025-06-09 10:49:01,239] Trial 15 finished with value: 0.8741080164909363 and parameters: {'learning_rate': 1.3384029845125636e-05, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 8}. Best is trial 8 with value: 0.6598436832427979.


Step,Training Loss
50,0.2535
100,0.236
150,0.2438
200,0.2414
250,0.2338
300,0.2182
350,0.2079
400,0.1957
450,0.1844
500,0.1693


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇███
train/global_step,▁▂▂▃▃▄▄▅▅▆▆▇▇███
train/grad_norm,▂▁▂█▃▂▆▂▄▄▅▂▁▃
train/learning_rate,▃▄▆██▇▇▆▅▃▂▂▁▁
train/loss,█▇▇▇▆▅▄▃▃▁▃▁▁▂

0,1
eval/loss,1.00135
eval/runtime,2.0794
eval/samples_per_second,60.115
eval/steps_per_second,30.298
total_flos,2.6594764237307904e+16
train/epoch,4.99467
train/global_step,700.0
train/grad_norm,2.46875
train/learning_rate,0.0
train/loss,0.1814


[I 2025-06-09 10:55:47,734] Trial 16 finished with value: 1.0013456344604492 and parameters: {'learning_rate': 2.3993824751261608e-05, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 4}. Best is trial 8 with value: 0.6598436832427979.


Step,Training Loss
50,0.1734
100,0.1647
150,0.1612


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▇██
train/global_step,▁▄▇██
train/grad_norm,▁▇█
train/learning_rate,▁▅█
train/loss,█▃▁

0,1
eval/loss,1.02647
eval/runtime,1.6535
eval/samples_per_second,75.597
eval/steps_per_second,9.676
total_flos,2.6044658557648896e+16
train/epoch,4.87943
train/global_step,175.0
train/grad_norm,0.99609
train/learning_rate,1e-05
train/loss,0.1612


[I 2025-06-09 11:00:28,531] Trial 17 finished with value: 1.0264668464660645 and parameters: {'learning_rate': 1.710104295980148e-05, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 4}. Best is trial 8 with value: 0.6598436832427979.


Step,Training Loss
50,0.1589
100,0.1513
150,0.1488


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▇██
train/global_step,▁▄▇██
train/grad_norm,█▅▁
train/learning_rate,▁▄█
train/loss,█▃▁

0,1
eval/loss,1.05497
eval/runtime,1.8509
eval/samples_per_second,67.535
eval/steps_per_second,17.289
total_flos,2.6044658557648896e+16
train/epoch,4.87943
train/global_step,175.0
train/grad_norm,0.94141
train/learning_rate,1e-05
train/loss,0.1488


[I 2025-06-09 11:05:49,594] Trial 18 finished with value: 1.0549684762954712 and parameters: {'learning_rate': 1.2145389528972164e-05, 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 8}. Best is trial 8 with value: 0.6598436832427979.


Step,Training Loss
50,0.1521
100,0.1444
150,0.1385


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▇██
train/global_step,▁▄▇██
train/grad_norm,▁██
train/learning_rate,▁▄█
train/loss,█▄▁

0,1
eval/loss,1.09559
eval/runtime,2.0668
eval/samples_per_second,60.481
eval/steps_per_second,30.482
total_flos,2.6044658557648896e+16
train/epoch,4.88099
train/global_step,175.0
train/grad_norm,1.0
train/learning_rate,2e-05
train/loss,0.1385


[I 2025-06-09 11:12:00,913] Trial 19 finished with value: 1.0955885648727417 and parameters: {'learning_rate': 2.3904033239190128e-05, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 16}. Best is trial 8 with value: 0.6598436832427979.


Best trial:
FrozenTrial(number=8, state=TrialState.COMPLETE, values=[0.6598436832427979], datetime_start=datetime.datetime(2025, 6, 9, 10, 4, 12, 537942), datetime_complete=datetime.datetime(2025, 6, 9, 10, 8, 54, 450388), params={'learning_rate': 2.6563467269626478e-05, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 4}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=5e-05, log=True, low=1e-05, step=None), 'per_device_train_batch_size': CategoricalDistribution(choices=(2, 4, 8, 16)), 'gradient_accumulation_steps': CategoricalDistribution(choices=(4, 8, 16))}, trial_id=8, value=None)
Best params:
{'learning_rate': 2.6563467269626478e-05, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 4}
