In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
!pip install rouge_score
!pip install nltk
!pip install bert_score
!pip install peft
!pip install --upgrade bitsandbytes
!pip install optuna
!pip install -U datasets

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=13ae8fecd9f2514e103a8e0537ff37193823b26117edaade57b5b43a27160256
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m831.3 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13
Collecting peft
  Download

In [None]:
import optuna
import numpy as np

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TrainingArguments, Trainer, DataCollatorForSeq2Seq, BitsAndBytesConfig, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score
from peft import get_peft_model, LoraConfig
import pandas as pd
import transformers
import torch
import random
from datasets import Dataset


from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score as bert_score

# Load Model

In [None]:
model_id = "tiiuae/falcon-7b"

tokenizer = AutoTokenizer.from_pretrained(model_id)


# Load the model with 4-bit quantization using bitsandbytes
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16,  # Mixed precision for compute (can use float16 or bfloat16)
    bnb_4bit_use_double_quant=True,       # Enable double quantization for more memory savings
    bnb_4bit_quant_type="nf4"             # Use NormalFloat4 (nf4) quantization, recommended for QLoRA
)

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=bnb_config)

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

# Load & Prepare *Data*

In [None]:
# Load the evaluation dataset
df = pd.read_csv('/content/medquad_summarized.csv')

In [None]:
df['input_text'] = df['question']
df['output_text'] = df['summary']
# Convert DataFrame to Hugging Face Dataset format
dataset = Dataset.from_pandas(df[['input_text', 'output_text']])

# Load Tokenizer

In [None]:
# Ensure the tokenizer has a pad_token set
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token = tokenizer.unk_token  # Use unknown token as pad token instead of eos

# Make sure the pad_token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use EOS as padding if not set


# Resize model embeddings to match the tokenizer's vocabulary size
model.resize_token_embeddings(len(tokenizer))



def tokenize_function(examples):
    return tokenizer(
        examples['input_text'],
        text_target=examples['output_text'],
        padding="max_length",      # Max_length for consistent
        truncation=True,
        max_length=256,         # Adjust as per your VRAM or reduce if needed
    )

# Apply the tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/14979 [00:00<?, ? examples/s]

In [None]:
# Split the dataset into training (80%) and validation (20%) sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Use only a small subset of the data for Hyperparameter
train_subset = train_dataset.select(range(4000))
eval_subset = eval_dataset.select(range(2000))

# Hyperparameter Tuning

In [None]:
def objective(trial):
    # Suggest hyperparameters for LoraConfig
    lora_rank = trial.suggest_int('lora_r', 16, 26)  # Adjust the range as necessary
    lora_alpha = trial.suggest_int('lora_alpha', 16, 24)
    lora_dropout = trial.suggest_float('lora_dropout', 0.1, 0.15)

    # Define LoRA configuration
    lora_config = LoraConfig(
        r=lora_rank,                         # LoRA rank
        lora_alpha=lora_alpha,               # Scaling factor
        lora_dropout=lora_dropout,           # Dropout to prevent overfitting
        target_modules=["query_key_value", "dense"],  # Adjust based on your model architecture
        bias="none",                         # Keep as "none"
        task_type="CAUSAL_LM",              # Task type
    )

    # # Define LoRA configuration
    # lora_config = LoraConfig(
    #     r=16,                         # LoRA rank
    #     lora_alpha=32,               # Scaling factor
    #     lora_dropout=0.5,           # Dropout to prevent overfitting
    #     target_modules=["query_key_value","dense"],  # Adjust based on your model architecture
    #     bias="none",                         # Keep as "none"
    #     task_type="CAUSAL_LM",              # Task type
    # )


    model.enable_input_require_grads()
    # Apply LoRA to the model
    model_with_lora = get_peft_model(model, lora_config)
    model_with_lora.print_trainable_parameters()



    # Suggest hyperparameters for training
    learning_rate = trial.suggest_float('learning_rate', 5e-6, 1e-5, log=True)
    num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5)
    per_device_train_batch_size = 4
    # # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps = max(1, 8 // per_device_train_batch_size),  # Adjust to maintain effective batch size
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        eval_strategy="steps",            # Evaluate once per 200 steps to monitor progress
        eval_steps=200,                    # Less frequent evaluation
        save_steps=400,                     # Adjusted to be a multiple of eval_steps
        logging_strategy="steps",
        logging_steps = 200,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        bf16=True,
    )

    # training_args = TrainingArguments(
    #     output_dir="./results",
    #     per_device_train_batch_size=4,
    #     gradient_accumulation_steps = 4,  # Adjust to maintain effective batch size
    #     learning_rate=1e-6,
    #     num_train_epochs=2,
    #     eval_strategy="steps",            # Evaluate once per 200 steps to monitor progress
    #     eval_steps=10,                    # Less frequent evaluation
    #     logging_strategy="steps",
    #     logging_steps = 10,
    #     load_best_model_at_end=True,
    #     metric_for_best_model="eval_loss",
    #     greater_is_better=False,
    #     bf16=True,
    # )

    # Adjust model configuration for VRAM efficiency
    model_with_lora.config.use_cache = False  # Disable caching to reduce memory usage
    model_with_lora.gradient_checkpointing_enable()  # Enable gradient checkpointing

    # # Decoding hyperparameters
    # top_k = trial.suggest_int('top_k', 20, 50)
    # top_p = trial.suggest_uniform('top_p', 0.7, 0.95)
    # temperature = trial.suggest_uniform('temperature', 0.5, 1.0)
    # repetition_penalty = trial.suggest_uniform('repetition_penalty', 1.1, 1.5)
    # max_length = trial.suggest_int('max_length', 128, 256)

    # # Update the generation configuration
    # model_with_lora.config.update({
    #     "do_sample": True,
    #     "top_k": top_k,
    #     "top_p": top_p,
    #     "temperature": temperature,
    #     "repetition_penalty": repetition_penalty,
    #     "max_length": max_length
    # })

    # Initialize the data collator
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_with_lora, padding=True)


    # Initialize Trainer
    trainer = Trainer(
        model=model_with_lora,
        args=training_args,
        train_dataset=train_subset,
        eval_dataset=eval_subset,
        data_collator=data_collator
    )

    # for name, param in model_with_lora.named_parameters():
    #     if param.dtype in [torch.float16, torch.float32, torch.float64]:
    #         param.requires_grad = True

    # def check_for_nan(module, grad_input, grad_output):
    #   for gi in grad_input:
    #       if gi is not None and torch.isnan(gi).any():
    #           print(f"NaN detected in gradients for {module}")

    #     #  Register hooks to check for NaNs
    # for module in model_with_lora.modules():
    #     module.register_backward_hook(check_for_nan)

    # Set the trainer to explicitly return the loss
    trainer.can_return_loss = True
    # Train and evaluate
    trainer.train()
    eval_results = trainer.evaluate(eval_dataset=eval_dataset)

    return eval_results['eval_loss']


In [None]:
# Create the study and optimize
study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=20)  # Adjust `n_trials` to the number of hyperparameter combinations you want to try

# Display the best trial
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


[I 2024-10-01 11:42:59,420] A new study created in memory with name: no-name-dc751884-f1e5-4579-a6ca-6323abc3117b


trainable params: 14,057,472 || all params: 6,935,778,176 || trainable%: 0.2027


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,8.7171,6.375253
400,5.2441,4.835126
600,4.7242,4.607574
800,4.5116,4.412804
1000,4.3303,4.275606
1200,4.2337,4.19127
1400,4.1537,4.145138
1600,4.1292,4.121818
1800,4.1011,4.10985
2000,4.1017,4.106264


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-01 12:40:45,378] Trial 0 finished with value: 4.100922584533691 and parameters: {'lora_r': 24, 'lora_alpha': 18, 'lora_dropout': 0.1092261171911793, 'learning_rate': 6.01388377465831e-06, 'num_train_epochs': 4}. Best is trial 0 with value: 4.100922584533691.


trainable params: 11,128,832 || all params: 6,932,849,536 || trainable%: 0.1605


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,8.5273,6.000691
400,5.1056,4.77125
600,4.6583,4.524605
800,4.4101,4.305715
1000,4.2233,4.174953
1200,4.1386,4.107405
1400,4.0714,4.068825
1600,4.05,4.044814
1800,4.0215,4.027166
2000,4.0123,4.016768


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-01 13:52:16,735] Trial 1 finished with value: 3.999864101409912 and parameters: {'lora_r': 19, 'lora_alpha': 20, 'lora_dropout': 0.13933894935125327, 'learning_rate': 6.248528296293344e-06, 'num_train_epochs': 5}. Best is trial 1 with value: 3.999864101409912.


trainable params: 14,057,472 || all params: 6,935,778,176 || trainable%: 0.2027


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,7.927,5.241584
400,4.8521,4.641437
600,4.5194,4.374122
800,4.2836,4.209305
1000,4.1532,4.132428
1200,4.1136,4.098261
1400,4.0742,4.084217


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-01 14:35:32,768] Trial 2 finished with value: 4.093052387237549 and parameters: {'lora_r': 24, 'lora_alpha': 24, 'lora_dropout': 0.11390029096384303, 'learning_rate': 7.51572835290396e-06, 'num_train_epochs': 3}. Best is trial 1 with value: 3.999864101409912.


trainable params: 10,543,104 || all params: 6,932,263,808 || trainable%: 0.1521


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,7.4977,5.013784
400,4.7089,4.502496
600,4.3541,4.216542
800,4.145,4.103022
1000,4.0624,4.055431
1200,4.0397,4.029626
1400,4.0037,4.017776


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-01 15:18:48,642] Trial 3 finished with value: 4.023904323577881 and parameters: {'lora_r': 18, 'lora_alpha': 22, 'lora_dropout': 0.1419266734934595, 'learning_rate': 9.690358139838673e-06, 'num_train_epochs': 3}. Best is trial 1 with value: 3.999864101409912.


trainable params: 12,300,288 || all params: 6,934,020,992 || trainable%: 0.1774


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,8.6065,6.144243
400,5.1632,4.803484
600,4.6965,4.578068
800,4.4742,4.37297
1000,4.2915,4.239297
1200,4.2004,4.162747
1400,4.1266,4.122756
1600,4.1075,4.101933
1800,4.0814,4.091025
2000,4.082,4.087608


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-01 16:16:48,193] Trial 4 finished with value: 4.082090854644775 and parameters: {'lora_r': 21, 'lora_alpha': 18, 'lora_dropout': 0.10681459059203782, 'learning_rate': 6.339446310216802e-06, 'num_train_epochs': 4}. Best is trial 1 with value: 3.999864101409912.


trainable params: 12,300,288 || all params: 6,934,020,992 || trainable%: 0.1774


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,8.7726,6.535549
400,5.2928,4.846016
600,4.727,4.604708
800,4.5006,4.396673
1000,4.3117,4.253659
1200,4.209,4.165048
1400,4.1245,4.115504
1600,4.0959,4.086206
1800,4.0618,4.066142
2000,4.052,4.053374


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-01 17:28:04,738] Trial 5 finished with value: 4.035161972045898 and parameters: {'lora_r': 21, 'lora_alpha': 20, 'lora_dropout': 0.14352257868762208, 'learning_rate': 5.550374183731688e-06, 'num_train_epochs': 5}. Best is trial 1 with value: 3.999864101409912.


trainable params: 13,471,744 || all params: 6,935,192,448 || trainable%: 0.1943


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,8.0591,5.3139
400,4.867,4.637781
600,4.5011,4.343798
800,4.2402,4.160013
1000,4.103,4.081409
1200,4.0562,4.03638
1400,4.0019,4.006648
1600,3.989,3.988833
1800,3.9669,3.973543
2000,3.9576,3.965801


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-01 18:39:22,560] Trial 6 finished with value: 3.9500231742858887 and parameters: {'lora_r': 23, 'lora_alpha': 17, 'lora_dropout': 0.10254130137343953, 'learning_rate': 8.225845529985416e-06, 'num_train_epochs': 5}. Best is trial 6 with value: 3.9500231742858887.


trainable params: 15,228,928 || all params: 6,936,949,632 || trainable%: 0.2195


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,8.8872,6.950725


Step,Training Loss,Validation Loss
200,8.8872,6.950725
400,5.4306,4.906379
600,4.7826,4.665444
800,4.5824,4.494304
1000,4.4083,4.348882
1200,4.3049,4.254319
1400,4.2141,4.197674
1600,4.1806,4.166629
1800,4.1447,4.15106
2000,4.1443,4.146475


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-01 19:37:11,017] Trial 7 finished with value: 4.14143180847168 and parameters: {'lora_r': 26, 'lora_alpha': 18, 'lora_dropout': 0.1370007793350684, 'learning_rate': 5.510902944611987e-06, 'num_train_epochs': 4}. Best is trial 6 with value: 3.9500231742858887.


trainable params: 14,643,200 || all params: 6,936,363,904 || trainable%: 0.2111


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,8.239,5.523374
400,4.9474,4.691679
600,4.5715,4.415991
800,4.3057,4.211518
1000,4.1443,4.11439
1200,4.0861,4.06284
1400,4.027,4.029088
1600,4.0111,4.008968
1800,3.9868,3.993093
2000,3.9774,3.984052


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-01 20:48:28,411] Trial 8 finished with value: 3.9679465293884277 and parameters: {'lora_r': 25, 'lora_alpha': 22, 'lora_dropout': 0.1473522840190758, 'learning_rate': 6.697710087224079e-06, 'num_train_epochs': 5}. Best is trial 6 with value: 3.9500231742858887.


trainable params: 9,957,376 || all params: 6,931,678,080 || trainable%: 0.1437


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,8.7343,6.474539
400,5.2782,4.846046
600,4.7332,4.614523
800,4.5172,4.418076
1000,4.3374,4.282356
1200,4.2405,4.196044
1400,4.1583,4.147749
1600,4.132,4.1226
1800,4.1022,4.11008
2000,4.1024,4.106324


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-01 21:46:23,691] Trial 9 finished with value: 4.1012067794799805 and parameters: {'lora_r': 17, 'lora_alpha': 23, 'lora_dropout': 0.11856156560169466, 'learning_rate': 5.408439157838638e-06, 'num_train_epochs': 4}. Best is trial 6 with value: 3.9500231742858887.


trainable params: 12,886,016 || all params: 6,934,606,720 || trainable%: 0.1858


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,8.1704,5.424845
400,4.9116,4.670504
600,4.5444,4.386296
800,4.2771,4.188788
1000,4.1264,4.101174
1200,4.0745,4.052963
1400,4.0181,4.020724
1600,4.0027,4.001634
1800,3.9792,3.985621
2000,3.9698,3.977197


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-01 22:57:32,498] Trial 10 finished with value: 3.961221694946289 and parameters: {'lora_r': 22, 'lora_alpha': 16, 'lora_dropout': 0.10024488158074826, 'learning_rate': 8.069041588696481e-06, 'num_train_epochs': 5}. Best is trial 6 with value: 3.9500231742858887.


trainable params: 12,886,016 || all params: 6,934,606,720 || trainable%: 0.1858


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,8.1433,5.400565
400,4.9051,4.663884
600,4.5366,4.377058
800,4.2684,4.181446
1000,4.1199,4.095621
1200,4.0697,4.0494
1400,4.0144,4.018268
1600,4.0011,4.000277
1800,3.9781,3.98504
2000,3.9691,3.977032


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-02 00:08:39,720] Trial 11 finished with value: 3.9611761569976807 and parameters: {'lora_r': 22, 'lora_alpha': 16, 'lora_dropout': 0.10370108844232619, 'learning_rate': 8.135207055658295e-06, 'num_train_epochs': 5}. Best is trial 6 with value: 3.9500231742858887.


trainable params: 12,886,016 || all params: 6,934,606,720 || trainable%: 0.1858


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,7.9881,5.257979
400,4.8459,4.624231
600,4.4835,4.320992
800,4.2178,4.143198
1000,4.0898,4.071149
1200,4.0472,4.028965
1400,3.9944,4.000538
1600,3.9837,3.983983
1800,3.9619,3.969083
2000,3.9526,3.961395


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-02 01:19:48,642] Trial 12 finished with value: 3.946066379547119 and parameters: {'lora_r': 22, 'lora_alpha': 16, 'lora_dropout': 0.1278687996079111, 'learning_rate': 8.693336643714757e-06, 'num_train_epochs': 5}. Best is trial 12 with value: 3.946066379547119.


trainable params: 13,471,744 || all params: 6,935,192,448 || trainable%: 0.1943


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,7.7329,5.100988
400,4.7549,4.543376
600,4.3853,4.232192
800,4.1468,4.093946
1000,4.0476,4.033495
1200,4.0118,3.996113
1400,3.9617,3.970062
1600,3.9529,3.954183
1800,3.9327,3.939751
2000,3.9225,3.932453


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-02 02:31:08,289] Trial 13 finished with value: 3.917358636856079 and parameters: {'lora_r': 23, 'lora_alpha': 17, 'lora_dropout': 0.12895854279760852, 'learning_rate': 9.513292383735116e-06, 'num_train_epochs': 5}. Best is trial 13 with value: 3.917358636856079.


trainable params: 11,714,560 || all params: 6,933,435,264 || trainable%: 0.1690


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,7.7998,5.144486
400,4.7838,4.572514
600,4.4158,4.257144
800,4.1662,4.10693
1000,4.0587,4.043487
1200,4.0211,4.005069
1400,3.9708,3.978495
1600,3.9617,3.962602
1800,3.9411,3.948194
2000,3.931,3.940724


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-02 03:42:16,169] Trial 14 finished with value: 3.925461769104004 and parameters: {'lora_r': 20, 'lora_alpha': 16, 'lora_dropout': 0.12981387705974679, 'learning_rate': 9.490913868787127e-06, 'num_train_epochs': 5}. Best is trial 13 with value: 3.917358636856079.


trainable params: 11,714,560 || all params: 6,933,435,264 || trainable%: 0.1690


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,7.5545,5.027428
400,4.7098,4.495947
600,4.3397,4.196522
800,4.1224,4.07923
1000,4.037,4.027489
1200,4.0081,3.995329
1400,3.9632,3.97424
1600,3.9595,3.962806
1800,3.9441,3.954709
2000,3.9402,3.952501


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-02 04:40:06,144] Trial 15 finished with value: 3.947164535522461 and parameters: {'lora_r': 20, 'lora_alpha': 19, 'lora_dropout': 0.1286844082049059, 'learning_rate': 9.905496063754352e-06, 'num_train_epochs': 4}. Best is trial 13 with value: 3.917358636856079.


trainable params: 9,371,648 || all params: 6,931,092,352 || trainable%: 0.1352


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,7.9008,5.207325
400,4.8173,4.59627
600,4.4467,4.290591
800,4.1938,4.125876
1000,4.0751,4.05718
1200,4.0335,4.015543
1400,3.9812,3.987456
1600,3.9705,3.970959
1800,3.9492,3.956235
2000,3.9392,3.948687


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-02 05:51:05,341] Trial 16 finished with value: 3.93308424949646 and parameters: {'lora_r': 16, 'lora_alpha': 17, 'lora_dropout': 0.1328692833114602, 'learning_rate': 8.944786021325906e-06, 'num_train_epochs': 5}. Best is trial 13 with value: 3.917358636856079.


trainable params: 11,128,832 || all params: 6,932,849,536 || trainable%: 0.1605


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,7.7747,5.136573
400,4.7858,4.579954
600,4.4316,4.276978
800,4.186,4.123492
1000,4.0757,4.060972
1200,4.0397,4.023688
1400,3.9921,3.999957
1600,3.9855,3.987055
1800,3.9689,3.979064
2000,3.9661,3.976739


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-02 06:49:06,950] Trial 17 finished with value: 3.971296787261963 and parameters: {'lora_r': 19, 'lora_alpha': 17, 'lora_dropout': 0.12194378124290389, 'learning_rate': 9.387198572771965e-06, 'num_train_epochs': 4}. Best is trial 13 with value: 3.917358636856079.


trainable params: 11,714,560 || all params: 6,933,435,264 || trainable%: 0.1690


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,8.2611,5.580895
400,4.9791,4.719907
600,4.6186,4.490304
800,4.3924,4.308138
1000,4.2418,4.208814
1200,4.185,4.161903
1400,4.1367,4.143408


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-02 07:32:21,006] Trial 18 finished with value: 4.156976699829102 and parameters: {'lora_r': 20, 'lora_alpha': 19, 'lora_dropout': 0.13357294297461705, 'learning_rate': 7.354672030340594e-06, 'num_train_epochs': 3}. Best is trial 13 with value: 3.917358636856079.


trainable params: 14,057,472 || all params: 6,935,778,176 || trainable%: 0.2027


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,7.6985,5.088428
400,4.7489,4.537871
600,4.3785,4.225451
800,4.1415,4.090102
1000,4.0441,4.030007
1200,4.0086,3.992758
1400,3.9582,3.967
1600,3.9501,3.951042
1800,3.9296,3.936923
2000,3.9192,3.929631


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[I 2024-10-02 08:43:25,722] Trial 19 finished with value: 3.914379119873047 and parameters: {'lora_r': 24, 'lora_alpha': 20, 'lora_dropout': 0.12310057100368679, 'learning_rate': 8.893630366717144e-06, 'num_train_epochs': 5}. Best is trial 19 with value: 3.914379119873047.


Best trial:
  Value: 3.914379119873047
  Params: 
    lora_r: 24
    lora_alpha: 20
    lora_dropout: 0.12310057100368679
    learning_rate: 8.893630366717144e-06
    num_train_epochs: 5
