# Dataset Creation

In [1]:
# %pip install -U transformers
# %pip install -U datasets
# %pip install -U pandas
# %pip install -U trl
# %pip install -U accelerate
# %pip install -U xformers
# %pip install torch torchvision torchaudio
# %pip install "unsloth[cu124-torch250] @ git+https://github.com/unslothai/unsloth.git"

In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.bfloat16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

๐ฆฅ Unsloth: Will patch your computer to enable 2x faster free finetuning.
๐ฆฅ Unsloth Zoo will now patch everything to make training faster!


In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-0.5B",
    # model_name = "unsloth/Llama-3.2-1B", # or choose "unsloth/Llama-3.2-1B-Instruct"
    # model_name = "unsloth/Qwen2.5-1.5B-Instruct",
    # model_name = "unsloth/gemma-2-2b-it",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.12.8: Fast Qwen2 patching. Transformers: 4.46.3.
   \\   /|    GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU. Max memory: 3.598 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                      "embed_tokens", "lm_head"],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Offloading input_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)


Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.12.8 patched 24 layers with 24 QKV layers, 24 O layers and 24 MLP layers.


Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


In [6]:
from datasets import load_from_disk
dataset_formatted = load_from_disk("/media/muhammad-osama/Local Disk/Pretrain Arabic & English Text/combined_data_2")

In [7]:
dataset_formatted

Dataset({
    features: ['text'],
    num_rows: 501070
})

## UNSLOTH Training

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

In [9]:
trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_formatted,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 1,
    save_strategy='steps',
    save_steps=50,
    packing = True, # Can make training 5x faster for short sequences.
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": True,  # No need to add additional separator token
    },
    logging_dir="/home/muhammad-osama/Documents/Qwen2.5-0.5B-Pretraining/tensorboard_logging/v1",
    args = UnslothTrainingArguments(
        per_device_train_batch_size = 10,
        gradient_accumulation_steps = 12,
        # warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps = 60,
        learning_rate = 2e-5,
        embedding_learning_rate = 2e-6,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        max_grad_norm=0.3,
        # weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir=f"/home/muhammad-osama/Documents/Qwen2.5-0.5B-Pretraining/run/v1",
        # output_dir = "outputs",
        report_to = "tensorboard", # Use this for WandB etc
    ),
)

In [10]:
tokenizer.decode(trainer.train_dataset[20000]['input_ids'])

'ููุฑุง ูุนุฌุฒุงุ ูุดูุฏูู ุนูุฏ ูุง ุชูุฒู ุงูุณูุฑุฉ ุงููุฑุขููุฉ ุนูู ุงูุฑุณูู ุตูู ุงููู ุนููู ูุณูู ููู ุญุงุถุฑูู ูู ูุฌูุณู ูุชููู: ููุฅูุฐุง ูุง ุฃูููุฒูููุชู ุณููุฑูุฉู ุฃู ุขูุงุช ูููุงุ ุนูู ุงูุฑุณูู ุตูู ุงููู ุนููู ูุณูู ููู ููุฌูุฏูู ูู ูุฌูุณู ููุธูุฑู ุจูุนูุถููููู ุฅููู ุจูุนูุถู ูู ุฑูุจุฉ ูููุฑุ ูุชุบุงูุฒูุง ุจุนููููู ูุฌูุงุฑุญูู ูู ูุคู ูุฎุณุฉ ุซู ุชุณุงุกููุง: ูููู ููุฑุงูููู ูููู ุฃูุญูุฏู ุฃู:ูู ูุฑุงูู ูู ุฃุญุฏ ูู ุงููุณูููู ุฅุฐุง ูุง ููุชู ูู ูุฐุง ุงููุฌูุณุ ูุจู ุฃู ูุชูู ุงูุฑุณูู ุตูู ุงููู ุนููู ูุณูู ูุฐู ุงูุณูุฑุฉ ุฃู ุงูุขูุงุช ุงูุชู ูุฏ ุชูุถุญูู ูุชูุดู ุนูุง ุฃุณุฑุฑุชููู ูููุง ุจูููู.ุซูููู ุงููุตูุฑููููุง ูู ูุฌูุณ ุงูุฑุณูู ุตูู ุงููู ุนููู ูุณูู ูุชุณูููู ูู ุญุฐุฑ ุญุชู 

In [11]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3050 Ti Laptop GPU. Max memory = 3.598 GB.
0.953 GB of memory reserved.


In [None]:
trainer_stats = trainer.train(resume_from_checkpoint="/home/muhammad-osama/Documents/Qwen2.5-0.5B-Pretraining/run/v1/checkpoint-450")

 27%|โโโ       | 817/3057 [22:25:33<145:55:00, 234.51s/it]

{'loss': 2.2977, 'grad_norm': 1.1400295495986938, 'learning_rate': 1.667754786181811e-05, 'epoch': 0.27}


 27%|โโโ       | 818/3057 [22:29:51<150:14:15, 241.56s/it]

{'loss': 2.3331, 'grad_norm': 1.1838876008987427, 'learning_rate': 1.6669894540267444e-05, 'epoch': 0.27}


 27%|โโโ       | 819/3057 [22:34:20<155:10:16, 249.61s/it]

{'loss': 2.2475, 'grad_norm': 1.2422094345092773, 'learning_rate': 1.666223417457973e-05, 'epoch': 0.27}


 27%|โโโ       | 820/3057 [22:39:16<163:47:32, 263.59s/it]

{'loss': 2.2131, 'grad_norm': 1.2490252256393433, 'learning_rate': 1.6654566772845148e-05, 'epoch': 0.27}


 27%|โโโ       | 821/3057 [22:43:01<156:37:17, 252.16s/it]

{'loss': 2.2863, 'grad_norm': 1.522324562072754, 'learning_rate': 1.664689234316131e-05, 'epoch': 0.27}


 27%|โโโ       | 822/3057 [22:46:17<146:05:53, 235.33s/it]

{'loss': 2.3185, 'grad_norm': 1.172848105430603, 'learning_rate': 1.6639210893633257e-05, 'epoch': 0.27}


 27%|โโโ       | 823/3057 [22:49:49<141:36:54, 228.21s/it]

{'loss': 2.2589, 'grad_norm': 1.3632211685180664, 'learning_rate': 1.6631522432373438e-05, 'epoch': 0.27}


 27%|โโโ       | 824/3057 [22:53:41<142:11:31, 229.24s/it]

{'loss': 2.3229, 'grad_norm': 1.3031443357467651, 'learning_rate': 1.6623826967501703e-05, 'epoch': 0.27}


 27%|โโโ       | 825/3057 [22:56:54<135:22:36, 218.35s/it]

{'loss': 2.2415, 'grad_norm': 1.515383243560791, 'learning_rate': 1.661612450714531e-05, 'epoch': 0.27}


 27%|โโโ       | 826/3057 [23:00:08<130:53:40, 211.21s/it]

{'loss': 2.3792, 'grad_norm': 1.3765002489089966, 'learning_rate': 1.660841505943889e-05, 'epoch': 0.27}


 27%|โโโ       | 827/3057 [23:03:45<131:57:44, 213.03s/it]

{'loss': 2.2321, 'grad_norm': 1.352211594581604, 'learning_rate': 1.6600698632524475e-05, 'epoch': 0.27}


 27%|โโโ       | 828/3057 [23:07:37<135:20:48, 218.60s/it]

{'loss': 2.2545, 'grad_norm': 1.5776795148849487, 'learning_rate': 1.659297523455144e-05, 'epoch': 0.27}


 27%|โโโ       | 829/3057 [23:11:33<138:34:48, 223.92s/it]

{'loss': 2.3377, 'grad_norm': 1.2037254571914673, 'learning_rate': 1.6585244873676544e-05, 'epoch': 0.27}


 27%|โโโ       | 830/3057 [23:15:36<141:57:22, 229.48s/it]

{'loss': 2.2678, 'grad_norm': 1.4483375549316406, 'learning_rate': 1.657750755806389e-05, 'epoch': 0.27}


 27%|โโโ       | 831/3057 [23:19:32<143:12:23, 231.60s/it]

{'loss': 2.3241, 'grad_norm': 1.3126881122589111, 'learning_rate': 1.656976329588493e-05, 'epoch': 0.27}


 27%|โโโ       | 832/3057 [23:23:07<140:04:17, 226.63s/it]

{'loss': 2.3139, 'grad_norm': 1.224544882774353, 'learning_rate': 1.6562012095318452e-05, 'epoch': 0.27}


 27%|โโโ       | 833/3057 [23:26:37<136:55:55, 221.65s/it]

{'loss': 2.2517, 'grad_norm': 1.2443780899047852, 'learning_rate': 1.6554253964550566e-05, 'epoch': 0.27}


 27%|โโโ       | 834/3057 [23:30:12<135:32:51, 219.51s/it]

{'loss': 2.2335, 'grad_norm': 1.1614062786102295, 'learning_rate': 1.654648891177471e-05, 'epoch': 0.27}


 27%|โโโ       | 835/3057 [23:33:46<134:32:26, 217.98s/it]

{'loss': 2.2956, 'grad_norm': 1.0119155645370483, 'learning_rate': 1.6538716945191623e-05, 'epoch': 0.27}


 27%|โโโ       | 836/3057 [23:37:26<134:42:17, 218.34s/it]

{'loss': 2.3522, 'grad_norm': 1.1753283739089966, 'learning_rate': 1.6530938073009353e-05, 'epoch': 0.27}


 27%|โโโ       | 837/3057 [23:41:33<139:57:28, 226.96s/it]

{'loss': 2.2775, 'grad_norm': 1.0982470512390137, 'learning_rate': 1.6523152303443238e-05, 'epoch': 0.27}


 27%|โโโ       | 838/3057 [23:45:22<140:24:04, 227.78s/it]

{'loss': 2.26, 'grad_norm': 1.109259009361267, 'learning_rate': 1.6515359644715905e-05, 'epoch': 0.27}


 27%|โโโ       | 839/3057 [23:49:03<139:01:53, 225.66s/it]

{'loss': 2.246, 'grad_norm': 1.1955512762069702, 'learning_rate': 1.6507560105057247e-05, 'epoch': 0.27}


 27%|โโโ       | 840/3057 [23:53:07<142:18:58, 231.10s/it]

{'loss': 2.219, 'grad_norm': 1.4539880752563477, 'learning_rate': 1.649975369270443e-05, 'epoch': 0.27}


 28%|โโโ       | 841/3057 [23:56:54<141:35:59, 230.04s/it]

{'loss': 2.2644, 'grad_norm': 1.3443676233291626, 'learning_rate': 1.6491940415901884e-05, 'epoch': 0.28}


 28%|โโโ       | 842/3057 [24:02:55<165:36:15, 269.15s/it]

{'loss': 2.2739, 'grad_norm': 1.3128657341003418, 'learning_rate': 1.6484120282901278e-05, 'epoch': 0.28}


 28%|โโโ       | 843/3057 [24:09:44<191:21:16, 311.15s/it]

{'loss': 2.253, 'grad_norm': 1.2765346765518188, 'learning_rate': 1.647629330196152e-05, 'epoch': 0.28}


 28%|โโโ       | 844/3057 [24:16:26<208:00:52, 338.39s/it]

{'loss': 2.2046, 'grad_norm': 1.1848506927490234, 'learning_rate': 1.6468459481348767e-05, 'epoch': 0.28}


 28%|โโโ       | 845/3057 [24:22:35<213:30:51, 347.49s/it]

{'loss': 2.2797, 'grad_norm': 1.2564936876296997, 'learning_rate': 1.6460618829336388e-05, 'epoch': 0.28}


 28%|โโโ       | 846/3057 [24:29:25<225:04:35, 366.47s/it]

{'loss': 2.2872, 'grad_norm': 1.165953278541565, 'learning_rate': 1.6452771354204964e-05, 'epoch': 0.28}


 28%|โโโ       | 847/3057 [24:34:30<213:38:31, 348.01s/it]

{'loss': 2.4303, 'grad_norm': 1.2789160013198853, 'learning_rate': 1.6444917064242284e-05, 'epoch': 0.28}


 28%|โโโ       | 848/3057 [24:40:07<211:33:05, 344.76s/it]

{'loss': 2.246, 'grad_norm': 1.2599693536758423, 'learning_rate': 1.6437055967743344e-05, 'epoch': 0.28}


 28%|โโโ       | 849/3057 [24:46:55<222:55:23, 363.46s/it]

{'loss': 2.3007, 'grad_norm': 1.037638545036316, 'learning_rate': 1.6429188073010312e-05, 'epoch': 0.28}


 28%|โโโ       | 850/3057 [24:52:30<217:38:23, 355.01s/it]

{'loss': 2.308, 'grad_norm': 1.0832046270370483, 'learning_rate': 1.6421313388352552e-05, 'epoch': 0.28}


 28%|โโโ       | 851/3057 [24:59:14<226:31:27, 369.67s/it]

{'loss': 2.2566, 'grad_norm': 1.3034331798553467, 'learning_rate': 1.641343192208658e-05, 'epoch': 0.28}


 28%|โโโ       | 852/3057 [25:07:01<244:18:23, 398.87s/it]

{'loss': 2.3293, 'grad_norm': 1.029085397720337, 'learning_rate': 1.6405543682536095e-05, 'epoch': 0.28}


 28%|โโโ       | 853/3057 [25:13:47<245:39:07, 401.25s/it]

{'loss': 2.2512, 'grad_norm': 1.2874318361282349, 'learning_rate': 1.6397648678031936e-05, 'epoch': 0.28}


 28%|โโโ       | 854/3057 [25:18:32<224:12:01, 366.37s/it]

{'loss': 2.2303, 'grad_norm': 1.1576147079467773, 'learning_rate': 1.638974691691209e-05, 'epoch': 0.28}


 28%|โโโ       | 855/3057 [25:22:27<199:58:45, 326.94s/it]

{'loss': 2.3752, 'grad_norm': 1.3753023147583008, 'learning_rate': 1.6381838407521683e-05, 'epoch': 0.28}


 28%|โโโ       | 856/3057 [25:26:12<181:03:37, 296.15s/it]

{'loss': 2.3551, 'grad_norm': 1.1361920833587646, 'learning_rate': 1.6373923158212956e-05, 'epoch': 0.28}


 28%|โโโ       | 857/3057 [25:30:17<171:34:00, 280.75s/it]

{'loss': 2.3013, 'grad_norm': 1.2007356882095337, 'learning_rate': 1.6366001177345282e-05, 'epoch': 0.28}


 28%|โโโ       | 858/3057 [25:34:09<162:43:09, 266.39s/it]

{'loss': 2.3126, 'grad_norm': 1.2205294370651245, 'learning_rate': 1.635807247328514e-05, 'epoch': 0.28}


 28%|โโโ       | 859/3057 [25:38:05<156:59:40, 257.13s/it]

{'loss': 2.3847, 'grad_norm': 1.1147361993789673, 'learning_rate': 1.6350137054406104e-05, 'epoch': 0.28}


 28%|โโโ       | 860/3057 [25:41:46<150:17:23, 246.26s/it]

{'loss': 2.2741, 'grad_norm': 1.1633360385894775, 'learning_rate': 1.6342194929088845e-05, 'epoch': 0.28}


 28%|โโโ       | 861/3057 [25:45:26<145:31:39, 238.57s/it]

{'loss': 2.3206, 'grad_norm': 1.196089744567871, 'learning_rate': 1.6334246105721116e-05, 'epoch': 0.28}


 28%|โโโ       | 862/3057 [25:49:04<141:39:18, 232.33s/it]

{'loss': 2.3027, 'grad_norm': 1.2972925901412964, 'learning_rate': 1.6326290592697738e-05, 'epoch': 0.28}


 28%|โโโ       | 863/3057 [25:53:00<142:12:18, 233.34s/it]

{'loss': 2.2612, 'grad_norm': 1.3234935998916626, 'learning_rate': 1.6318328398420608e-05, 'epoch': 0.28}


 28%|โโโ       | 864/3057 [25:57:10<145:14:22, 238.42s/it]

{'loss': 2.2721, 'grad_norm': 1.2990202903747559, 'learning_rate': 1.631035953129866e-05, 'epoch': 0.28}


 28%|โโโ       | 865/3057 [26:01:19<147:01:17, 241.46s/it]

{'loss': 2.3546, 'grad_norm': 1.2978490591049194, 'learning_rate': 1.6302383999747907e-05, 'epoch': 0.28}


 28%|โโโ       | 866/3057 [26:04:58<142:54:53, 234.82s/it]

{'loss': 2.2984, 'grad_norm': 1.3411705493927002, 'learning_rate': 1.6294401812191366e-05, 'epoch': 0.28}


 28%|โโโ       | 867/3057 [26:08:24<137:33:46, 226.13s/it]

{'loss': 2.303, 'grad_norm': 1.1666674613952637, 'learning_rate': 1.62864129770591e-05, 'epoch': 0.28}


 28%|โโโ       | 868/3057 [26:12:11<137:38:12, 226.36s/it]

{'loss': 2.2772, 'grad_norm': 1.3840633630752563, 'learning_rate': 1.6278417502788203e-05, 'epoch': 0.28}


 28%|โโโ       | 869/3057 [26:15:43<134:58:51, 222.09s/it]

{'loss': 2.2962, 'grad_norm': 1.609632134437561, 'learning_rate': 1.6270415397822756e-05, 'epoch': 0.28}


 28%|โโโ       | 870/3057 [26:19:00<130:25:07, 214.68s/it]

{'loss': 2.1985, 'grad_norm': 1.3693615198135376, 'learning_rate': 1.6262406670613865e-05, 'epoch': 0.28}


 28%|โโโ       | 871/3057 [26:22:23<128:05:56, 210.96s/it]

{'loss': 2.2972, 'grad_norm': 1.5666720867156982, 'learning_rate': 1.6254391329619616e-05, 'epoch': 0.28}


 29%|โโโ       | 872/3057 [26:25:47<126:53:01, 209.05s/it]

{'loss': 2.3136, 'grad_norm': 1.2757450342178345, 'learning_rate': 1.624636938330508e-05, 'epoch': 0.29}


 29%|โโโ       | 873/3057 [26:29:30<129:23:44, 213.29s/it]

{'loss': 2.3371, 'grad_norm': 1.1570703983306885, 'learning_rate': 1.6238340840142317e-05, 'epoch': 0.29}


 29%|โโโ       | 874/3057 [26:32:48<126:27:56, 208.56s/it]

{'loss': 2.2909, 'grad_norm': 1.1867469549179077, 'learning_rate': 1.623030570861034e-05, 'epoch': 0.29}


 29%|โโโ       | 875/3057 [26:36:52<132:51:18, 219.19s/it]

{'loss': 2.2244, 'grad_norm': 1.3670240640640259, 'learning_rate': 1.6222263997195132e-05, 'epoch': 0.29}


 29%|โโโ       | 876/3057 [26:40:53<136:51:12, 225.89s/it]

{'loss': 2.3559, 'grad_norm': 1.1419306993484497, 'learning_rate': 1.621421571438961e-05, 'epoch': 0.29}


 29%|โโโ       | 877/3057 [26:45:39<147:35:42, 243.74s/it]

{'loss': 2.2908, 'grad_norm': 1.2679295539855957, 'learning_rate': 1.6206160868693644e-05, 'epoch': 0.29}


 29%|โโโ       | 878/3057 [26:49:00<139:45:12, 230.89s/it]

{'loss': 2.3339, 'grad_norm': 1.040433645248413, 'learning_rate': 1.619809946861403e-05, 'epoch': 0.29}


 29%|โโโ       | 879/3057 [26:53:51<150:39:10, 249.01s/it]

{'loss': 2.2861, 'grad_norm': 1.241455078125, 'learning_rate': 1.6190031522664483e-05, 'epoch': 0.29}


 29%|โโโ       | 880/3057 [26:58:33<156:31:48, 258.85s/it]

{'loss': 2.3191, 'grad_norm': 1.0854967832565308, 'learning_rate': 1.618195703936564e-05, 'epoch': 0.29}


 29%|โโโ       | 881/3057 [27:02:58<157:37:46, 260.78s/it]

{'loss': 2.3169, 'grad_norm': 1.3550642728805542, 'learning_rate': 1.617387602724504e-05, 'epoch': 0.29}


 29%|โโโ       | 882/3057 [27:07:43<161:55:15, 268.01s/it]

{'loss': 2.3515, 'grad_norm': 1.2364572286605835, 'learning_rate': 1.6165788494837106e-05, 'epoch': 0.29}


 29%|โโโ       | 883/3057 [27:11:36<155:28:34, 257.46s/it]

{'loss': 2.2115, 'grad_norm': 1.175830364227295, 'learning_rate': 1.6157694450683165e-05, 'epoch': 0.29}


 29%|โโโ       | 884/3057 [27:15:54<155:32:39, 257.69s/it]

{'loss': 2.2109, 'grad_norm': 1.426518201828003, 'learning_rate': 1.61495939033314e-05, 'epoch': 0.29}


 29%|โโโ       | 885/3057 [27:20:02<153:47:17, 254.90s/it]

{'loss': 2.2204, 'grad_norm': 1.48033607006073, 'learning_rate': 1.6141486861336887e-05, 'epoch': 0.29}


 29%|โโโ       | 886/3057 [27:23:54<149:31:18, 247.94s/it]

{'loss': 2.2804, 'grad_norm': 1.0434428453445435, 'learning_rate': 1.6133373333261535e-05, 'epoch': 0.29}


 29%|โโโ       | 887/3057 [27:28:54<158:53:58, 263.61s/it]

{'loss': 2.2886, 'grad_norm': 1.1698846817016602, 'learning_rate': 1.6125253327674124e-05, 'epoch': 0.29}


 29%|โโโ       | 888/3057 [27:34:34<172:36:21, 286.48s/it]

{'loss': 2.2381, 'grad_norm': 1.1811519861221313, 'learning_rate': 1.6117126853150263e-05, 'epoch': 0.29}


 29%|โโโ       | 889/3057 [27:39:43<176:28:58, 293.05s/it]

{'loss': 2.316, 'grad_norm': 1.1515486240386963, 'learning_rate': 1.6108993918272398e-05, 'epoch': 0.29}


 29%|โโโ       | 890/3057 [27:42:51<157:31:58, 261.71s/it]

{'loss': 2.3121, 'grad_norm': 1.2261444330215454, 'learning_rate': 1.61008545316298e-05, 'epoch': 0.29}


 29%|โโโ       | 891/3057 [27:45:51<142:40:01, 237.12s/it]

{'loss': 2.2921, 'grad_norm': 1.1233245134353638, 'learning_rate': 1.6092708701818538e-05, 'epoch': 0.29}


 29%|โโโ       | 892/3057 [27:48:51<132:16:48, 219.96s/it]

{'loss': 2.2885, 'grad_norm': 1.1593787670135498, 'learning_rate': 1.608455643744151e-05, 'epoch': 0.29}


 29%|โโโ       | 893/3057 [27:51:47<124:24:09, 206.95s/it]

{'loss': 2.2621, 'grad_norm': 1.240460991859436, 'learning_rate': 1.6076397747108395e-05, 'epoch': 0.29}


 29%|โโโ       | 894/3057 [27:54:43<118:44:24, 197.63s/it]

{'loss': 2.314, 'grad_norm': 1.399755835533142, 'learning_rate': 1.6068232639435662e-05, 'epoch': 0.29}


 29%|โโโ       | 895/3057 [27:57:39<114:39:34, 190.92s/it]

{'loss': 2.2982, 'grad_norm': 1.1963272094726562, 'learning_rate': 1.6060061123046547e-05, 'epoch': 0.29}


 29%|โโโ       | 896/3057 [28:00:30<111:09:58, 185.19s/it]

{'loss': 2.3158, 'grad_norm': 1.4450081586837769, 'learning_rate': 1.6051883206571076e-05, 'epoch': 0.29}


 29%|โโโ       | 897/3057 [28:03:11<106:44:02, 177.89s/it]

{'loss': 2.3288, 'grad_norm': 1.1632658243179321, 'learning_rate': 1.6043698898646017e-05, 'epoch': 0.29}


 29%|โโโ       | 898/3057 [28:05:54<103:56:49, 173.33s/it]

{'loss': 2.3245, 'grad_norm': 1.516890287399292, 'learning_rate': 1.603550820791489e-05, 'epoch': 0.29}


 29%|โโโ       | 899/3057 [28:08:43<103:12:40, 172.18s/it]

{'loss': 2.3795, 'grad_norm': 1.2939660549163818, 'learning_rate': 1.6027311143027962e-05, 'epoch': 0.29}


 29%|โโโ       | 900/3057 [28:11:28<101:51:50, 170.01s/it]

{'loss': 2.2214, 'grad_norm': 1.1672853231430054, 'learning_rate': 1.601910771264223e-05, 'epoch': 0.29}


 29%|โโโ       | 901/3057 [28:14:20<102:09:07, 170.57s/it]

{'loss': 2.3082, 'grad_norm': 1.2542991638183594, 'learning_rate': 1.601089792542141e-05, 'epoch': 0.29}


 30%|โโโ       | 902/3057 [28:17:01<100:24:35, 167.74s/it]

{'loss': 2.2575, 'grad_norm': 1.1367487907409668, 'learning_rate': 1.6002681790035934e-05, 'epoch': 0.3}


 30%|โโโ       | 903/3057 [28:19:46<99:44:10, 166.69s/it] 

{'loss': 2.3027, 'grad_norm': 1.2956911325454712, 'learning_rate': 1.5994459315162944e-05, 'epoch': 0.3}


 30%|โโโ       | 904/3057 [28:22:26<98:35:00, 164.84s/it]

{'loss': 2.2727, 'grad_norm': 1.3005502223968506, 'learning_rate': 1.5986230509486257e-05, 'epoch': 0.3}


 30%|โโโ       | 905/3057 [28:25:15<99:15:23, 166.04s/it]

{'loss': 2.2167, 'grad_norm': 1.3733620643615723, 'learning_rate': 1.5977995381696402e-05, 'epoch': 0.3}


 30%|โโโ       | 906/3057 [28:28:03<99:36:27, 166.71s/it]

{'loss': 2.2489, 'grad_norm': 1.0421972274780273, 'learning_rate': 1.5969753940490577e-05, 'epoch': 0.3}


 30%|โโโ       | 907/3057 [28:30:34<96:43:12, 161.95s/it]

{'loss': 2.2122, 'grad_norm': 1.576580286026001, 'learning_rate': 1.5961506194572632e-05, 'epoch': 0.3}


 30%|โโโ       | 908/3057 [28:33:03<94:17:23, 157.95s/it]

{'loss': 2.3167, 'grad_norm': 1.2011051177978516, 'learning_rate': 1.5953252152653095e-05, 'epoch': 0.3}


 30%|โโโ       | 909/3057 [28:35:40<94:03:18, 157.63s/it]

{'loss': 2.2805, 'grad_norm': 1.3477839231491089, 'learning_rate': 1.5944991823449134e-05, 'epoch': 0.3}


 30%|โโโ       | 910/3057 [28:38:12<93:02:22, 156.00s/it]

{'loss': 2.3136, 'grad_norm': 1.2480417490005493, 'learning_rate': 1.5936725215684564e-05, 'epoch': 0.3}


 30%|โโโ       | 911/3057 [28:40:45<92:27:27, 155.10s/it]

{'loss': 2.2949, 'grad_norm': 1.1880544424057007, 'learning_rate': 1.5928452338089813e-05, 'epoch': 0.3}


 30%|โโโ       | 912/3057 [28:43:18<92:00:33, 154.42s/it]

{'loss': 2.314, 'grad_norm': 1.3824326992034912, 'learning_rate': 1.592017319940195e-05, 'epoch': 0.3}


 30%|โโโ       | 913/3057 [28:45:56<92:40:05, 155.60s/it]

{'loss': 2.3154, 'grad_norm': 1.2732218503952026, 'learning_rate': 1.5911887808364657e-05, 'epoch': 0.3}


 30%|โโโ       | 914/3057 [28:48:36<93:25:03, 156.93s/it]

{'loss': 2.1761, 'grad_norm': 1.1737369298934937, 'learning_rate': 1.5903596173728203e-05, 'epoch': 0.3}


 30%|โโโ       | 915/3057 [28:51:11<93:02:41, 156.38s/it]

{'loss': 2.2739, 'grad_norm': 1.3883146047592163, 'learning_rate': 1.589529830424946e-05, 'epoch': 0.3}


 30%|โโโ       | 916/3057 [28:54:04<95:54:09, 161.26s/it]

{'loss': 2.3289, 'grad_norm': 1.1673725843429565, 'learning_rate': 1.5886994208691885e-05, 'epoch': 0.3}


 30%|โโโ       | 917/3057 [28:56:53<97:11:44, 163.51s/it]

{'loss': 2.3594, 'grad_norm': 1.292984962463379, 'learning_rate': 1.5878683895825512e-05, 'epoch': 0.3}


 30%|โโโ       | 918/3057 [28:59:19<94:09:02, 158.46s/it]

{'loss': 2.2784, 'grad_norm': 1.1710822582244873, 'learning_rate': 1.5870367374426936e-05, 'epoch': 0.3}


 30%|โโโ       | 919/3057 [29:01:48<92:18:02, 155.42s/it]

{'loss': 2.3154, 'grad_norm': 1.3031991720199585, 'learning_rate': 1.586204465327932e-05, 'epoch': 0.3}


 30%|โโโ       | 920/3057 [29:04:19<91:30:22, 154.15s/it]

{'loss': 2.2753, 'grad_norm': 1.104161024093628, 'learning_rate': 1.5853715741172353e-05, 'epoch': 0.3}


 30%|โโโ       | 921/3057 [29:06:49<90:43:43, 152.91s/it]

{'loss': 2.3145, 'grad_norm': 1.1684390306472778, 'learning_rate': 1.5845380646902285e-05, 'epoch': 0.3}


 30%|โโโ       | 922/3057 [29:09:28<91:46:42, 154.76s/it]

{'loss': 2.2087, 'grad_norm': 1.2754955291748047, 'learning_rate': 1.5837039379271888e-05, 'epoch': 0.3}


 30%|โโโ       | 923/3057 [29:12:11<93:16:14, 157.34s/it]

{'loss': 2.3087, 'grad_norm': 1.146803855895996, 'learning_rate': 1.582869194709044e-05, 'epoch': 0.3}


 30%|โโโ       | 924/3057 [29:15:01<95:23:13, 160.99s/it]

{'loss': 2.3625, 'grad_norm': 1.1291133165359497, 'learning_rate': 1.5820338359173753e-05, 'epoch': 0.3}


 30%|โโโ       | 925/3057 [29:17:31<93:30:26, 157.89s/it]

{'loss': 2.2123, 'grad_norm': 1.4455448389053345, 'learning_rate': 1.581197862434412e-05, 'epoch': 0.3}


 30%|โโโ       | 926/3057 [29:19:56<91:07:18, 153.94s/it]

{'loss': 2.1778, 'grad_norm': 1.140886664390564, 'learning_rate': 1.5803612751430343e-05, 'epoch': 0.3}


 30%|โโโ       | 927/3057 [29:22:21<89:27:13, 151.19s/it]

{'loss': 2.3744, 'grad_norm': 1.10056471824646, 'learning_rate': 1.5795240749267686e-05, 'epoch': 0.3}


 30%|โโโ       | 928/3057 [29:24:51<89:14:41, 150.91s/it]

{'loss': 2.2568, 'grad_norm': 1.2996509075164795, 'learning_rate': 1.5786862626697908e-05, 'epoch': 0.3}


 30%|โโโ       | 929/3057 [29:27:21<89:02:27, 150.63s/it]

{'loss': 2.2866, 'grad_norm': 1.0029478073120117, 'learning_rate': 1.5778478392569215e-05, 'epoch': 0.3}


 30%|โโโ       | 930/3057 [29:30:04<91:15:24, 154.45s/it]

{'loss': 2.3093, 'grad_norm': 1.3860013484954834, 'learning_rate': 1.5770088055736273e-05, 'epoch': 0.3}


 30%|โโโ       | 931/3057 [29:32:42<91:47:15, 155.43s/it]

{'loss': 2.3216, 'grad_norm': 1.4194334745407104, 'learning_rate': 1.5761691625060202e-05, 'epoch': 0.3}


 30%|โโโ       | 932/3057 [29:35:19<92:00:54, 155.88s/it]

{'loss': 2.2528, 'grad_norm': 1.31056547164917, 'learning_rate': 1.575328910940854e-05, 'epoch': 0.3}


 31%|โโโ       | 933/3057 [29:38:07<94:07:11, 159.53s/it]

{'loss': 2.327, 'grad_norm': 1.1280721426010132, 'learning_rate': 1.5744880517655266e-05, 'epoch': 0.31}


 31%|โโโ       | 934/3057 [29:41:04<97:12:35, 164.84s/it]

{'loss': 2.3079, 'grad_norm': 1.1581534147262573, 'learning_rate': 1.573646585868077e-05, 'epoch': 0.31}


 31%|โโโ       | 935/3057 [29:43:48<96:54:32, 164.41s/it]

{'loss': 2.2279, 'grad_norm': 1.1817958354949951, 'learning_rate': 1.5728045141371854e-05, 'epoch': 0.31}


 31%|โโโ       | 936/3057 [29:46:31<96:34:25, 163.92s/it]

{'loss': 2.2796, 'grad_norm': 1.068175196647644, 'learning_rate': 1.5719618374621714e-05, 'epoch': 0.31}


 31%|โโโ       | 937/3057 [29:48:59<93:52:55, 159.42s/it]

{'loss': 2.2793, 'grad_norm': 1.069070816040039, 'learning_rate': 1.571118556732993e-05, 'epoch': 0.31}


 31%|โโโ       | 938/3057 [29:51:26<91:33:08, 155.54s/it]

{'loss': 2.3222, 'grad_norm': 1.1732866764068604, 'learning_rate': 1.5702746728402477e-05, 'epoch': 0.31}


 31%|โโโ       | 939/3057 [29:53:56<90:29:58, 153.82s/it]

{'loss': 2.2079, 'grad_norm': 1.2850394248962402, 'learning_rate': 1.5694301866751684e-05, 'epoch': 0.31}


 31%|โโโ       | 940/3057 [29:56:22<89:07:05, 151.55s/it]

{'loss': 2.2184, 'grad_norm': 1.2027407884597778, 'learning_rate': 1.5685850991296246e-05, 'epoch': 0.31}


 31%|โโโ       | 941/3057 [29:59:00<90:17:25, 153.61s/it]

{'loss': 2.2833, 'grad_norm': 1.4076652526855469, 'learning_rate': 1.567739411096121e-05, 'epoch': 0.31}


 31%|โโโ       | 942/3057 [30:01:44<92:02:43, 156.67s/it]

{'loss': 2.2976, 'grad_norm': 1.278489351272583, 'learning_rate': 1.566893123467798e-05, 'epoch': 0.31}


 31%|โโโ       | 943/3057 [30:04:24<92:28:54, 157.49s/it]

{'loss': 2.2743, 'grad_norm': 1.210137128829956, 'learning_rate': 1.566046237138426e-05, 'epoch': 0.31}


 31%|โโโ       | 944/3057 [30:07:05<93:06:55, 158.64s/it]

{'loss': 2.2472, 'grad_norm': 1.233697772026062, 'learning_rate': 1.5651987530024097e-05, 'epoch': 0.31}


 31%|โโโ       | 945/3057 [30:09:42<92:50:31, 158.25s/it]

{'loss': 2.2288, 'grad_norm': 1.3732101917266846, 'learning_rate': 1.5643506719547855e-05, 'epoch': 0.31}


 31%|โโโ       | 946/3057 [30:12:21<92:53:10, 158.40s/it]

{'loss': 2.2354, 'grad_norm': 1.5204020738601685, 'learning_rate': 1.5635019948912194e-05, 'epoch': 0.31}


 31%|โโโ       | 947/3057 [30:15:02<93:14:15, 159.08s/it]

{'loss': 2.2841, 'grad_norm': 1.4973982572555542, 'learning_rate': 1.5626527227080067e-05, 'epoch': 0.31}


 31%|โโโ       | 948/3057 [30:17:34<91:56:21, 156.94s/it]

{'loss': 2.2423, 'grad_norm': 1.7843220233917236, 'learning_rate': 1.5618028563020725e-05, 'epoch': 0.31}


 31%|โโโ       | 949/3057 [30:20:11<91:58:09, 157.06s/it]

{'loss': 2.3278, 'grad_norm': 1.207082748413086, 'learning_rate': 1.5609523965709676e-05, 'epoch': 0.31}


 31%|โโโ       | 950/3057 [30:22:43<90:58:41, 155.44s/it]

{'loss': 2.2894, 'grad_norm': 1.4123576879501343, 'learning_rate': 1.5601013444128706e-05, 'epoch': 0.31}


 31%|โโโ       | 951/3057 [30:25:23<91:42:16, 156.76s/it]

{'loss': 2.2386, 'grad_norm': 1.2938553094863892, 'learning_rate': 1.5592497007265857e-05, 'epoch': 0.31}


 31%|โโโ       | 952/3057 [30:28:13<94:02:02, 160.82s/it]

{'loss': 2.2648, 'grad_norm': 1.3906092643737793, 'learning_rate': 1.5583974664115417e-05, 'epoch': 0.31}


 31%|โโโ       | 953/3057 [30:30:43<92:02:54, 157.50s/it]

{'loss': 2.2725, 'grad_norm': 1.4832972288131714, 'learning_rate': 1.5575446423677907e-05, 'epoch': 0.31}


 31%|โโโ       | 954/3057 [30:33:20<92:00:08, 157.49s/it]

{'loss': 2.3354, 'grad_norm': 1.3499159812927246, 'learning_rate': 1.5566912294960082e-05, 'epoch': 0.31}


 31%|โโโ       | 955/3057 [30:36:00<92:21:54, 158.19s/it]

{'loss': 2.2788, 'grad_norm': 1.0762877464294434, 'learning_rate': 1.5558372286974915e-05, 'epoch': 0.31}


 31%|โโโโ      | 956/3057 [30:38:41<92:54:01, 159.18s/it]

{'loss': 2.2974, 'grad_norm': 1.4105377197265625, 'learning_rate': 1.554982640874159e-05, 'epoch': 0.31}


 31%|โโโโ      | 957/3057 [30:41:06<90:21:08, 154.89s/it]

{'loss': 2.2469, 'grad_norm': 1.492699146270752, 'learning_rate': 1.554127466928548e-05, 'epoch': 0.31}


 31%|โโโโ      | 958/3057 [30:43:32<88:42:57, 152.16s/it]

{'loss': 2.3251, 'grad_norm': 1.2380436658859253, 'learning_rate': 1.5532717077638167e-05, 'epoch': 0.31}


 31%|โโโโ      | 959/3057 [30:45:58<87:34:44, 150.28s/it]

{'loss': 2.2564, 'grad_norm': 1.3360600471496582, 'learning_rate': 1.552415364283739e-05, 'epoch': 0.31}


 31%|โโโโ      | 960/3057 [30:48:32<88:10:04, 151.36s/it]

{'loss': 2.2991, 'grad_norm': 1.1517915725708008, 'learning_rate': 1.551558437392708e-05, 'epoch': 0.31}


 31%|โโโโ      | 961/3057 [30:51:00<87:36:23, 150.47s/it]

{'loss': 2.2562, 'grad_norm': 1.5080410242080688, 'learning_rate': 1.5507009279957315e-05, 'epoch': 0.31}


 31%|โโโโ      | 962/3057 [30:53:36<88:33:07, 152.17s/it]

{'loss': 2.2411, 'grad_norm': 1.2286193370819092, 'learning_rate': 1.5498428369984334e-05, 'epoch': 0.31}


 32%|โโโโ      | 963/3057 [30:56:22<90:56:07, 156.34s/it]

{'loss': 2.3355, 'grad_norm': 1.1336722373962402, 'learning_rate': 1.5489841653070518e-05, 'epoch': 0.31}


 32%|โโโโ      | 964/3057 [30:59:12<93:12:55, 160.33s/it]

{'loss': 2.2654, 'grad_norm': 1.348728060722351, 'learning_rate': 1.548124913828437e-05, 'epoch': 0.32}


 32%|โโโโ      | 965/3057 [31:02:03<95:03:16, 163.57s/it]

{'loss': 2.2888, 'grad_norm': 1.2302781343460083, 'learning_rate': 1.5472650834700532e-05, 'epoch': 0.32}


 32%|โโโโ      | 966/3057 [31:04:51<95:41:02, 164.74s/it]

{'loss': 2.2624, 'grad_norm': 1.196995735168457, 'learning_rate': 1.5464046751399745e-05, 'epoch': 0.32}


 32%|โโโโ      | 967/3057 [31:07:22<93:21:38, 160.81s/it]

{'loss': 2.3118, 'grad_norm': 1.0793112516403198, 'learning_rate': 1.5455436897468866e-05, 'epoch': 0.32}


 32%|โโโโ      | 968/3057 [31:09:55<91:56:45, 158.45s/it]

{'loss': 2.1975, 'grad_norm': 1.304789662361145, 'learning_rate': 1.5446821282000836e-05, 'epoch': 0.32}


 32%|โโโโ      | 969/3057 [31:12:30<91:17:38, 157.40s/it]

{'loss': 2.2603, 'grad_norm': 1.3168481588363647, 'learning_rate': 1.5438199914094688e-05, 'epoch': 0.32}


 32%|โโโโ      | 970/3057 [31:14:56<89:19:02, 154.07s/it]

{'loss': 2.2532, 'grad_norm': 1.19931161403656, 'learning_rate': 1.542957280285553e-05, 'epoch': 0.32}


 32%|โโโโ      | 971/3057 [31:17:25<88:16:26, 152.34s/it]

{'loss': 2.2413, 'grad_norm': 1.1044683456420898, 'learning_rate': 1.5420939957394537e-05, 'epoch': 0.32}


 32%|โโโโ      | 972/3057 [31:19:54<87:40:43, 151.39s/it]

{'loss': 2.175, 'grad_norm': 1.5190699100494385, 'learning_rate': 1.5412301386828926e-05, 'epoch': 0.32}


 32%|โโโโ      | 973/3057 [31:22:45<91:01:49, 157.25s/it]

{'loss': 2.3006, 'grad_norm': 1.2162531614303589, 'learning_rate': 1.5403657100281978e-05, 'epoch': 0.32}


 32%|โโโโ      | 974/3057 [31:25:39<93:51:39, 162.22s/it]

{'loss': 2.3133, 'grad_norm': 1.4975422620773315, 'learning_rate': 1.5395007106883e-05, 'epoch': 0.32}


 32%|โโโโ      | 975/3057 [31:28:18<93:17:32, 161.31s/it]

{'loss': 2.275, 'grad_norm': 1.2142139673233032, 'learning_rate': 1.5386351415767333e-05, 'epoch': 0.32}


 32%|โโโโ      | 976/3057 [31:30:49<91:29:15, 158.27s/it]

{'loss': 2.3389, 'grad_norm': 1.289975881576538, 'learning_rate': 1.5377690036076332e-05, 'epoch': 0.32}


 32%|โโโโ      | 977/3057 [31:33:18<89:46:34, 155.38s/it]

{'loss': 2.2878, 'grad_norm': 1.499453067779541, 'learning_rate': 1.536902297695736e-05, 'epoch': 0.32}


 32%|โโโโ      | 978/3057 [31:35:45<88:22:23, 153.03s/it]

{'loss': 2.2608, 'grad_norm': 1.3157625198364258, 'learning_rate': 1.5360350247563776e-05, 'epoch': 0.32}


 32%|โโโโ      | 979/3057 [31:38:14<87:32:20, 151.66s/it]

{'loss': 2.2801, 'grad_norm': 1.6307183504104614, 'learning_rate': 1.535167185705493e-05, 'epoch': 0.32}


 32%|โโโโ      | 980/3057 [31:40:52<88:35:03, 153.54s/it]

{'loss': 2.3419, 'grad_norm': 1.1631110906600952, 'learning_rate': 1.5342987814596152e-05, 'epoch': 0.32}


 32%|โโโโ      | 981/3057 [31:43:18<87:15:00, 151.30s/it]

{'loss': 2.2886, 'grad_norm': 1.1886208057403564, 'learning_rate': 1.5334298129358734e-05, 'epoch': 0.32}


 32%|โโโโ      | 982/3057 [31:45:44<86:24:37, 149.92s/it]

{'loss': 2.2997, 'grad_norm': 1.2589415311813354, 'learning_rate': 1.532560281051994e-05, 'epoch': 0.32}


 32%|โโโโ      | 983/3057 [31:48:12<86:01:37, 149.32s/it]

{'loss': 2.2607, 'grad_norm': 1.0766695737838745, 'learning_rate': 1.531690186726297e-05, 'epoch': 0.32}


 32%|โโโโ      | 984/3057 [31:50:39<85:36:36, 148.67s/it]

{'loss': 2.3599, 'grad_norm': 1.6178128719329834, 'learning_rate': 1.530819530877697e-05, 'epoch': 0.32}


 32%|โโโโ      | 985/3057 [31:53:25<88:29:35, 153.75s/it]

{'loss': 2.295, 'grad_norm': 1.2626657485961914, 'learning_rate': 1.5299483144257024e-05, 'epoch': 0.32}


 32%|โโโโ      | 986/3057 [31:56:08<89:59:22, 156.43s/it]

{'loss': 2.2339, 'grad_norm': 1.5568251609802246, 'learning_rate': 1.5290765382904127e-05, 'epoch': 0.32}


 32%|โโโโ      | 987/3057 [31:58:50<91:01:25, 158.30s/it]

{'loss': 2.2509, 'grad_norm': 1.2393014430999756, 'learning_rate': 1.5282042033925183e-05, 'epoch': 0.32}


 32%|โโโโ      | 988/3057 [32:01:32<91:32:00, 159.27s/it]

{'loss': 2.223, 'grad_norm': 1.059856653213501, 'learning_rate': 1.5273313106532998e-05, 'epoch': 0.32}


 32%|โโโโ      | 989/3057 [32:04:12<91:36:04, 159.46s/it]

{'loss': 2.2553, 'grad_norm': 1.3524599075317383, 'learning_rate': 1.526457860994628e-05, 'epoch': 0.32}


 32%|โโโโ      | 990/3057 [32:06:46<90:34:07, 157.74s/it]

{'loss': 2.2605, 'grad_norm': 1.488606572151184, 'learning_rate': 1.5255838553389612e-05, 'epoch': 0.32}


 32%|โโโโ      | 991/3057 [32:09:12<88:34:28, 154.34s/it]

{'loss': 2.2504, 'grad_norm': 1.2633754014968872, 'learning_rate': 1.5247092946093442e-05, 'epoch': 0.32}


 32%|โโโโ      | 992/3057 [32:11:45<88:18:51, 153.96s/it]

{'loss': 2.3418, 'grad_norm': 1.1924654245376587, 'learning_rate': 1.523834179729409e-05, 'epoch': 0.32}


 32%|โโโโ      | 993/3057 [32:14:37<91:21:30, 159.35s/it]

{'loss': 2.2435, 'grad_norm': 1.323471188545227, 'learning_rate': 1.5229585116233725e-05, 'epoch': 0.32}


 33%|โโโโ      | 994/3057 [32:17:16<91:13:41, 159.20s/it]

{'loss': 2.2521, 'grad_norm': 1.167954921722412, 'learning_rate': 1.5220822912160355e-05, 'epoch': 0.33}


 33%|โโโโ      | 995/3057 [32:20:05<92:50:58, 162.10s/it]

{'loss': 2.2268, 'grad_norm': 1.3446094989776611, 'learning_rate': 1.5212055194327828e-05, 'epoch': 0.33}


 33%|โโโโ      | 996/3057 [32:22:38<91:21:36, 159.58s/it]

{'loss': 2.2638, 'grad_norm': 1.1806795597076416, 'learning_rate': 1.5203281971995813e-05, 'epoch': 0.33}


 33%|โโโโ      | 997/3057 [32:25:10<89:59:50, 157.28s/it]

{'loss': 2.2866, 'grad_norm': 1.2887327671051025, 'learning_rate': 1.5194503254429785e-05, 'epoch': 0.33}


 33%|โโโโ      | 998/3057 [32:27:50<90:22:54, 158.03s/it]

{'loss': 2.2705, 'grad_norm': 1.5536741018295288, 'learning_rate': 1.5185719050901035e-05, 'epoch': 0.33}


 33%|โโโโ      | 999/3057 [32:30:25<89:52:11, 157.21s/it]

{'loss': 2.2421, 'grad_norm': 1.313948631286621, 'learning_rate': 1.5176929370686637e-05, 'epoch': 0.33}


 33%|โโโโ      | 1000/3057 [32:32:52<87:58:45, 153.97s/it]

{'loss': 2.2413, 'grad_norm': 1.389561414718628, 'learning_rate': 1.5168134223069457e-05, 'epoch': 0.33}


 33%|โโโโ      | 1001/3057 [32:35:24<87:38:59, 153.47s/it]

{'loss': 2.2778, 'grad_norm': 1.4211535453796387, 'learning_rate': 1.5159333617338126e-05, 'epoch': 0.33}


 33%|โโโโ      | 1002/3057 [32:37:51<86:25:06, 151.39s/it]

{'loss': 2.277, 'grad_norm': 1.26664400100708, 'learning_rate': 1.5150527562787048e-05, 'epoch': 0.33}


 33%|โโโโ      | 1003/3057 [32:40:18<85:44:44, 150.28s/it]

{'loss': 2.352, 'grad_norm': 1.4680646657943726, 'learning_rate': 1.5141716068716381e-05, 'epoch': 0.33}


 33%|โโโโ      | 1004/3057 [32:43:05<88:33:17, 155.28s/it]

{'loss': 2.2924, 'grad_norm': 1.4090347290039062, 'learning_rate': 1.5132899144432015e-05, 'epoch': 0.33}


 33%|โโโโ      | 1005/3057 [32:45:31<86:49:01, 152.31s/it]

{'loss': 2.2371, 'grad_norm': 1.2801343202590942, 'learning_rate': 1.5124076799245599e-05, 'epoch': 0.33}


 33%|โโโโ      | 1006/3057 [32:48:05<87:10:49, 153.02s/it]

{'loss': 2.2479, 'grad_norm': 1.4794948101043701, 'learning_rate': 1.5115249042474485e-05, 'epoch': 0.33}


 33%|โโโโ      | 1007/3057 [32:50:39<87:19:39, 153.36s/it]

{'loss': 2.2544, 'grad_norm': 1.1630439758300781, 'learning_rate': 1.5106415883441748e-05, 'epoch': 0.33}


 33%|โโโโ      | 1008/3057 [32:53:24<89:10:31, 156.68s/it]

{'loss': 2.223, 'grad_norm': 1.6337319612503052, 'learning_rate': 1.5097577331476166e-05, 'epoch': 0.33}


 33%|โโโโ      | 1009/3057 [32:56:10<90:40:02, 159.38s/it]

{'loss': 2.2952, 'grad_norm': 1.487809419631958, 'learning_rate': 1.5088733395912226e-05, 'epoch': 0.33}


 33%|โโโโ      | 1010/3057 [32:58:55<91:36:06, 161.10s/it]

{'loss': 2.251, 'grad_norm': 1.3404276371002197, 'learning_rate': 1.5079884086090078e-05, 'epoch': 0.33}


 33%|โโโโ      | 1011/3057 [33:01:38<91:54:23, 161.71s/it]

{'loss': 2.264, 'grad_norm': 1.259911298751831, 'learning_rate': 1.5071029411355565e-05, 'epoch': 0.33}


 33%|โโโโ      | 1012/3057 [33:04:03<89:00:48, 156.70s/it]

{'loss': 2.2078, 'grad_norm': 1.3500641584396362, 'learning_rate': 1.5062169381060187e-05, 'epoch': 0.33}


 33%|โโโโ      | 1013/3057 [33:06:43<89:29:07, 157.61s/it]

{'loss': 2.3239, 'grad_norm': 1.186132550239563, 'learning_rate': 1.5053304004561113e-05, 'epoch': 0.33}


 33%|โโโโ      | 1014/3057 [33:09:22<89:47:29, 158.22s/it]

{'loss': 2.3083, 'grad_norm': 1.1708967685699463, 'learning_rate': 1.5044433291221137e-05, 'epoch': 0.33}


 33%|โโโโ      | 1015/3057 [33:12:07<90:47:57, 160.08s/it]

{'loss': 2.2652, 'grad_norm': 1.2999874353408813, 'learning_rate': 1.5035557250408707e-05, 'epoch': 0.33}


 33%|โโโโ      | 1016/3057 [33:14:47<90:48:55, 160.18s/it]

{'loss': 2.214, 'grad_norm': 1.1898179054260254, 'learning_rate': 1.5026675891497892e-05, 'epoch': 0.33}


 33%|โโโโ      | 1017/3057 [33:17:47<94:07:38, 166.11s/it]

{'loss': 2.3061, 'grad_norm': 1.2379482984542847, 'learning_rate': 1.5017789223868377e-05, 'epoch': 0.33}


 33%|โโโโ      | 1018/3057 [33:21:23<102:36:23, 181.16s/it]

{'loss': 2.2711, 'grad_norm': 1.1265980005264282, 'learning_rate': 1.5008897256905448e-05, 'epoch': 0.33}


 33%|โโโโ      | 1019/3057 [33:24:02<98:46:43, 174.49s/it] 

{'loss': 2.2754, 'grad_norm': 1.5302391052246094, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.33}


 33%|โโโโ      | 1020/3057 [33:26:50<97:39:01, 172.58s/it]

{'loss': 2.2946, 'grad_norm': 1.2066689729690552, 'learning_rate': 1.4991097462548507e-05, 'epoch': 0.33}


 33%|โโโโ      | 1021/3057 [33:29:56<99:46:07, 176.41s/it]

{'loss': 2.2445, 'grad_norm': 1.7555292844772339, 'learning_rate': 1.4982189653953017e-05, 'epoch': 0.33}


 33%|โโโโ      | 1022/3057 [33:32:57<100:30:07, 177.79s/it]

{'loss': 2.2269, 'grad_norm': 1.450284719467163, 'learning_rate': 1.4973276583621151e-05, 'epoch': 0.33}


 33%|โโโโ      | 1023/3057 [33:35:50<99:45:05, 176.55s/it] 

{'loss': 2.3148, 'grad_norm': 1.0831515789031982, 'learning_rate': 1.4964358260966086e-05, 'epoch': 0.33}


 33%|โโโโ      | 1024/3057 [33:38:31<96:58:42, 171.73s/it]

{'loss': 2.2522, 'grad_norm': 1.490062952041626, 'learning_rate': 1.495543469540654e-05, 'epoch': 0.33}


 34%|โโโโ      | 1025/3057 [33:41:32<98:29:34, 174.50s/it]

{'loss': 2.3219, 'grad_norm': 1.1890230178833008, 'learning_rate': 1.4946505896366779e-05, 'epoch': 0.34}


 34%|โโโโ      | 1026/3057 [33:44:09<95:32:10, 169.34s/it]

{'loss': 2.27, 'grad_norm': 1.454338550567627, 'learning_rate': 1.4937571873276584e-05, 'epoch': 0.34}


 34%|โโโโ      | 1027/3057 [33:46:59<95:34:56, 169.51s/it]

{'loss': 2.2394, 'grad_norm': 1.6242173910140991, 'learning_rate': 1.4928632635571262e-05, 'epoch': 0.34}


 34%|โโโโ      | 1028/3057 [33:49:40<94:03:28, 166.88s/it]

{'loss': 2.2501, 'grad_norm': 1.101784110069275, 'learning_rate': 1.4919688192691624e-05, 'epoch': 0.34}


 34%|โโโโ      | 1029/3057 [33:52:33<95:02:21, 168.71s/it]

{'loss': 2.2353, 'grad_norm': 1.1301451921463013, 'learning_rate': 1.4910738554083982e-05, 'epoch': 0.34}


 34%|โโโโ      | 1030/3057 [33:55:11<93:17:29, 165.69s/it]

{'loss': 2.2387, 'grad_norm': 1.3284721374511719, 'learning_rate': 1.4901783729200126e-05, 'epoch': 0.34}


 34%|โโโโ      | 1031/3057 [33:57:56<93:05:24, 165.41s/it]

{'loss': 2.3313, 'grad_norm': 1.0240199565887451, 'learning_rate': 1.4892823727497332e-05, 'epoch': 0.34}


 34%|โโโโ      | 1032/3057 [34:00:38<92:31:11, 164.48s/it]

{'loss': 2.2101, 'grad_norm': 1.5241190195083618, 'learning_rate': 1.4883858558438338e-05, 'epoch': 0.34}


 34%|โโโโ      | 1033/3057 [34:03:29<93:30:38, 166.32s/it]

{'loss': 2.2835, 'grad_norm': 1.1712775230407715, 'learning_rate': 1.4874888231491348e-05, 'epoch': 0.34}


 34%|โโโโ      | 1034/3057 [34:06:17<93:46:37, 166.88s/it]

{'loss': 2.2766, 'grad_norm': 1.2905625104904175, 'learning_rate': 1.4865912756129997e-05, 'epoch': 0.34}


 34%|โโโโ      | 1035/3057 [34:09:08<94:20:59, 167.98s/it]

{'loss': 2.1995, 'grad_norm': 1.439602017402649, 'learning_rate': 1.4856932141833374e-05, 'epoch': 0.34}


 34%|โโโโ      | 1036/3057 [34:12:13<97:10:34, 173.10s/it]

{'loss': 2.2124, 'grad_norm': 1.0598971843719482, 'learning_rate': 1.4847946398085986e-05, 'epoch': 0.34}


 34%|โโโโ      | 1037/3057 [34:15:17<98:54:52, 176.28s/it]

{'loss': 2.2505, 'grad_norm': 1.3240766525268555, 'learning_rate': 1.4838955534377762e-05, 'epoch': 0.34}


 34%|โโโโ      | 1038/3057 [34:18:18<99:39:40, 177.70s/it]

{'loss': 2.3213, 'grad_norm': 1.2460027933120728, 'learning_rate': 1.4829959560204035e-05, 'epoch': 0.34}


 34%|โโโโ      | 1039/3057 [34:21:13<99:16:25, 177.10s/it]

{'loss': 2.2617, 'grad_norm': 1.152348279953003, 'learning_rate': 1.4820958485065533e-05, 'epoch': 0.34}


 34%|โโโโ      | 1040/3057 [34:24:08<98:46:52, 176.31s/it]

{'loss': 2.2696, 'grad_norm': 1.3542758226394653, 'learning_rate': 1.4811952318468381e-05, 'epoch': 0.34}


 34%|โโโโ      | 1041/3057 [34:26:55<97:17:49, 173.74s/it]

{'loss': 2.2633, 'grad_norm': 1.4053186178207397, 'learning_rate': 1.4802941069924068e-05, 'epoch': 0.34}


 34%|โโโโ      | 1042/3057 [34:30:06<100:05:45, 178.83s/it]

{'loss': 2.2628, 'grad_norm': 1.3969449996948242, 'learning_rate': 1.4793924748949459e-05, 'epoch': 0.34}


 34%|โโโโ      | 1043/3057 [34:33:04<99:53:03, 178.54s/it] 

{'loss': 2.295, 'grad_norm': 1.415513515472412, 'learning_rate': 1.4784903365066777e-05, 'epoch': 0.34}


 34%|โโโโ      | 1044/3057 [34:36:01<99:36:36, 178.14s/it]

{'loss': 2.2588, 'grad_norm': 1.5023492574691772, 'learning_rate': 1.4775876927803581e-05, 'epoch': 0.34}


 34%|โโโโ      | 1045/3057 [34:39:17<102:35:46, 183.57s/it]

{'loss': 2.3229, 'grad_norm': 1.1900734901428223, 'learning_rate': 1.4766845446692781e-05, 'epoch': 0.34}


 34%|โโโโ      | 1046/3057 [34:42:17<101:56:27, 182.49s/it]

{'loss': 2.3072, 'grad_norm': 1.340106725692749, 'learning_rate': 1.4757808931272606e-05, 'epoch': 0.34}


 34%|โโโโ      | 1047/3057 [34:44:58<98:16:38, 176.02s/it] 

{'loss': 2.2473, 'grad_norm': 1.3335916996002197, 'learning_rate': 1.47487673910866e-05, 'epoch': 0.34}


 34%|โโโโ      | 1048/3057 [34:47:51<97:39:18, 174.99s/it]

{'loss': 2.2455, 'grad_norm': 1.3117812871932983, 'learning_rate': 1.4739720835683619e-05, 'epoch': 0.34}


 34%|โโโโ      | 1049/3057 [34:50:38<96:15:39, 172.58s/it]

{'loss': 2.2679, 'grad_norm': 1.3805127143859863, 'learning_rate': 1.4730669274617817e-05, 'epoch': 0.34}


 34%|โโโโ      | 1050/3057 [34:53:20<94:26:58, 169.42s/it]

{'loss': 2.2437, 'grad_norm': 1.181418776512146, 'learning_rate': 1.4721612717448625e-05, 'epoch': 0.34}


 34%|โโโโ      | 1051/3057 [34:56:26<97:14:01, 174.50s/it]

{'loss': 2.2529, 'grad_norm': 1.4499517679214478, 'learning_rate': 1.4712551173740764e-05, 'epoch': 0.34}


 34%|โโโโ      | 1052/3057 [34:59:29<98:33:36, 176.97s/it]

{'loss': 2.2889, 'grad_norm': 1.199668526649475, 'learning_rate': 1.4703484653064202e-05, 'epoch': 0.34}


 34%|โโโโ      | 1053/3057 [35:02:14<96:27:42, 173.28s/it]

{'loss': 2.2526, 'grad_norm': 1.2420663833618164, 'learning_rate': 1.4694413164994186e-05, 'epoch': 0.34}


 34%|โโโโ      | 1054/3057 [35:04:53<94:01:55, 169.00s/it]

{'loss': 2.2273, 'grad_norm': 1.3932193517684937, 'learning_rate': 1.4685336719111196e-05, 'epoch': 0.34}


 35%|โโโโ      | 1055/3057 [35:07:25<91:09:47, 163.93s/it]

{'loss': 2.3092, 'grad_norm': 1.3193947076797485, 'learning_rate': 1.4676255325000944e-05, 'epoch': 0.35}


 35%|โโโโ      | 1056/3057 [35:10:01<89:51:51, 161.67s/it]

{'loss': 2.2777, 'grad_norm': 1.130484938621521, 'learning_rate': 1.4667168992254378e-05, 'epoch': 0.35}


 35%|โโโโ      | 1057/3057 [35:12:47<90:34:48, 163.04s/it]

{'loss': 2.3674, 'grad_norm': 1.2529970407485962, 'learning_rate': 1.465807773046766e-05, 'epoch': 0.35}


 35%|โโโโ      | 1058/3057 [35:15:25<89:33:45, 161.29s/it]

{'loss': 2.3436, 'grad_norm': 1.283518671989441, 'learning_rate': 1.4648981549242151e-05, 'epoch': 0.35}


 35%|โโโโ      | 1059/3057 [35:18:03<89:00:04, 160.36s/it]

{'loss': 2.265, 'grad_norm': 1.1688164472579956, 'learning_rate': 1.4639880458184415e-05, 'epoch': 0.35}


 35%|โโโโ      | 1060/3057 [35:20:55<90:58:49, 164.01s/it]

{'loss': 2.2334, 'grad_norm': 1.2540873289108276, 'learning_rate': 1.4630774466906195e-05, 'epoch': 0.35}


 35%|โโโโ      | 1061/3057 [35:23:52<93:02:52, 167.82s/it]

{'loss': 2.3136, 'grad_norm': 1.0617879629135132, 'learning_rate': 1.462166358502442e-05, 'epoch': 0.35}


 35%|โโโโ      | 1062/3057 [35:26:38<92:45:10, 167.37s/it]

{'loss': 2.2394, 'grad_norm': 1.256881594657898, 'learning_rate': 1.4612547822161163e-05, 'epoch': 0.35}


 35%|โโโโ      | 1063/3057 [35:29:19<91:31:32, 165.24s/it]

{'loss': 2.2368, 'grad_norm': 1.146464467048645, 'learning_rate': 1.4603427187943675e-05, 'epoch': 0.35}


 35%|โโโโ      | 1064/3057 [35:32:09<92:18:46, 166.75s/it]

{'loss': 2.3401, 'grad_norm': 1.2777312994003296, 'learning_rate': 1.4594301692004342e-05, 'epoch': 0.35}


 35%|โโโโ      | 1065/3057 [35:35:19<96:07:42, 173.73s/it]

{'loss': 2.2486, 'grad_norm': 1.2560722827911377, 'learning_rate': 1.4585171343980677e-05, 'epoch': 0.35}


 35%|โโโโ      | 1066/3057 [35:38:16<96:39:40, 174.78s/it]

{'loss': 2.3311, 'grad_norm': 1.145797848701477, 'learning_rate': 1.4576036153515332e-05, 'epoch': 0.35}


 35%|โโโโ      | 1067/3057 [35:42:27<109:12:06, 197.55s/it]

{'loss': 2.2603, 'grad_norm': 1.3672503232955933, 'learning_rate': 1.4566896130256064e-05, 'epoch': 0.35}


 35%|โโโโ      | 1068/3057 [35:46:36<117:44:45, 213.12s/it]

{'loss': 2.3268, 'grad_norm': 1.1310889720916748, 'learning_rate': 1.4557751283855734e-05, 'epoch': 0.35}


 35%|โโโโ      | 1069/3057 [35:50:11<117:52:52, 213.47s/it]

{'loss': 2.3627, 'grad_norm': 1.4134234189987183, 'learning_rate': 1.4548601623972299e-05, 'epoch': 0.35}


 35%|โโโโ      | 1070/3057 [35:52:54<109:28:30, 198.34s/it]

{'loss': 2.2205, 'grad_norm': 1.1366780996322632, 'learning_rate': 1.45394471602688e-05, 'epoch': 0.35}


 35%|โโโโ      | 1071/3057 [35:56:56<116:46:55, 211.69s/it]

{'loss': 2.2463, 'grad_norm': 1.6365485191345215, 'learning_rate': 1.4530287902413349e-05, 'epoch': 0.35}


 35%|โโโโ      | 1072/3057 [36:00:38<118:20:45, 214.63s/it]

{'loss': 2.2889, 'grad_norm': 1.2438737154006958, 'learning_rate': 1.452112386007912e-05, 'epoch': 0.35}


 35%|โโโโ      | 1073/3057 [36:04:21<119:42:06, 217.20s/it]

{'loss': 2.2294, 'grad_norm': 1.1670339107513428, 'learning_rate': 1.451195504294435e-05, 'epoch': 0.35}


 35%|โโโโ      | 1074/3057 [36:08:24<123:52:08, 224.88s/it]

{'loss': 2.1522, 'grad_norm': 1.438173770904541, 'learning_rate': 1.4502781460692308e-05, 'epoch': 0.35}


 35%|โโโโ      | 1075/3057 [36:12:24<126:18:20, 229.41s/it]

{'loss': 2.3245, 'grad_norm': 1.1499971151351929, 'learning_rate': 1.4493603123011294e-05, 'epoch': 0.35}


In [13]:
# trainer.model.save_pretrained("/media/muhammad-osama/Local Disk/Llama-3.2-1B-Instruct-training-artifacts/run/v2/final")

In [14]:
# from peft import AutoPeftModelForCausalLM
# import torch

In [16]:
# peft_model = AutoPeftModelForCausalLM.from_pretrained(
#     "/home/muhammad-osama/Documents/Llama-3.2-1B-Instruct-training-artifacts/run/v2/final",
#     # low_cpu_mem_usage=True,
#     torch_dtype=torch.bfloat16
# )

In [17]:
# peft_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
              

In [18]:
# Merge LoRA and base model and save
# merged_model = peft_model.merge_and_unload()

In [19]:
# merged_model.save_pretrained("/home/muhammad-osama/Documents/Llama-3.2-1B-Instruct-training-artifacts/run/v2/lora_merged")

In [20]:
# from transformers import AutoModelForCausalLM, AutoTokenizer

# model_name = "unsloth/Llama-3.2-1B-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.save_pretrained("/home/muhammad-osama/Documents/Llama-3.2-1B-Instruct-training-artifacts/run/v2/lora_merged")

('/home/muhammad-osama/Documents/Llama-3.2-1B-Instruct-training-artifacts/run/v2/lora_merged/tokenizer_config.json',
 '/home/muhammad-osama/Documents/Llama-3.2-1B-Instruct-training-artifacts/run/v2/lora_merged/special_tokens_map.json',
 '/home/muhammad-osama/Documents/Llama-3.2-1B-Instruct-training-artifacts/run/v2/lora_merged/tokenizer.json')