In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import LongformerTokenizer, LongformerForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight

In [11]:
# Load the dataset
df = pd.read_csv("combined_output.csv", on_bad_lines='skip')

In [12]:
df.head()

Unnamed: 0,original_id,source_name,title,url,body,date_published,language,date_modified,author_list,images,description,sentiment,emotions,entities,quotations,prValues,clipping,label,category
0,'5404c9d2fd24852afa122f2cc01cb3acba3c5d05b682d...,'kabargayo.com,'Kredit sepeda motor bisa terbayar jika Anda m...,'https://www.kabargayo.com/2024/09/19/kredit-s...,"'Jakarta, VIVA – Pembayaran kredit sepeda moto...",19/09/2024 22.32,'id,19/09/2024 22.32,'Aldi Hadad,'https://i1.wp.com/thumb.viva.co.id/media/fron...,,'positive,,"'Hari Pembayaran Berbayar atau Harcilnas 2024,...",,5250000,Adira,,
1,'e1e3f8d68b58568e8217b7562d48de634fceb0d837135...,'viva.co.id,'Kredit Motor Bisa Lunas Jika Bayar Cicilan Te...,'https://www.viva.co.id/otomotif/tips/1753596-...,"'Jakarta, VIVA – Cicilan kredit motor yang ser...",19/09/2024 22.30,'id,19/09/2024 22.30,'Krisna Wicaksono,'https://thumb.viva.co.id/media/frontend/thumb...,,'positive,,"'Harinya Cicilan Lunas,2024,PT Adira Dinamika ...",,5250000,Adira,,
2,'dca74b8fa4eabf60cebfa7b811ecb385872a0fd301eaf...,'kabarmegapolitan.pikiran-rakyat.com,'Adira Finance Umumkan Pemenang HARCILNAS 2024...,'https://kabarmegapolitan.pikiran-rakyat.com/b...,'KABARMEGAPOLITAN.com - PT Adira Dinamika Mult...,19/09/2024 21.45,'id,19/09/2024 21.45,'Yuliansyah,'https://assets.pikiran-rakyat.com/www/network...,,'positive,'HARCILNAS merupakan wujud apresiasi kami kepa...,"'PT Adira Dinamika,Cicilan Lunas HARCILNAS,12 ...","(Person :Tania Endah Budhi ,Quote : HARCILNAS ...",5250000,Adira,,
3,'56c73e2a6d254a17a5cc21dee7ed0b4660c3af70c093b...,'banggairaya.id,"'Dapatkan Promo Menarik, Yamaha Prima Motor Ra...",'https://banggairaya.id/dapatkan-promo-menarik...,'BANGGAI RAYA- Yamaha Prima Motor ramaikan pam...,19/09/2024 19.45,'id,19/09/2024 19.45,'Chikal Connect,'https://i0.wp.com/banggairaya.id/wp-content/u...,,'neutral,,"'RAYA- Yamaha Prima Motor,Banggai Goverment Ex...",,5250000,Adira,,
4,'1cd6c6db60224b6ee5f49cd5d6c62cd2850d9f3255721...,'jakarta.tribunnews.com,"'Sindikat Penipuan Leasing, Satu Bulan Ajukan ...",'https://jakarta.tribunnews.com/2024/09/19/sin...,'Laporan wartawan TribunJakarta.com Yusuf Bach...,19/09/2024 18.43,'id,19/09/2024 18.43,'Yusuf Bachtiar,'https://asset-2.tstatic.net/jakarta/foto/bank...,,'neutral,'Pelaku ini melakukan pembiayaan pembelian ken...,"'Yusuf Bachtiar TRIBUNJAKARTACOM,MEDAN,SATRIA,...","(Person :Dedi ,Quote : Pelaku ini melakukan pe...",5250000,Adira,,


In [13]:
# Clean extra characters from all columns in the DataFrame
for column in df.columns:
    if df[column].dtype == 'object':  # Check if the column is of string type
        df[column] = df[column].str.strip("'")  # Remove extra characters

# Filter relevant columns
df_filtered = df[['body', 'sentiment']].dropna()


In [5]:
df.head()

Unnamed: 0,original_id,source_name,title,url,body,date_published,language,date_modified,author_list,images,description,sentiment,emotions,entities,quotations,prValues,clipping,label,category
0,5404c9d2fd24852afa122f2cc01cb3acba3c5d05b682d4...,kabargayo.com,Kredit sepeda motor bisa terbayar jika Anda me...,https://www.kabargayo.com/2024/09/19/kredit-se...,"Jakarta, VIVA – Pembayaran kredit sepeda motor...",19/09/2024 22.32,id,19/09/2024 22.32,Aldi Hadad,https://i1.wp.com/thumb.viva.co.id/media/front...,,positive,,"Hari Pembayaran Berbayar atau Harcilnas 2024,P...",,5250000,Adira,,
1,e1e3f8d68b58568e8217b7562d48de634fceb0d8371356...,viva.co.id,Kredit Motor Bisa Lunas Jika Bayar Cicilan Tep...,https://www.viva.co.id/otomotif/tips/1753596-k...,"Jakarta, VIVA – Cicilan kredit motor yang seri...",19/09/2024 22.30,id,19/09/2024 22.30,Krisna Wicaksono,https://thumb.viva.co.id/media/frontend/thumbs...,,positive,,"Harinya Cicilan Lunas,2024,PT Adira Dinamika M...",,5250000,Adira,,
2,dca74b8fa4eabf60cebfa7b811ecb385872a0fd301eaf5...,kabarmegapolitan.pikiran-rakyat.com,Adira Finance Umumkan Pemenang HARCILNAS 2024:...,https://kabarmegapolitan.pikiran-rakyat.com/bi...,KABARMEGAPOLITAN.com - PT Adira Dinamika Multi...,19/09/2024 21.45,id,19/09/2024 21.45,Yuliansyah,https://assets.pikiran-rakyat.com/www/network/...,,positive,HARCILNAS merupakan wujud apresiasi kami kepad...,"PT Adira Dinamika,Cicilan Lunas HARCILNAS,12 p...","(Person :Tania Endah Budhi ,Quote : HARCILNAS ...",5250000,Adira,,
3,56c73e2a6d254a17a5cc21dee7ed0b4660c3af70c093b2...,banggairaya.id,"Dapatkan Promo Menarik, Yamaha Prima Motor Ram...",https://banggairaya.id/dapatkan-promo-menarik-...,BANGGAI RAYA- Yamaha Prima Motor ramaikan pame...,19/09/2024 19.45,id,19/09/2024 19.45,Chikal Connect,https://i0.wp.com/banggairaya.id/wp-content/up...,,neutral,,"RAYA- Yamaha Prima Motor,Banggai Goverment Exp...",,5250000,Adira,,
4,1cd6c6db60224b6ee5f49cd5d6c62cd2850d9f3255721d...,jakarta.tribunnews.com,"Sindikat Penipuan Leasing, Satu Bulan Ajukan K...",https://jakarta.tribunnews.com/2024/09/19/sind...,Laporan wartawan TribunJakarta.com Yusuf Bacht...,19/09/2024 18.43,id,19/09/2024 18.43,Yusuf Bachtiar,https://asset-2.tstatic.net/jakarta/foto/bank/...,,neutral,Pelaku ini melakukan pembiayaan pembelian kend...,"Yusuf Bachtiar TRIBUNJAKARTACOM,MEDAN,SATRIA,T...","(Person :Dedi ,Quote : Pelaku ini melakukan pe...",5250000,Adira,,


In [14]:
# Balance dataset classes
class_counts = df_filtered['sentiment'].value_counts()
min_class = class_counts.min()
df_balanced = df_filtered.groupby('sentiment').apply(lambda x: x.sample(min_class)).reset_index(drop=True)

In [15]:
# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_balanced['body'].tolist(), df_balanced['sentiment'].tolist(), test_size=0.2, random_state=42
)

In [8]:
# Load the Longformer tokenizer
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")

# Tokenize the training and validation texts
# Increase sequence length
train_encodings = tokenizer(train_texts, truncation=True, padding="max_length", max_length=1024)
val_encodings = tokenizer(val_texts, truncation=True, padding="max_length", max_length=1024)


In [16]:
# Map sentiment labels to integers
label_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
train_labels = [label_mapping[label] for label in train_labels]
val_labels = [label_mapping[label] for label in val_labels]

In [10]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [11]:
# Create datasets
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

In [12]:
# Check if GPU is available
# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Using device: {device}")

In [40]:
class LongformerWithWeightedLoss(LongformerForSequenceClassification):
    def __init__(self, config, class_weights):
        super().__init__(config)
        # Ensure class weights are in the correct format
        self.class_weights = class_weights.float()
        self.loss_fn = torch.nn.CrossEntropyLoss(weight=self.class_weights)

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
        logits = outputs.logits
        loss = None
        if labels is not None:
            # Convert labels to float if necessary
            labels = labels.to(dtype=torch.long)  # Ensure labels are of the correct type
            loss = self.loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}


In [41]:
# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to('cuda' if torch.cuda.is_available() else 'cpu')

In [42]:
# Load Longformer model for sequence classification
model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096", num_labels=3)
model = LongformerWithWeightedLoss.from_pretrained("allenai/longformer-base-4096", config=model.config, class_weights=class_weights)

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of LongformerWithWeightedLoss were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'loss_fn.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# # Modify model loss to include class weights
# def compute_loss_with_weights(labels, logits):
#     loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
#     return loss_fn(logits, labels)

# model.config.loss_fn = compute_loss_with_weights 

In [43]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",              # Evaluate at the end of each epoch
    save_strategy="epoch",              # Save at the end of each epoch
    per_device_train_batch_size=2,      # Adjust based on GPU capacity
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,      # Simulate larger batch size
    num_train_epochs=5,
    learning_rate=3e-5,                 # Adjust as needed
    weight_decay=0.01,                  # Regularization
    fp16=True,                          # Mixed precision training
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,                 # Limit the number of saved models
    load_best_model_at_end=True,        # Load the best model
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    lr_scheduler_type="cosine_with_restarts",
)

In [44]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],  # Early stopping
)

In [45]:
import torch
torch.cuda.empty_cache()


In [47]:
# Train the model
trainer.train()

  0%|          | 0/2530 [03:26<?, ?it/s]
  0%|          | 3/2530 [01:08<15:55:19, 22.68s/it]
  0%|          | 10/2530 [01:04<4:31:09,  6.46s/it]

{'loss': 1.1538, 'grad_norm': 4.174160957336426, 'learning_rate': 2.999884358321664e-05, 'epoch': 0.02}


  1%|          | 20/2530 [02:09<4:29:53,  6.45s/it]

{'loss': 1.1433, 'grad_norm': 2.6724045276641846, 'learning_rate': 2.9995374511173196e-05, 'epoch': 0.04}


  1%|          | 30/2530 [03:13<4:29:41,  6.47s/it]

{'loss': 1.1241, 'grad_norm': 3.5480241775512695, 'learning_rate': 2.998959331876208e-05, 'epoch': 0.06}


  2%|▏         | 40/2530 [04:18<4:24:29,  6.37s/it]

{'loss': 1.1011, 'grad_norm': 4.565907955169678, 'learning_rate': 2.9981500897379025e-05, 'epoch': 0.08}


  2%|▏         | 50/2530 [05:21<4:26:19,  6.44s/it]

{'loss': 1.1035, 'grad_norm': 2.304727077484131, 'learning_rate': 2.9971098494785612e-05, 'epoch': 0.1}


  2%|▏         | 60/2530 [06:26<4:26:53,  6.48s/it]

{'loss': 1.1291, 'grad_norm': 3.1884634494781494, 'learning_rate': 2.9958387714916904e-05, 'epoch': 0.12}


  3%|▎         | 70/2530 [07:30<4:24:48,  6.46s/it]

{'loss': 1.125, 'grad_norm': 2.7557833194732666, 'learning_rate': 2.994337051763412e-05, 'epoch': 0.14}


  3%|▎         | 80/2530 [08:35<4:25:20,  6.50s/it]

{'loss': 1.1148, 'grad_norm': 4.8969879150390625, 'learning_rate': 2.9926049218422463e-05, 'epoch': 0.16}


  4%|▎         | 90/2530 [09:40<4:23:49,  6.49s/it]

{'loss': 1.1091, 'grad_norm': 2.8232221603393555, 'learning_rate': 2.9906426488034075e-05, 'epoch': 0.18}


  4%|▍         | 100/2530 [10:45<4:22:16,  6.48s/it]

{'loss': 1.1106, 'grad_norm': 3.2310383319854736, 'learning_rate': 2.9884505352076267e-05, 'epoch': 0.2}


  4%|▍         | 110/2530 [11:50<4:21:11,  6.48s/it]

{'loss': 1.105, 'grad_norm': 1.727150321006775, 'learning_rate': 2.986028919054496e-05, 'epoch': 0.22}


  5%|▍         | 120/2530 [12:55<4:20:57,  6.50s/it]

{'loss': 1.1367, 'grad_norm': 2.493455648422241, 'learning_rate': 2.983378173730359e-05, 'epoch': 0.24}


  5%|▌         | 130/2530 [14:00<4:19:54,  6.50s/it]

{'loss': 1.1247, 'grad_norm': 5.106982707977295, 'learning_rate': 2.9804987079507315e-05, 'epoch': 0.26}


  6%|▌         | 140/2530 [15:05<4:18:53,  6.50s/it]

{'loss': 1.0919, 'grad_norm': 2.936387777328491, 'learning_rate': 2.9773909656972887e-05, 'epoch': 0.28}


  6%|▌         | 150/2530 [16:09<4:17:15,  6.49s/it]

{'loss': 1.1167, 'grad_norm': 4.181922912597656, 'learning_rate': 2.974055426149403e-05, 'epoch': 0.3}


  6%|▋         | 160/2530 [17:14<4:12:48,  6.40s/it]

{'loss': 1.1213, 'grad_norm': 1.7450653314590454, 'learning_rate': 2.970492603610264e-05, 'epoch': 0.32}


  7%|▋         | 170/2530 [18:18<4:14:36,  6.47s/it]

{'loss': 1.0915, 'grad_norm': 2.061669111251831, 'learning_rate': 2.9667030474275747e-05, 'epoch': 0.34}


  7%|▋         | 180/2530 [19:23<4:13:59,  6.48s/it]

{'loss': 1.0965, 'grad_norm': 2.184063196182251, 'learning_rate': 2.9626873419088523e-05, 'epoch': 0.36}


  8%|▊         | 190/2530 [20:28<4:12:32,  6.48s/it]

{'loss': 1.1097, 'grad_norm': 4.798665523529053, 'learning_rate': 2.9584461062313305e-05, 'epoch': 0.38}


  8%|▊         | 200/2530 [21:32<4:04:47,  6.30s/it]

{'loss': 1.1115, 'grad_norm': 3.4666879177093506, 'learning_rate': 2.9539799943464923e-05, 'epoch': 0.39}


  8%|▊         | 210/2530 [22:36<4:09:33,  6.45s/it]

{'loss': 1.113, 'grad_norm': 1.7301105260849, 'learning_rate': 2.9492896948792364e-05, 'epoch': 0.41}


  9%|▊         | 220/2530 [23:41<4:09:21,  6.48s/it]

{'loss': 1.1144, 'grad_norm': 3.3344008922576904, 'learning_rate': 2.944375931021699e-05, 'epoch': 0.43}


  9%|▉         | 230/2530 [24:46<4:08:56,  6.49s/it]

{'loss': 1.1121, 'grad_norm': 1.0439478158950806, 'learning_rate': 2.939239460421746e-05, 'epoch': 0.45}


  9%|▉         | 240/2530 [25:51<4:07:49,  6.49s/it]

{'loss': 1.1114, 'grad_norm': 4.002546310424805, 'learning_rate': 2.9338810750661525e-05, 'epoch': 0.47}


 10%|▉         | 250/2530 [26:55<4:01:05,  6.34s/it]

{'loss': 1.1069, 'grad_norm': 2.5739941596984863, 'learning_rate': 2.928301601158485e-05, 'epoch': 0.49}


 10%|█         | 260/2530 [27:58<4:04:27,  6.46s/it]

{'loss': 1.1026, 'grad_norm': 3.239820957183838, 'learning_rate': 2.9225018989917138e-05, 'epoch': 0.51}


 11%|█         | 270/2530 [29:03<4:04:42,  6.50s/it]

{'loss': 1.1254, 'grad_norm': 5.537233352661133, 'learning_rate': 2.9164828628155618e-05, 'epoch': 0.53}


 11%|█         | 280/2530 [30:08<4:03:28,  6.49s/it]

{'loss': 1.1261, 'grad_norm': 2.435612201690674, 'learning_rate': 2.910245420698622e-05, 'epoch': 0.55}


 11%|█▏        | 290/2530 [31:13<4:02:12,  6.49s/it]

{'loss': 1.0987, 'grad_norm': 6.53463888168335, 'learning_rate': 2.9037905343852624e-05, 'epoch': 0.57}


 12%|█▏        | 300/2530 [32:18<4:01:45,  6.50s/it]

{'loss': 1.1198, 'grad_norm': 3.535907506942749, 'learning_rate': 2.8971191991473312e-05, 'epoch': 0.59}


 12%|█▏        | 310/2530 [33:23<4:00:19,  6.50s/it]

{'loss': 1.1086, 'grad_norm': 2.3132007122039795, 'learning_rate': 2.8902324436306995e-05, 'epoch': 0.61}


 13%|█▎        | 320/2530 [34:28<3:59:25,  6.50s/it]

{'loss': 1.0979, 'grad_norm': 3.5858161449432373, 'learning_rate': 2.8831313296966565e-05, 'epoch': 0.63}


 13%|█▎        | 330/2530 [35:33<3:58:05,  6.49s/it]

{'loss': 1.103, 'grad_norm': 1.6011021137237549, 'learning_rate': 2.8758169522581795e-05, 'epoch': 0.65}


 13%|█▎        | 340/2530 [36:38<3:57:30,  6.51s/it]

{'loss': 1.099, 'grad_norm': 2.3125827312469482, 'learning_rate': 2.8682904391111128e-05, 'epoch': 0.67}


 14%|█▍        | 350/2530 [37:40<3:43:00,  6.14s/it]

{'loss': 1.094, 'grad_norm': 1.853219985961914, 'learning_rate': 2.8605529507602727e-05, 'epoch': 0.69}


 14%|█▍        | 360/2530 [38:44<3:53:28,  6.46s/it]

{'loss': 1.1264, 'grad_norm': 1.8638333082199097, 'learning_rate': 2.852605680240511e-05, 'epoch': 0.71}


 15%|█▍        | 370/2530 [39:49<3:53:42,  6.49s/it]

{'loss': 1.0975, 'grad_norm': 1.0354242324829102, 'learning_rate': 2.8444498529327633e-05, 'epoch': 0.73}


 15%|█▌        | 380/2530 [40:54<3:52:39,  6.49s/it]

{'loss': 1.1127, 'grad_norm': 2.518148183822632, 'learning_rate': 2.8360867263751055e-05, 'epoch': 0.75}


 15%|█▌        | 390/2530 [41:59<3:50:48,  6.47s/it]

{'loss': 1.1147, 'grad_norm': 1.3903168439865112, 'learning_rate': 2.8275175900688607e-05, 'epoch': 0.77}


 16%|█▌        | 400/2530 [43:03<3:50:03,  6.48s/it]

{'loss': 1.0785, 'grad_norm': 3.5134799480438232, 'learning_rate': 2.8187437652797676e-05, 'epoch': 0.79}


 16%|█▌        | 410/2530 [44:08<3:49:11,  6.49s/it]

{'loss': 1.0855, 'grad_norm': 3.373004674911499, 'learning_rate': 2.8097666048342583e-05, 'epoch': 0.81}


 17%|█▋        | 420/2530 [45:13<3:48:24,  6.49s/it]

{'loss': 1.1042, 'grad_norm': 2.6540136337280273, 'learning_rate': 2.8005874929108662e-05, 'epoch': 0.83}


 17%|█▋        | 430/2530 [46:18<3:47:15,  6.49s/it]

{'loss': 1.1114, 'grad_norm': 3.1557300090789795, 'learning_rate': 2.791207844826804e-05, 'epoch': 0.85}


 17%|█▋        | 440/2530 [47:22<3:41:21,  6.35s/it]

{'loss': 1.1057, 'grad_norm': 3.3654706478118896, 'learning_rate': 2.781629106819733e-05, 'epoch': 0.87}


 18%|█▊        | 450/2530 [48:26<3:44:46,  6.48s/it]

{'loss': 1.1044, 'grad_norm': 2.0659000873565674, 'learning_rate': 2.7718527558247722e-05, 'epoch': 0.89}


 18%|█▊        | 460/2530 [49:31<3:43:55,  6.49s/it]

{'loss': 1.1005, 'grad_norm': 3.6929931640625, 'learning_rate': 2.7618802992467718e-05, 'epoch': 0.91}


 19%|█▊        | 470/2530 [50:36<3:42:50,  6.49s/it]

{'loss': 1.087, 'grad_norm': 2.417505979537964, 'learning_rate': 2.751713274727886e-05, 'epoch': 0.93}


 19%|█▉        | 480/2530 [51:41<3:41:59,  6.50s/it]

{'loss': 1.0927, 'grad_norm': 1.7178573608398438, 'learning_rate': 2.7413532499104862e-05, 'epoch': 0.95}


 19%|█▉        | 490/2530 [52:46<3:36:45,  6.38s/it]

{'loss': 1.0926, 'grad_norm': 1.1143302917480469, 'learning_rate': 2.73080182219545e-05, 'epoch': 0.97}


 20%|█▉        | 500/2530 [53:49<3:38:10,  6.45s/it]

{'loss': 1.1137, 'grad_norm': 2.998089075088501, 'learning_rate': 2.7200606184958567e-05, 'epoch': 0.99}


                                                    
 20%|██        | 506/2530 [1:08:37<3:38:53,  6.49s/it]

{'eval_loss': 1.0992237329483032, 'eval_runtime': 845.3488, 'eval_samples_per_second': 1.198, 'eval_steps_per_second': 0.6, 'epoch': 1.0}


 20%|██        | 510/2530 [1:09:05<52:46:11, 94.05s/it]  

{'loss': 1.105, 'grad_norm': 1.2614396810531616, 'learning_rate': 2.7091312949861367e-05, 'epoch': 1.01}


 21%|██        | 520/2530 [1:10:10<5:00:50,  8.98s/it] 

{'loss': 1.1106, 'grad_norm': 1.6377100944519043, 'learning_rate': 2.698015536846709e-05, 'epoch': 1.03}


 21%|██        | 530/2530 [1:11:15<3:38:18,  6.55s/it]

{'loss': 1.119, 'grad_norm': 2.331671714782715, 'learning_rate': 2.6867150580041436e-05, 'epoch': 1.05}


 21%|██▏       | 540/2530 [1:12:19<3:34:28,  6.47s/it]

{'loss': 1.1073, 'grad_norm': 1.3423969745635986, 'learning_rate': 2.6752316008668916e-05, 'epoch': 1.07}


 22%|██▏       | 550/2530 [1:13:24<3:34:40,  6.51s/it]

{'loss': 1.1001, 'grad_norm': 1.9027093648910522, 'learning_rate': 2.6635669360566298e-05, 'epoch': 1.09}


 22%|██▏       | 560/2530 [1:14:29<3:32:28,  6.47s/it]

{'loss': 1.1068, 'grad_norm': 4.592230319976807, 'learning_rate': 2.651722862135245e-05, 'epoch': 1.11}


 23%|██▎       | 570/2530 [1:15:34<3:32:24,  6.50s/it]

{'loss': 1.1195, 'grad_norm': 2.05647873878479, 'learning_rate': 2.63970120532752e-05, 'epoch': 1.13}


 23%|██▎       | 580/2530 [1:16:39<3:30:53,  6.49s/it]

{'loss': 1.1076, 'grad_norm': 2.176149845123291, 'learning_rate': 2.627503819239547e-05, 'epoch': 1.15}


 23%|██▎       | 590/2530 [1:17:44<3:29:51,  6.49s/it]

{'loss': 1.103, 'grad_norm': 1.6473824977874756, 'learning_rate': 2.6151325845729245e-05, 'epoch': 1.16}


 24%|██▎       | 600/2530 [1:18:49<3:28:18,  6.48s/it]

{'loss': 1.0992, 'grad_norm': 2.0042366981506348, 'learning_rate': 2.6025894088347723e-05, 'epoch': 1.18}


 24%|██▍       | 610/2530 [1:19:54<3:27:46,  6.49s/it]

{'loss': 1.1053, 'grad_norm': 1.2904692888259888, 'learning_rate': 2.5898762260436156e-05, 'epoch': 1.2}


 25%|██▍       | 620/2530 [1:20:59<3:26:31,  6.49s/it]

{'loss': 1.0754, 'grad_norm': 1.4390493631362915, 'learning_rate': 2.5769949964311814e-05, 'epoch': 1.22}


 25%|██▍       | 630/2530 [1:22:04<3:25:53,  6.50s/it]

{'loss': 1.0792, 'grad_norm': 3.210905075073242, 'learning_rate': 2.563947706140151e-05, 'epoch': 1.24}


 25%|██▌       | 640/2530 [1:23:09<3:24:33,  6.49s/it]

{'loss': 1.1154, 'grad_norm': 1.9668328762054443, 'learning_rate': 2.5507363669179215e-05, 'epoch': 1.26}


 26%|██▌       | 650/2530 [1:24:13<3:23:19,  6.49s/it]

{'loss': 1.1151, 'grad_norm': 1.618813157081604, 'learning_rate': 2.537363015806412e-05, 'epoch': 1.28}


 26%|██▌       | 660/2530 [1:25:18<3:22:34,  6.50s/it]

{'loss': 1.1253, 'grad_norm': 2.421704053878784, 'learning_rate': 2.5238297148279813e-05, 'epoch': 1.3}


 26%|██▋       | 670/2530 [1:26:23<3:21:16,  6.49s/it]

{'loss': 1.1023, 'grad_norm': 1.2199931144714355, 'learning_rate': 2.5101385506674793e-05, 'epoch': 1.32}


 27%|██▋       | 680/2530 [1:27:28<3:20:09,  6.49s/it]

{'loss': 1.0935, 'grad_norm': 2.8846426010131836, 'learning_rate': 2.4962916343505093e-05, 'epoch': 1.34}


 27%|██▋       | 690/2530 [1:28:33<3:18:52,  6.48s/it]

{'loss': 1.1044, 'grad_norm': 1.9602367877960205, 'learning_rate': 2.482291100917928e-05, 'epoch': 1.36}


 28%|██▊       | 700/2530 [1:29:38<3:17:42,  6.48s/it]

{'loss': 1.0978, 'grad_norm': 1.319777011871338, 'learning_rate': 2.4681391090966466e-05, 'epoch': 1.38}


 28%|██▊       | 710/2530 [1:30:43<3:17:12,  6.50s/it]

{'loss': 1.0987, 'grad_norm': 2.498842477798462, 'learning_rate': 2.4538378409667802e-05, 'epoch': 1.4}


 28%|██▊       | 720/2530 [1:31:48<3:15:44,  6.49s/it]

{'loss': 1.0982, 'grad_norm': 1.5828238725662231, 'learning_rate': 2.4393895016251943e-05, 'epoch': 1.42}


 29%|██▉       | 730/2530 [1:32:53<3:14:44,  6.49s/it]

{'loss': 1.1026, 'grad_norm': 2.539292573928833, 'learning_rate': 2.4247963188455027e-05, 'epoch': 1.44}


 29%|██▉       | 740/2530 [1:33:58<3:13:44,  6.49s/it]

{'loss': 1.1156, 'grad_norm': 2.326648712158203, 'learning_rate': 2.4100605427345704e-05, 'epoch': 1.46}


 30%|██▉       | 750/2530 [1:35:03<3:12:27,  6.49s/it]

{'loss': 1.0978, 'grad_norm': 1.187072992324829, 'learning_rate': 2.3951844453855727e-05, 'epoch': 1.48}


 30%|███       | 760/2530 [1:36:08<3:11:49,  6.50s/it]

{'loss': 1.1033, 'grad_norm': 1.7058727741241455, 'learning_rate': 2.3801703205276616e-05, 'epoch': 1.5}


 30%|███       | 770/2530 [1:37:12<3:10:42,  6.50s/it]

{'loss': 1.0913, 'grad_norm': 1.5051029920578003, 'learning_rate': 2.3650204831723007e-05, 'epoch': 1.52}


 31%|███       | 780/2530 [1:38:17<3:09:30,  6.50s/it]

{'loss': 1.1058, 'grad_norm': 3.853031873703003, 'learning_rate': 2.3497372692563143e-05, 'epoch': 1.54}


 31%|███       | 790/2530 [1:39:22<3:08:11,  6.49s/it]

{'loss': 1.103, 'grad_norm': 0.7603291869163513, 'learning_rate': 2.334323035281713e-05, 'epoch': 1.56}


 32%|███▏      | 800/2530 [1:40:27<3:06:55,  6.48s/it]

{'loss': 1.082, 'grad_norm': 2.471839666366577, 'learning_rate': 2.318780157952345e-05, 'epoch': 1.58}


 32%|███▏      | 810/2530 [1:41:32<3:06:25,  6.50s/it]

{'loss': 1.1316, 'grad_norm': 5.8995256423950195, 'learning_rate': 2.3031110338074388e-05, 'epoch': 1.6}


 32%|███▏      | 820/2530 [1:42:37<3:05:28,  6.51s/it]

{'loss': 1.0998, 'grad_norm': 2.591655731201172, 'learning_rate': 2.2873180788520794e-05, 'epoch': 1.62}


 33%|███▎      | 830/2530 [1:43:42<3:03:58,  6.49s/it]

{'loss': 1.1034, 'grad_norm': 4.002465724945068, 'learning_rate': 2.27140372818469e-05, 'epoch': 1.64}


 33%|███▎      | 840/2530 [1:44:47<3:02:53,  6.49s/it]

{'loss': 1.0994, 'grad_norm': 2.21405029296875, 'learning_rate': 2.2553704356215642e-05, 'epoch': 1.66}


 34%|███▎      | 850/2530 [1:45:52<3:02:10,  6.51s/it]

{'loss': 1.0949, 'grad_norm': 1.9501183032989502, 'learning_rate': 2.2392206733185175e-05, 'epoch': 1.68}


 34%|███▍      | 860/2530 [1:46:57<3:00:38,  6.49s/it]

{'loss': 1.1109, 'grad_norm': 1.5677443742752075, 'learning_rate': 2.2229569313897068e-05, 'epoch': 1.7}


 34%|███▍      | 870/2530 [1:48:02<2:59:37,  6.49s/it]

{'loss': 1.0873, 'grad_norm': 4.851618766784668, 'learning_rate': 2.2065817175236813e-05, 'epoch': 1.72}


 35%|███▍      | 880/2530 [1:49:07<2:58:13,  6.48s/it]

{'loss': 1.099, 'grad_norm': 1.392723560333252, 'learning_rate': 2.1900975565967284e-05, 'epoch': 1.74}


 35%|███▌      | 890/2530 [1:50:12<2:57:22,  6.49s/it]

{'loss': 1.0947, 'grad_norm': 2.178339719772339, 'learning_rate': 2.173506990283561e-05, 'epoch': 1.76}


 36%|███▌      | 900/2530 [1:51:17<2:56:31,  6.50s/it]

{'loss': 1.1047, 'grad_norm': 1.8291800022125244, 'learning_rate': 2.1568125766654236e-05, 'epoch': 1.78}


 36%|███▌      | 910/2530 [1:52:22<2:55:18,  6.49s/it]

{'loss': 1.0834, 'grad_norm': 3.165698289871216, 'learning_rate': 2.140016889835663e-05, 'epoch': 1.8}


 36%|███▋      | 920/2530 [1:53:27<2:54:02,  6.49s/it]

{'loss': 1.0918, 'grad_norm': 2.366227626800537, 'learning_rate': 2.12312251950283e-05, 'epoch': 1.82}


 37%|███▋      | 930/2530 [1:54:32<2:53:11,  6.49s/it]

{'loss': 1.1075, 'grad_norm': 2.4914615154266357, 'learning_rate': 2.1061320705913778e-05, 'epoch': 1.84}


 37%|███▋      | 940/2530 [1:55:36<2:51:40,  6.48s/it]

{'loss': 1.1081, 'grad_norm': 1.8227754831314087, 'learning_rate': 2.08904816284001e-05, 'epoch': 1.86}


 38%|███▊      | 950/2530 [1:56:41<2:50:58,  6.49s/it]

{'loss': 1.1104, 'grad_norm': 2.9958066940307617, 'learning_rate': 2.071873430397747e-05, 'epoch': 1.88}


 38%|███▊      | 960/2530 [1:57:46<2:49:47,  6.49s/it]

{'loss': 1.0915, 'grad_norm': 2.7777597904205322, 'learning_rate': 2.0546105214177678e-05, 'epoch': 1.9}


 38%|███▊      | 970/2530 [1:58:51<2:48:39,  6.49s/it]

{'loss': 1.1084, 'grad_norm': 1.5286182165145874, 'learning_rate': 2.037262097649096e-05, 'epoch': 1.92}


 39%|███▊      | 980/2530 [1:59:56<2:47:44,  6.49s/it]

{'loss': 1.0941, 'grad_norm': 1.1438041925430298, 'learning_rate': 2.019830834026186e-05, 'epoch': 1.93}


 39%|███▉      | 990/2530 [2:01:01<2:46:24,  6.48s/it]

{'loss': 1.0954, 'grad_norm': 2.214442014694214, 'learning_rate': 2.002319418256479e-05, 'epoch': 1.95}


 40%|███▉      | 1000/2530 [2:02:06<2:45:28,  6.49s/it]

{'loss': 1.1015, 'grad_norm': 2.129039764404297, 'learning_rate': 1.984730550405989e-05, 'epoch': 1.97}


 40%|███▉      | 1010/2530 [2:03:11<2:44:40,  6.50s/it]

{'loss': 1.0889, 'grad_norm': 1.8456953763961792, 'learning_rate': 1.967066942482978e-05, 'epoch': 1.99}


                                                       
 40%|████      | 1013/2530 [2:20:06<2:40:11,  6.34s/it]

{'eval_loss': 1.1049301624298096, 'eval_runtime': 996.601, 'eval_samples_per_second': 1.016, 'eval_steps_per_second': 0.509, 'epoch': 2.0}


 40%|████      | 1020/2530 [2:20:54<17:30:46, 41.75s/it]  

{'loss': 1.1097, 'grad_norm': 2.3184454441070557, 'learning_rate': 1.9493313180198024e-05, 'epoch': 2.01}


 41%|████      | 1030/2530 [2:21:59<3:07:06,  7.48s/it] 

{'loss': 1.1093, 'grad_norm': 2.3042070865631104, 'learning_rate': 1.931526411652967e-05, 'epoch': 2.03}


 41%|████      | 1040/2530 [2:23:04<2:41:56,  6.52s/it]

{'loss': 1.0992, 'grad_norm': 2.155867576599121, 'learning_rate': 1.913654968701478e-05, 'epoch': 2.05}


 42%|████▏     | 1050/2530 [2:24:09<2:39:40,  6.47s/it]

{'loss': 1.1026, 'grad_norm': 1.4660122394561768, 'learning_rate': 1.8957197447435458e-05, 'epoch': 2.07}


 42%|████▏     | 1060/2530 [2:25:14<2:39:01,  6.49s/it]

{'loss': 1.1097, 'grad_norm': 2.9505527019500732, 'learning_rate': 1.8777235051917027e-05, 'epoch': 2.09}


 42%|████▏     | 1070/2530 [2:26:19<2:37:58,  6.49s/it]

{'loss': 1.095, 'grad_norm': 3.6491384506225586, 'learning_rate': 1.8596690248664103e-05, 'epoch': 2.11}


 43%|████▎     | 1080/2530 [2:27:24<2:37:00,  6.50s/it]

{'loss': 1.1013, 'grad_norm': 1.9663735628128052, 'learning_rate': 1.8415590875682098e-05, 'epoch': 2.13}


 43%|████▎     | 1090/2530 [2:28:29<2:35:59,  6.50s/it]

{'loss': 1.1087, 'grad_norm': 1.1590845584869385, 'learning_rate': 1.8233964856484924e-05, 'epoch': 2.15}


 43%|████▎     | 1100/2530 [2:29:34<2:35:01,  6.50s/it]

{'loss': 1.1086, 'grad_norm': 2.402301073074341, 'learning_rate': 1.805184019578951e-05, 'epoch': 2.17}


 44%|████▍     | 1110/2530 [2:30:38<2:33:48,  6.50s/it]

{'loss': 1.1019, 'grad_norm': 2.675626039505005, 'learning_rate': 1.7869244975197752e-05, 'epoch': 2.19}


 44%|████▍     | 1120/2530 [2:31:43<2:32:43,  6.50s/it]

{'loss': 1.0874, 'grad_norm': 3.5328590869903564, 'learning_rate': 1.7686207348866677e-05, 'epoch': 2.21}


 45%|████▍     | 1130/2530 [2:32:48<2:31:35,  6.50s/it]

{'loss': 1.0949, 'grad_norm': 1.7119258642196655, 'learning_rate': 1.750275553916736e-05, 'epoch': 2.23}


 45%|████▌     | 1140/2530 [2:33:53<2:30:27,  6.49s/it]

{'loss': 1.0986, 'grad_norm': 1.3919862508773804, 'learning_rate': 1.7318917832333356e-05, 'epoch': 2.25}


 45%|████▌     | 1150/2530 [2:34:58<2:29:06,  6.48s/it]

{'loss': 1.1036, 'grad_norm': 2.5913338661193848, 'learning_rate': 1.713472257409928e-05, 'epoch': 2.27}


 46%|████▌     | 1160/2530 [2:36:03<2:28:27,  6.50s/it]

{'loss': 1.0968, 'grad_norm': 2.9442477226257324, 'learning_rate': 1.69501981653302e-05, 'epoch': 2.29}


 46%|████▌     | 1170/2530 [2:37:08<2:27:19,  6.50s/it]

{'loss': 1.1127, 'grad_norm': 4.0179972648620605, 'learning_rate': 1.6765373057642554e-05, 'epoch': 2.31}


 47%|████▋     | 1180/2530 [2:38:13<2:26:12,  6.50s/it]

{'loss': 1.101, 'grad_norm': 1.51995849609375, 'learning_rate': 1.6580275749017208e-05, 'epoch': 2.33}


 47%|████▋     | 1190/2530 [2:39:18<2:24:43,  6.48s/it]

{'loss': 1.1031, 'grad_norm': 2.8017945289611816, 'learning_rate': 1.639493477940539e-05, 'epoch': 2.35}


 47%|████▋     | 1200/2530 [2:40:23<2:23:54,  6.49s/it]

{'loss': 1.1111, 'grad_norm': 2.223665952682495, 'learning_rate': 1.6209378726328168e-05, 'epoch': 2.37}


 48%|████▊     | 1210/2530 [2:41:27<2:22:54,  6.50s/it]

{'loss': 1.1041, 'grad_norm': 2.1170573234558105, 'learning_rate': 1.6023636200470068e-05, 'epoch': 2.39}


 48%|████▊     | 1220/2530 [2:42:32<2:21:29,  6.48s/it]

{'loss': 1.1079, 'grad_norm': 2.9590256214141846, 'learning_rate': 1.583773584126766e-05, 'epoch': 2.41}


 49%|████▊     | 1230/2530 [2:43:37<2:20:25,  6.48s/it]

{'loss': 1.0945, 'grad_norm': 3.0380055904388428, 'learning_rate': 1.565170631249367e-05, 'epoch': 2.43}


 49%|████▉     | 1240/2530 [2:44:41<2:19:03,  6.47s/it]

{'loss': 1.0966, 'grad_norm': 2.144970417022705, 'learning_rate': 1.5465576297837337e-05, 'epoch': 2.45}


 49%|████▉     | 1250/2530 [2:45:46<2:18:17,  6.48s/it]

{'loss': 1.0951, 'grad_norm': 1.168903112411499, 'learning_rate': 1.5279374496481708e-05, 'epoch': 2.47}


 50%|████▉     | 1260/2530 [2:46:51<2:17:12,  6.48s/it]

{'loss': 1.1014, 'grad_norm': 2.0965237617492676, 'learning_rate': 1.5093129618678527e-05, 'epoch': 2.49}


 50%|█████     | 1270/2530 [2:47:55<2:16:03,  6.48s/it]

{'loss': 1.1024, 'grad_norm': 2.8617501258850098, 'learning_rate': 1.4906870381321476e-05, 'epoch': 2.51}


 51%|█████     | 1280/2530 [2:49:00<2:15:10,  6.49s/it]

{'loss': 1.0875, 'grad_norm': 2.158494472503662, 'learning_rate': 1.47206255035183e-05, 'epoch': 2.53}


 51%|█████     | 1290/2530 [2:50:05<2:13:50,  6.48s/it]

{'loss': 1.1096, 'grad_norm': 1.5240864753723145, 'learning_rate': 1.4534423702162662e-05, 'epoch': 2.55}


 51%|█████▏    | 1300/2530 [2:51:10<2:12:56,  6.48s/it]

{'loss': 1.1022, 'grad_norm': 2.2761449813842773, 'learning_rate': 1.434829368750633e-05, 'epoch': 2.57}


 52%|█████▏    | 1310/2530 [2:52:14<2:11:58,  6.49s/it]

{'loss': 1.1042, 'grad_norm': 2.215200662612915, 'learning_rate': 1.4162264158732343e-05, 'epoch': 2.59}


 52%|█████▏    | 1320/2530 [2:53:19<2:10:57,  6.49s/it]

{'loss': 1.1079, 'grad_norm': 1.698764443397522, 'learning_rate': 1.3976363799529938e-05, 'epoch': 2.61}


 53%|█████▎    | 1330/2530 [2:54:24<2:09:52,  6.49s/it]

{'loss': 1.1105, 'grad_norm': 4.84207010269165, 'learning_rate': 1.3790621273671834e-05, 'epoch': 2.63}


 53%|█████▎    | 1340/2530 [2:55:29<2:08:39,  6.49s/it]

{'loss': 1.1051, 'grad_norm': 3.454545259475708, 'learning_rate': 1.3605065220594605e-05, 'epoch': 2.65}


 53%|█████▎    | 1350/2530 [2:56:33<2:07:39,  6.49s/it]

{'loss': 1.1001, 'grad_norm': 4.542194843292236, 'learning_rate': 1.3419724250982795e-05, 'epoch': 2.67}


 54%|█████▍    | 1360/2530 [2:57:38<2:06:40,  6.50s/it]

{'loss': 1.0858, 'grad_norm': 2.3242154121398926, 'learning_rate': 1.3234626942357447e-05, 'epoch': 2.69}


 54%|█████▍    | 1370/2530 [2:58:43<2:05:08,  6.47s/it]

{'loss': 1.1243, 'grad_norm': 1.715355634689331, 'learning_rate': 1.3049801834669801e-05, 'epoch': 2.7}


 55%|█████▍    | 1380/2530 [2:59:47<2:04:10,  6.48s/it]

{'loss': 1.0955, 'grad_norm': 3.349112033843994, 'learning_rate': 1.2865277425900725e-05, 'epoch': 2.72}


 55%|█████▍    | 1390/2530 [3:00:52<2:03:03,  6.48s/it]

{'loss': 1.1006, 'grad_norm': 1.8213557004928589, 'learning_rate': 1.2681082167666647e-05, 'epoch': 2.74}


 55%|█████▌    | 1400/2530 [3:01:57<2:02:11,  6.49s/it]

{'loss': 1.1062, 'grad_norm': 2.8365318775177, 'learning_rate': 1.2497244460832644e-05, 'epoch': 2.76}


 56%|█████▌    | 1410/2530 [3:03:02<2:01:15,  6.50s/it]

{'loss': 1.0991, 'grad_norm': 2.0795340538024902, 'learning_rate': 1.2313792651133326e-05, 'epoch': 2.78}


 56%|█████▌    | 1420/2530 [3:04:06<1:59:54,  6.48s/it]

{'loss': 1.1026, 'grad_norm': 4.228910446166992, 'learning_rate': 1.2130755024802252e-05, 'epoch': 2.8}


 57%|█████▋    | 1430/2530 [3:05:11<1:59:00,  6.49s/it]

{'loss': 1.1009, 'grad_norm': 1.884877324104309, 'learning_rate': 1.1948159804210497e-05, 'epoch': 2.82}


 57%|█████▋    | 1440/2530 [3:06:16<1:57:44,  6.48s/it]

{'loss': 1.1118, 'grad_norm': 1.7092621326446533, 'learning_rate': 1.1766035143515077e-05, 'epoch': 2.84}


 57%|█████▋    | 1450/2530 [3:07:20<1:56:41,  6.48s/it]

{'loss': 1.0973, 'grad_norm': 1.0999305248260498, 'learning_rate': 1.1584409124317906e-05, 'epoch': 2.86}


 58%|█████▊    | 1460/2530 [3:08:25<1:55:40,  6.49s/it]

{'loss': 1.1015, 'grad_norm': 3.0418310165405273, 'learning_rate': 1.14033097513359e-05, 'epoch': 2.88}


 58%|█████▊    | 1470/2530 [3:09:30<1:54:23,  6.47s/it]

{'loss': 1.1063, 'grad_norm': 2.4686412811279297, 'learning_rate': 1.1222764948082973e-05, 'epoch': 2.9}


 58%|█████▊    | 1480/2530 [3:10:35<1:53:29,  6.49s/it]

{'loss': 1.0997, 'grad_norm': 2.3565738201141357, 'learning_rate': 1.1042802552564544e-05, 'epoch': 2.92}


 59%|█████▉    | 1490/2530 [3:11:39<1:52:17,  6.48s/it]

{'loss': 1.112, 'grad_norm': 3.2330892086029053, 'learning_rate': 1.086345031298522e-05, 'epoch': 2.94}


 59%|█████▉    | 1500/2530 [3:12:44<1:51:19,  6.49s/it]

{'loss': 1.0973, 'grad_norm': 4.444709777832031, 'learning_rate': 1.0684735883470333e-05, 'epoch': 2.96}


 60%|█████▉    | 1510/2530 [3:13:49<1:50:16,  6.49s/it]

{'loss': 1.0907, 'grad_norm': 2.9566493034362793, 'learning_rate': 1.0506686819801979e-05, 'epoch': 2.98}


                                                       
 60%|██████    | 1519/2530 [3:28:57<1:49:11,  6.48s/it]

{'eval_loss': 1.1332805156707764, 'eval_runtime': 847.3466, 'eval_samples_per_second': 1.195, 'eval_steps_per_second': 0.598, 'epoch': 3.0}


 60%|██████    | 1520/2530 [3:29:03<73:19:39, 261.37s/it]

{'loss': 1.0711, 'grad_norm': 3.0342366695404053, 'learning_rate': 1.0329330575170221e-05, 'epoch': 3.0}


 60%|██████    | 1530/2530 [3:30:08<3:48:05, 13.69s/it]  

{'loss': 1.0748, 'grad_norm': 4.88809061050415, 'learning_rate': 1.0152694495940119e-05, 'epoch': 3.02}


 61%|██████    | 1540/2530 [3:31:13<1:50:20,  6.69s/it]

{'loss': 1.0931, 'grad_norm': 1.9179832935333252, 'learning_rate': 9.976805817435208e-06, 'epoch': 3.04}


 61%|██████▏   | 1550/2530 [3:32:17<1:45:47,  6.48s/it]

{'loss': 0.9972, 'grad_norm': 7.412996292114258, 'learning_rate': 9.80169165973814e-06, 'epoch': 3.06}


 62%|██████▏   | 1560/2530 [3:33:22<1:44:42,  6.48s/it]

{'loss': 1.0546, 'grad_norm': 17.79094123840332, 'learning_rate': 9.662176137124742e-06, 'epoch': 3.08}


 62%|██████▏   | 1570/2530 [3:34:27<1:43:56,  6.50s/it]

{'loss': 1.0704, 'grad_norm': 4.092397212982178, 'learning_rate': 9.48852407563511e-06, 'epoch': 3.1}


 62%|██████▏   | 1580/2530 [3:35:31<1:42:48,  6.49s/it]

{'loss': 0.9834, 'grad_norm': 5.293342590332031, 'learning_rate': 9.315721822580148e-06, 'epoch': 3.12}


 63%|██████▎   | 1590/2530 [3:36:36<1:41:48,  6.50s/it]

{'loss': 1.0667, 'grad_norm': 4.342841148376465, 'learning_rate': 9.143796022149936e-06, 'epoch': 3.14}


 63%|██████▎   | 1600/2530 [3:37:41<1:40:43,  6.50s/it]

{'loss': 1.0018, 'grad_norm': 5.999962329864502, 'learning_rate': 8.972773183395299e-06, 'epoch': 3.16}


 64%|██████▎   | 1610/2530 [3:38:46<1:39:44,  6.50s/it]

{'loss': 0.9285, 'grad_norm': 8.573980331420898, 'learning_rate': 8.802679676140373e-06, 'epoch': 3.18}


 64%|██████▍   | 1620/2530 [3:39:51<1:38:26,  6.49s/it]

{'loss': 0.9346, 'grad_norm': 5.249029159545898, 'learning_rate': 8.633541726916698e-06, 'epoch': 3.2}


 64%|██████▍   | 1630/2530 [3:40:56<1:37:14,  6.48s/it]

{'loss': 0.9654, 'grad_norm': 4.864335060119629, 'learning_rate': 8.465385414919363e-06, 'epoch': 3.22}


 65%|██████▍   | 1640/2530 [3:42:00<1:36:13,  6.49s/it]

{'loss': 0.9219, 'grad_norm': 5.9684648513793945, 'learning_rate': 8.298236667985894e-06, 'epoch': 3.24}


 65%|██████▌   | 1650/2530 [3:43:05<1:35:01,  6.48s/it]

{'loss': 0.9375, 'grad_norm': 4.694262981414795, 'learning_rate': 8.13212125859846e-06, 'epoch': 3.26}


 66%|██████▌   | 1660/2530 [3:44:10<1:34:06,  6.49s/it]

{'loss': 0.9394, 'grad_norm': 3.5855469703674316, 'learning_rate': 7.967064799910043e-06, 'epoch': 3.28}


 66%|██████▌   | 1670/2530 [3:45:15<1:33:01,  6.49s/it]

{'loss': 1.0385, 'grad_norm': 6.922179698944092, 'learning_rate': 7.803092741795183e-06, 'epoch': 3.3}


 66%|██████▋   | 1680/2530 [3:46:19<1:31:54,  6.49s/it]

{'loss': 0.9327, 'grad_norm': 3.674916982650757, 'learning_rate': 7.640230366925881e-06, 'epoch': 3.32}


 67%|██████▋   | 1690/2530 [3:47:24<1:30:44,  6.48s/it]

{'loss': 0.9604, 'grad_norm': 5.4354448318481445, 'learning_rate': 7.478502786873288e-06, 'epoch': 3.34}


 67%|██████▋   | 1700/2530 [3:48:29<1:29:36,  6.48s/it]

{'loss': 0.95, 'grad_norm': 7.069309711456299, 'learning_rate': 7.317934938235802e-06, 'epoch': 3.36}


 68%|██████▊   | 1710/2530 [3:49:34<1:28:42,  6.49s/it]

{'loss': 0.9483, 'grad_norm': 5.618178844451904, 'learning_rate': 7.158551578794089e-06, 'epoch': 3.38}


 68%|██████▊   | 1720/2530 [3:50:38<1:27:41,  6.50s/it]

{'loss': 0.9814, 'grad_norm': 4.6456193923950195, 'learning_rate': 7.000377283693728e-06, 'epoch': 3.4}


 68%|██████▊   | 1730/2530 [3:51:43<1:26:37,  6.50s/it]

{'loss': 1.0093, 'grad_norm': 5.172972679138184, 'learning_rate': 6.843436441655988e-06, 'epoch': 3.42}


 69%|██████▉   | 1740/2530 [3:52:48<1:25:30,  6.49s/it]

{'loss': 0.9464, 'grad_norm': 11.002119064331055, 'learning_rate': 6.68775325121738e-06, 'epoch': 3.44}


 69%|██████▉   | 1750/2530 [3:53:53<1:24:15,  6.48s/it]

{'loss': 0.9161, 'grad_norm': 10.811800956726074, 'learning_rate': 6.533351716998466e-06, 'epoch': 3.46}


 70%|██████▉   | 1760/2530 [3:54:57<1:23:13,  6.48s/it]

{'loss': 0.8014, 'grad_norm': 8.041816711425781, 'learning_rate': 6.380255646002654e-06, 'epoch': 3.47}


 70%|██████▉   | 1770/2530 [3:56:02<1:22:10,  6.49s/it]

{'loss': 0.957, 'grad_norm': 2.8815455436706543, 'learning_rate': 6.228488643945408e-06, 'epoch': 3.49}


 70%|███████   | 1780/2530 [3:57:07<1:20:44,  6.46s/it]

{'loss': 0.9551, 'grad_norm': 10.968076705932617, 'learning_rate': 6.078074111614502e-06, 'epoch': 3.51}


 71%|███████   | 1790/2530 [3:58:11<1:20:03,  6.49s/it]

{'loss': 0.87, 'grad_norm': 4.634321689605713, 'learning_rate': 5.929035241261899e-06, 'epoch': 3.53}


 71%|███████   | 1800/2530 [3:59:16<1:18:50,  6.48s/it]

{'loss': 0.9068, 'grad_norm': 6.077610015869141, 'learning_rate': 5.781395013027732e-06, 'epoch': 3.55}


 72%|███████▏  | 1810/2530 [4:00:21<1:17:52,  6.49s/it]

{'loss': 0.8498, 'grad_norm': 7.696475028991699, 'learning_rate': 5.635176191397048e-06, 'epoch': 3.57}


 72%|███████▏  | 1820/2530 [4:01:25<1:16:38,  6.48s/it]

{'loss': 0.812, 'grad_norm': 5.2518630027771, 'learning_rate': 5.490401321689762e-06, 'epoch': 3.59}


 72%|███████▏  | 1830/2530 [4:02:30<1:15:39,  6.48s/it]

{'loss': 0.9106, 'grad_norm': 11.879291534423828, 'learning_rate': 5.34709272658442e-06, 'epoch': 3.61}


 73%|███████▎  | 1840/2530 [4:03:35<1:14:35,  6.49s/it]

{'loss': 0.923, 'grad_norm': 7.267664432525635, 'learning_rate': 5.205272502676317e-06, 'epoch': 3.63}


 73%|███████▎  | 1850/2530 [4:04:40<1:13:24,  6.48s/it]

{'loss': 0.8918, 'grad_norm': 9.875836372375488, 'learning_rate': 5.0649625170703885e-06, 'epoch': 3.65}


 74%|███████▎  | 1860/2530 [4:05:44<1:12:28,  6.49s/it]

{'loss': 0.8569, 'grad_norm': 5.547942638397217, 'learning_rate': 4.926184404009605e-06, 'epoch': 3.67}


 74%|███████▍  | 1870/2530 [4:06:49<1:11:28,  6.50s/it]

{'loss': 0.7308, 'grad_norm': 9.12745189666748, 'learning_rate': 4.7889595615391775e-06, 'epoch': 3.69}


 74%|███████▍  | 1880/2530 [4:07:54<1:10:16,  6.49s/it]

{'loss': 0.734, 'grad_norm': 7.3858137130737305, 'learning_rate': 4.65330914820723e-06, 'epoch': 3.71}


 75%|███████▍  | 1890/2530 [4:08:59<1:09:07,  6.48s/it]

{'loss': 0.9383, 'grad_norm': 12.040438652038574, 'learning_rate': 4.519254079802386e-06, 'epoch': 3.73}


 75%|███████▌  | 1900/2530 [4:10:03<1:08:00,  6.48s/it]

{'loss': 0.8572, 'grad_norm': 6.978471755981445, 'learning_rate': 4.386815026128778e-06, 'epoch': 3.75}


 75%|███████▌  | 1910/2530 [4:11:08<1:07:01,  6.49s/it]

{'loss': 0.9254, 'grad_norm': 11.284137725830078, 'learning_rate': 4.256012407818996e-06, 'epoch': 3.77}


 76%|███████▌  | 1920/2530 [4:12:13<1:06:02,  6.50s/it]

{'loss': 0.8874, 'grad_norm': 17.712158203125, 'learning_rate': 4.1268663931854585e-06, 'epoch': 3.79}


 76%|███████▋  | 1930/2530 [4:13:18<1:04:53,  6.49s/it]

{'loss': 0.8042, 'grad_norm': 5.439753532409668, 'learning_rate': 3.999396895110668e-06, 'epoch': 3.81}


 77%|███████▋  | 1940/2530 [4:14:22<1:03:54,  6.50s/it]

{'loss': 0.9138, 'grad_norm': 9.912501335144043, 'learning_rate': 3.8736235679768985e-06, 'epoch': 3.83}


 77%|███████▋  | 1950/2530 [4:15:27<1:02:44,  6.49s/it]

{'loss': 0.8068, 'grad_norm': 9.117073059082031, 'learning_rate': 3.7495658046356556e-06, 'epoch': 3.85}


 77%|███████▋  | 1960/2530 [4:16:32<1:01:40,  6.49s/it]

{'loss': 0.8139, 'grad_norm': 6.98477840423584, 'learning_rate': 3.6272427334175628e-06, 'epoch': 3.87}


 78%|███████▊  | 1970/2530 [4:17:37<1:00:32,  6.49s/it]

{'loss': 0.8403, 'grad_norm': 10.16403579711914, 'learning_rate': 3.506673215182957e-06, 'epoch': 3.89}


 78%|███████▊  | 1980/2530 [4:18:42<59:26,  6.49s/it]  

{'loss': 0.7291, 'grad_norm': 6.269925117492676, 'learning_rate': 3.3878758404137627e-06, 'epoch': 3.91}


 79%|███████▊  | 1990/2530 [4:19:46<58:26,  6.49s/it]

{'loss': 0.9462, 'grad_norm': 11.131715774536133, 'learning_rate': 3.2708689263470565e-06, 'epoch': 3.93}


 79%|███████▉  | 2000/2530 [4:20:51<57:22,  6.49s/it]

{'loss': 0.9172, 'grad_norm': 13.193572998046875, 'learning_rate': 3.1556705141507245e-06, 'epoch': 3.95}


 79%|███████▉  | 2010/2530 [4:21:56<56:06,  6.47s/it]

{'loss': 0.7556, 'grad_norm': 11.998347282409668, 'learning_rate': 3.042298366141738e-06, 'epoch': 3.97}


 80%|███████▉  | 2020/2530 [4:23:00<55:04,  6.48s/it]

{'loss': 0.7583, 'grad_norm': 8.6522855758667, 'learning_rate': 2.930769963047394e-06, 'epoch': 3.99}


                                                     
 80%|████████  | 2026/2530 [4:40:10<53:08,  6.33s/it]

{'eval_loss': 0.8006295561790466, 'eval_runtime': 991.9253, 'eval_samples_per_second': 1.021, 'eval_steps_per_second': 0.511, 'epoch': 4.0}


 80%|████████  | 2030/2530 [4:40:39<15:06:41, 108.80s/it]

{'loss': 0.7541, 'grad_norm': 11.304121017456055, 'learning_rate': 2.8211025013099806e-06, 'epoch': 4.01}


 81%|████████  | 2040/2530 [4:41:44<1:16:37,  9.38s/it]  

{'loss': 0.8002, 'grad_norm': 8.463007926940918, 'learning_rate': 2.713312890435277e-06, 'epoch': 4.03}


 81%|████████  | 2050/2530 [4:42:49<52:32,  6.57s/it]  

{'loss': 0.8808, 'grad_norm': 6.70296049118042, 'learning_rate': 2.6074177503852937e-06, 'epoch': 4.05}


 81%|████████▏ | 2060/2530 [4:43:54<50:50,  6.49s/it]

{'loss': 0.9161, 'grad_norm': 13.414787292480469, 'learning_rate': 2.5034334090156608e-06, 'epoch': 4.07}


 82%|████████▏ | 2070/2530 [4:44:58<49:46,  6.49s/it]

{'loss': 0.7668, 'grad_norm': 8.39021110534668, 'learning_rate': 2.4013758995580528e-06, 'epoch': 4.09}


 82%|████████▏ | 2080/2530 [4:46:03<48:42,  6.49s/it]

{'loss': 0.8573, 'grad_norm': 21.08426284790039, 'learning_rate': 2.3012609581480486e-06, 'epoch': 4.11}


 83%|████████▎ | 2090/2530 [4:47:08<47:34,  6.49s/it]

{'loss': 0.7019, 'grad_norm': 8.095500946044922, 'learning_rate': 2.2031040213987804e-06, 'epoch': 4.13}


 83%|████████▎ | 2100/2530 [4:48:13<46:30,  6.49s/it]

{'loss': 0.8291, 'grad_norm': 9.17888355255127, 'learning_rate': 2.10692022402079e-06, 'epoch': 4.15}


 83%|████████▎ | 2110/2530 [4:49:18<45:26,  6.49s/it]

{'loss': 0.7647, 'grad_norm': 9.185944557189941, 'learning_rate': 2.012724396488419e-06, 'epoch': 4.17}


 84%|████████▍ | 2120/2530 [4:50:22<43:54,  6.43s/it]

{'loss': 0.7499, 'grad_norm': 10.850695610046387, 'learning_rate': 1.920531062753118e-06, 'epoch': 4.19}


 84%|████████▍ | 2130/2530 [4:51:26<43:09,  6.47s/it]

{'loss': 0.6765, 'grad_norm': 15.76744270324707, 'learning_rate': 1.83035443800402e-06, 'epoch': 4.21}


 85%|████████▍ | 2140/2530 [4:52:31<42:04,  6.47s/it]

{'loss': 0.7398, 'grad_norm': 6.349808692932129, 'learning_rate': 1.742208426476093e-06, 'epoch': 4.23}


 85%|████████▍ | 2150/2530 [4:53:36<41:03,  6.48s/it]

{'loss': 0.718, 'grad_norm': 18.586870193481445, 'learning_rate': 1.6561066193062912e-06, 'epoch': 4.24}


 85%|████████▌ | 2160/2530 [4:54:40<39:57,  6.48s/it]

{'loss': 0.8302, 'grad_norm': 5.394116401672363, 'learning_rate': 1.5720622924379295e-06, 'epoch': 4.26}


 86%|████████▌ | 2170/2530 [4:55:45<38:55,  6.49s/it]

{'loss': 0.6876, 'grad_norm': 9.859572410583496, 'learning_rate': 1.4900884045736922e-06, 'epoch': 4.28}


 86%|████████▌ | 2180/2530 [4:56:50<37:51,  6.49s/it]

{'loss': 0.8178, 'grad_norm': 11.324897766113281, 'learning_rate': 1.4101975951775482e-06, 'epoch': 4.3}


 87%|████████▋ | 2190/2530 [4:57:55<36:44,  6.48s/it]

{'loss': 0.8623, 'grad_norm': 15.308244705200195, 'learning_rate': 1.3324021825258692e-06, 'epoch': 4.32}


 87%|████████▋ | 2200/2530 [4:58:59<35:39,  6.48s/it]

{'loss': 0.745, 'grad_norm': 7.48248291015625, 'learning_rate': 1.2567141618081056e-06, 'epoch': 4.34}


 87%|████████▋ | 2210/2530 [5:00:04<34:35,  6.48s/it]

{'loss': 0.7175, 'grad_norm': 14.235809326171875, 'learning_rate': 1.1831452032772499e-06, 'epoch': 4.36}


 88%|████████▊ | 2220/2530 [5:01:09<33:26,  6.47s/it]

{'loss': 0.745, 'grad_norm': 14.712079048156738, 'learning_rate': 1.1117066504504158e-06, 'epoch': 4.38}


 88%|████████▊ | 2230/2530 [5:02:13<32:24,  6.48s/it]

{'loss': 0.7993, 'grad_norm': 17.00355339050293, 'learning_rate': 1.0424095183598098e-06, 'epoch': 4.4}


 89%|████████▊ | 2240/2530 [5:03:18<31:20,  6.48s/it]

{'loss': 0.5189, 'grad_norm': 7.863236904144287, 'learning_rate': 9.752644918543008e-07, 'epoch': 4.42}


 89%|████████▉ | 2250/2530 [5:04:23<30:12,  6.47s/it]

{'loss': 0.8039, 'grad_norm': 18.050479888916016, 'learning_rate': 9.102819239519683e-07, 'epoch': 4.44}


 89%|████████▉ | 2260/2530 [5:05:27<29:05,  6.46s/it]

{'loss': 0.7518, 'grad_norm': 12.14304256439209, 'learning_rate': 8.474718342437643e-07, 'epoch': 4.46}


 90%|████████▉ | 2270/2530 [5:06:32<28:04,  6.48s/it]

{'loss': 0.7931, 'grad_norm': 12.990486145019531, 'learning_rate': 7.868439073486073e-07, 'epoch': 4.48}


 90%|█████████ | 2280/2530 [5:07:37<27:02,  6.49s/it]

{'loss': 0.7615, 'grad_norm': 11.650672912597656, 'learning_rate': 7.2840749142013e-07, 'epoch': 4.5}


 91%|█████████ | 2290/2530 [5:08:42<25:57,  6.49s/it]

{'loss': 0.6934, 'grad_norm': 7.159228324890137, 'learning_rate': 6.721715967052794e-07, 'epoch': 4.52}


 91%|█████████ | 2300/2530 [5:09:46<24:54,  6.50s/it]

{'loss': 0.9263, 'grad_norm': 10.02138900756836, 'learning_rate': 6.181448941550543e-07, 'epoch': 4.54}


 91%|█████████▏| 2310/2530 [5:10:51<23:44,  6.48s/it]

{'loss': 0.761, 'grad_norm': 11.731865882873535, 'learning_rate': 5.663357140875303e-07, 'epoch': 4.56}


 92%|█████████▏| 2320/2530 [5:11:56<22:37,  6.47s/it]

{'loss': 0.6946, 'grad_norm': 10.577459335327148, 'learning_rate': 5.16752044903423e-07, 'epoch': 4.58}


 92%|█████████▏| 2330/2530 [5:13:00<21:26,  6.43s/it]

{'loss': 0.8942, 'grad_norm': 9.19836139678955, 'learning_rate': 4.69401531854366e-07, 'epoch': 4.6}


 92%|█████████▏| 2340/2530 [5:14:05<20:34,  6.50s/it]

{'loss': 0.6942, 'grad_norm': 11.419611930847168, 'learning_rate': 4.242914758640848e-07, 'epoch': 4.62}


 93%|█████████▎| 2350/2530 [5:15:10<19:30,  6.50s/it]

{'loss': 0.7775, 'grad_norm': 11.451339721679688, 'learning_rate': 3.8142883240269767e-07, 'epoch': 4.64}


 93%|█████████▎| 2360/2530 [5:16:14<18:21,  6.48s/it]

{'loss': 0.7302, 'grad_norm': 14.956415176391602, 'learning_rate': 3.4082021041423826e-07, 'epoch': 4.66}


 94%|█████████▎| 2370/2530 [5:17:19<17:16,  6.48s/it]

{'loss': 0.804, 'grad_norm': 12.54621696472168, 'learning_rate': 3.0247187129764266e-07, 'epoch': 4.68}


 94%|█████████▍| 2380/2530 [5:18:24<16:12,  6.48s/it]

{'loss': 0.7354, 'grad_norm': 7.35681676864624, 'learning_rate': 2.66389727941308e-07, 'epoch': 4.7}


 94%|█████████▍| 2390/2530 [5:19:28<15:08,  6.49s/it]

{'loss': 0.7277, 'grad_norm': 14.896217346191406, 'learning_rate': 2.3257934381138714e-07, 'epoch': 4.72}


 95%|█████████▍| 2400/2530 [5:20:33<14:04,  6.49s/it]

{'loss': 0.7112, 'grad_norm': 6.409152507781982, 'learning_rate': 2.0104593209397116e-07, 'epoch': 4.74}


 95%|█████████▌| 2410/2530 [5:21:38<12:58,  6.49s/it]

{'loss': 0.5852, 'grad_norm': 7.409276008605957, 'learning_rate': 1.717943548912626e-07, 'epoch': 4.76}


 96%|█████████▌| 2420/2530 [5:22:43<11:53,  6.49s/it]

{'loss': 0.8388, 'grad_norm': 10.340348243713379, 'learning_rate': 1.4482912247190506e-07, 'epoch': 4.78}


 96%|█████████▌| 2430/2530 [5:23:47<10:42,  6.43s/it]

{'loss': 0.7769, 'grad_norm': 17.462360382080078, 'learning_rate': 1.2015439257554274e-07, 'epoch': 4.8}


 96%|█████████▋| 2440/2530 [5:24:51<09:41,  6.46s/it]

{'loss': 0.8206, 'grad_norm': 7.803554058074951, 'learning_rate': 9.777396977174668e-08, 'epoch': 4.82}


 97%|█████████▋| 2450/2530 [5:25:56<08:37,  6.47s/it]

{'loss': 0.8422, 'grad_norm': 9.731578826904297, 'learning_rate': 7.769130487339071e-08, 'epoch': 4.84}


 97%|█████████▋| 2460/2530 [5:27:00<07:33,  6.47s/it]

{'loss': 0.8213, 'grad_norm': 18.144376754760742, 'learning_rate': 5.99094944045736e-08, 'epoch': 4.86}


 98%|█████████▊| 2470/2530 [5:28:05<06:29,  6.50s/it]

{'loss': 0.6953, 'grad_norm': 8.089004516601562, 'learning_rate': 4.4431280123169436e-08, 'epoch': 4.88}


 98%|█████████▊| 2480/2530 [5:29:10<05:24,  6.49s/it]

{'loss': 0.661, 'grad_norm': 8.362761497497559, 'learning_rate': 3.125904859808237e-08, 'epoch': 4.9}


 98%|█████████▊| 2490/2530 [5:30:15<04:18,  6.47s/it]

{'loss': 0.8195, 'grad_norm': 14.074579238891602, 'learning_rate': 2.0394830841259373e-08, 'epoch': 4.92}


 99%|█████████▉| 2500/2530 [5:31:19<03:11,  6.37s/it]

{'loss': 0.729, 'grad_norm': 13.121773719787598, 'learning_rate': 1.1840301994535674e-08, 'epoch': 4.94}


 99%|█████████▉| 2510/2530 [5:32:24<02:09,  6.48s/it]

{'loss': 0.8098, 'grad_norm': 10.476983070373535, 'learning_rate': 5.596781071339763e-09, 'epoch': 4.96}


100%|█████████▉| 2520/2530 [5:33:29<01:04,  6.47s/it]

{'loss': 0.7323, 'grad_norm': 9.920268058776855, 'learning_rate': 1.6652307533260303e-09, 'epoch': 4.98}


100%|██████████| 2530/2530 [5:34:33<00:00,  6.46s/it]

{'loss': 0.6974, 'grad_norm': 13.007787704467773, 'learning_rate': 4.625724192686143e-11, 'epoch': 5.0}


                                                     
100%|██████████| 2530/2530 [5:51:10<00:00,  6.46s/it]

{'eval_loss': 0.7807053327560425, 'eval_runtime': 995.0304, 'eval_samples_per_second': 1.018, 'eval_steps_per_second': 0.51, 'epoch': 5.0}


100%|██████████| 2530/2530 [5:51:12<00:00,  8.33s/it]

{'train_runtime': 21072.8878, 'train_samples_per_second': 0.961, 'train_steps_per_second': 0.12, 'train_loss': 0.9983279559923255, 'epoch': 5.0}





TrainOutput(global_step=2530, training_loss=0.9983279559923255, metrics={'train_runtime': 21072.8878, 'train_samples_per_second': 0.961, 'train_steps_per_second': 0.12, 'total_flos': 1.3309223087812608e+16, 'train_loss': 0.9983279559923255, 'epoch': 4.995064165844028})

In [48]:
print(f"Number of training examples: {len(train_dataset)}")
print(f"Batch size: {training_args.per_device_train_batch_size}")


Number of training examples: 4051
Batch size: 2


In [49]:
# Save the trained model and tokenizer
model.save_pretrained('./longformer_model3')
tokenizer.save_pretrained('./longformer_model3')

('./longformer_model3\\tokenizer_config.json',
 './longformer_model3\\special_tokens_map.json',
 './longformer_model3\\vocab.json',
 './longformer_model3\\merges.txt',
 './longformer_model3\\added_tokens.json')

In [50]:
# Evaluate the model
results = trainer.evaluate()

100%|██████████| 507/507 [16:35<00:00,  1.96s/it]


In [51]:
print(results)

{'eval_loss': 0.7807053327560425, 'eval_runtime': 997.624, 'eval_samples_per_second': 1.015, 'eval_steps_per_second': 0.508, 'epoch': 4.995064165844028}


In [52]:
# After training, get predictions on the validation dataset
predictions = trainer.predict(val_dataset)

# Get the predicted logits
logits = predictions.predictions

# Convert logits to predicted class labels
predicted_labels = np.argmax(logits, axis=1)

# Get the true labels
true_labels = val_labels

#  Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

100%|██████████| 507/507 [16:40<00:00,  1.97s/it]

Accuracy: 0.6614
F1 Score: 0.6564
Precision: 0.6601
Recall: 0.6614





In [None]:
import random
import torch
import time
from transformers import LongformerForSequenceClassification, LongformerTokenizer

torch.cuda.empty_cache()


# Set random seed for reproducibility
random.seed(42)

# Load the saved model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = './longformer_model3'  # Path to your saved model
model = LongformerForSequenceClassification.from_pretrained(model_path).to(device)
tokenizer = LongformerTokenizer.from_pretrained(model_path)

print("Model and tokenizer successfully loaded!")

# Limit to 100 articles from the validation set
val_texts_subset = val_texts[:100]
val_labels_subset = val_labels[:100]

# Tokenize the subset of the validation set
start_tokenization_time = time.time()
inputs = tokenizer(val_texts_subset, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
end_tokenization_time = time.time()

# Perform inference
model.eval()
start_inference_time = time.time()
with torch.no_grad():
    outputs = model(**inputs)
end_inference_time = time.time()

# Get the predicted class for each input
logits = outputs.logits
predicted_classes = torch.argmax(logits, dim=1).tolist()

# Print results for the validation set subset
label_mapping = {0: "Positive", 1: "Neutral", 2: "Negative"}
correct_predictions = 0

for i, (text, true_label, predicted_label) in enumerate(zip(val_texts_subset, val_labels_subset, predicted_classes)):
    print(f"Text {i+1}: {text}")
    print(f"True Label: {label_mapping[true_label]}, Predicted Label: {label_mapping[predicted_label]}\n")
    if true_label == predicted_label:
        correct_predictions += 1

# Calculate overall accuracy
accuracy = correct_predictions / len(val_labels_subset)
print(f"Accuracy on the validation set: {accuracy:.4f}")

# Calculate timing metrics
tokenization_time = end_tokenization_time - start_tokenization_time
inference_time = end_inference_time - start_inference_time
print(f"Time taken for tokenization: {tokenization_time:.4f} seconds")
print(f"Time taken for inference: {inference_time:.4f} seconds")
print(f"Total time taken: {tokenization_time + inference_time:.4f} seconds")

Model and tokenizer successfully loaded!
Text 1: TatarMedia.ID - Jakarta, Komitmen untuk terus tumbuh dan memberikan layanan terbaik kepada para nasabahnya terus ditunjukkan oleh PT Bank Rakyat Indonesia (Pesero) Tbk atau BRI.  Hal ini dibuktikan melalui penghargaan Total Service Quality Satisfaction based on Customer Perception Survey SQIndex 2024 dengan Predikat Diamond di ajang Service Quality Awards 2024.  SQ Award merupakan penghargaan bergengsi bertaraf nasional yang telah diadakan sejak tahun 2007 oleh Majalah Marketing dan Customer Action Recognition and Reward Experience (CARRE). Kali ini, penghargaan diberikan pada 7 Agustus 2024 di Sky Ayana Jimbaran, Bali.  Penghargaan ini bertujuan untuk menganugerahkan perusahaan-perusahaan yang berhasil memberikan layanan terbaik pada industrinya masing-masing dengan meraih nilai tertinggi dan di atas standar industri pada Service Quality Index (SQIndex).  Adapun indikator utama yang dinilai yakni kualitas layanan, meliputi Service Acces

: 

In [53]:
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Load the saved model and tokenizer
model = LongformerForSequenceClassification.from_pretrained('./longformer_model').to(device)
tokenizer = LongformerTokenizer.from_pretrained('./longformer_model')

In [None]:
# Define label mapping
label_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}

In [None]:
# Testing new articles
def classify_article(article):
    # Tokenize the input article and move the tensors to the correct device
    inputs = tokenizer(article, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    # Move tokenized inputs to the correct device (GPU or CPU)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        # Perform inference using the model
        outputs = model(**inputs)
    
    logits = outputs.logits
    # Get the predicted label (index of the highest logit)
    predicted_label = np.argmax(logits.cpu().numpy(), axis=1)[0]
    
    # Calculate sentiment scores (probabilities)
    sentiment_score = torch.softmax(logits, dim=1).cpu().numpy()[0]
    
    # Map predicted label to sentiment text
    sentiment_result = label_mapping[predicted_label]
    
    return sentiment_result, sentiment_score

In [None]:
# User input for a new article
new_article = input("Your article:")
sentiment_result, sentiment_score = classify_article(new_article)
print(f"The sentiment of the article is: {sentiment_result}")
print(f"Sentiment scores: {sentiment_score}")


The sentiment of the article is: Positive
Sentiment scores: [0.00133929 0.01447496 0.98418576]


In [None]:
from transformers import LongformerTokenizer, EncoderDecoderModel

# Load summarization model and tokenizer
summarization_model = EncoderDecoderModel.from_pretrained("patrickvonplaten/longformer2roberta-cnn_dailymail-fp16")
summarization_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")

You are using a model of type encoder_decoder to instantiate a model of type encoder-decoder. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at patrickvonplaten/longformer2roberta-cnn_dailymail-fp16 were not used when initializing EncoderDecoderModel: ['decoder.roberta.pooler.dense.bias', 'decoder.roberta.pooler.dense.weight', 'encoder.embeddings.position_ids']
- This IS expected if you are initializing EncoderDecoderModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EncoderDecoderModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Function to summarize text
def summarize_text(text):
    input_ids = summarization_tokenizer(text, return_tensors="pt", max_length=4096, truncation=True).input_ids
    output_ids = summarization_model.generate(input_ids, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    return summarization_tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [None]:
# User input for text summarization
article_to_summarize = input("Your article:")

In [None]:
# Ensure model is moved to the device
summarization_model = summarization_model.to(device)

# Tokenize and summarize
input_ids = summarization_tokenizer(article_to_summarize, return_tensors="pt").input_ids.to(device)
output_ids = summarization_model.generate(input_ids)

# Get the summary from the output tokens
summary = summarization_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print summary
print("Summary of the article:")
print(summary)

Summary of the article:
The program Harcilnas 2024 is based in Jakarta, Indonesia .
The program is aimed at helping the program Harinya Cicilan Lunasan .
Program Harcilas is a program ini e-bike, e-bikes, ebike, hinga, e bike, e/bike .


In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

summarization_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# Article text to summarize
article_to_summarize = """
Jakarta, VIVA – Pembayaran kredit sepeda motor yang seringkali menjadi beban bagi sebagian besar masyarakat, kini 
dapat diatasi melalui program Hari Pembayaran Berbayar atau Harcilnas 2024 yang diselenggarakan oleh PT Adira Dinamika 
Multi Finance Tbk (Adira Finance). Program ini memungkinkan nasabah melunasi kredit sepeda motor atau kredit lainnya secara 
penuh hanya dengan menukarkan poin yang terkumpul. Pada Harcilnas periode pertama tahun ini, ada 12 nasabah yang berhasil 
melunasi seluruh tagihannya.
"""

# Use "summarize:" prefix
input_text = f"summarize: {article_to_summarize}"

# Tokenize the input text
input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512).input_ids.to(device)

# Generate the summary with BART's optimized settings
output_ids = summarization_model.generate(
    input_ids,
    max_length=150,            # Shorter summary length
    num_beams=4,               # Beam search for better quality
    early_stopping=True,
)

# Decode the generated summary output
summary = tokenizer.decode(output_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
print("Summary of the article:")
print(summary)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Summary of the article:
Pembayaran kredit sepeda motor yang seringkali menjadi beban bagi sebagian besar masyarakat. Program ini memungkinkan nasabah melunasi kredit atau kredit lainnya secara penuh.


In [None]:
# Export results to CSV
import csv

def export_results(sentiment_results, summaries):
    with open('results.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Article', 'Sentiment', 'Sentiment Scores', 'Summary'])
        for article, sentiment, scores, summary in zip([new_article, article_to_summarize], 
                                                        [sentiment_result, 'N/A'], 
                                                        [sentiment_score, 'N/A'], 
                                                        [summary, 'N/A']):
            writer.writerow([article, sentiment, scores.tolist() if isinstance(scores, np.ndarray) else scores, summary])

In [None]:
# Call the export function
export_results(sentiment_results=[(new_article, sentiment_result, sentiment_score)], 
               summaries=[summary])
print("Results exported to 'results.csv'.")

UnboundLocalError: cannot access local variable 'summary' where it is not associated with a value