In [2]:
from dotenv import load_dotenv

import pandas as pd
import matplotlib.pyplot as plt

# Load environment variables
load_dotenv()

print("Setup complete. Environment variables loaded and libraries imported.")

Setup complete. Environment variables loaded and libraries imported.


In [3]:
# Load the master dataset with all statistics
df_original = pd.read_csv('../data/new/master_translated.csv')
print(f"Loaded master dataset with {len(df_original):,} rows.")

Loaded master dataset with 302,442 rows.


In [4]:
# Drop duplicate answer_ids to ensure each answer is only counted once
print(f"Before dropping duplicates, dataset has {len(df_original):,} rows.")
df_original = df_original.drop_duplicates(subset=['answer_id'])
print(f"After dropping duplicates, dataset has {len(df_original):,} rows.")



Before dropping duplicates, dataset has 302,442 rows.
After dropping duplicates, dataset has 269,424 rows.


In [5]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from transformers.trainer_utils import get_last_checkpoint
import numpy as np
from sklearn.metrics import accuracy_score, f1_score


In [6]:
# Example: get top 50 users by answer count
user_answer_counts = df_original['user_id'].value_counts()
top_50_users = user_answer_counts.head(50)

# Reverse the order of the top 50 users
top_50_users = top_50_users.iloc[::-1]

print("Top 50 users by number of answers:")
for user_id, count in top_50_users.items():
    print(f"User {user_id}: {count:,} answers")


Top 50 users by number of answers:
User 22606: 501 answers
User 32329: 509 answers
User 22853: 510 answers
User 51978: 522 answers
User 16297: 525 answers
User 52981: 533 answers
User 52584: 535 answers
User 22601: 536 answers
User 50149: 538 answers
User 53005: 540 answers
User 52977: 544 answers
User 51618: 546 answers
User 52913: 546 answers
User 16289: 552 answers
User 52995: 554 answers
User 51184: 570 answers
User 52979: 577 answers
User 52171: 579 answers
User 53024: 596 answers
User 52989: 597 answers
User 52912: 602 answers
User 52991: 616 answers
User 30844: 635 answers
User 52174: 646 answers
User 52295: 647 answers
User 52423: 650 answers
User 52937: 653 answers
User 52311: 680 answers
User 23076: 685 answers
User 52167: 695 answers
User 52914: 698 answers
User 52645: 703 answers
User 52209: 723 answers
User 52173: 743 answers
User 52407: 751 answers
User 51986: 761 answers
User 53007: 807 answers
User 52170: 817 answers
User 52933: 897 answers
User 50061: 899 answers
User 

In [7]:
def train_user_model(
    df: pd.DataFrame,
    user_id: int,
    model_name: str = "answerdotai/ModernBERT-base",
    output_dir_base: str = "user_models",
    push_to_hub: bool = False,
    hub_repo_org: str = None,
    hub_token: str = None,
    num_train_epochs: int = 5,
    batch_size: int = 4
):
    """
    Trains a ModernBERT model to predict correctness for a single user.

    Args:
        df (pd.DataFrame): Original DataFrame with user data.
        user_id (int): Which user's data to train on.
        model_name (str): Hugging Face model ID for ModernBERT.
        output_dir_base (str): Base directory where user-specific checkpoints are stored.
        push_to_hub (bool): Whether to push final model to Hugging Face Hub.
        hub_repo_org (str): Org or username for HF Hub repositories.
        hub_token (str): Personal access token if required for private repos.
        num_train_epochs (int): Number of training epochs.
        batch_size (int): Training batch size.
    """

    # --------------------------
    # 1) Filter user data
    # --------------------------
    df_user = df[df["user_id"] == user_id].copy()
    if df_user.empty:
        print(f"[User {user_id}] No data. Skipping.")
        return

    # Convert is_correct to an integer label
    df_user["label"] = df_user["is_correct"].astype(int)

    # --------------------------
    # 2) Combine text: question + choices + topics
    # --------------------------
    def combine_text(row):
        q = str(row.get("question_title", ""))
        a = str(row.get("option_a", ""))
        b = str(row.get("option_b", ""))
        c = str(row.get("option_c", ""))
        d = str(row.get("option_d", ""))
        e = str(row.get("option_e", ""))
        topic = str(row.get("topic_name", ""))
        subj = str(row.get("subject_name", ""))
        axis = str(row.get("axis_name", ""))

        return (
            f"Topic: {topic}\n"
            f"Subject: {subj}\n"
            f"Axis: {axis}\n\n"
            f"Question: {q}\n"
            f"A) {a}\n"
            f"B) {b}\n"
            f"C) {c}\n"
            f"D) {d}\n"
            f"E) {e}"
        )
    
    df_user["text"] = df_user.apply(combine_text, axis=1)

    # --------------------------
    # 3) Train/Test Split
    # --------------------------
    train_df, test_df = train_test_split(
        df_user, 
        test_size=0.2, 
        shuffle=True, 
        random_state=42
    )

    if train_df.empty or test_df.empty:
        print(f"[User {user_id}] Not enough data to split. Skipping.")
        return

    # Build HF Datasets
    train_dataset_hf = Dataset.from_pandas(train_df[["text","label"]].reset_index(drop=True))
    test_dataset_hf = Dataset.from_pandas(test_df[["text","label"]].reset_index(drop=True))

    ds_dict = DatasetDict({
        "train": train_dataset_hf,
        "test": test_dataset_hf
    })

    # --------------------------
    # 4) Tokenize
    # --------------------------
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    def tokenize_fn(batch):
        return tokenizer(
            batch["text"],
            padding="max_length",
            truncation=True,
            max_length=512
        )
    
    ds_dict = ds_dict.map(tokenize_fn, batched=True)

    # --------------------------
    # 5) Load Model
    # --------------------------
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2
    )
    model.config.problem_type = "single_label_classification"

    # --------------------------
    # 6) Training Arguments
    # --------------------------
    user_output_dir = os.path.join(output_dir_base, f"user_{user_id}")
    os.makedirs(user_output_dir, exist_ok=True)

    # Optional: define the Hub repo ID if pushing
    if push_to_hub:
        if hub_repo_org:
            hub_repo_id = f"{hub_repo_org}/modernbert-user-{user_id}"
        else:
            # If no org specified, the model will be pushed to your personal account
            hub_repo_id = f"modernbert-user-{user_id}"
    else:
        hub_repo_id = None

    training_args = TrainingArguments(
        output_dir=user_output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        use_mps_device=True,              # Apple Silicon
        bf16=True,                        # optional, can cause issues on some M2 setups
        optim="adamw_torch_fused",        # fused optimizer for speed
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1",       # or "accuracy"
        save_total_limit=2,              # keep only last 2 checkpoints
        push_to_hub=push_to_hub,         # to decide if we push
        hub_model_id=hub_repo_id,        # if we push
        hub_token=hub_token,             # if needed
        hub_strategy="end",              # push only at end or each save
    )

    # Define metrics
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        acc = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds, average="weighted")
        return {"accuracy": acc, "f1": f1}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds_dict["train"],
        eval_dataset=ds_dict["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # --------------------------
    # 7) Resume Check
    # --------------------------
    last_checkpoint = None
    if os.path.isdir(user_output_dir):
        last_checkpoint = get_last_checkpoint(user_output_dir)
    if last_checkpoint is not None:
        print(f"[User {user_id}] Resuming from {last_checkpoint}")
    else:
        print(f"[User {user_id}] Starting from scratch...")

    # --------------------------
    # 8) Train
    # --------------------------
    trainer.train(resume_from_checkpoint=last_checkpoint)

    # The best checkpoint is already loaded at end (load_best_model_at_end=True)
    # Evaluate final
    eval_metrics = trainer.evaluate()
    print(f"[User {user_id}] Final eval metrics:", eval_metrics)

    # --------------------------
    # 9) Save final model
    # --------------------------
    # This ensures we save *just* the best
    trainer.save_model(user_output_dir)

    # If pushing to hub, trainer will push after the final save (hub_strategy="end").
    # Alternatively, you can manually call:
    # if push_to_hub:
    #     trainer.push_to_hub()

    print(f"[User {user_id}] Done! Best model saved at: {user_output_dir}")


In [None]:
# Directory where you store user-specific models locally
OUTPUT_DIR_BASE = "user_models"

for user_id, count in top_50_users.items():
    print(f"\n=== Training model for User {user_id} (Answers={count}) ===")
    train_user_model(
        df=df_original,
        user_id=user_id,
        model_name="answerdotai/ModernBERT-base",
        output_dir_base=OUTPUT_DIR_BASE,
        push_to_hub=True,               # Set to False if you don't want to push
        hub_token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
        num_train_epochs=10,            # Adjust as needed
        batch_size=4                   # Adjust for your M2 memory constraints
    )



=== Training model for User 22606 (Answers=501) ===


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 22606] Resuming from user_models/user_22606/checkpoint-1000


  0%|          | 0/1000 [00:00<?, ?it/s]

{'train_runtime': 0.536, 'train_samples_per_second': 7462.799, 'train_steps_per_second': 1865.7, 'train_loss': 0.0, 'epoch': 10.0}


  0%|          | 0/26 [00:00<?, ?it/s]

[User 22606] Final eval metrics: {'eval_loss': 0.6155869364738464, 'eval_accuracy': 0.6831683168316832, 'eval_f1': 0.6725498636820204, 'eval_runtime': 4.2999, 'eval_samples_per_second': 23.489, 'eval_steps_per_second': 6.047, 'epoch': 10.0}


training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

events.out.tfevents.1740086126.DN0a1e6b6f.SUNet.13394.31:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

events.out.tfevents.1740086131.DN0a1e6b6f.SUNet.13394.32:   0%|          | 0.00/457 [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

[User 22606] Done! Best model saved at: user_models/user_22606

=== Training model for User 32329 (Answers=509) ===


Map:   0%|          | 0/407 [00:00<?, ? examples/s]

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 32329] Resuming from user_models/user_32329/checkpoint-1020


  0%|          | 0/1020 [00:00<?, ?it/s]

{'train_runtime': 0.4802, 'train_samples_per_second': 8475.209, 'train_steps_per_second': 2124.008, 'train_loss': 0.0, 'epoch': 10.0}


  0%|          | 0/26 [00:00<?, ?it/s]

[User 32329] Final eval metrics: {'eval_loss': 0.5452166795730591, 'eval_accuracy': 0.7549019607843137, 'eval_f1': 0.763047667169946, 'eval_runtime': 4.4332, 'eval_samples_per_second': 23.008, 'eval_steps_per_second': 5.865, 'epoch': 10.0}


events.out.tfevents.1740086136.DN0a1e6b6f.SUNet.13394.33:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

events.out.tfevents.1740086141.DN0a1e6b6f.SUNet.13394.34:   0%|          | 0.00/457 [00:00<?, ?B/s]

[User 32329] Done! Best model saved at: user_models/user_32329

=== Training model for User 22853 (Answers=510) ===


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 22853] Resuming from user_models/user_22853/checkpoint-1020


  0%|          | 0/1020 [00:00<?, ?it/s]

{'train_runtime': 0.5211, 'train_samples_per_second': 7829.125, 'train_steps_per_second': 1957.281, 'train_loss': 0.0, 'epoch': 10.0}


  0%|          | 0/26 [00:00<?, ?it/s]

[User 22853] Final eval metrics: {'eval_loss': 0.6393024325370789, 'eval_accuracy': 0.7647058823529411, 'eval_f1': 0.7617931505290705, 'eval_runtime': 4.3709, 'eval_samples_per_second': 23.336, 'eval_steps_per_second': 5.948, 'epoch': 10.0}


events.out.tfevents.1740086152.DN0a1e6b6f.SUNet.13394.36:   0%|          | 0.00/457 [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

events.out.tfevents.1740086147.DN0a1e6b6f.SUNet.13394.35:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

[User 22853] Done! Best model saved at: user_models/user_22853

=== Training model for User 51978 (Answers=522) ===


Map:   0%|          | 0/417 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 51978] Resuming from user_models/user_51978/checkpoint-1050


  0%|          | 0/1050 [00:00<?, ?it/s]

{'train_runtime': 0.4667, 'train_samples_per_second': 8935.243, 'train_steps_per_second': 2249.881, 'train_loss': 0.0, 'epoch': 10.0}


  0%|          | 0/27 [00:00<?, ?it/s]

[User 51978] Final eval metrics: {'eval_loss': 0.6610203385353088, 'eval_accuracy': 0.9047619047619048, 'eval_f1': 0.877454590083456, 'eval_runtime': 4.4321, 'eval_samples_per_second': 23.691, 'eval_steps_per_second': 6.092, 'epoch': 10.0}


events.out.tfevents.1740086157.DN0a1e6b6f.SUNet.13394.37:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1740086162.DN0a1e6b6f.SUNet.13394.38:   0%|          | 0.00/457 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

[User 51978] Done! Best model saved at: user_models/user_51978

=== Training model for User 16297 (Answers=525) ===


Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 16297] Resuming from user_models/user_16297/checkpoint-1050


  0%|          | 0/1050 [00:00<?, ?it/s]

{'train_runtime': 0.454, 'train_samples_per_second': 9251.038, 'train_steps_per_second': 2312.76, 'train_loss': 0.0, 'epoch': 10.0}


  0%|          | 0/27 [00:00<?, ?it/s]

[User 16297] Final eval metrics: {'eval_loss': 0.4734443724155426, 'eval_accuracy': 0.8571428571428571, 'eval_f1': 0.8413565790614972, 'eval_runtime': 4.4172, 'eval_samples_per_second': 23.771, 'eval_steps_per_second': 6.112, 'epoch': 10.0}


training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1740086167.DN0a1e6b6f.SUNet.13394.39:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

events.out.tfevents.1740086172.DN0a1e6b6f.SUNet.13394.40:   0%|          | 0.00/457 [00:00<?, ?B/s]

[User 16297] Done! Best model saved at: user_models/user_16297

=== Training model for User 52981 (Answers=533) ===


Map:   0%|          | 0/426 [00:00<?, ? examples/s]

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 52981] Resuming from user_models/user_52981/checkpoint-1070


  0%|          | 0/1070 [00:00<?, ?it/s]

{'train_runtime': 0.4551, 'train_samples_per_second': 9359.716, 'train_steps_per_second': 2350.915, 'train_loss': 0.0, 'epoch': 10.0}


  0%|          | 0/27 [00:00<?, ?it/s]

[User 52981] Final eval metrics: {'eval_loss': 0.826038122177124, 'eval_accuracy': 0.8691588785046729, 'eval_f1': 0.8398207919838135, 'eval_runtime': 4.4814, 'eval_samples_per_second': 23.877, 'eval_steps_per_second': 6.025, 'epoch': 10.0}


events.out.tfevents.1740086177.DN0a1e6b6f.SUNet.13394.41:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1740086182.DN0a1e6b6f.SUNet.13394.42:   0%|          | 0.00/457 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

[User 52981] Done! Best model saved at: user_models/user_52981

=== Training model for User 52584 (Answers=535) ===


Map:   0%|          | 0/428 [00:00<?, ? examples/s]

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 52584] Resuming from user_models/user_52584/checkpoint-1070


  0%|          | 0/1070 [00:00<?, ?it/s]

{'train_runtime': 0.4548, 'train_samples_per_second': 9409.775, 'train_steps_per_second': 2352.444, 'train_loss': 0.0, 'epoch': 10.0}


  0%|          | 0/27 [00:00<?, ?it/s]

[User 52584] Final eval metrics: {'eval_loss': 0.3813420534133911, 'eval_accuracy': 0.9252336448598131, 'eval_f1': 0.9039032435404069, 'eval_runtime': 4.5292, 'eval_samples_per_second': 23.624, 'eval_steps_per_second': 5.961, 'epoch': 10.0}


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1740086188.DN0a1e6b6f.SUNet.13394.43:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

events.out.tfevents.1740086193.DN0a1e6b6f.SUNet.13394.44:   0%|          | 0.00/457 [00:00<?, ?B/s]

[User 52584] Done! Best model saved at: user_models/user_52584

=== Training model for User 22601 (Answers=536) ===


Map:   0%|          | 0/428 [00:00<?, ? examples/s]

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 22601] Resuming from user_models/user_22601/checkpoint-1070


  0%|          | 0/1070 [00:00<?, ?it/s]

{'train_runtime': 0.4559, 'train_samples_per_second': 9387.98, 'train_steps_per_second': 2346.995, 'train_loss': 0.0, 'epoch': 10.0}


  0%|          | 0/27 [00:00<?, ?it/s]

[User 22601] Final eval metrics: {'eval_loss': 0.7486973404884338, 'eval_accuracy': 0.6388888888888888, 'eval_f1': 0.6358538047912551, 'eval_runtime': 4.5943, 'eval_samples_per_second': 23.507, 'eval_steps_per_second': 5.877, 'epoch': 10.0}


training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

events.out.tfevents.1740086204.DN0a1e6b6f.SUNet.13394.46:   0%|          | 0.00/457 [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1740086199.DN0a1e6b6f.SUNet.13394.45:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

[User 22601] Done! Best model saved at: user_models/user_22601

=== Training model for User 50149 (Answers=538) ===


Map:   0%|          | 0/430 [00:00<?, ? examples/s]

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 50149] Resuming from user_models/user_50149/checkpoint-1080


  0%|          | 0/1080 [00:00<?, ?it/s]

{'train_runtime': 0.4802, 'train_samples_per_second': 8953.841, 'train_steps_per_second': 2248.872, 'train_loss': 0.0, 'epoch': 10.0}


  0%|          | 0/27 [00:00<?, ?it/s]

[User 50149] Final eval metrics: {'eval_loss': 0.6018584966659546, 'eval_accuracy': 0.7777777777777778, 'eval_f1': 0.7768582375478927, 'eval_runtime': 4.565, 'eval_samples_per_second': 23.658, 'eval_steps_per_second': 5.915, 'epoch': 10.0}


events.out.tfevents.1740086209.DN0a1e6b6f.SUNet.13394.47:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1740086214.DN0a1e6b6f.SUNet.13394.48:   0%|          | 0.00/457 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

[User 50149] Done! Best model saved at: user_models/user_50149

=== Training model for User 53005 (Answers=540) ===


Map:   0%|          | 0/432 [00:00<?, ? examples/s]

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 53005] Resuming from user_models/user_53005/checkpoint-1080


  0%|          | 0/1080 [00:00<?, ?it/s]

{'train_runtime': 0.4678, 'train_samples_per_second': 9234.463, 'train_steps_per_second': 2308.616, 'train_loss': 0.0, 'epoch': 10.0}


  0%|          | 0/27 [00:00<?, ?it/s]

[User 53005] Final eval metrics: {'eval_loss': 1.0922365188598633, 'eval_accuracy': 0.8518518518518519, 'eval_f1': 0.8203309692671396, 'eval_runtime': 4.5322, 'eval_samples_per_second': 23.83, 'eval_steps_per_second': 5.957, 'epoch': 10.0}


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

events.out.tfevents.1740086225.DN0a1e6b6f.SUNet.13394.50:   0%|          | 0.00/457 [00:00<?, ?B/s]

events.out.tfevents.1740086220.DN0a1e6b6f.SUNet.13394.49:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

[User 53005] Done! Best model saved at: user_models/user_53005

=== Training model for User 52977 (Answers=544) ===


Map:   0%|          | 0/435 [00:00<?, ? examples/s]

Map:   0%|          | 0/109 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 52977] Resuming from user_models/user_52977/checkpoint-1090


  0%|          | 0/1090 [00:00<?, ?it/s]

{'train_runtime': 0.4684, 'train_samples_per_second': 9286.324, 'train_steps_per_second': 2326.918, 'train_loss': 0.0, 'epoch': 10.0}


  0%|          | 0/28 [00:00<?, ?it/s]

[User 52977] Final eval metrics: {'eval_loss': 0.6718893051147461, 'eval_accuracy': 0.8623853211009175, 'eval_f1': 0.7986622678175983, 'eval_runtime': 4.6933, 'eval_samples_per_second': 23.224, 'eval_steps_per_second': 5.966, 'epoch': 10.0}


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

events.out.tfevents.1740086231.DN0a1e6b6f.SUNet.13394.51:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

events.out.tfevents.1740086236.DN0a1e6b6f.SUNet.13394.52:   0%|          | 0.00/457 [00:00<?, ?B/s]

[User 52977] Done! Best model saved at: user_models/user_52977

=== Training model for User 51618 (Answers=546) ===


Map:   0%|          | 0/436 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 51618] Resuming from user_models/user_51618/checkpoint-1090


  0%|          | 0/1090 [00:00<?, ?it/s]

{'train_runtime': 0.4594, 'train_samples_per_second': 9491.346, 'train_steps_per_second': 2372.836, 'train_loss': 0.0, 'epoch': 10.0}


  0%|          | 0/28 [00:00<?, ?it/s]

[User 51618] Final eval metrics: {'eval_loss': 0.5607085824012756, 'eval_accuracy': 0.7090909090909091, 'eval_f1': 0.7104150751209575, 'eval_runtime': 4.6714, 'eval_samples_per_second': 23.547, 'eval_steps_per_second': 5.994, 'epoch': 10.0}


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

events.out.tfevents.1740086244.DN0a1e6b6f.SUNet.13394.53:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

events.out.tfevents.1740086249.DN0a1e6b6f.SUNet.13394.54:   0%|          | 0.00/457 [00:00<?, ?B/s]

[User 51618] Done! Best model saved at: user_models/user_51618

=== Training model for User 52913 (Answers=546) ===


Map:   0%|          | 0/436 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 52913] Resuming from user_models/user_52913/checkpoint-1090


  0%|          | 0/1090 [00:00<?, ?it/s]

{'train_runtime': 0.4494, 'train_samples_per_second': 9701.177, 'train_steps_per_second': 2425.294, 'train_loss': 0.0, 'epoch': 10.0}


  0%|          | 0/28 [00:00<?, ?it/s]

[User 52913] Final eval metrics: {'eval_loss': 0.6106128096580505, 'eval_accuracy': 0.8090909090909091, 'eval_f1': 0.7727978466739976, 'eval_runtime': 4.6963, 'eval_samples_per_second': 23.423, 'eval_steps_per_second': 5.962, 'epoch': 10.0}


events.out.tfevents.1740086260.DN0a1e6b6f.SUNet.13394.56:   0%|          | 0.00/457 [00:00<?, ?B/s]

events.out.tfevents.1740086255.DN0a1e6b6f.SUNet.13394.55:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

[User 52913] Done! Best model saved at: user_models/user_52913

=== Training model for User 16289 (Answers=552) ===


Map:   0%|          | 0/441 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 16289] Resuming from user_models/user_16289/checkpoint-1110


  0%|          | 0/1110 [00:00<?, ?it/s]

{'train_runtime': 0.4552, 'train_samples_per_second': 9687.259, 'train_steps_per_second': 2438.29, 'train_loss': 0.0, 'epoch': 10.0}


  0%|          | 0/28 [00:00<?, ?it/s]

[User 16289] Final eval metrics: {'eval_loss': 0.6319292187690735, 'eval_accuracy': 0.7567567567567568, 'eval_f1': 0.7224919835089326, 'eval_runtime': 4.6872, 'eval_samples_per_second': 23.682, 'eval_steps_per_second': 5.974, 'epoch': 10.0}


events.out.tfevents.1740086265.DN0a1e6b6f.SUNet.13394.57:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

events.out.tfevents.1740086270.DN0a1e6b6f.SUNet.13394.58:   0%|          | 0.00/457 [00:00<?, ?B/s]

[User 16289] Done! Best model saved at: user_models/user_16289

=== Training model for User 52995 (Answers=554) ===


Map:   0%|          | 0/443 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 52995] Resuming from user_models/user_52995/checkpoint-1110


  0%|          | 0/1110 [00:00<?, ?it/s]

{'train_runtime': 0.4817, 'train_samples_per_second': 9196.901, 'train_steps_per_second': 2304.415, 'train_loss': 0.0, 'epoch': 10.0}


  0%|          | 0/28 [00:00<?, ?it/s]

[User 52995] Final eval metrics: {'eval_loss': 0.6443758606910706, 'eval_accuracy': 0.8738738738738738, 'eval_f1': 0.8150554400554402, 'eval_runtime': 4.6675, 'eval_samples_per_second': 23.782, 'eval_steps_per_second': 5.999, 'epoch': 10.0}


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1740086276.DN0a1e6b6f.SUNet.13394.59:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

events.out.tfevents.1740086281.DN0a1e6b6f.SUNet.13394.60:   0%|          | 0.00/457 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

[User 52995] Done! Best model saved at: user_models/user_52995

=== Training model for User 51184 (Answers=570) ===


Map:   0%|          | 0/456 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 51184] Resuming from user_models/user_51184/checkpoint-912


  0%|          | 0/1140 [00:00<?, ?it/s]

{'loss': 0.1635, 'grad_norm': 0.9840195178985596, 'learning_rate': 8.333333333333334e-06, 'epoch': 8.33}
{'loss': 0.1338, 'grad_norm': 0.004239968489855528, 'learning_rate': 6.140350877192982e-06, 'epoch': 8.77}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.6967489123344421, 'eval_accuracy': 0.9298245614035088, 'eval_f1': 0.9044657097288676, 'eval_runtime': 4.8201, 'eval_samples_per_second': 23.651, 'eval_steps_per_second': 6.016, 'epoch': 9.0}
{'loss': 0.1355, 'grad_norm': 0.517867922782898, 'learning_rate': 3.9473684210526315e-06, 'epoch': 9.21}
{'loss': 0.1357, 'grad_norm': 0.7190349102020264, 'learning_rate': 1.7543859649122807e-06, 'epoch': 9.65}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.7403576374053955, 'eval_accuracy': 0.9298245614035088, 'eval_f1': 0.9044657097288676, 'eval_runtime': 4.8617, 'eval_samples_per_second': 23.449, 'eval_steps_per_second': 5.965, 'epoch': 10.0}
{'train_runtime': 149.2293, 'train_samples_per_second': 30.557, 'train_steps_per_second': 7.639, 'train_loss': 0.02584436525378311, 'epoch': 10.0}


  0%|          | 0/29 [00:00<?, ?it/s]

[User 51184] Final eval metrics: {'eval_loss': 0.2327311635017395, 'eval_accuracy': 0.9385964912280702, 'eval_f1': 0.9088671906009367, 'eval_runtime': 4.8189, 'eval_samples_per_second': 23.657, 'eval_steps_per_second': 6.018, 'epoch': 10.0}


model.safetensors:   0%|          | 0.00/598M [00:00<?, ?B/s]

events.out.tfevents.1740086286.DN0a1e6b6f.SUNet.13394.61:   0%|          | 0.00/7.70k [00:00<?, ?B/s]

events.out.tfevents.1740085462.DN0a1e6b6f.SUNet.13394.30:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

events.out.tfevents.1740086440.DN0a1e6b6f.SUNet.13394.62:   0%|          | 0.00/457 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

[User 51184] Done! Best model saved at: user_models/user_51184

=== Training model for User 52979 (Answers=577) ===


Map:   0%|          | 0/461 [00:00<?, ? examples/s]

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 52979] Starting from scratch...


  0%|          | 0/1160 [00:00<?, ?it/s]

{'loss': 0.7187, 'grad_norm': 7.754086017608643, 'learning_rate': 4.78448275862069e-05, 'epoch': 0.43}
{'loss': 0.6112, 'grad_norm': 1.5520775318145752, 'learning_rate': 4.5689655172413794e-05, 'epoch': 0.86}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.6812432408332825, 'eval_accuracy': 0.6982758620689655, 'eval_f1': 0.5742166987572204, 'eval_runtime': 4.9073, 'eval_samples_per_second': 23.638, 'eval_steps_per_second': 5.91, 'epoch': 1.0}
{'loss': 0.5581, 'grad_norm': 3.7784132957458496, 'learning_rate': 4.353448275862069e-05, 'epoch': 1.29}
{'loss': 0.585, 'grad_norm': 1.4393309354782104, 'learning_rate': 4.1379310344827587e-05, 'epoch': 1.72}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.5826264023780823, 'eval_accuracy': 0.6982758620689655, 'eval_f1': 0.5742166987572204, 'eval_runtime': 4.8941, 'eval_samples_per_second': 23.702, 'eval_steps_per_second': 5.925, 'epoch': 2.0}
{'loss': 0.6328, 'grad_norm': 7.173276424407959, 'learning_rate': 3.922413793103448e-05, 'epoch': 2.16}
{'loss': 0.5624, 'grad_norm': 2.4226584434509277, 'learning_rate': 3.7068965517241385e-05, 'epoch': 2.59}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.5530815720558167, 'eval_accuracy': 0.6982758620689655, 'eval_f1': 0.5742166987572204, 'eval_runtime': 4.8833, 'eval_samples_per_second': 23.754, 'eval_steps_per_second': 5.939, 'epoch': 3.0}
{'loss': 0.7096, 'grad_norm': 1.4513814449310303, 'learning_rate': 3.4913793103448275e-05, 'epoch': 3.02}
{'loss': 0.5088, 'grad_norm': 6.544799327850342, 'learning_rate': 3.275862068965517e-05, 'epoch': 3.45}
{'loss': 0.5303, 'grad_norm': 5.145565032958984, 'learning_rate': 3.060344827586207e-05, 'epoch': 3.88}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.5112521648406982, 'eval_accuracy': 0.7327586206896551, 'eval_f1': 0.6970048449938177, 'eval_runtime': 4.8828, 'eval_samples_per_second': 23.757, 'eval_steps_per_second': 5.939, 'epoch': 4.0}
{'loss': 0.4377, 'grad_norm': 4.151411533355713, 'learning_rate': 2.844827586206897e-05, 'epoch': 4.31}
{'loss': 0.4561, 'grad_norm': 14.526500701904297, 'learning_rate': 2.6293103448275862e-05, 'epoch': 4.74}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.6625968813896179, 'eval_accuracy': 0.6896551724137931, 'eval_f1': 0.6988430127041743, 'eval_runtime': 4.8869, 'eval_samples_per_second': 23.737, 'eval_steps_per_second': 5.934, 'epoch': 5.0}
{'loss': 0.4223, 'grad_norm': 5.1327738761901855, 'learning_rate': 2.413793103448276e-05, 'epoch': 5.17}
{'loss': 0.4441, 'grad_norm': 7.062114715576172, 'learning_rate': 2.1982758620689654e-05, 'epoch': 5.6}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.8968421816825867, 'eval_accuracy': 0.7327586206896551, 'eval_f1': 0.6763886327780851, 'eval_runtime': 4.9119, 'eval_samples_per_second': 23.616, 'eval_steps_per_second': 5.904, 'epoch': 6.0}
{'loss': 0.4002, 'grad_norm': 0.7256134748458862, 'learning_rate': 1.9827586206896554e-05, 'epoch': 6.03}
{'loss': 0.382, 'grad_norm': 4.336787700653076, 'learning_rate': 1.767241379310345e-05, 'epoch': 6.47}
{'loss': 0.3491, 'grad_norm': 1.2170711755752563, 'learning_rate': 1.5517241379310346e-05, 'epoch': 6.9}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 3.184399127960205, 'eval_accuracy': 0.6810344827586207, 'eval_f1': 0.5798825378064256, 'eval_runtime': 4.9004, 'eval_samples_per_second': 23.671, 'eval_steps_per_second': 5.918, 'epoch': 7.0}
{'loss': 0.3689, 'grad_norm': 1.1690820455551147, 'learning_rate': 1.336206896551724e-05, 'epoch': 7.33}
{'loss': 0.375, 'grad_norm': 34.43851089477539, 'learning_rate': 1.1206896551724138e-05, 'epoch': 7.76}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 1.9807525873184204, 'eval_accuracy': 0.7068965517241379, 'eval_f1': 0.6708222811671087, 'eval_runtime': 4.9003, 'eval_samples_per_second': 23.672, 'eval_steps_per_second': 5.918, 'epoch': 8.0}
{'loss': 0.2613, 'grad_norm': 6.4222941398620605, 'learning_rate': 9.051724137931036e-06, 'epoch': 8.19}
{'loss': 0.3126, 'grad_norm': 2.971757411956787, 'learning_rate': 6.896551724137932e-06, 'epoch': 8.62}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 2.0510575771331787, 'eval_accuracy': 0.6982758620689655, 'eval_f1': 0.669916582540779, 'eval_runtime': 4.8839, 'eval_samples_per_second': 23.752, 'eval_steps_per_second': 5.938, 'epoch': 9.0}
{'loss': 0.2889, 'grad_norm': 3.1501693725585938, 'learning_rate': 4.741379310344828e-06, 'epoch': 9.05}
{'loss': 0.2823, 'grad_norm': 0.9688918590545654, 'learning_rate': 2.586206896551724e-06, 'epoch': 9.48}
{'loss': 0.2865, 'grad_norm': 2.2677109241485596, 'learning_rate': 4.3103448275862073e-07, 'epoch': 9.91}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 2.442965507507324, 'eval_accuracy': 0.6637931034482759, 'eval_f1': 0.61103932811112, 'eval_runtime': 4.916, 'eval_samples_per_second': 23.596, 'eval_steps_per_second': 5.899, 'epoch': 10.0}
{'train_runtime': 749.089, 'train_samples_per_second': 6.154, 'train_steps_per_second': 1.549, 'train_loss': 0.45369186709667075, 'epoch': 10.0}


  0%|          | 0/29 [00:00<?, ?it/s]

[User 52979] Final eval metrics: {'eval_loss': 0.6625968813896179, 'eval_accuracy': 0.6896551724137931, 'eval_f1': 0.6988430127041743, 'eval_runtime': 4.9147, 'eval_samples_per_second': 23.603, 'eval_steps_per_second': 5.901, 'epoch': 10.0}


events.out.tfevents.1740087228.DN0a1e6b6f.SUNet.13394.64:   0%|          | 0.00/457 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/598M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

events.out.tfevents.1740086474.DN0a1e6b6f.SUNet.13394.63:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

[User 52979] Done! Best model saved at: user_models/user_52979

=== Training model for User 52171 (Answers=579) ===


Map:   0%|          | 0/463 [00:00<?, ? examples/s]

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 52171] Starting from scratch...


  0%|          | 0/1160 [00:00<?, ?it/s]

{'loss': 0.5661, 'grad_norm': 0.21722523868083954, 'learning_rate': 4.78448275862069e-05, 'epoch': 0.43}
{'loss': 0.6017, 'grad_norm': 3.620434522628784, 'learning_rate': 4.5689655172413794e-05, 'epoch': 0.86}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.6944404244422913, 'eval_accuracy': 0.853448275862069, 'eval_f1': 0.7859663191659984, 'eval_runtime': 4.9493, 'eval_samples_per_second': 23.438, 'eval_steps_per_second': 5.859, 'epoch': 1.0}
{'loss': 0.3906, 'grad_norm': 0.6139665246009827, 'learning_rate': 4.353448275862069e-05, 'epoch': 1.29}
{'loss': 0.4647, 'grad_norm': 0.35983359813690186, 'learning_rate': 4.1379310344827587e-05, 'epoch': 1.72}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.42003947496414185, 'eval_accuracy': 0.853448275862069, 'eval_f1': 0.7859663191659984, 'eval_runtime': 4.8524, 'eval_samples_per_second': 23.906, 'eval_steps_per_second': 5.976, 'epoch': 2.0}
{'loss': 0.514, 'grad_norm': 2.7646923065185547, 'learning_rate': 3.922413793103448e-05, 'epoch': 2.16}
{'loss': 0.3449, 'grad_norm': 3.4657626152038574, 'learning_rate': 3.7068965517241385e-05, 'epoch': 2.59}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.4265924096107483, 'eval_accuracy': 0.853448275862069, 'eval_f1': 0.7859663191659984, 'eval_runtime': 4.8605, 'eval_samples_per_second': 23.866, 'eval_steps_per_second': 5.966, 'epoch': 3.0}
{'loss': 0.4643, 'grad_norm': 2.6680757999420166, 'learning_rate': 3.4913793103448275e-05, 'epoch': 3.02}
{'loss': 0.3628, 'grad_norm': 0.4472881555557251, 'learning_rate': 3.275862068965517e-05, 'epoch': 3.45}
{'loss': 0.4472, 'grad_norm': 1.568207859992981, 'learning_rate': 3.060344827586207e-05, 'epoch': 3.88}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.7192618250846863, 'eval_accuracy': 0.853448275862069, 'eval_f1': 0.7859663191659984, 'eval_runtime': 4.8771, 'eval_samples_per_second': 23.784, 'eval_steps_per_second': 5.946, 'epoch': 4.0}
{'loss': 0.4525, 'grad_norm': 0.7211773991584778, 'learning_rate': 2.844827586206897e-05, 'epoch': 4.31}
{'loss': 0.3653, 'grad_norm': 0.30269891023635864, 'learning_rate': 2.6293103448275862e-05, 'epoch': 4.74}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.5816306471824646, 'eval_accuracy': 0.853448275862069, 'eval_f1': 0.7859663191659984, 'eval_runtime': 4.8549, 'eval_samples_per_second': 23.893, 'eval_steps_per_second': 5.973, 'epoch': 5.0}
{'loss': 0.4595, 'grad_norm': 0.6073949337005615, 'learning_rate': 2.413793103448276e-05, 'epoch': 5.17}
{'loss': 0.325, 'grad_norm': 0.20305407047271729, 'learning_rate': 2.1982758620689654e-05, 'epoch': 5.6}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.43665608763694763, 'eval_accuracy': 0.8706896551724138, 'eval_f1': 0.8346485186655149, 'eval_runtime': 4.8617, 'eval_samples_per_second': 23.86, 'eval_steps_per_second': 5.965, 'epoch': 6.0}
{'loss': 0.3942, 'grad_norm': 0.6542053818702698, 'learning_rate': 1.9827586206896554e-05, 'epoch': 6.03}
{'loss': 0.3345, 'grad_norm': 35.637569427490234, 'learning_rate': 1.767241379310345e-05, 'epoch': 6.47}
{'loss': 0.2957, 'grad_norm': 9.415839195251465, 'learning_rate': 1.5517241379310346e-05, 'epoch': 6.9}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.5658952593803406, 'eval_accuracy': 0.8706896551724138, 'eval_f1': 0.8346485186655149, 'eval_runtime': 4.878, 'eval_samples_per_second': 23.78, 'eval_steps_per_second': 5.945, 'epoch': 7.0}
{'loss': 0.2399, 'grad_norm': 0.26714515686035156, 'learning_rate': 1.336206896551724e-05, 'epoch': 7.33}
{'loss': 0.2999, 'grad_norm': 0.3731180429458618, 'learning_rate': 1.1206896551724138e-05, 'epoch': 7.76}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.7242525219917297, 'eval_accuracy': 0.8706896551724138, 'eval_f1': 0.8346485186655149, 'eval_runtime': 4.8591, 'eval_samples_per_second': 23.873, 'eval_steps_per_second': 5.968, 'epoch': 8.0}
{'loss': 0.2446, 'grad_norm': 0.5044949054718018, 'learning_rate': 9.051724137931036e-06, 'epoch': 8.19}
{'loss': 0.2601, 'grad_norm': 0.298328697681427, 'learning_rate': 6.896551724137932e-06, 'epoch': 8.62}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 1.014742136001587, 'eval_accuracy': 0.8706896551724138, 'eval_f1': 0.8346485186655149, 'eval_runtime': 4.856, 'eval_samples_per_second': 23.888, 'eval_steps_per_second': 5.972, 'epoch': 9.0}
{'loss': 0.1183, 'grad_norm': 0.06588847190141678, 'learning_rate': 4.741379310344828e-06, 'epoch': 9.05}
{'loss': 0.2691, 'grad_norm': 0.20073652267456055, 'learning_rate': 2.586206896551724e-06, 'epoch': 9.48}
{'loss': 0.1696, 'grad_norm': 0.5768171548843384, 'learning_rate': 4.3103448275862073e-07, 'epoch': 9.91}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.971697211265564, 'eval_accuracy': 0.8706896551724138, 'eval_f1': 0.8346485186655149, 'eval_runtime': 4.8722, 'eval_samples_per_second': 23.809, 'eval_steps_per_second': 5.952, 'epoch': 10.0}
{'train_runtime': 746.5941, 'train_samples_per_second': 6.201, 'train_steps_per_second': 1.554, 'train_loss': 0.3631989271476351, 'epoch': 10.0}


  0%|          | 0/29 [00:00<?, ?it/s]

[User 52171] Final eval metrics: {'eval_loss': 0.43665608763694763, 'eval_accuracy': 0.8706896551724138, 'eval_f1': 0.8346485186655149, 'eval_runtime': 4.8615, 'eval_samples_per_second': 23.861, 'eval_steps_per_second': 5.965, 'epoch': 10.0}


training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/598M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

events.out.tfevents.1740087261.DN0a1e6b6f.SUNet.13394.65:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

events.out.tfevents.1740088013.DN0a1e6b6f.SUNet.13394.66:   0%|          | 0.00/457 [00:00<?, ?B/s]

[User 52171] Done! Best model saved at: user_models/user_52171

=== Training model for User 53024 (Answers=596) ===


Map:   0%|          | 0/476 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 53024] Starting from scratch...


  0%|          | 0/1190 [00:00<?, ?it/s]

{'loss': 0.8828, 'grad_norm': 7.452171802520752, 'learning_rate': 4.7899159663865554e-05, 'epoch': 0.42}
{'loss': 0.6879, 'grad_norm': 4.814461708068848, 'learning_rate': 4.579831932773109e-05, 'epoch': 0.84}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.7603553533554077, 'eval_accuracy': 0.5833333333333334, 'eval_f1': 0.4298245614035087, 'eval_runtime': 5.0616, 'eval_samples_per_second': 23.708, 'eval_steps_per_second': 5.927, 'epoch': 1.0}
{'loss': 0.6734, 'grad_norm': 3.785757541656494, 'learning_rate': 4.369747899159664e-05, 'epoch': 1.26}
{'loss': 0.7698, 'grad_norm': 2.3328115940093994, 'learning_rate': 4.159663865546219e-05, 'epoch': 1.68}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.6613768339157104, 'eval_accuracy': 0.5916666666666667, 'eval_f1': 0.5433091646526099, 'eval_runtime': 5.0549, 'eval_samples_per_second': 23.739, 'eval_steps_per_second': 5.935, 'epoch': 2.0}
{'loss': 0.6702, 'grad_norm': 5.849123954772949, 'learning_rate': 3.949579831932773e-05, 'epoch': 2.1}
{'loss': 0.6543, 'grad_norm': 2.9164631366729736, 'learning_rate': 3.739495798319328e-05, 'epoch': 2.52}
{'loss': 0.6787, 'grad_norm': 4.397680282592773, 'learning_rate': 3.529411764705883e-05, 'epoch': 2.94}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.6741365790367126, 'eval_accuracy': 0.5833333333333334, 'eval_f1': 0.4298245614035087, 'eval_runtime': 5.0557, 'eval_samples_per_second': 23.736, 'eval_steps_per_second': 5.934, 'epoch': 3.0}
{'loss': 0.6164, 'grad_norm': 1.8681111335754395, 'learning_rate': 3.319327731092437e-05, 'epoch': 3.36}
{'loss': 0.6924, 'grad_norm': 1.5994086265563965, 'learning_rate': 3.1092436974789916e-05, 'epoch': 3.78}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.6491380929946899, 'eval_accuracy': 0.65, 'eval_f1': 0.6442420681551116, 'eval_runtime': 5.0451, 'eval_samples_per_second': 23.786, 'eval_steps_per_second': 5.946, 'epoch': 4.0}
{'loss': 0.6526, 'grad_norm': 1.7433885335922241, 'learning_rate': 2.8991596638655467e-05, 'epoch': 4.2}
{'loss': 0.6026, 'grad_norm': 5.737093925476074, 'learning_rate': 2.689075630252101e-05, 'epoch': 4.62}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.709734320640564, 'eval_accuracy': 0.5833333333333334, 'eval_f1': 0.586123511904762, 'eval_runtime': 5.0591, 'eval_samples_per_second': 23.719, 'eval_steps_per_second': 5.93, 'epoch': 5.0}
{'loss': 0.6296, 'grad_norm': 6.263751983642578, 'learning_rate': 2.4789915966386556e-05, 'epoch': 5.04}
{'loss': 0.5697, 'grad_norm': 1.4917885065078735, 'learning_rate': 2.26890756302521e-05, 'epoch': 5.46}
{'loss': 0.5982, 'grad_norm': 4.141619682312012, 'learning_rate': 2.058823529411765e-05, 'epoch': 5.88}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.8315696716308594, 'eval_accuracy': 0.6, 'eval_f1': 0.5201159840521928, 'eval_runtime': 5.1732, 'eval_samples_per_second': 23.197, 'eval_steps_per_second': 5.799, 'epoch': 6.0}
{'loss': 0.6184, 'grad_norm': 18.486391067504883, 'learning_rate': 1.8487394957983196e-05, 'epoch': 6.3}
{'loss': 0.547, 'grad_norm': 5.931907653808594, 'learning_rate': 1.638655462184874e-05, 'epoch': 6.72}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.8988639712333679, 'eval_accuracy': 0.625, 'eval_f1': 0.6068989333128694, 'eval_runtime': 5.0599, 'eval_samples_per_second': 23.716, 'eval_steps_per_second': 5.929, 'epoch': 7.0}
{'loss': 0.5518, 'grad_norm': 4.1270222663879395, 'learning_rate': 1.4285714285714285e-05, 'epoch': 7.14}
{'loss': 0.5409, 'grad_norm': 2.6953299045562744, 'learning_rate': 1.2184873949579832e-05, 'epoch': 7.56}
{'loss': 0.4963, 'grad_norm': 1.6217085123062134, 'learning_rate': 1.008403361344538e-05, 'epoch': 7.98}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.9226388931274414, 'eval_accuracy': 0.6166666666666667, 'eval_f1': 0.6081481481481481, 'eval_runtime': 5.0733, 'eval_samples_per_second': 23.653, 'eval_steps_per_second': 5.913, 'epoch': 8.0}
{'loss': 0.4777, 'grad_norm': 2.020547389984131, 'learning_rate': 7.983193277310924e-06, 'epoch': 8.4}
{'loss': 0.5702, 'grad_norm': 8.358563423156738, 'learning_rate': 5.882352941176471e-06, 'epoch': 8.82}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 1.0809396505355835, 'eval_accuracy': 0.625, 'eval_f1': 0.6068989333128694, 'eval_runtime': 5.0716, 'eval_samples_per_second': 23.661, 'eval_steps_per_second': 5.915, 'epoch': 9.0}
{'loss': 0.4343, 'grad_norm': 3.0617306232452393, 'learning_rate': 3.7815126050420167e-06, 'epoch': 9.24}
{'loss': 0.4701, 'grad_norm': 5.63592529296875, 'learning_rate': 1.680672268907563e-06, 'epoch': 9.66}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 1.1174461841583252, 'eval_accuracy': 0.6166666666666667, 'eval_f1': 0.5998168498168499, 'eval_runtime': 5.1184, 'eval_samples_per_second': 23.445, 'eval_steps_per_second': 5.861, 'epoch': 10.0}
{'train_runtime': 768.4239, 'train_samples_per_second': 6.194, 'train_steps_per_second': 1.549, 'train_loss': 0.6095558214588326, 'epoch': 10.0}


  0%|          | 0/30 [00:00<?, ?it/s]

[User 53024] Final eval metrics: {'eval_loss': 0.6491380929946899, 'eval_accuracy': 0.65, 'eval_f1': 0.6442420681551116, 'eval_runtime': 5.1385, 'eval_samples_per_second': 23.353, 'eval_steps_per_second': 5.838, 'epoch': 10.0}


events.out.tfevents.1740088046.DN0a1e6b6f.SUNet.13394.67:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/598M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

events.out.tfevents.1740088819.DN0a1e6b6f.SUNet.13394.68:   0%|          | 0.00/457 [00:00<?, ?B/s]

[User 53024] Done! Best model saved at: user_models/user_53024

=== Training model for User 52989 (Answers=597) ===


Map:   0%|          | 0/477 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 52989] Starting from scratch...


  0%|          | 0/1200 [00:00<?, ?it/s]

{'loss': 0.8602, 'grad_norm': 8.630634307861328, 'learning_rate': 4.791666666666667e-05, 'epoch': 0.42}
{'loss': 0.5949, 'grad_norm': 13.008700370788574, 'learning_rate': 4.5833333333333334e-05, 'epoch': 0.83}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.5777505040168762, 'eval_accuracy': 0.7583333333333333, 'eval_f1': 0.6541074249605056, 'eval_runtime': 5.0686, 'eval_samples_per_second': 23.675, 'eval_steps_per_second': 5.919, 'epoch': 1.0}
{'loss': 0.7408, 'grad_norm': 7.796112060546875, 'learning_rate': 4.375e-05, 'epoch': 1.25}
{'loss': 0.5667, 'grad_norm': 2.4560654163360596, 'learning_rate': 4.166666666666667e-05, 'epoch': 1.67}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.5868874788284302, 'eval_accuracy': 0.7583333333333333, 'eval_f1': 0.6541074249605056, 'eval_runtime': 5.0912, 'eval_samples_per_second': 23.57, 'eval_steps_per_second': 5.893, 'epoch': 2.0}
{'loss': 0.6168, 'grad_norm': 5.061364650726318, 'learning_rate': 3.958333333333333e-05, 'epoch': 2.08}
{'loss': 0.5861, 'grad_norm': 1.5025814771652222, 'learning_rate': 3.7500000000000003e-05, 'epoch': 2.5}
{'loss': 0.583, 'grad_norm': 1.9211422204971313, 'learning_rate': 3.541666666666667e-05, 'epoch': 2.92}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.5279809832572937, 'eval_accuracy': 0.775, 'eval_f1': 0.7136933797909408, 'eval_runtime': 5.0739, 'eval_samples_per_second': 23.65, 'eval_steps_per_second': 5.913, 'epoch': 3.0}
{'loss': 0.5406, 'grad_norm': 2.9999606609344482, 'learning_rate': 3.3333333333333335e-05, 'epoch': 3.33}
{'loss': 0.5453, 'grad_norm': 4.639827728271484, 'learning_rate': 3.125e-05, 'epoch': 3.75}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.531540036201477, 'eval_accuracy': 0.7583333333333333, 'eval_f1': 0.7314814814814815, 'eval_runtime': 5.0533, 'eval_samples_per_second': 23.747, 'eval_steps_per_second': 5.937, 'epoch': 4.0}
{'loss': 0.6444, 'grad_norm': 1.5316095352172852, 'learning_rate': 2.916666666666667e-05, 'epoch': 4.17}
{'loss': 0.5219, 'grad_norm': 5.579716205596924, 'learning_rate': 2.7083333333333332e-05, 'epoch': 4.58}
{'loss': 0.5984, 'grad_norm': 8.292757987976074, 'learning_rate': 2.5e-05, 'epoch': 5.0}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.6141159534454346, 'eval_accuracy': 0.725, 'eval_f1': 0.6875044268681384, 'eval_runtime': 5.0267, 'eval_samples_per_second': 23.873, 'eval_steps_per_second': 5.968, 'epoch': 5.0}
{'loss': 0.4082, 'grad_norm': 0.41211292147636414, 'learning_rate': 2.2916666666666667e-05, 'epoch': 5.42}
{'loss': 0.7011, 'grad_norm': 4.033714294433594, 'learning_rate': 2.0833333333333336e-05, 'epoch': 5.83}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.5704391598701477, 'eval_accuracy': 0.7583333333333333, 'eval_f1': 0.6924854819976771, 'eval_runtime': 5.057, 'eval_samples_per_second': 23.729, 'eval_steps_per_second': 5.932, 'epoch': 6.0}
{'loss': 0.4311, 'grad_norm': 0.5008652210235596, 'learning_rate': 1.8750000000000002e-05, 'epoch': 6.25}
{'loss': 0.4868, 'grad_norm': 2.4662160873413086, 'learning_rate': 1.6666666666666667e-05, 'epoch': 6.67}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.745458722114563, 'eval_accuracy': 0.725, 'eval_f1': 0.6875044268681384, 'eval_runtime': 5.0615, 'eval_samples_per_second': 23.708, 'eval_steps_per_second': 5.927, 'epoch': 7.0}
{'loss': 0.5115, 'grad_norm': 0.6501550078392029, 'learning_rate': 1.4583333333333335e-05, 'epoch': 7.08}
{'loss': 0.4245, 'grad_norm': 4.758299827575684, 'learning_rate': 1.25e-05, 'epoch': 7.5}
{'loss': 0.4696, 'grad_norm': 4.277594566345215, 'learning_rate': 1.0416666666666668e-05, 'epoch': 7.92}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.7030568718910217, 'eval_accuracy': 0.725, 'eval_f1': 0.6797340360338276, 'eval_runtime': 5.0552, 'eval_samples_per_second': 23.738, 'eval_steps_per_second': 5.934, 'epoch': 8.0}
{'loss': 0.401, 'grad_norm': 3.054361581802368, 'learning_rate': 8.333333333333334e-06, 'epoch': 8.33}
{'loss': 0.4697, 'grad_norm': 2.5863826274871826, 'learning_rate': 6.25e-06, 'epoch': 8.75}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.8112096190452576, 'eval_accuracy': 0.7333333333333333, 'eval_f1': 0.6763592148688553, 'eval_runtime': 5.0657, 'eval_samples_per_second': 23.689, 'eval_steps_per_second': 5.922, 'epoch': 9.0}
{'loss': 0.4455, 'grad_norm': 9.950785636901855, 'learning_rate': 4.166666666666667e-06, 'epoch': 9.17}
{'loss': 0.3896, 'grad_norm': 0.6737775802612305, 'learning_rate': 2.0833333333333334e-06, 'epoch': 9.58}
{'loss': 0.4771, 'grad_norm': 1.022727370262146, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.9093745350837708, 'eval_accuracy': 0.7333333333333333, 'eval_f1': 0.6763592148688553, 'eval_runtime': 5.0647, 'eval_samples_per_second': 23.693, 'eval_steps_per_second': 5.923, 'epoch': 10.0}
{'train_runtime': 771.4893, 'train_samples_per_second': 6.183, 'train_steps_per_second': 1.555, 'train_loss': 0.5422897291183472, 'epoch': 10.0}


  0%|          | 0/30 [00:00<?, ?it/s]

[User 52989] Final eval metrics: {'eval_loss': 0.531540036201477, 'eval_accuracy': 0.7583333333333333, 'eval_f1': 0.7314814814814815, 'eval_runtime': 5.0555, 'eval_samples_per_second': 23.737, 'eval_steps_per_second': 5.934, 'epoch': 10.0}


events.out.tfevents.1740089629.DN0a1e6b6f.SUNet.13394.70:   0%|          | 0.00/457 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

events.out.tfevents.1740088853.DN0a1e6b6f.SUNet.13394.69:   0%|          | 0.00/14.9k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/598M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

[User 52989] Done! Best model saved at: user_models/user_52989

=== Training model for User 52912 (Answers=602) ===


Map:   0%|          | 0/481 [00:00<?, ? examples/s]

Map:   0%|          | 0/121 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 52912] Starting from scratch...


  0%|          | 0/1210 [00:00<?, ?it/s]

{'loss': 0.9676, 'grad_norm': 27.799015045166016, 'learning_rate': 4.793388429752066e-05, 'epoch': 0.41}
{'loss': 0.9183, 'grad_norm': 7.1295928955078125, 'learning_rate': 4.586776859504133e-05, 'epoch': 0.83}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.6630536317825317, 'eval_accuracy': 0.7355371900826446, 'eval_f1': 0.6449266091821679, 'eval_runtime': 5.1295, 'eval_samples_per_second': 23.589, 'eval_steps_per_second': 6.043, 'epoch': 1.0}
{'loss': 0.5681, 'grad_norm': 2.273611068725586, 'learning_rate': 4.3801652892561984e-05, 'epoch': 1.24}
{'loss': 0.686, 'grad_norm': 1.8391352891921997, 'learning_rate': 4.1735537190082645e-05, 'epoch': 1.65}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.7324346303939819, 'eval_accuracy': 0.7107438016528925, 'eval_f1': 0.5905697289096499, 'eval_runtime': 5.1266, 'eval_samples_per_second': 23.602, 'eval_steps_per_second': 6.047, 'epoch': 2.0}
{'loss': 0.5246, 'grad_norm': 2.4367523193359375, 'learning_rate': 3.9669421487603306e-05, 'epoch': 2.07}
{'loss': 0.5731, 'grad_norm': 1.0966717004776, 'learning_rate': 3.760330578512397e-05, 'epoch': 2.48}
{'loss': 0.5257, 'grad_norm': 1.4551324844360352, 'learning_rate': 3.553719008264463e-05, 'epoch': 2.89}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.633296549320221, 'eval_accuracy': 0.6776859504132231, 'eval_f1': 0.626153874387506, 'eval_runtime': 5.1358, 'eval_samples_per_second': 23.56, 'eval_steps_per_second': 6.036, 'epoch': 3.0}
{'loss': 0.5936, 'grad_norm': 1.6217870712280273, 'learning_rate': 3.347107438016529e-05, 'epoch': 3.31}
{'loss': 0.6187, 'grad_norm': 1.3563333749771118, 'learning_rate': 3.1404958677685955e-05, 'epoch': 3.72}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.5985391139984131, 'eval_accuracy': 0.7107438016528925, 'eval_f1': 0.5905697289096499, 'eval_runtime': 5.1493, 'eval_samples_per_second': 23.498, 'eval_steps_per_second': 6.02, 'epoch': 4.0}
{'loss': 0.5549, 'grad_norm': 1.8407971858978271, 'learning_rate': 2.9338842975206616e-05, 'epoch': 4.13}
{'loss': 0.5625, 'grad_norm': 1.2623966932296753, 'learning_rate': 2.7272727272727273e-05, 'epoch': 4.55}
{'loss': 0.5858, 'grad_norm': 2.4956252574920654, 'learning_rate': 2.5206611570247934e-05, 'epoch': 4.96}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.5971818566322327, 'eval_accuracy': 0.6859504132231405, 'eval_f1': 0.5783503484038244, 'eval_runtime': 5.1328, 'eval_samples_per_second': 23.574, 'eval_steps_per_second': 6.04, 'epoch': 5.0}
{'loss': 0.4879, 'grad_norm': 3.381999969482422, 'learning_rate': 2.3140495867768598e-05, 'epoch': 5.37}
{'loss': 0.5963, 'grad_norm': 2.9258294105529785, 'learning_rate': 2.1074380165289255e-05, 'epoch': 5.79}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.7665849924087524, 'eval_accuracy': 0.71900826446281, 'eval_f1': 0.6095063610509329, 'eval_runtime': 5.1599, 'eval_samples_per_second': 23.45, 'eval_steps_per_second': 6.008, 'epoch': 6.0}
{'loss': 0.5645, 'grad_norm': 1.4010648727416992, 'learning_rate': 1.900826446280992e-05, 'epoch': 6.2}
{'loss': 0.4543, 'grad_norm': 0.5545366406440735, 'learning_rate': 1.694214876033058e-05, 'epoch': 6.61}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.8971520066261292, 'eval_accuracy': 0.6363636363636364, 'eval_f1': 0.604594330400782, 'eval_runtime': 5.1208, 'eval_samples_per_second': 23.629, 'eval_steps_per_second': 6.054, 'epoch': 7.0}
{'loss': 0.4319, 'grad_norm': 4.506198406219482, 'learning_rate': 1.487603305785124e-05, 'epoch': 7.02}
{'loss': 0.4426, 'grad_norm': 1.2599655389785767, 'learning_rate': 1.2809917355371901e-05, 'epoch': 7.44}
{'loss': 0.5685, 'grad_norm': 2.227383613586426, 'learning_rate': 1.0743801652892564e-05, 'epoch': 7.85}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.8760265111923218, 'eval_accuracy': 0.7024793388429752, 'eval_f1': 0.6341096919609317, 'eval_runtime': 5.1541, 'eval_samples_per_second': 23.476, 'eval_steps_per_second': 6.015, 'epoch': 8.0}
{'loss': 0.4361, 'grad_norm': 5.771166801452637, 'learning_rate': 8.677685950413224e-06, 'epoch': 8.26}
{'loss': 0.4627, 'grad_norm': 4.15530252456665, 'learning_rate': 6.611570247933885e-06, 'epoch': 8.68}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.8976389765739441, 'eval_accuracy': 0.6694214876033058, 'eval_f1': 0.6278649580084987, 'eval_runtime': 5.1304, 'eval_samples_per_second': 23.585, 'eval_steps_per_second': 6.042, 'epoch': 9.0}
{'loss': 0.4004, 'grad_norm': 0.8380221724510193, 'learning_rate': 4.5454545454545455e-06, 'epoch': 9.09}
{'loss': 0.4342, 'grad_norm': 2.080738067626953, 'learning_rate': 2.4793388429752066e-06, 'epoch': 9.5}
{'loss': 0.3526, 'grad_norm': 3.6189234256744385, 'learning_rate': 4.132231404958678e-07, 'epoch': 9.92}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.9439116716384888, 'eval_accuracy': 0.6776859504132231, 'eval_f1': 0.6404896799407949, 'eval_runtime': 5.1277, 'eval_samples_per_second': 23.597, 'eval_steps_per_second': 6.046, 'epoch': 10.0}
{'train_runtime': 775.9672, 'train_samples_per_second': 6.199, 'train_steps_per_second': 1.559, 'train_loss': 0.5522258017673966, 'epoch': 10.0}


  0%|          | 0/31 [00:00<?, ?it/s]

[User 52912] Final eval metrics: {'eval_loss': 0.6630536317825317, 'eval_accuracy': 0.7355371900826446, 'eval_f1': 0.6449266091821679, 'eval_runtime': 5.1462, 'eval_samples_per_second': 23.513, 'eval_steps_per_second': 6.024, 'epoch': 10.0}


events.out.tfevents.1740089663.DN0a1e6b6f.SUNet.13394.71:   0%|          | 0.00/14.9k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/598M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

events.out.tfevents.1740090444.DN0a1e6b6f.SUNet.13394.72:   0%|          | 0.00/457 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

[User 52912] Done! Best model saved at: user_models/user_52912

=== Training model for User 52991 (Answers=616) ===


Map:   0%|          | 0/492 [00:00<?, ? examples/s]

Map:   0%|          | 0/124 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 52991] Starting from scratch...


  0%|          | 0/1230 [00:00<?, ?it/s]

{'loss': 0.5668, 'grad_norm': 0.4901951849460602, 'learning_rate': 4.796747967479675e-05, 'epoch': 0.41}
{'loss': 0.5076, 'grad_norm': 1.2764301300048828, 'learning_rate': 4.59349593495935e-05, 'epoch': 0.81}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.482997328042984, 'eval_accuracy': 0.8790322580645161, 'eval_f1': 0.8224421985324658, 'eval_runtime': 5.2289, 'eval_samples_per_second': 23.715, 'eval_steps_per_second': 5.929, 'epoch': 1.0}
{'loss': 0.4347, 'grad_norm': 0.13889920711517334, 'learning_rate': 4.390243902439025e-05, 'epoch': 1.22}
{'loss': 0.4526, 'grad_norm': 0.16600972414016724, 'learning_rate': 4.186991869918699e-05, 'epoch': 1.63}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.4928465485572815, 'eval_accuracy': 0.8790322580645161, 'eval_f1': 0.8224421985324658, 'eval_runtime': 5.2511, 'eval_samples_per_second': 23.614, 'eval_steps_per_second': 5.904, 'epoch': 2.0}
{'loss': 0.3294, 'grad_norm': 0.2907229959964752, 'learning_rate': 3.983739837398374e-05, 'epoch': 2.03}
{'loss': 0.4205, 'grad_norm': 4.962575435638428, 'learning_rate': 3.780487804878049e-05, 'epoch': 2.44}
{'loss': 0.2756, 'grad_norm': 0.18938970565795898, 'learning_rate': 3.577235772357724e-05, 'epoch': 2.85}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.4606013000011444, 'eval_accuracy': 0.8790322580645161, 'eval_f1': 0.8224421985324658, 'eval_runtime': 5.2367, 'eval_samples_per_second': 23.679, 'eval_steps_per_second': 5.92, 'epoch': 3.0}
{'loss': 0.2404, 'grad_norm': 0.19231177866458893, 'learning_rate': 3.373983739837399e-05, 'epoch': 3.25}
{'loss': 0.38, 'grad_norm': 9.687187194824219, 'learning_rate': 3.170731707317073e-05, 'epoch': 3.66}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.4357147216796875, 'eval_accuracy': 0.8790322580645161, 'eval_f1': 0.8224421985324658, 'eval_runtime': 5.23, 'eval_samples_per_second': 23.71, 'eval_steps_per_second': 5.927, 'epoch': 4.0}
{'loss': 0.3263, 'grad_norm': 0.22143788635730743, 'learning_rate': 2.9674796747967482e-05, 'epoch': 4.07}
{'loss': 0.2769, 'grad_norm': 0.09173323214054108, 'learning_rate': 2.764227642276423e-05, 'epoch': 4.47}
{'loss': 0.3232, 'grad_norm': 0.21085265278816223, 'learning_rate': 2.5609756097560977e-05, 'epoch': 4.88}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.51115882396698, 'eval_accuracy': 0.8870967741935484, 'eval_f1': 0.8411081757508343, 'eval_runtime': 6.2189, 'eval_samples_per_second': 19.939, 'eval_steps_per_second': 4.985, 'epoch': 5.0}
{'loss': 0.2659, 'grad_norm': 3.0059568881988525, 'learning_rate': 2.3577235772357724e-05, 'epoch': 5.28}
{'loss': 0.2722, 'grad_norm': 0.7431789636611938, 'learning_rate': 2.1544715447154475e-05, 'epoch': 5.69}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.5154097080230713, 'eval_accuracy': 0.8790322580645161, 'eval_f1': 0.8224421985324658, 'eval_runtime': 6.1841, 'eval_samples_per_second': 20.052, 'eval_steps_per_second': 5.013, 'epoch': 6.0}
{'loss': 0.2107, 'grad_norm': 4.669829845428467, 'learning_rate': 1.9512195121951222e-05, 'epoch': 6.1}
{'loss': 0.2287, 'grad_norm': 0.20341315865516663, 'learning_rate': 1.747967479674797e-05, 'epoch': 6.5}
{'loss': 0.2302, 'grad_norm': 0.22016245126724243, 'learning_rate': 1.5447154471544717e-05, 'epoch': 6.91}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.5740196108818054, 'eval_accuracy': 0.8790322580645161, 'eval_f1': 0.8224421985324658, 'eval_runtime': 6.1829, 'eval_samples_per_second': 20.055, 'eval_steps_per_second': 5.014, 'epoch': 7.0}
{'loss': 0.1827, 'grad_norm': 0.03740270808339119, 'learning_rate': 1.3414634146341466e-05, 'epoch': 7.32}
{'loss': 0.2432, 'grad_norm': 0.23656319081783295, 'learning_rate': 1.1382113821138211e-05, 'epoch': 7.72}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.7198987007141113, 'eval_accuracy': 0.8870967741935484, 'eval_f1': 0.8411081757508343, 'eval_runtime': 6.1601, 'eval_samples_per_second': 20.13, 'eval_steps_per_second': 5.032, 'epoch': 8.0}
{'loss': 0.1199, 'grad_norm': 0.4932897984981537, 'learning_rate': 9.34959349593496e-06, 'epoch': 8.13}
{'loss': 0.1739, 'grad_norm': 0.023669864982366562, 'learning_rate': 7.317073170731707e-06, 'epoch': 8.54}
{'loss': 0.186, 'grad_norm': 3.7607533931732178, 'learning_rate': 5.2845528455284555e-06, 'epoch': 8.94}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.7503798604011536, 'eval_accuracy': 0.8790322580645161, 'eval_f1': 0.8361837403583133, 'eval_runtime': 6.0544, 'eval_samples_per_second': 20.481, 'eval_steps_per_second': 5.12, 'epoch': 9.0}
{'loss': 0.1787, 'grad_norm': 0.5876972079277039, 'learning_rate': 3.2520325203252037e-06, 'epoch': 9.35}
{'loss': 0.1441, 'grad_norm': 0.1225999966263771, 'learning_rate': 1.2195121951219514e-06, 'epoch': 9.76}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.8547011613845825, 'eval_accuracy': 0.8790322580645161, 'eval_f1': 0.8361837403583133, 'eval_runtime': 6.0903, 'eval_samples_per_second': 20.36, 'eval_steps_per_second': 5.09, 'epoch': 10.0}
{'train_runtime': 862.1013, 'train_samples_per_second': 5.707, 'train_steps_per_second': 1.427, 'train_loss': 0.2862761319168215, 'epoch': 10.0}


  0%|          | 0/31 [00:00<?, ?it/s]

[User 52991] Final eval metrics: {'eval_loss': 0.51115882396698, 'eval_accuracy': 0.8870967741935484, 'eval_f1': 0.8411081757508343, 'eval_runtime': 6.1075, 'eval_samples_per_second': 20.303, 'eval_steps_per_second': 5.076, 'epoch': 10.0}


training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

events.out.tfevents.1740090477.DN0a1e6b6f.SUNet.13394.73:   0%|          | 0.00/14.9k [00:00<?, ?B/s]

events.out.tfevents.1740091345.DN0a1e6b6f.SUNet.13394.74:   0%|          | 0.00/457 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/598M [00:00<?, ?B/s]

[User 52991] Done! Best model saved at: user_models/user_52991

=== Training model for User 30844 (Answers=635) ===


Map:   0%|          | 0/508 [00:00<?, ? examples/s]

Map:   0%|          | 0/127 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 30844] Starting from scratch...


  0%|          | 0/1270 [00:00<?, ?it/s]

{'loss': 0.4845, 'grad_norm': 0.5062860250473022, 'learning_rate': 4.8031496062992124e-05, 'epoch': 0.39}
{'loss': 0.4435, 'grad_norm': 4.17603063583374, 'learning_rate': 4.606299212598425e-05, 'epoch': 0.79}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.30602675676345825, 'eval_accuracy': 0.9291338582677166, 'eval_f1': 0.8950024104129841, 'eval_runtime': 6.2567, 'eval_samples_per_second': 20.298, 'eval_steps_per_second': 5.115, 'epoch': 1.0}
{'loss': 0.3976, 'grad_norm': 0.5468829870223999, 'learning_rate': 4.409448818897638e-05, 'epoch': 1.18}
{'loss': 0.3124, 'grad_norm': 0.20963113009929657, 'learning_rate': 4.21259842519685e-05, 'epoch': 1.57}
{'loss': 0.4365, 'grad_norm': 3.075038433074951, 'learning_rate': 4.015748031496063e-05, 'epoch': 1.97}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.2606280744075775, 'eval_accuracy': 0.9291338582677166, 'eval_f1': 0.8950024104129841, 'eval_runtime': 6.3164, 'eval_samples_per_second': 20.106, 'eval_steps_per_second': 5.066, 'epoch': 2.0}
{'loss': 0.3369, 'grad_norm': 0.25327402353286743, 'learning_rate': 3.818897637795276e-05, 'epoch': 2.36}
{'loss': 0.4183, 'grad_norm': 0.5543381571769714, 'learning_rate': 3.622047244094489e-05, 'epoch': 2.76}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.34263139963150024, 'eval_accuracy': 0.9291338582677166, 'eval_f1': 0.8950024104129841, 'eval_runtime': 6.3494, 'eval_samples_per_second': 20.002, 'eval_steps_per_second': 5.04, 'epoch': 3.0}
{'loss': 0.293, 'grad_norm': 0.09615466743707657, 'learning_rate': 3.425196850393701e-05, 'epoch': 3.15}
{'loss': 0.3759, 'grad_norm': 4.894548416137695, 'learning_rate': 3.228346456692913e-05, 'epoch': 3.54}
{'loss': 0.3738, 'grad_norm': 23.126298904418945, 'learning_rate': 3.0314960629921263e-05, 'epoch': 3.94}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.2754558324813843, 'eval_accuracy': 0.9291338582677166, 'eval_f1': 0.8950024104129841, 'eval_runtime': 6.296, 'eval_samples_per_second': 20.172, 'eval_steps_per_second': 5.083, 'epoch': 4.0}
{'loss': 0.3784, 'grad_norm': 0.2950327396392822, 'learning_rate': 2.8346456692913388e-05, 'epoch': 4.33}
{'loss': 0.3665, 'grad_norm': 5.1982293128967285, 'learning_rate': 2.637795275590551e-05, 'epoch': 4.72}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.3120919167995453, 'eval_accuracy': 0.9291338582677166, 'eval_f1': 0.8950024104129841, 'eval_runtime': 6.3264, 'eval_samples_per_second': 20.075, 'eval_steps_per_second': 5.058, 'epoch': 5.0}
{'loss': 0.234, 'grad_norm': 0.1373729407787323, 'learning_rate': 2.440944881889764e-05, 'epoch': 5.12}
{'loss': 0.3821, 'grad_norm': 0.3650893568992615, 'learning_rate': 2.2440944881889763e-05, 'epoch': 5.51}
{'loss': 0.2703, 'grad_norm': 0.19874192774295807, 'learning_rate': 2.0472440944881892e-05, 'epoch': 5.91}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.32579952478408813, 'eval_accuracy': 0.9212598425196851, 'eval_f1': 0.8910546017813347, 'eval_runtime': 6.344, 'eval_samples_per_second': 20.019, 'eval_steps_per_second': 5.044, 'epoch': 6.0}
{'loss': 0.416, 'grad_norm': 0.5192802548408508, 'learning_rate': 1.8503937007874017e-05, 'epoch': 6.3}
{'loss': 0.2084, 'grad_norm': 0.03886600583791733, 'learning_rate': 1.6535433070866142e-05, 'epoch': 6.69}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.40465831756591797, 'eval_accuracy': 0.9133858267716536, 'eval_f1': 0.8870743008975729, 'eval_runtime': 6.3239, 'eval_samples_per_second': 20.083, 'eval_steps_per_second': 5.06, 'epoch': 7.0}
{'loss': 0.2228, 'grad_norm': 0.38913843035697937, 'learning_rate': 1.4566929133858267e-05, 'epoch': 7.09}
{'loss': 0.2811, 'grad_norm': 0.23647604882717133, 'learning_rate': 1.2598425196850394e-05, 'epoch': 7.48}
{'loss': 0.1048, 'grad_norm': 0.1569388061761856, 'learning_rate': 1.062992125984252e-05, 'epoch': 7.87}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.5937107801437378, 'eval_accuracy': 0.9212598425196851, 'eval_f1': 0.902550920804321, 'eval_runtime': 6.3057, 'eval_samples_per_second': 20.141, 'eval_steps_per_second': 5.075, 'epoch': 8.0}
{'loss': 0.2332, 'grad_norm': 0.4931308329105377, 'learning_rate': 8.661417322834646e-06, 'epoch': 8.27}
{'loss': 0.1436, 'grad_norm': 0.1287035048007965, 'learning_rate': 6.692913385826772e-06, 'epoch': 8.66}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.5503928661346436, 'eval_accuracy': 0.9212598425196851, 'eval_f1': 0.902550920804321, 'eval_runtime': 6.294, 'eval_samples_per_second': 20.178, 'eval_steps_per_second': 5.084, 'epoch': 9.0}
{'loss': 0.2253, 'grad_norm': 4.9874372482299805, 'learning_rate': 4.7244094488188975e-06, 'epoch': 9.06}
{'loss': 0.1103, 'grad_norm': 0.043202582746744156, 'learning_rate': 2.755905511811024e-06, 'epoch': 9.45}
{'loss': 0.1614, 'grad_norm': 1.008959174156189, 'learning_rate': 7.874015748031496e-07, 'epoch': 9.84}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.5883382558822632, 'eval_accuracy': 0.9212598425196851, 'eval_f1': 0.902550920804321, 'eval_runtime': 6.2972, 'eval_samples_per_second': 20.168, 'eval_steps_per_second': 5.082, 'epoch': 10.0}
{'train_runtime': 947.6092, 'train_samples_per_second': 5.361, 'train_steps_per_second': 1.34, 'train_loss': 0.3030680712752455, 'epoch': 10.0}


  0%|          | 0/32 [00:00<?, ?it/s]

[User 30844] Final eval metrics: {'eval_loss': 0.5937107801437378, 'eval_accuracy': 0.9212598425196851, 'eval_f1': 0.902550920804321, 'eval_runtime': 6.3053, 'eval_samples_per_second': 20.142, 'eval_steps_per_second': 5.075, 'epoch': 10.0}


model.safetensors:   0%|          | 0.00/598M [00:00<?, ?B/s]

events.out.tfevents.1740091377.DN0a1e6b6f.SUNet.13394.75:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

events.out.tfevents.1740092331.DN0a1e6b6f.SUNet.13394.76:   0%|          | 0.00/457 [00:00<?, ?B/s]

[User 30844] Done! Best model saved at: user_models/user_30844

=== Training model for User 52174 (Answers=646) ===


Map:   0%|          | 0/516 [00:00<?, ? examples/s]

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[User 52174] Starting from scratch...


  0%|          | 0/1290 [00:00<?, ?it/s]

{'loss': 0.8784, 'grad_norm': 3.485852003097534, 'learning_rate': 4.8062015503875976e-05, 'epoch': 0.39}
{'loss': 0.7015, 'grad_norm': 8.442926406860352, 'learning_rate': 4.6124031007751936e-05, 'epoch': 0.78}


  0%|          | 0/33 [00:00<?, ?it/s]

{'eval_loss': 0.7320469617843628, 'eval_accuracy': 0.6384615384615384, 'eval_f1': 0.4975803539183821, 'eval_runtime': 5.4588, 'eval_samples_per_second': 23.815, 'eval_steps_per_second': 6.045, 'epoch': 1.0}
{'loss': 0.669, 'grad_norm': 4.5944976806640625, 'learning_rate': 4.418604651162791e-05, 'epoch': 1.16}
{'loss': 0.6395, 'grad_norm': 3.814293384552002, 'learning_rate': 4.2248062015503877e-05, 'epoch': 1.55}
{'loss': 0.6333, 'grad_norm': 4.437618255615234, 'learning_rate': 4.0310077519379843e-05, 'epoch': 1.94}


  0%|          | 0/33 [00:00<?, ?it/s]