In [1]:
import json
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, hamming_loss
from imblearn.over_sampling import RandomOverSampler, SMOTE
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    AutoModel
)
import torch
from datasets import Dataset
from sklearn.neighbors import NearestNeighbors

In [2]:
# -----------------------------
# 1. Load Data
# -----------------------------
def load_license_data(json_folder):
    license_data = []
    for filename in os.listdir(json_folder):
        if filename.endswith(".json"):
            license_name = filename[:-5]
            filepath = os.path.join(json_folder, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                data = json.load(f)
                license_data.append({
                    "license_name": license_name,
                    "family": data["family"],
                    "labels": data["labels"],
                    "text": data["text"],
                })
    return license_data

json_folder = "../../data/processed/preprocessed_licenses_json_2"
license_data = load_license_data(json_folder)
df = pd.DataFrame(license_data)

# Drop rows that are missing or empty labels
df.dropna(subset=["labels"], inplace=True)
df = df[df["labels"].apply(lambda x: len(x) > 0)]


In [3]:
# -----------------------------
# 2. Encode Labels (Multi-Label)
# -----------------------------
mlb = MultiLabelBinarizer()
multi_hot_labels = mlb.fit_transform(df["labels"])
df["multi_hot_labels"] = list(multi_hot_labels)
num_labels = len(mlb.classes_)

In [5]:
# -----------------------------
# 3. Split Data
# -----------------------------
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
val_df = val_df.drop(columns=["labels"])
val_df = val_df.rename(columns={"multi_hot_labels": "labels"})

# --- Print Label Counts Before Oversampling ---
print("\n--- Label Distribution Before Oversampling (Training Data) ---")
original_train_labels = [label for sublist in train_df["labels"] for label in sublist]
label_counts_before = pd.Series(original_train_labels).value_counts()
print(label_counts_before)


--- Label Distribution Before Oversampling (Training Data) ---
Use in distributed software (right/INFORMATION)                316
Display copyright notice (obligation/INFORMATION)              315
Display license in the source (obligation/INFORMATION)         312
Display license in binary (obligation/INFORMATION)             306
Permissive (right/INFORMATION)                                 226
Endorsement prohibited (prohibition/INFORMATION)               151
Patent grant (other/INFORMATION)                                72
No further restrictions permitted (prohibition/INFORMATION)     57
License upgrade allowed (right/INFORMATION)                     56
Deprecated License (other/INFORMATION)                          23
Doing Business with US (other/ALARM)                            23
Usage notice in advertisement (obligation/INFORMATION)          11
Severe patent retaliation (other/ALARM)                         10
Copyleft (network protective) (obligation/ALARM)                 

In [6]:
# -----------------------------
# 4. Generate BERT Embeddings ---
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

def get_embeddings(texts, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        # Use [CLS] token embedding
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

In [14]:
# -----------------------------
# 5. Oversampling (ML-ROS) ---
# -----------------------------
def ml_ros(X, y, min_samples=50):
    y = np.array(y)
    resampled_indices = []
    for label in range(y.shape[1]):
        label_indices = np.where(y[:, label] == 1)[0]
        if len(label_indices) < min_samples:
            additional_samples = np.random.choice(
                label_indices,
                size=min_samples - len(label_indices),
                replace=True
            )
            resampled_indices.extend(additional_samples)
    return [X[i] for i in resampled_indices], y[resampled_indices]

train_texts = train_df["text"].tolist()
train_labels = np.array(train_df["multi_hot_labels"].tolist())

# Apply ML-ROS
train_texts_resampled, train_labels_resampled = ml_ros(train_texts, train_labels, min_samples=50)

# Update training data
train_df = pd.DataFrame({
    "text": train_texts_resampled,
    "multi_hot_labels": list(train_labels_resampled)
})

In [7]:
# -----------------------------
# 6. Oversampling (ML-SMOTE) ---
# -----------------------------

def ml_smote(X_text, y, min_samples=50, max_samples=500, k_neighbors=5):
    np.random.seed(42)
    y = np.array(y)
    X_text = np.array(X_text)
    X_resampled = []
    y_resampled = []
    
    for label_idx in range(y.shape[1]):
        y_binary = y[:, label_idx]
        current_count = np.sum(y_binary)
        
        # Skip if label meets min_samples and is below max_samples
        if current_count >= min_samples and current_count <= max_samples:
            continue
        
        # Apply SMOTE only if needed
        if current_count < min_samples:
            sampling_strategy = {1: min_samples}
        elif current_count > max_samples:
            sampling_strategy = {1: max_samples}
        
        # Generate embeddings and apply SMOTE
        embeddings = get_embeddings(X_text.tolist())
        smote = SMOTE(
            sampling_strategy=sampling_strategy,
            random_state=42,
            k_neighbors=min(k_neighbors, len(embeddings[y_binary == 1]) - 1)
        )
        X_res_emb, y_res = smote.fit_resample(embeddings, y_binary)
        
        # Map synthetic embeddings to text
        new_samples_mask = np.arange(len(y_res)) >= len(y_binary)
        synthetic_embeddings = X_res_emb[new_samples_mask]
        nn = NearestNeighbors(n_neighbors=1, metric="cosine").fit(embeddings)
        _, indices = nn.kneighbors(synthetic_embeddings)
        
        # Update resampled data
        synthetic_texts = X_text[indices].flatten()
        synthetic_labels = y[indices].reshape(-1, y.shape[1])
        X_resampled.extend(synthetic_texts.tolist())
        y_resampled.extend(synthetic_labels.tolist())
    
    return X_resampled, np.array(y_resampled)
# --- Apply ML-SMOTE ---
train_texts = train_df["text"].tolist()
train_labels = np.array(train_df["multi_hot_labels"].tolist())

# Generate synthetic data
resampled_texts, resampled_labels = ml_smote(
    train_texts,
    train_labels,
    min_samples=50,
    k_neighbors=3
)

# Update training DataFrame
train_df = pd.DataFrame({
    "text": resampled_texts,
    "labels": list(resampled_labels)
})


In [8]:
# -----------------------------
# 6. Print Label Counts After Oversampling  ---
# -----------------------------
print("\n--- Label Distribution After Oversampling (Training Data) ---")

# Convert multi_hot_labels to numpy array first
multi_hot_array = np.array(train_df["labels"].tolist())

# Now inverse_transform works
resampled_labels_flat = [label for sublist in mlb.inverse_transform(multi_hot_array) for label in sublist]
label_counts_after = pd.Series(resampled_labels_flat).value_counts()
print(label_counts_after)


--- Label Distribution After Oversampling (Training Data) ---
Display copyright notice (obligation/INFORMATION)              462
Display license in the source (obligation/INFORMATION)         462
Use in distributed software (right/INFORMATION)                448
Display license in binary (obligation/INFORMATION)             427
Patent grant (other/INFORMATION)                               282
No further restrictions permitted (prohibition/INFORMATION)    269
Endorsement prohibited (prohibition/INFORMATION)               191
Permissive (right/INFORMATION)                                 187
License upgrade allowed (right/INFORMATION)                    108
Deprecated License (other/INFORMATION)                          91
Copyleft (network protective) (obligation/ALARM)                88
Doing Business with US (other/ALARM)                            73
Unclear or Ambiguous (other/ALARM)                              66
Severe patent retaliation (other/ALARM)                         61

In [9]:
# -----------------------------
# 7. Tokenization
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

train_dataset = Dataset.from_pandas(train_df).map(tokenize, batched=True)
val_dataset = Dataset.from_pandas(val_df).map(tokenize, batched=True)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/478 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

In [10]:
# -----------------------------
# 8. Compute Class Weights
# -----------------------------
label_counts = resampled_labels.sum(axis=0)
class_weights = torch.tensor([1.0 / count for count in label_counts], dtype=torch.float32)

In [11]:
# -----------------------------
# 9. Model Setup
# -----------------------------
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Ignore the new parameter with **kwargs
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        loss_fct = torch.nn.BCEWithLogitsLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits, labels.float())
        
        return (loss, outputs) if return_outputs else loss

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification",
    label2id={label: str(i) for i, label in enumerate(mlb.classes_)},
    id2label={str(i): label for i, label in enumerate(mlb.classes_)}
)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# -----------------------------
# 10. Training
# -----------------------------
def compute_metrics(p):
    preds = torch.sigmoid(torch.tensor(p.predictions)).cpu().numpy() > 0.5
    labels = p.label_ids
    
    return {
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
        "hamming_loss": hamming_loss(labels, preds)
    }

training_args = TrainingArguments(
    output_dir="../../model/Bert4.0",
    evaluation_strategy="epoch",     # Evaluate every epoch
    save_strategy="epoch",
    logging_strategy="steps",          
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    log_level="info",       # Show detailed logs
    report_to="none",       # Disable external logging
    disable_tqdm=False 
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()

The following columns in the training set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 478
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 300
  Number of trainable parameters = 149,627,165


  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.0052, 'grad_norm': 0.01451078336685896, 'learning_rate': 1.9333333333333333e-05, 'epoch': 0.33}
{'loss': 0.0036, 'grad_norm': 0.02002893015742302, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.67}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: license_name, text, family, __index_level_0__. If license_name, text, family, __index_level_0__ are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 36
  Batch size = 16


{'loss': 0.0027, 'grad_norm': 0.024952979758381844, 'learning_rate': 1.8e-05, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-30
Configuration saved in ../../model/Bert4.0\checkpoint-30\config.json


{'eval_loss': 0.002591557800769806, 'eval_f1_macro': 0.3784563290504735, 'eval_hamming_loss': 0.14367816091954022, 'eval_runtime': 35.4604, 'eval_samples_per_second': 1.015, 'eval_steps_per_second': 0.085, 'epoch': 1.0}


Model weights saved in ../../model/Bert4.0\checkpoint-30\model.safetensors
Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-258] due to args.save_total_limit


{'loss': 0.0018, 'grad_norm': 0.02489512972533703, 'learning_rate': 1.7333333333333336e-05, 'epoch': 1.33}
{'loss': 0.0013, 'grad_norm': 0.024218115955591202, 'learning_rate': 1.6666666666666667e-05, 'epoch': 1.67}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: license_name, text, family, __index_level_0__. If license_name, text, family, __index_level_0__ are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 36
  Batch size = 16


{'loss': 0.001, 'grad_norm': 0.028273258358240128, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-60
Configuration saved in ../../model/Bert4.0\checkpoint-60\config.json


{'eval_loss': 0.0022137858904898167, 'eval_f1_macro': 0.4814762310069234, 'eval_hamming_loss': 0.11590038314176246, 'eval_runtime': 34.8145, 'eval_samples_per_second': 1.034, 'eval_steps_per_second': 0.086, 'epoch': 2.0}


Model weights saved in ../../model/Bert4.0\checkpoint-60\model.safetensors
Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-30] due to args.save_total_limit


{'loss': 0.0008, 'grad_norm': 0.018079984933137894, 'learning_rate': 1.5333333333333334e-05, 'epoch': 2.33}
{'loss': 0.0007, 'grad_norm': 0.018146738409996033, 'learning_rate': 1.4666666666666666e-05, 'epoch': 2.67}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: license_name, text, family, __index_level_0__. If license_name, text, family, __index_level_0__ are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 36
  Batch size = 16


{'loss': 0.0006, 'grad_norm': 0.01552626397460699, 'learning_rate': 1.4e-05, 'epoch': 3.0}


  0%|          | 0/3 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-90
Configuration saved in ../../model/Bert4.0\checkpoint-90\config.json


{'eval_loss': 0.0025293370708823204, 'eval_f1_macro': 0.4527024678033163, 'eval_hamming_loss': 0.13984674329501914, 'eval_runtime': 36.1188, 'eval_samples_per_second': 0.997, 'eval_steps_per_second': 0.083, 'epoch': 3.0}


Model weights saved in ../../model/Bert4.0\checkpoint-90\model.safetensors
Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-90] due to args.save_total_limit


{'loss': 0.0005, 'grad_norm': 0.00918611604720354, 'learning_rate': 1.3333333333333333e-05, 'epoch': 3.33}
{'loss': 0.0005, 'grad_norm': 0.01590833067893982, 'learning_rate': 1.2666666666666667e-05, 'epoch': 3.67}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: license_name, text, family, __index_level_0__. If license_name, text, family, __index_level_0__ are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 36
  Batch size = 16


{'loss': 0.0004, 'grad_norm': 0.014464384876191616, 'learning_rate': 1.2e-05, 'epoch': 4.0}


  0%|          | 0/3 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-120
Configuration saved in ../../model/Bert4.0\checkpoint-120\config.json


{'eval_loss': 0.0023317087907344103, 'eval_f1_macro': 0.4667112911912737, 'eval_hamming_loss': 0.12547892720306514, 'eval_runtime': 35.5254, 'eval_samples_per_second': 1.013, 'eval_steps_per_second': 0.084, 'epoch': 4.0}


Model weights saved in ../../model/Bert4.0\checkpoint-120\model.safetensors
Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-120] due to args.save_total_limit


{'loss': 0.0003, 'grad_norm': 0.010994451120495796, 'learning_rate': 1.1333333333333334e-05, 'epoch': 4.33}
{'loss': 0.0003, 'grad_norm': 0.009732961654663086, 'learning_rate': 1.0666666666666667e-05, 'epoch': 4.67}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: license_name, text, family, __index_level_0__. If license_name, text, family, __index_level_0__ are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 36
  Batch size = 16


{'loss': 0.0004, 'grad_norm': 0.01314868126064539, 'learning_rate': 1e-05, 'epoch': 5.0}


  0%|          | 0/3 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-150
Configuration saved in ../../model/Bert4.0\checkpoint-150\config.json


{'eval_loss': 0.0023138225078582764, 'eval_f1_macro': 0.4777162501477119, 'eval_hamming_loss': 0.11877394636015326, 'eval_runtime': 35.5936, 'eval_samples_per_second': 1.011, 'eval_steps_per_second': 0.084, 'epoch': 5.0}


Model weights saved in ../../model/Bert4.0\checkpoint-150\model.safetensors
Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-150] due to args.save_total_limit


{'loss': 0.0003, 'grad_norm': 0.008449296467006207, 'learning_rate': 9.333333333333334e-06, 'epoch': 5.33}
{'loss': 0.0003, 'grad_norm': 0.007912036962807178, 'learning_rate': 8.666666666666668e-06, 'epoch': 5.67}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: license_name, text, family, __index_level_0__. If license_name, text, family, __index_level_0__ are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 36
  Batch size = 16


{'loss': 0.0002, 'grad_norm': 0.0036654588766396046, 'learning_rate': 8.000000000000001e-06, 'epoch': 6.0}


  0%|          | 0/3 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-180
Configuration saved in ../../model/Bert4.0\checkpoint-180\config.json


{'eval_loss': 0.002555279992520809, 'eval_f1_macro': 0.5210911593714286, 'eval_hamming_loss': 0.11973180076628352, 'eval_runtime': 35.3205, 'eval_samples_per_second': 1.019, 'eval_steps_per_second': 0.085, 'epoch': 6.0}


Model weights saved in ../../model/Bert4.0\checkpoint-180\model.safetensors
Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-60] due to args.save_total_limit


{'loss': 0.0002, 'grad_norm': 0.002646500011906028, 'learning_rate': 7.333333333333333e-06, 'epoch': 6.33}
{'loss': 0.0002, 'grad_norm': 0.005542600993067026, 'learning_rate': 6.666666666666667e-06, 'epoch': 6.67}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: license_name, text, family, __index_level_0__. If license_name, text, family, __index_level_0__ are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 36
  Batch size = 16


{'loss': 0.0002, 'grad_norm': 0.00858584325760603, 'learning_rate': 6e-06, 'epoch': 7.0}


  0%|          | 0/3 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-210
Configuration saved in ../../model/Bert4.0\checkpoint-210\config.json


{'eval_loss': 0.0025097783654928207, 'eval_f1_macro': 0.4836582635889781, 'eval_hamming_loss': 0.12260536398467432, 'eval_runtime': 35.2875, 'eval_samples_per_second': 1.02, 'eval_steps_per_second': 0.085, 'epoch': 7.0}


Model weights saved in ../../model/Bert4.0\checkpoint-210\model.safetensors
Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-210] due to args.save_total_limit


{'loss': 0.0002, 'grad_norm': 0.014329814352095127, 'learning_rate': 5.333333333333334e-06, 'epoch': 7.33}
{'loss': 0.0002, 'grad_norm': 0.00268304324708879, 'learning_rate': 4.666666666666667e-06, 'epoch': 7.67}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: license_name, text, family, __index_level_0__. If license_name, text, family, __index_level_0__ are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 36
  Batch size = 16


{'loss': 0.0002, 'grad_norm': 0.21031861007213593, 'learning_rate': 4.000000000000001e-06, 'epoch': 8.0}


  0%|          | 0/3 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-240
Configuration saved in ../../model/Bert4.0\checkpoint-240\config.json


{'eval_loss': 0.002568166935816407, 'eval_f1_macro': 0.5057306169576671, 'eval_hamming_loss': 0.1206896551724138, 'eval_runtime': 35.5781, 'eval_samples_per_second': 1.012, 'eval_steps_per_second': 0.084, 'epoch': 8.0}


Model weights saved in ../../model/Bert4.0\checkpoint-240\model.safetensors
Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-240] due to args.save_total_limit


{'loss': 0.0001, 'grad_norm': 0.004804142285138369, 'learning_rate': 3.3333333333333333e-06, 'epoch': 8.33}
{'loss': 0.0002, 'grad_norm': 0.005920608527958393, 'learning_rate': 2.666666666666667e-06, 'epoch': 8.67}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: license_name, text, family, __index_level_0__. If license_name, text, family, __index_level_0__ are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 36
  Batch size = 16


{'loss': 0.0002, 'grad_norm': 0.0026111965999007225, 'learning_rate': 2.0000000000000003e-06, 'epoch': 9.0}


  0%|          | 0/3 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-270
Configuration saved in ../../model/Bert4.0\checkpoint-270\config.json


{'eval_loss': 0.0025854813866317272, 'eval_f1_macro': 0.5096564012076986, 'eval_hamming_loss': 0.11302681992337164, 'eval_runtime': 35.6819, 'eval_samples_per_second': 1.009, 'eval_steps_per_second': 0.084, 'epoch': 9.0}


Model weights saved in ../../model/Bert4.0\checkpoint-270\model.safetensors
Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-270] due to args.save_total_limit


{'loss': 0.0001, 'grad_norm': 0.004353752825409174, 'learning_rate': 1.3333333333333334e-06, 'epoch': 9.33}
{'loss': 0.0001, 'grad_norm': 0.0017765768570825458, 'learning_rate': 6.666666666666667e-07, 'epoch': 9.67}


Saving model checkpoint to ../../model/Bert4.0\checkpoint-300
Configuration saved in ../../model/Bert4.0\checkpoint-300\config.json


{'loss': 0.0002, 'grad_norm': 0.004672541283071041, 'learning_rate': 0.0, 'epoch': 10.0}


Model weights saved in ../../model/Bert4.0\checkpoint-300\model.safetensors
Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-300] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: license_name, text, family, __index_level_0__. If license_name, text, family, __index_level_0__ are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 36
  Batch size = 16


  0%|          | 0/3 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-300
Configuration saved in ../../model/Bert4.0\checkpoint-300\config.json


{'eval_loss': 0.002577125560492277, 'eval_f1_macro': 0.5120264583871932, 'eval_hamming_loss': 0.11398467432950192, 'eval_runtime': 35.3171, 'eval_samples_per_second': 1.019, 'eval_steps_per_second': 0.085, 'epoch': 10.0}


Model weights saved in ../../model/Bert4.0\checkpoint-300\model.safetensors
Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-300] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ../../model/Bert4.0\checkpoint-180 (score: 0.5210911593714286).
The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: license_name, text, family, __index_level_0__. If license_name, text, family, __index_level_0__ are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 36
  Batch size = 16


{'train_runtime': 20177.0467, 'train_samples_per_second': 0.237, 'train_steps_per_second': 0.015, 'train_loss': 0.0007656728650908917, 'epoch': 10.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.002555279992520809,
 'eval_f1_macro': 0.5210911593714286,
 'eval_hamming_loss': 0.11973180076628352,
 'eval_runtime': 35.431,
 'eval_samples_per_second': 1.016,
 'eval_steps_per_second': 0.085,
 'epoch': 10.0}

In [13]:
trainer.save_model("../../model/Bert")  # Saves model and tokenizer
tokenizer.save_pretrained("../../model/Bert")

Saving model checkpoint to ../../model/Bert
Configuration saved in ../../model/Bert\config.json
Model weights saved in ../../model/Bert\model.safetensors
tokenizer config file saved in ../../model/Bert\tokenizer_config.json
Special tokens file saved in ../../model/Bert\special_tokens_map.json


('../../model/Bert\\tokenizer_config.json',
 '../../model/Bert\\special_tokens_map.json',
 '../../model/Bert\\tokenizer.json')