In [1]:
# -----------------------------
# 0. Import modules
# -----------------------------
import json
import os
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, hamming_loss
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    MarianMTModel,
    MarianTokenizer
)
import torch
from datasets import Dataset
import nlpaug.augmenter.word as naw
from nlpaug.util import Action

In [3]:
# -----------------------------
# 1. Load Data
# -----------------------------
def load_license_data(json_folder):
    license_data = []
    for filename in os.listdir(json_folder):
        if filename.endswith(".json"):
            license_name = filename[:-5]
            filepath = os.path.join(json_folder, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                data = json.load(f)
                license_data.append({
                    "license_name": license_name,
                    "family": data["family"],
                    "labels": data["labels"],
                    "text": data["text"],
                })
    return license_data

json_folder = "../../data/processed/preprocessed_licenses_json_2"
license_data = load_license_data(json_folder)
df = pd.DataFrame(license_data)

# Drop rows with missing or empty labels
df.dropna(subset=["labels"], inplace=True)
df = df[df["labels"].apply(lambda x: len(x) > 0)]

In [4]:
# -----------------------------
# 2. Encode Labels
# -----------------------------
mlb = MultiLabelBinarizer()
df["labels"] = list(mlb.fit_transform(df["labels"]))
num_labels = len(mlb.classes_)

In [5]:
# -----------------------------
# 3. Split Data (Train, Validation, Test)
# -----------------------------
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [6]:
# -----------------------------
# 4. Data Augmentation
# -----------------------------
# Initialize backtranslation models globally for efficiency
en_to_fr_model_name = 'Helsinki-NLP/opus-mt-en-fr'
fr_to_en_model_name = 'Helsinki-NLP/opus-mt-fr-en'
tokenizer_en_to_fr = MarianTokenizer.from_pretrained(en_to_fr_model_name)
model_en_to_fr = MarianMTModel.from_pretrained(en_to_fr_model_name)
tokenizer_fr_to_en = MarianTokenizer.from_pretrained(fr_to_en_model_name)
model_fr_to_en = MarianMTModel.from_pretrained(fr_to_en_model_name)

def backtranslate(text):
    # Translate English to French
    inputs = tokenizer_en_to_fr.prepare_seq2seq_batch([text], return_tensors="pt")
    translated = model_en_to_fr.generate(**inputs)
    french_text = tokenizer_en_to_fr.batch_decode(translated, skip_special_tokens=True)[0]
    # Translate French back to English
    inputs = tokenizer_fr_to_en.prepare_seq2seq_batch([french_text], return_tensors="pt")
    translated = model_fr_to_en.generate(**inputs)
    backtranslated_text = tokenizer_fr_to_en.batch_decode(translated, skip_special_tokens=True)[0]
    return backtranslated_text

def augment_text(text, aug_p=0.2, aug_max=3):
    """
    Randomly choose between contextual augmentation and backtranslation.
    """
    augmenters = ['contextual', 'backtranslation']
    chosen = random.choice(augmenters)
    if chosen == 'contextual':
        aug = naw.ContextualWordEmbsAug(
            model_path='bert-base-uncased',
            action=Action.SUBSTITUTE,
            aug_p=aug_p,
            aug_max=aug_max,
            device='cuda' if torch.cuda.is_available() else 'cpu'
        )
        try:
            augmented = aug.augment(text)
            return augmented[0] if isinstance(augmented, list) and len(augmented) > 0 else text
        except Exception:
            return text
    elif chosen == 'backtranslation':
        try:
            return backtranslate(text)
        except Exception:
            return text

def augment_minority_classes(df, min_samples=50, aug_factor=3):
    augmented_texts = []
    augmented_labels = []
    
    # Compute label counts and identify minority labels
    label_matrix = np.array(df["labels"].tolist())
    label_counts = label_matrix.sum(axis=0)
    minority_labels = np.where(label_counts < min_samples)[0]
    
    for _, row in df.iterrows():
        text = row["text"]
        labels = row["labels"]
        sample_labels = np.where(np.array(labels) == 1)[0]
        # Augment if any of the labels are in minority group
        if any(label in minority_labels for label in sample_labels):
            for _ in range(aug_factor):
                new_text = augment_text(text)
                augmented_texts.append(new_text)
                augmented_labels.append(labels)
    
    augmented_df = pd.DataFrame({
        "text": augmented_texts,
        "labels": list(augmented_labels)
    })
    
    # Combine original and augmented data
    return pd.concat([df, augmented_df], ignore_index=True)

print("Original training size:", len(train_df))
train_df = augment_minority_classes(train_df, min_samples=50, aug_factor=3)
print("Augmented training size:", len(train_df))

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Original training size: 247


model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Augmented training size: 679


In [7]:
# -----------------------------
# 5. Tokenization
# -----------------------------
model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    encoding = tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )
    return encoding

# Prepare Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df).map(tokenize, batched=True)
val_dataset = Dataset.from_pandas(val_df).map(tokenize, batched=True)
test_dataset = Dataset.from_pandas(test_df).map(tokenize, batched=True)

# Set format for PyTorch (ensure "labels" is preserved)
train_dataset = train_dataset.with_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset = val_dataset.with_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset = test_dataset.with_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/679 [00:00<?, ? examples/s]

Map:   0%|          | 0/53 [00:00<?, ? examples/s]

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

In [8]:
# -----------------------------
# 6. Focal Loss Implementation
# -----------------------------
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, labels):
        bce_loss = torch.nn.functional.binary_cross_entropy_with_logits(
            logits, labels, reduction="none"
        )
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return focal_loss.mean()

In [21]:
# -----------------------------
# 7. Custom Trainer with Focal Loss
# -----------------------------
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, num_items_in_batch=None, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = FocalLoss(alpha=0.25, gamma=2.0)
        loss = loss_fct(logits, labels.float())
        return (loss, outputs) if return_outputs else loss


In [22]:
# -----------------------------
# 8. Model Setup
# -----------------------------
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification",
    label2id={label: str(i) for i, label in enumerate(mlb.classes_)},
    id2label={str(i): label for i, label in enumerate(mlb.classes_)}
)

loading configuration file config.json from cache at C:\Users\NPARSHO\.cache\huggingface\hub\models--answerdotai--ModernBERT-base\snapshots\8949b909ec900327062f0ebf497f51aef5e6f0c8\config.json
Model config ModernBertConfig {
  "_name_or_path": "answerdotai/ModernBERT-base",
  "architectures": [
    "ModernBertForMaskedLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 50281,
  "classifier_activation": "gelu",
  "classifier_bias": false,
  "classifier_dropout": 0.0,
  "classifier_pooling": "mean",
  "cls_token_id": 50281,
  "decoder_bias": true,
  "deterministic_flash_attn": false,
  "embedding_dropout": 0.0,
  "eos_token_id": 50282,
  "global_attn_every_n_layers": 3,
  "global_rope_theta": 160000.0,
  "gradient_checkpointing": false,
  "hidden_activation": "gelu",
  "hidden_size": 768,
  "id2label": {
    "10": "Display license in binary (obligation/INFORMATION)",
    "11": "Display license in the source (obligation/INFORMATION)",
    "12": "Doing Busines

In [23]:
# -----------------------------
# 9. Training Configuration
# -----------------------------
def compute_metrics(p):
    preds = torch.sigmoid(torch.tensor(p.predictions)).cpu().numpy() > 0.5
    labels = p.label_ids
    return {
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
        "f1_micro": f1_score(labels, preds, average="micro", zero_division=0),
        "hamming_loss": hamming_loss(labels, preds)
    }

training_args = TrainingArguments(
    output_dir="../../model/Bert4.0",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    save_total_limit=2,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    report_to="none",
    seed=42,
    log_level="info",
    disable_tqdm=False
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices


In [24]:
# -----------------------------
# 10. Train the Model
# -----------------------------
trainer.train()

The following columns in the training set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: text, license_name, family. If text, license_name, family are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 679
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 430
  Number of trainable parameters = 149,627,165


  0%|          | 0/430 [00:00<?, ?it/s]

{'loss': 0.0338, 'grad_norm': 0.3166827857494354, 'learning_rate': 1.9534883720930235e-05, 'epoch': 0.23}
{'loss': 0.0235, 'grad_norm': 0.17391929030418396, 'learning_rate': 1.9069767441860468e-05, 'epoch': 0.47}
{'loss': 0.0198, 'grad_norm': 0.08449245989322662, 'learning_rate': 1.86046511627907e-05, 'epoch': 0.7}
{'loss': 0.0184, 'grad_norm': 0.40008246898651123, 'learning_rate': 1.813953488372093e-05, 'epoch': 0.93}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text, license_name, family. If __index_level_0__, text, license_name, family are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 53
  Batch size = 16


  0%|          | 0/4 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-43
Configuration saved in ../../model/Bert4.0\checkpoint-43\config.json


{'eval_loss': 0.019957410171628, 'eval_f1_macro': 0.3462024504798812, 'eval_f1_micro': 0.7861885790172642, 'eval_hamming_loss': 0.10474951203643461, 'eval_runtime': 38.5264, 'eval_samples_per_second': 1.376, 'eval_steps_per_second': 0.104, 'epoch': 1.0}


Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-180] due to args.save_total_limit


{'loss': 0.0161, 'grad_norm': 0.1603558212518692, 'learning_rate': 1.7674418604651163e-05, 'epoch': 1.16}
{'loss': 0.0163, 'grad_norm': 0.11087103188037872, 'learning_rate': 1.7209302325581396e-05, 'epoch': 1.4}
{'loss': 0.0142, 'grad_norm': 0.11978786438703537, 'learning_rate': 1.674418604651163e-05, 'epoch': 1.63}
{'loss': 0.0153, 'grad_norm': 0.1317322552204132, 'learning_rate': 1.6279069767441862e-05, 'epoch': 1.86}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text, license_name, family. If __index_level_0__, text, license_name, family are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 53
  Batch size = 16


  0%|          | 0/4 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-86
Configuration saved in ../../model/Bert4.0\checkpoint-86\config.json


{'eval_loss': 0.018072564154863358, 'eval_f1_macro': 0.4128804492248801, 'eval_f1_micro': 0.8036410923276983, 'eval_hamming_loss': 0.09824333116460637, 'eval_runtime': 38.0437, 'eval_samples_per_second': 1.393, 'eval_steps_per_second': 0.105, 'epoch': 2.0}


Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-43] due to args.save_total_limit


{'loss': 0.0134, 'grad_norm': 0.13580729067325592, 'learning_rate': 1.5813953488372095e-05, 'epoch': 2.09}
{'loss': 0.0111, 'grad_norm': 0.15702365338802338, 'learning_rate': 1.5348837209302328e-05, 'epoch': 2.33}
{'loss': 0.0118, 'grad_norm': 0.13769187033176422, 'learning_rate': 1.488372093023256e-05, 'epoch': 2.56}
{'loss': 0.0109, 'grad_norm': 0.2078876495361328, 'learning_rate': 1.441860465116279e-05, 'epoch': 2.79}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text, license_name, family. If __index_level_0__, text, license_name, family are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 53
  Batch size = 16


  0%|          | 0/4 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-129
Configuration saved in ../../model/Bert4.0\checkpoint-129\config.json


{'eval_loss': 0.018003562465310097, 'eval_f1_macro': 0.459576141809573, 'eval_f1_micro': 0.8258575197889182, 'eval_hamming_loss': 0.08588158750813273, 'eval_runtime': 37.9465, 'eval_samples_per_second': 1.397, 'eval_steps_per_second': 0.105, 'epoch': 3.0}


Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-86] due to args.save_total_limit


{'loss': 0.0094, 'grad_norm': 0.21581168472766876, 'learning_rate': 1.3953488372093025e-05, 'epoch': 3.02}
{'loss': 0.0073, 'grad_norm': 0.24782583117485046, 'learning_rate': 1.3488372093023257e-05, 'epoch': 3.26}
{'loss': 0.0065, 'grad_norm': 0.39637893438339233, 'learning_rate': 1.302325581395349e-05, 'epoch': 3.49}
{'loss': 0.0072, 'grad_norm': 0.1957440823316574, 'learning_rate': 1.2558139534883723e-05, 'epoch': 3.72}
{'loss': 0.0073, 'grad_norm': 0.1842920482158661, 'learning_rate': 1.2093023255813954e-05, 'epoch': 3.95}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text, license_name, family. If __index_level_0__, text, license_name, family are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 53
  Batch size = 16


  0%|          | 0/4 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-172
Configuration saved in ../../model/Bert4.0\checkpoint-172\config.json


{'eval_loss': 0.02190111204981804, 'eval_f1_macro': 0.5560896391172055, 'eval_f1_micro': 0.8343711083437111, 'eval_hamming_loss': 0.08653220559531555, 'eval_runtime': 37.9979, 'eval_samples_per_second': 1.395, 'eval_steps_per_second': 0.105, 'epoch': 4.0}


Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-129] due to args.save_total_limit


{'loss': 0.0049, 'grad_norm': 0.12041410803794861, 'learning_rate': 1.1627906976744187e-05, 'epoch': 4.19}
{'loss': 0.005, 'grad_norm': 0.2955976724624634, 'learning_rate': 1.116279069767442e-05, 'epoch': 4.42}
{'loss': 0.0046, 'grad_norm': 0.1834060549736023, 'learning_rate': 1.0697674418604651e-05, 'epoch': 4.65}
{'loss': 0.0037, 'grad_norm': 0.14258672297000885, 'learning_rate': 1.0232558139534884e-05, 'epoch': 4.88}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text, license_name, family. If __index_level_0__, text, license_name, family are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 53
  Batch size = 16


  0%|          | 0/4 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-215
Configuration saved in ../../model/Bert4.0\checkpoint-215\config.json


{'eval_loss': 0.025575857609510422, 'eval_f1_macro': 0.5298706859995225, 'eval_f1_micro': 0.8297055057618438, 'eval_hamming_loss': 0.08653220559531555, 'eval_runtime': 38.0917, 'eval_samples_per_second': 1.391, 'eval_steps_per_second': 0.105, 'epoch': 5.0}


Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-215] due to args.save_total_limit


{'loss': 0.003, 'grad_norm': 0.11537239700555801, 'learning_rate': 9.767441860465117e-06, 'epoch': 5.12}
{'loss': 0.0028, 'grad_norm': 0.2227599322795868, 'learning_rate': 9.30232558139535e-06, 'epoch': 5.35}
{'loss': 0.0029, 'grad_norm': 0.07481295615434647, 'learning_rate': 8.837209302325582e-06, 'epoch': 5.58}
{'loss': 0.0024, 'grad_norm': 0.07811135798692703, 'learning_rate': 8.372093023255815e-06, 'epoch': 5.81}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text, license_name, family. If __index_level_0__, text, license_name, family are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 53
  Batch size = 16


  0%|          | 0/4 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-258
Configuration saved in ../../model/Bert4.0\checkpoint-258\config.json


{'eval_loss': 0.027957003563642502, 'eval_f1_macro': 0.5689999521851361, 'eval_f1_micro': 0.8430379746835444, 'eval_hamming_loss': 0.08067664281067013, 'eval_runtime': 37.9694, 'eval_samples_per_second': 1.396, 'eval_steps_per_second': 0.105, 'epoch': 6.0}


Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-172] due to args.save_total_limit


{'loss': 0.0025, 'grad_norm': 0.08982890099287033, 'learning_rate': 7.906976744186048e-06, 'epoch': 6.05}
{'loss': 0.0017, 'grad_norm': 0.13825491070747375, 'learning_rate': 7.44186046511628e-06, 'epoch': 6.28}
{'loss': 0.0024, 'grad_norm': 0.11251609772443771, 'learning_rate': 6.976744186046513e-06, 'epoch': 6.51}
{'loss': 0.0018, 'grad_norm': 0.10781411081552505, 'learning_rate': 6.511627906976745e-06, 'epoch': 6.74}
{'loss': 0.0021, 'grad_norm': 0.1306561827659607, 'learning_rate': 6.046511627906977e-06, 'epoch': 6.98}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text, license_name, family. If __index_level_0__, text, license_name, family are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 53
  Batch size = 16


  0%|          | 0/4 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-301
Configuration saved in ../../model/Bert4.0\checkpoint-301\config.json


{'eval_loss': 0.02834116667509079, 'eval_f1_macro': 0.5546838381525192, 'eval_f1_micro': 0.8378033205619413, 'eval_hamming_loss': 0.0826284970722186, 'eval_runtime': 37.9501, 'eval_samples_per_second': 1.397, 'eval_steps_per_second': 0.105, 'epoch': 7.0}


Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-301] due to args.save_total_limit


{'loss': 0.0015, 'grad_norm': 0.10110499709844589, 'learning_rate': 5.58139534883721e-06, 'epoch': 7.21}
{'loss': 0.0015, 'grad_norm': 0.06459742039442062, 'learning_rate': 5.116279069767442e-06, 'epoch': 7.44}
{'loss': 0.0014, 'grad_norm': 0.07300381362438202, 'learning_rate': 4.651162790697675e-06, 'epoch': 7.67}
{'loss': 0.0012, 'grad_norm': 0.04339738190174103, 'learning_rate': 4.186046511627907e-06, 'epoch': 7.91}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text, license_name, family. If __index_level_0__, text, license_name, family are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 53
  Batch size = 16


  0%|          | 0/4 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-344
Configuration saved in ../../model/Bert4.0\checkpoint-344\config.json


{'eval_loss': 0.029063384979963303, 'eval_f1_macro': 0.5727424216553582, 'eval_f1_micro': 0.8418367346938775, 'eval_hamming_loss': 0.08067664281067013, 'eval_runtime': 37.9462, 'eval_samples_per_second': 1.397, 'eval_steps_per_second': 0.105, 'epoch': 8.0}


Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-258] due to args.save_total_limit


{'loss': 0.0012, 'grad_norm': 0.07509707659482956, 'learning_rate': 3.72093023255814e-06, 'epoch': 8.14}
{'loss': 0.0013, 'grad_norm': 0.08574200421571732, 'learning_rate': 3.2558139534883724e-06, 'epoch': 8.37}
{'loss': 0.0011, 'grad_norm': 0.04984118789434433, 'learning_rate': 2.790697674418605e-06, 'epoch': 8.6}
{'loss': 0.0011, 'grad_norm': 0.06070873886346817, 'learning_rate': 2.3255813953488376e-06, 'epoch': 8.84}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text, license_name, family. If __index_level_0__, text, license_name, family are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 53
  Batch size = 16


  0%|          | 0/4 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-387
Configuration saved in ../../model/Bert4.0\checkpoint-387\config.json


{'eval_loss': 0.029338618740439415, 'eval_f1_macro': 0.5742807905023848, 'eval_f1_micro': 0.8421052631578947, 'eval_hamming_loss': 0.08002602472348731, 'eval_runtime': 38.0896, 'eval_samples_per_second': 1.391, 'eval_steps_per_second': 0.105, 'epoch': 9.0}


Deleting older checkpoint [..\..\model\Bert4.0\checkpoint-344] due to args.save_total_limit


{'loss': 0.001, 'grad_norm': 0.07109622657299042, 'learning_rate': 1.86046511627907e-06, 'epoch': 9.07}
{'loss': 0.0009, 'grad_norm': 0.026853298768401146, 'learning_rate': 1.3953488372093025e-06, 'epoch': 9.3}
{'loss': 0.0011, 'grad_norm': 0.058654945343732834, 'learning_rate': 9.30232558139535e-07, 'epoch': 9.53}
{'loss': 0.0009, 'grad_norm': 0.04897912219166756, 'learning_rate': 4.651162790697675e-07, 'epoch': 9.77}


Saving model checkpoint to ../../model/Bert4.0\checkpoint-430
Configuration saved in ../../model/Bert4.0\checkpoint-430\config.json


{'loss': 0.0011, 'grad_norm': 0.09619215875864029, 'learning_rate': 0.0, 'epoch': 10.0}


The following columns in the evaluation set don't have a corresponding argument in `ModernBertForSequenceClassification.forward` and have been ignored: __index_level_0__, text, license_name, family. If __index_level_0__, text, license_name, family are not expected by `ModernBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 53
  Batch size = 16


  0%|          | 0/4 [00:00<?, ?it/s]

Saving model checkpoint to ../../model/Bert4.0\checkpoint-430
Configuration saved in ../../model/Bert4.0\checkpoint-430\config.json


{'eval_loss': 0.029989423230290413, 'eval_f1_macro': 0.574884529448395, 'eval_f1_micro': 0.8401015228426396, 'eval_hamming_loss': 0.08197787898503578, 'eval_runtime': 36.4537, 'eval_samples_per_second': 1.454, 'eval_steps_per_second': 0.11, 'epoch': 10.0}




Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ../../model/Bert4.0\checkpoint-430 (score: 0.574884529448395).


{'train_runtime': 31296.18, 'train_samples_per_second': 0.217, 'train_steps_per_second': 0.014, 'train_loss': 0.006912035828586235, 'epoch': 10.0}


TrainOutput(global_step=430, training_loss=0.006912035828586235, metrics={'train_runtime': 31296.18, 'train_samples_per_second': 0.217, 'train_steps_per_second': 0.014, 'total_flos': 2314178867374080.0, 'train_loss': 0.006912035828586235, 'epoch': 10.0})

In [25]:
trainer.save_model("../../model/Electra")  # Saves model and tokenizer
tokenizer.save_pretrained("../../model/Electra")

Saving model checkpoint to ../../model/Electra
Configuration saved in ../../model/Electra\config.json


('../../model/Electra\\tokenizer_config.json',
 '../../model/Electra\\special_tokens_map.json',
 '../../model/Electra\\tokenizer.json')

In [None]:




# ===================================================
# Inference & Explainability Section
# ===================================================
# After training, set your model to evaluation mode.
model.eval()

# -----------------------------
# A. Simple Prediction Function
# -----------------------------
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.sigmoid(logits)
    return probs.detach().cpu().numpy()

# Example prediction
sample_text = "Your sample open source license text here."
pred_probs = predict(sample_text)
print("Predicted probabilities:", pred_probs)

# -----------------------------
# B. Integrated Gradients with Captum
# -----------------------------
from captum.attr import IntegratedGradients

def interpret_with_integrated_gradients(text, target_label_idx=0):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    def forward_func(input_ids, attention_mask):
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        # Return logits for the target label
        return logits[:, target_label_idx]
    
    ig = IntegratedGradients(forward_func)
    attributions, delta = ig.attribute(inputs["input_ids"],
                                       additional_forward_args=(inputs["attention_mask"],),
                                       return_convergence_delta=True)
    # Sum attributions across embedding dimensions
    attributions_sum = attributions.sum(dim=-1).squeeze(0)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    return tokens, attributions_sum.cpu().detach().numpy(), delta.cpu().detach().numpy()

tokens, attributions, delta = interpret_with_integrated_gradients(sample_text, target_label_idx=0)
print("Tokens:", tokens)
print("Integrated Gradients Attributions:", attributions)

# -----------------------------
# C. LIME Explanation
# -----------------------------
from lime.lime_text import LimeTextExplainer

def explain_with_lime(text):
    explainer = LimeTextExplainer(class_names=mlb.classes_)
    
    def predict_proba(texts):
        inputs = tokenizer(texts, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.sigmoid(logits)
        return probs.detach().cpu().numpy()
    
    exp = explainer.explain_instance(text, predict_proba, num_features=10, num_samples=100)
    return exp.as_list()

lime_explanation = explain_with_lime(sample_text)
print("LIME Explanation:", lime_explanation)

# -----------------------------
# D. SHAP Explanation
# -----------------------------
import shap

def explain_with_shap(text):
    # Define a prediction function for SHAP
    def predict_fn(texts):
        inputs = tokenizer(texts, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.sigmoid(logits)
        return probs.detach().cpu().numpy()
    
    # Create a SHAP explainer (this may take time on first run)
    explainer = shap.Explainer(predict_fn, tokenizer)
    shap_values = explainer([text])
    return shap_values

shap_values = explain_with_shap(sample_text)
# Visualize the SHAP values for text explanation (this will open a plot in supported environments)
shap.plots.text(shap_values)
