In [3]:
from datetime import datetime
from pathlib import Path

from transformers import TrainingArguments

from src.minilm.config import DataArguments, DataConfig, DatasetSource, ModelConfig
from src.minilm.data import get_tokenized_datasets
from src.minilm.trainer import DistillationConfig, DistillationTrainer

## Datasets  Config

In [4]:
dataset_source_1 = DatasetSource(
    name="bookcorpus/bookcorpus", column="text", is_hf=True
)
# dataset_source_2 = DatasetSource(
#     name="legacy-datasets/wikipedia",
#     subset="20220301.en",
#     column="text",
#     is_hf=True,
# )

In [5]:
data_config = DataConfig(sources=[dataset_source_1], cache_dir=".cache")

In [6]:
data_args = DataArguments(train_config=data_config, max_seq_len=512)

## Model Config

In [7]:
model_config = ModelConfig(
    input_model_dir="google-bert/bert-base-uncased",
    student_hidden_size=768,
    student_num_layers=6,
    student_attention_heads=12,
    teacher_layer=12,  # for all models from paper
    num_relation_heads=48,  # 48 for base models and 64 for large models from paper
    model_type="bert",
    cache_dir=".cache",
)

## Training Config

In [17]:
dt = datetime.now().strftime("%Y-%b-%d_%H-%M-%S")
output_dir = Path("results") / (
    model_config.input_model_dir.replace("/", "-") + "_" + dt
)
output_dir

PosixPath('results/google-bert-bert-base-uncased_2025-Feb-10_09-51-42')

In [29]:
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=32,
    learning_rate=6e-4,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-6,
    max_steps=400_000,
    warmup_steps=4_000,
    logging_steps=100,  # 1_000,
    save_steps=500,  # 50_000,
    seed=42,
    ddp_find_unused_parameters=True,
    save_total_limit=5,
    # load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    save_strategy="steps",
)

## Distillation Config

In [30]:
config = DistillationConfig(
    model_config=model_config,
    training_args=training_args,
    data_args=data_args,
)

## Distillation Trainer

In [31]:
trainer = DistillationTrainer(config=config)

[32m2025-02-10 09:58:56.642[0m | [1mINFO    [0m | [36msrc.minilm.trainer.distillation[0m:[36m_create_teacher[0m:[36m87[0m - [1mLoading teacher model...[0m
[32m2025-02-10 09:58:56.827[0m | [1mINFO    [0m | [36msrc.minilm.trainer.distillation[0m:[36m_create_student[0m:[36m95[0m - [1mCreating student model...[0m
[32m2025-02-10 09:58:57.331[0m | [1mINFO    [0m | [36msrc.minilm.trainer.distillation[0m:[36m_create_student[0m:[36m108[0m - [1mStudent configuration:
BertConfig {
  "_name_or_path": "google-bert/bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token

### Create Datasets

In [12]:
train_dataset, eval_dataset = get_tokenized_datasets(
    data_args=data_args,
    tokenizer=trainer.tokenizer,
    tokenization_kwargs={"padding": "do_not_pad"},
)

Tokenizing bookcorpus/bookcorpus (num_proc=11):   0%|          | 0/74004228 [00:00<?, ? examples/s]

[32m2025-02-10 09:37:33.152[0m | [1mINFO    [0m | [36msrc.minilm.data.utils[0m:[36mprepare_dataset[0m:[36m116[0m - [1mCreated dataset with 74004228 samples from 1 sources[0m


In [32]:
trainer.train(train_dataset, eval_dataset)

[32m2025-02-10 09:59:02.897[0m | [1mINFO    [0m | [36msrc.minilm.trainer.distillation[0m:[36mtrain[0m:[36m158[0m - [1mStarting training...[0m


Step,Training Loss
100,0.9888
200,0.7711
300,0.6898
400,0.6461
500,0.62
600,0.5986
700,0.5881
800,0.5697
900,0.5612
1000,0.5574


KeyboardInterrupt: 

I found the minilmv2 repository. I want to recreate it in my repo but with improvements. 
The idea is to go over the code step by step and do code refactoring, optimization, pythonic approach, and generally make the code better and more understandable. This will help me get to know the code because I am new to it. 
When we have a scope of the whole project, we will start to improve the whole structure. Here is the link to the repository: https://github.com/LazarusNLP/minilmv2.bb It is a fork of the original but I think it has some better improvements so that is why I want to use both the fork version and original (https://github.com/bloomberg/minilmv2.bb). 
I want to create a library from this and the documentation, but I think that it is best to leave it for the finish. 
I already created a repo, cloned it and implement most of the scripts. This is my project structure:
minilmv2-py3.10➜  minilmv2 git:(feat/train) ✗ tree
.
├── README.md
├── configs
│   ├── default
│   │   ├── train_config.json
│   │   └── val_config.json
│   └── examples
│       ├── custom_dataset.json
│       ├── mixed_sources.json
│       └── wikipedia_books.json
├── distillation.ipynb
├── docs
│   └── index.md
├── examples
│   ├── dataset_basic_usage.py
│   └── distillation_usage.py
├── poetry.lock
├── pyproject.toml
├── src
│   └── minilm
│       ├── __init__.py
│       ├── __pycache__
│       │   └── __init__.cpython-310.pyc
│       ├── config
│       │   ├── __init__.py
│       │   ├── __pycache__
│       │   │   ├── __init__.cpython-310.pyc
│       │   │   ├── data_config.cpython-310.pyc
│       │   │   ├── model_config.cpython-310.pyc
│       │   │   ├── parsers.cpython-310.pyc
│       │   │   ├── parsers_old.cpython-310.pyc
│       │   │   └── training_config.cpython-310.pyc
│       │   ├── data_config.py
│       │   ├── model_config.py
│       │   ├── parsers.py
│       │   └── training_config.py
│       ├── data
│       │   ├── __init__.py
│       │   ├── __pycache__
│       │   │   ├── __init__.cpython-310.pyc
│       │   │   ├── data_utils.cpython-310.pyc
│       │   │   ├── data_utils_org.cpython-310.pyc
│       │   │   └── utils.cpython-310.pyc
│       │   └── utils.py
│       ├── models
│       │   ├── __init__.py
│       │   ├── __pycache__
│       │   │   ├── __init__.cpython-310.pyc
│       │   │   ├── minilm.cpython-310.pyc
│       │   │   └── minilmv2_old.cpython-310.pyc
│       │   ├── minilm.py
│       │   └── minilmv2_old.py
│       ├── scripts
│       │   └── run_distillation.py
│       ├── trainer
│       │   ├── __init__.py
│       │   ├── __pycache__
│       │   │   ├── __init__.cpython-310.pyc
│       │   │   └── distillation.cpython-310.pyc
│       │   └── distillation.py
│       └── utils
├── test.ipynb
├── test_output
│   ├── checkpoint-100
│   │   ├── model.safetensors
│   │   ├── optimizer.pt
│   │   ├── rng_state.pth
│   │   ├── scheduler.pt
│   │   ├── trainer_state.json
│   │   └── training_args.bin
│   ├── checkpoint-125
│   │   ├── model.safetensors
│   │   ├── optimizer.pt
│   │   ├── rng_state.pth
│   │   ├── scheduler.pt
│   │   ├── trainer_state.json
│   │   └── training_args.bin
│   ├── checkpoint-25
│   │   ├── model.safetensors
│   │   ├── optimizer.pt
│   │   ├── rng_state.pth
│   │   ├── scheduler.pt
│   │   ├── trainer_state.json
│   │   └── training_args.bin
│   └── checkpoint-50
│       ├── model.safetensors
│       ├── optimizer.pt
│       ├── rng_state.pth
│       ├── scheduler.pt
│       ├── trainer_state.json
│       └── training_args.bin
└── tests
    └── __init__.py

25 directories, 68 files

Questions:
1. I want to test my project and see if I am getting the same or close results as they got. Here is their results that they mentioned in readme:
|                      | qnli  | qqp   | rte   | sst2  | mnli  | Avg    |
|----------------------|-------|-------|-------|-------|-------|--------|
| MiniLM 6x768 (ours)  | 89.05 | 90.47 | 60.65 | 91.63 | 82.92 |  82.94 |
| MiniLM 6x384 (ours)  | 89.44 | 90.47 | 63.18 | 91.28 | 82.59 | 83.392 |
| MiniLM 6x768 (paper) |  90.8 |  91.1 |  72.1 |  92.4 |  84.2 |  86.12 |
| MiniLM 6x384 (paper) | 90.24 | 90.51 | 66.43 | 91.17 | 82.91 |  84.25 |

And here is the command that they left as how to start the training run:
CMD="python -m minilmv2.run_distillation -- "
# Set seed to ensure reproducibility. Set to -1 for no seed
SEED=42

ARGS="data_params \
    --train_config ./train_config.json \
    training_params \
      --per_device_train_batch_size 256 \
      --learning_rate 6e-4 \
      --adam_epsilon 1e-6 \
      --adam_beta1 0.9 \
      --adam_beta2 0.999 \
      --weight_decay 0.01 \
      --max_steps 400000 \
      --save_steps 50000 \
      --logging_steps 1000 \
      --warmup_steps 4000 \
    --ddp_find_unused_parameters  true\
    --output_dir ./out\
    --seed=${SEED}\
    model_params \
     --input_model_dir ./model/bert-base-uncased/ \
      --student_hidden_size 384 \
      --student_num_layers 6 \
      --student_attention_heads 12 \
      --L 12 \
      --num_relation_heads 48\
"

$CMD $ARGS
My question is for what model are these commands? I do not understand the naming of their models. `MiniLM 6x768` what is 6 and 768? How should I set my parameters to get both of their models?

2. I want to get the model similar to their two model architectures but using different model as a teacher. I want to use ModernBERT. I will give you the config of both original BERT model and new one ModernBERt and you tell me how to set the parameters so I get same models as they from results? Because architectures are different do I need to change something?


* google-bert/bert-base-uncased
* Config:
BertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}
* Arhitecture:
<bound method Module.parameters of BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (pooler): BertPooler(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
  )
)>

But it does not work for new ModerBERT for some reason:
* model: answerdotai/ModernBERT-base
* Config:
ModernBertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "answerdotai/ModernBERT-base",
  "architectures": [
    "ModernBertForMaskedLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 50281,
  "classifier_activation": "gelu",
  "classifier_bias": false,
  "classifier_dropout": 0.0,
  "classifier_pooling": "mean",
  "cls_token_id": 50281,
  "decoder_bias": true,
  "deterministic_flash_attn": false,
  "embedding_dropout": 0.0,
  "eos_token_id": 50282,
  "global_attn_every_n_layers": 3,
  "global_rope_theta": 160000.0,
  "gradient_checkpointing": false,
  "hidden_activation": "gelu",
  "hidden_size": 768,
  "initializer_cutoff_factor": 2.0,
  "initializer_range": 0.02,
  "intermediate_size": 1152,
  "layer_norm_eps": 1e-05,
  "local_attention": 128,
  "local_rope_theta": 10000.0,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "mlp_dropout": 0.0,
  "model_type": "modernbert",
  "norm_bias": false,
  "norm_eps": 1e-05,
  "num_attention_heads": 12,
  "num_hidden_layers": 22,
  "pad_token_id": 50283,
  "position_embedding_type": "absolute",
  "reference_compile": null,
  "repad_logits_with_grad": false,
  "sep_token_id": 50282,
  "sparse_pred_ignore_index": -100,
  "sparse_prediction": false,
  "torch_dtype": "float32",
  "transformers_version": "4.48.2",
  "vocab_size": 50368
}
* Arhitecture:
<bound method Module.parameters of ModernBertModel(
  (embeddings): ModernBertEmbeddings(
    (tok_embeddings): Embedding(50368, 768, padding_idx=50283)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (drop): Dropout(p=0.0, inplace=False)
  )
  (layers): ModuleList(
    (0): ModernBertEncoderLayer(
      (attn_norm): Identity()
      (attn): ModernBertAttention(
        (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
        (rotary_emb): ModernBertRotaryEmbedding()
        (Wo): Linear(in_features=768, out_features=768, bias=False)
        (out_drop): Identity()
      )
      (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): ModernBertMLP(
        (Wi): Linear(in_features=768, out_features=2304, bias=False)
        (act): GELUActivation()
        (drop): Dropout(p=0.0, inplace=False)
        (Wo): Linear(in_features=1152, out_features=768, bias=False)
      )
    )
    (1-21): 21 x ModernBertEncoderLayer(
      (attn_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): ModernBertAttention(
        (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
        (rotary_emb): ModernBertRotaryEmbedding()
        (Wo): Linear(in_features=768, out_features=768, bias=False)
        (out_drop): Identity()
      )
      (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): ModernBertMLP(
        (Wi): Linear(in_features=768, out_features=2304, bias=False)
        (act): GELUActivation()
        (drop): Dropout(p=0.0, inplace=False)
        (Wo): Linear(in_features=1152, out_features=768, bias=False)
      )
    )
  )
  (final_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)>