In [4]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
sys.path.append('../')

from torch import nn, Tensor, save
from transformers import AutoTokenizer

import src.utils as ut
import src.trainer as trn
import src.finetuning as ft

from src.utils import LLAMA_MODEL_ID, login_to_hf_hub
from src.txt_dataset import TokenizedTxtDataset


login_to_hf_hub()
TOKENIZER = AutoTokenizer.from_pretrained(LLAMA_MODEL_ID)

ut.gpu_mem_info()


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Returning secret from environment variable `HF_API_KEY`=`hf...Gk`
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/teo/.cache/huggingface/token
Login successful


{'gpu_mem_used_MB': 4950.07,
 'gpu_mem_free_MB': 1070.6,
 'gpu_mem_total_MB': 6020.66}

In [2]:
model = ut.load_raw_model()
print("before installing LoRA layers: ", ut.gpu_mem_info())

ft.freeze_and_install_lora(model, lora_rank=16)
print("after installing LoRA layers: ", ut.gpu_mem_info())

before installing LoRA layers:  {'gpu_mem_used_MB': 2569.8, 'gpu_mem_free_MB': 3450.86, 'gpu_mem_total_MB': 6020.66}
Parámetros sin LoRA: 167,772,160 || Parámetros con LoRA: 3,407,872  || Porcentaje de parámetros con LoRA: 1.99%
after installing LoRA layers:  {'gpu_mem_used_MB': 3119.25, 'gpu_mem_free_MB': 2901.41, 'gpu_mem_total_MB': 6020.66}


In [3]:
text_fpaths = [
    Path("../data/Estatutos-Universidad-de-los-Andes-2020-ratificados-MEN-RQ.preprocessed.txt"),
    Path("../data/reglamento-maestria-web-2024.preprocessed.txt")
]

# English versions:

# text_fpaths = [
#    Path("../data/Estatutos-Universidad-de-los-Andes-2020-ratificados-MEN-RQ.translated.txt"),
#    Path("../data/reglamento-maestria-web-2024.translated.txt"),
#]

train_ds = TokenizedTxtDataset(text_fpaths,
                block_size=128,
                stride=64,
                tokenizer=TOKENIZER,
                start_pct=1.0,
                end_pct=100.0
            )

valid_ds = TokenizedTxtDataset(text_fpaths,
                block_size=128,
                stride=64,
                tokenizer=TOKENIZER,
                start_pct=0.0,
                end_pct=1.0
            )

print("len(ds):", len(train_ds), "max_stride_mult:", train_ds.max_stride_mult)

DEVICE = ut.module_device(model)
print(f"DEVICE: {DEVICE}")
TRAIN_BATCH_SIZE = 1
VALID_BATCH_SIZE = 8
LEARNING_RATE = 2e-4

trainer = trn.Trainer(
    train_ds=train_ds,
    train_batch_size=TRAIN_BATCH_SIZE,
    valid_ds=valid_ds,
    valid_batch_size=VALID_BATCH_SIZE,
    lr=LEARNING_RATE,
    device=DEVICE
)

def pred_next_token_loss(model: nn.Module, batch: dict[str, Tensor]) -> Tensor:
    return model(input_ids=batch['input_ids'],
                 attention_mask=batch['attention_mask'],
                 labels=batch['input_ids']).loss

trainer.train(model,
              loss_fun=pred_next_token_loss,
              max_steps=70,
              accum_grad_steps=8)

../data/Estatutos-Universidad-de-los-Andes-2020-ratificados-MEN-RQ.preprocessed.txt             :  51,028 bytes,    819 lines
../data/reglamento-maestria-web-2024.preprocessed.txt                                           :  60,045 bytes,    838 lines
concat_files_to_str: returning 113,571 characters, whole text at: tokenized_txt_dataset.concat.txt
TokenizedTxtDs.__init__: len(all_text)=113,571
n_toks_raw: 28428 start_idx: 284 end_idx: 28428
n_toks_sampled:  28,144  input_ids length (padded):  28,160 block_size: 128 n_blocks: 220
../data/Estatutos-Universidad-de-los-Andes-2020-ratificados-MEN-RQ.preprocessed.txt             :  51,028 bytes,    819 lines
../data/reglamento-maestria-web-2024.preprocessed.txt                                           :  60,045 bytes,    838 lines
concat_files_to_str: returning 113,571 characters, whole text at: tokenized_txt_dataset.concat.txt
TokenizedTxtDs.__init__: len(all_text)=113,571
n_toks_raw: 28428 start_idx: 0 end_idx: 284
n_toks_sampled:     28

Step    5 - Train loss: 2.174                     (tokens/sec: 1435) - estimating loss on 'validation' dataset: 100%|██████████| 1/1 [00:00<00:00,  6.66it/s]


Step    5 -                    Valid loss: 3.224


Step   10 - Train loss: 2.056                     (tokens/sec: 1567) - estimating loss on 'validation' dataset: 100%|██████████| 1/1 [00:00<00:00,  6.89it/s]


Step   10 -                    Valid loss: 3.018


Step   15 - Train loss: 1.928                     (tokens/sec: 1552) - estimating loss on 'validation' dataset: 100%|██████████| 1/1 [00:00<00:00,  6.48it/s]


Step   15 -                    Valid loss: 3.109


Step   20 - Train loss: 1.941                     (tokens/sec: 1549) - estimating loss on 'validation' dataset: 100%|██████████| 1/1 [00:00<00:00,  6.41it/s]


Step   20 -                    Valid loss: 3.191


Step   25 - Train loss: 1.891                     (tokens/sec: 1544) - estimating loss on 'validation' dataset: 100%|██████████| 1/1 [00:00<00:00,  6.34it/s]


Step   25 -                    Valid loss: 3.345


Step   30 - Train loss: 1.925                     (tokens/sec: 1547) - estimating loss on 'validation' dataset: 100%|██████████| 1/1 [00:00<00:00,  6.30it/s]


Step   30 -                    Valid loss: 3.478


Step   35 - Train loss: 1.893                     (tokens/sec: 1542) - estimating loss on 'validation' dataset: 100%|██████████| 1/1 [00:00<00:00,  6.22it/s]


Step   35 -                    Valid loss: 1.943


Step   40 - Train loss: 1.797                     (tokens/sec: 1538) - estimating loss on 'validation' dataset: 100%|██████████| 1/1 [00:00<00:00,  6.32it/s]


Step   40 -                    Valid loss: 1.134


Step   45 - Train loss: 1.818                     (tokens/sec: 1539) - estimating loss on 'validation' dataset: 100%|██████████| 1/1 [00:00<00:00,  6.32it/s]


Step   45 -                    Valid loss: 1.084


Step   50 - Train loss: 1.736                     (tokens/sec: 1534) - estimating loss on 'validation' dataset: 100%|██████████| 1/1 [00:00<00:00,  6.28it/s]


Step   50 -                    Valid loss: 1.043


Step   55 - Train loss: 1.752                     (tokens/sec: 1535) - estimating loss on 'validation' dataset: 100%|██████████| 1/1 [00:00<00:00,  6.32it/s]


Step   55 -                    Valid loss: 0.995
train_dl exhausted, resetting... epoch_cnt=1


Step   60 - Train loss: 1.445                     (tokens/sec: 1536) - estimating loss on 'validation' dataset: 100%|██████████| 1/1 [00:00<00:00,  6.32it/s]


Step   60 -                    Valid loss: 0.984


Step   65 - Train loss: 1.401                     (tokens/sec: 1526) - estimating loss on 'validation' dataset: 100%|██████████| 1/1 [00:00<00:00,  6.24it/s]


Step   65 -                    Valid loss: 1.000


Step   69 - Train loss: 1.426                     (tokens/sec: 1525) - estimating loss on 'validation' dataset: 100%|██████████| 1/1 [00:00<00:00,  6.22it/s]


Step   69 -                    Valid loss: 0.989


Step   70 - Train loss: 1.361                     (tokens/sec: 1511) - estimating loss on 'validation' dataset: 100%|██████████| 1/1 [00:00<00:00,  6.35it/s]

Step   70 -                    Valid loss: 0.988





LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): LoraLinear(
            (linear_layer): Linear(in_features=2048, out_features=2048, bias=False)
          )
          (k_proj): LoraLinear(
            (linear_layer): Linear(in_features=2048, out_features=512, bias=False)
          )
          (v_proj): LoraLinear(
            (linear_layer): Linear(in_features=2048, out_features=512, bias=False)
          )
          (o_proj): LoraLinear(
            (linear_layer): Linear(in_features=2048, out_features=2048, bias=False)
          )
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2

In [None]:
save(model, "r16-e70-spa-v1.ckpt")