In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')
from groq import Groq
from src.utils import get_secret

In [2]:
api_keys = get_secret("GROQ_API_KEYS").split(';')
client = Groq(api_key=api_keys[2])

Returning secret from environment variable `GROQ_API_KEYS`=`gs...a9`


In [3]:
PROMPT_TMPL = """Please translate the following single choice question and the 4 answer options regarding regulations of Los Andes University to English:
PREGUNTA: {question}

{options}

Your answer should consist of only 5 lines.
The expected format of those lines is:
QUESTION: <translation of PREGUNTA to *English*>
OPTION A: <translation of Opción A. to *English*>
OPTION B: <translation of Opción B. to *English*>
OPTION C: <translation of Opción C. to *English*>
OPTION D: <translation of Opción D. to *English*>
"""

GROQ_MODEL = "llama-3.2-1b-preview"

# rec = "¿Qué requisito de grado puede variar según el programa específico de maestría?,El promedio acumulado mínimo,El trabajo de grado,La asistencia a clases,El requisito de inglés".split(",")
rec="¿Cuál es el nivel de promedio acumulado necesario para recibir el grado Cum Laude?,Tener promedio acumulado superior a 4.5,Estar en el 3% superior del promedio histórico,Tener promedio acumulado de 5.0,Promedio superior a 4.0"

rec_parts = rec.split(",")
question = rec_parts[0]
options = [f"Opción {let}. {ans}" for let, ans in zip("ABCD", rec_parts[1:5])]

prompt = PROMPT_TMPL.format(question=question, options="\n".join(options))

print(f"====BEGIN-PROMPT===\n{prompt}\n=====END-PROMPT======\n")

chat_completion = client.chat.completions.create(
                        messages=[{
                            "role": "user",
                            "content": prompt
                        }],
                        model=GROQ_MODEL,
                        temperature=0.0
                    )

completion = chat_completion.choices[0].message.content
print(f"====BEGIN-COMPLETION===\n{completion}\n=======END-COMPLETION=======")

====BEGIN-PROMPT===
Please translate the following single choice question and the 4 answer options regarding regulations of Los Andes University to English:
PREGUNTA: ¿Cuál es el nivel de promedio acumulado necesario para recibir el grado Cum Laude?

Opción A. Tener promedio acumulado superior a 4.5
Opción B. Estar en el 3% superior del promedio histórico
Opción C. Tener promedio acumulado de 5.0
Opción D. Promedio superior a 4.0

Your answer should consist of only 5 lines.
The expected format of those lines is:
QUESTION: <translation of PREGUNTA to *English*>
OPTION A: <translation of Opción A. to *English*>
OPTION B: <translation of Opción B. to *English*>
OPTION C: <translation of Opción C. to *English*>
OPTION D: <translation of Opción D. to *English*>


====BEGIN-COMPLETION===
QUESTION: Nivel de promedio acumulado necesario para Cum Laude
Opción A: Nivel de promedio acumulado superior a 4.5
Opción B: Nivel de promedio histórico superior al 3%
Opción C: Nivel de promedio acumulado 

In [4]:
PROMPT_TMPL = """Please translate the following to English:
"{sentence}"
Please provide ONLY A SINGLE English translation"""

GROQ_MODEL = "llama-3.2-1b-preview"

# rec = "¿Qué requisito de grado puede variar según el programa específico de maestría?,El promedio acumulado mínimo,El trabajo de grado,La asistencia a clases,El requisito de inglés".split(",")
rec="¿Cuál es el nivel de promedio acumulado necesario para recibir el grado Cum Laude?,Tener promedio acumulado superior a 4.5,Estar en el 3% superior del promedio histórico,Tener promedio acumulado de 5.0,Promedio superior a 4.0"

rec_parts = rec.split(",")
question = rec_parts[0]
options = [f"Opción {let}. {ans}" for let, ans in zip("ABCD", rec_parts[1:5])]

for a_str in rec_parts:
    prompt = PROMPT_TMPL.format(sentence=a_str)

    print(f"====BEGIN-PROMPT===\n{prompt}\n=====END-PROMPT======")

    chat_completion = client.chat.completions.create(
                            messages=[{
                                "role": "user",
                                "content": prompt
                            }],
                            model=GROQ_MODEL,
                            temperature=0.0
                        )

    completion = chat_completion.choices[0].message.content
    print(f"====BEGIN-COMPLETION===\n{completion}\n=======END-COMPLETION=======\n")

====BEGIN-PROMPT===
Please translate the following to English:
"¿Cuál es el nivel de promedio acumulado necesario para recibir el grado Cum Laude?"
Please provide ONLY A SINGLE English translation
====BEGIN-COMPLETION===
"¿Cuál es el nivel de promedio acumulado necesario para recibir el grado Cum Laude?" 

 translates to "What is the accumulated average required to receive the Cum Laude degree?"

====BEGIN-PROMPT===
Please translate the following to English:
"Tener promedio acumulado superior a 4.5"
Please provide ONLY A SINGLE English translation
====BEGIN-COMPLETION===
"Total acumulado superior a 4.5"

====BEGIN-PROMPT===
Please translate the following to English:
"Estar en el 3% superior del promedio histórico"
Please provide ONLY A SINGLE English translation
====BEGIN-COMPLETION===
"Estar en el 3% superior del promedio histórico" translates to "Being at the 3% above the historical average".

====BEGIN-PROMPT===
Please translate the following to English:
"Tener promedio acumulado de

In [5]:
from pathlib import Path
import pandas as pd
import torch as pt
from torch import nn, Tensor
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModelForCausalLM

import src.utils as ut
import src.trainer as trn
import src.finetuning as ft

from src.utils import LLAMA_MODEL_ID, login_to_hf_hub, load_test_df

login_to_hf_hub()
TOKENIZER = AutoTokenizer.from_pretrained(LLAMA_MODEL_ID)

ut.gpu_mem_info()

Returning secret from environment variable `HF_API_KEY`=`hf...Gk`
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/teo/.cache/huggingface/token
Login successful


{'gpu_mem_used_MB': 188.15,
 'gpu_mem_free_MB': 5832.51,
 'gpu_mem_total_MB': 6020.66}

In [6]:
from src.txt_dataset import TokenizedTxtDataset

text_fpaths = [
    Path("../data/Estatutos-Universidad-de-los-Andes-2020-ratificados-MEN-RQ.translated.txt"),
    Path("../data/reglamento-maestria-web-2024.translated.txt"),
]

train_ds = TokenizedTxtDataset(text_fpaths,
                block_size=128,
                stride=64,
                tokenizer=TOKENIZER,
                start_pct=0.0,
                end_pct=90.0
            )

valid_ds = TokenizedTxtDataset(text_fpaths,
                block_size=128,
                stride=64,
                tokenizer=TOKENIZER,
                start_pct=90.0,
                end_pct=100.0
            )

print("len(ds):", len(train_ds), "max_stride_mult:", train_ds.max_stride_mult)

../data/Estatutos-Universidad-de-los-Andes-2020-ratificados-MEN-RQ.translated.txt               :  49,682 bytes,    689 lines
../data/reglamento-maestria-web-2024.translated.txt                                             :  54,466 bytes,    686 lines
concat_files_to_str: returning 108,502 characters, whole text at: tokenized_txt_dataset.concat.txt
TokenizedTxtDs.__init__: len(all_text)=108,502
n_toks_raw: 20674 start_idx: 0 end_idx: 18606
n_toks_sampled:  18,606  input_ids length (padded):  18,688 block_size: 128 n_blocks: 146
../data/Estatutos-Universidad-de-los-Andes-2020-ratificados-MEN-RQ.translated.txt               :  49,682 bytes,    689 lines
../data/reglamento-maestria-web-2024.translated.txt                                             :  54,466 bytes,    686 lines
concat_files_to_str: returning 108,502 characters, whole text at: tokenized_txt_dataset.concat.txt
TokenizedTxtDs.__init__: len(all_text)=108,502
n_toks_raw: 20674 start_idx: 18606 end_idx: 20674
n_toks_sampled:   

In [1]:
train_dl = DataLoader(train_ds, batch_size=4, shuffle=False)
for batch in train_dl:
    break

## batch

NameError: name 'DataLoader' is not defined

In [8]:
model = ut.load_raw_model()
ut.gpu_mem_info()

{'gpu_mem_used_MB': 2662.79,
 'gpu_mem_free_MB': 3357.87,
 'gpu_mem_total_MB': 6020.66}

In [9]:
ft.freeze_and_install_lora(model, lora_rank=16)
ut.gpu_mem_info()

Parámetros sin LoRA: 167,772,160 || Parámetros con LoRA: 3,407,872  || Porcentaje de parámetros con LoRA: 1.99%


{'gpu_mem_used_MB': 3212.25,
 'gpu_mem_free_MB': 2808.41,
 'gpu_mem_total_MB': 6020.66}

In [10]:

DEVICE = ut.module_device(model)
print(f"DEVICE: {DEVICE}")
TRAIN_BATCH_SIZE = 1
VALID_BATCH_SIZE = 8
LEARNING_RATE = 2e-4

trainer = trn.Trainer(
    train_ds=train_ds,
    train_batch_size=TRAIN_BATCH_SIZE,
    valid_ds=valid_ds,
    valid_batch_size=VALID_BATCH_SIZE,
    lr=LEARNING_RATE,
    device=DEVICE
)

DEVICE: cuda

Initializing trainer: device: cuda - len(train_ds)=292, len(valid_ds)=34


In [None]:
def pred_next_token_loss(model: nn.Module, batch: dict[str, Tensor]) -> Tensor:
    return model(input_ids=batch['input_ids'],
                 attention_mask=batch['attention_mask'],
                 labels=batch['input_ids']).loss

trainer.train(model,
              loss_fun=pred_next_token_loss,
              max_steps=70,
              accum_grad_steps=8)

Entrenando por 70 pasos. 1 epochs
Nota Importante:
    El `Train Loss` que se reporta se calcula únicamente sobre los datos de los últimos 8 pasos de entrenamiento.
    El `Valid Loss` es sobre *todos* los datos de validación


Step    5 - Train loss: 3.049                     (tokens/sec: 1326) - estimating loss on 'validation' dataset: 100%|██████████| 5/5 [00:00<00:00,  6.08it/s]


Step    5 -                    Valid loss: 2.288


Step   10 - Train loss: 2.624                     (tokens/sec: 1424) - estimating loss on 'validation' dataset: 100%|██████████| 5/5 [00:00<00:00,  6.07it/s]


Step   10 -                    Valid loss: 2.202


Step   15 - Train loss: 2.451                     (tokens/sec: 1431) - estimating loss on 'validation' dataset: 100%|██████████| 5/5 [00:00<00:00,  6.05it/s]


Step   15 -                    Valid loss: 2.166


Step   20 - Train loss: 2.454                     (tokens/sec: 1432) - estimating loss on 'validation' dataset: 100%|██████████| 5/5 [00:00<00:00,  5.98it/s]


Step   20 -                    Valid loss: 2.163


Step   25 - Train loss: 2.372                     (tokens/sec: 1426) - estimating loss on 'validation' dataset: 100%|██████████| 5/5 [00:00<00:00,  6.00it/s]


Step   25 -                    Valid loss: 2.159


Step   30 - Train loss: 2.337                     (tokens/sec: 1424) - estimating loss on 'validation' dataset: 100%|██████████| 5/5 [00:00<00:00,  6.02it/s]


Step   30 -                    Valid loss: 2.146


Step   35 - Train loss: 2.295                     (tokens/sec: 1387) - estimating loss on 'validation' dataset: 100%|██████████| 5/5 [00:00<00:00,  6.01it/s]


Step   35 -                    Valid loss: 2.136
train_dl exhausted, resetting... epoch_cnt=1


Step   40 - Train loss: 2.017                     (tokens/sec: 1421) - estimating loss on 'validation' dataset: 100%|██████████| 5/5 [00:00<00:00,  5.97it/s]


Step   40 -                    Valid loss: 2.153


Step   45 - Train loss: 1.945                     (tokens/sec: 1096) - estimating loss on 'validation' dataset: 100%|██████████| 5/5 [00:00<00:00,  6.00it/s]


Step   45 -                    Valid loss: 2.196


Step   50 - Train loss: 1.930                     (tokens/sec: 1415) - estimating loss on 'validation' dataset: 100%|██████████| 5/5 [00:00<00:00,  5.97it/s]


Step   50 -                    Valid loss: 2.222


Step   55 - Train loss: 1.884                     (tokens/sec: 1419) - estimating loss on 'validation' dataset: 100%|██████████| 5/5 [00:00<00:00,  5.97it/s]


Step   55 -                    Valid loss: 2.208


Step   60 - Train loss: 1.758                     (tokens/sec: 1411) - estimating loss on 'validation' dataset: 100%|██████████| 5/5 [00:00<00:00,  6.01it/s]


Step   60 -                    Valid loss: 2.206


Step   65 - Train loss: 1.765                     (tokens/sec: 1417) - estimating loss on 'validation' dataset: 100%|██████████| 5/5 [00:00<00:00,  5.93it/s]


Step   65 -                    Valid loss: 2.210


Step   69 - Train loss: 1.881                     (tokens/sec: 1413) - estimating loss on 'validation' dataset: 100%|██████████| 5/5 [00:00<00:00,  5.95it/s]


Step   69 -                    Valid loss: 2.215


Step   70 - Train loss: 1.628                     (tokens/sec: 1400) - estimating loss on 'validation' dataset: 100%|██████████| 5/5 [00:00<00:00,  5.98it/s]

Step   70 -                    Valid loss: 2.222





LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): LoraLinear(
            (linear_layer): Linear(in_features=2048, out_features=2048, bias=False)
          )
          (k_proj): LoraLinear(
            (linear_layer): Linear(in_features=2048, out_features=512, bias=False)
          )
          (v_proj): LoraLinear(
            (linear_layer): Linear(in_features=2048, out_features=512, bias=False)
          )
          (o_proj): LoraLinear(
            (linear_layer): Linear(in_features=2048, out_features=2048, bias=False)
          )
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2

In [15]:
pt.save(model, "../data/r16-e70.ckpt")
