In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, get_scheduler
from bitsandbytes.optim import Adam8bit,PagedAdam32bit
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig
from peft import prepare_model_for_kbit_training
import torch
from IPython.display import  clear_output
import time
import gc,os
from torch.utils.data import Dataset, DataLoader

device = 'cuda' if torch.cuda.is_available() else 'cpu'

DEFAULT_MODEL = '/home/nas/buffer/mohan.dash/llama_3_2_3B'#"meta-llama/Llama-3.2-3B-Instruct"
TOKENIZER_PATH = "llama_odia_tokenizer"

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=False,
    )


model = AutoModelForCausalLM.from_pretrained(
    DEFAULT_MODEL,
    quantization_config=bnb_config,
    use_safetensors=True,
    device_map=device,
)

print(model.get_memory_footprint()/(1024*1024)) 

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_safetensors=True)
tokenizer.pad_token_id = tokenizer.eos_token_id

def flush():
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()


2025-04-25 07:33:13.340424: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745566393.349761 1573652 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745566393.352604 1573652 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-25 07:33:13.362996: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

2095.841064453125


In [2]:
LORA_ADAPTER_DIR = '/home/nas/buffer/mohan.dash/llama_3_finetuned/adapter'
OPTIMIZER_CKPT_DIR = '/home/nas/buffer/mohan.dash/llama_3_finetuned'

In [5]:
# Resize the model's token embeddings to match the tokenizer's vocab size
model.resize_token_embeddings(len(tokenizer))

embedding_state_dict = torch.load(f"{OPTIMIZER_CKPT_DIR}/embedding_weights.pt", map_location=device)
lm_head_state_dict = torch.load(f"{OPTIMIZER_CKPT_DIR}/lm_head_weights.pt", map_location=device)
# Load the trained embeddings and LM head
model.model.embed_tokens.load_state_dict(embedding_state_dict)
model.lm_head.load_state_dict(lm_head_state_dict)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


  embedding_state_dict = torch.load(f"{OPTIMIZER_CKPT_DIR}/embedding_weights.pt", map_location=device)
  lm_head_state_dict = torch.load(f"{OPTIMIZER_CKPT_DIR}/lm_head_weights.pt", map_location=device)


<All keys matched successfully>

In [6]:
model = PeftModel.from_pretrained(model, LORA_ADAPTER_DIR, is_trainable=True) # Biggest change in this script

In [13]:

question='ତୁମେ କିଏ?'
chat_template = f'''<|begin_of_text|> <|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'''
inputs = tokenizer(chat_template , return_tensors="pt").to(device)
# print(prompt)

model.eval()


# with model.disable_adapter():
output = model.generate(
    **inputs,
    do_sample=True,
    max_new_tokens=256,
    repetition_penalty=1.3,
    temperature=0.7,         # Optional: smooth randomness
    top_k=50,                # Optional: top-k sampling
    top_p=0.9                # Optional: nucleus sampling
)
processed_text = tokenizer.decode(output[0], skip_special_tokens=False)
print(processed_text)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<|begin_of_text|><|begin_of_text|> <|start_header_id|>user<|end_header_id|>

ତୁମେ କିଏ?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

ନିର୍ଦ୍ଦିଷ୍ଟ ଭାବେ କୃଷି (କୃଷି, ଅନ୍ୟାନ୍ୟ ମହିଳା)।।।।।।।।।।।।।।।।।।।।।।।।।।।।କୁ ଆପଣାଇବାର କ୍ଷମତା ଏବଂ ବିଶାଳ ଖର୍ଚ୍ଚରେ ପରିବର୍ତ୍ତନ ଆଣିବା ପାଇଁ ଗୁରୁତ୍ୱପୂର୍ଣ୍ଣ ସମାନତା ଯୋଗୁଁ ଜଣେ ନିର୍ଭରଶୀଳ ଥିଲା।।।।।।।।।।। ମୁକ୍ତ ରୟାଲ କୋର୍ଟଙ୍କ ଦ୍ୱାରା ଉପଯୁକ୍ତ ହୋଇଥାଏ, ଏବଂ ଏହାକୁ ମହିଳାମାନଙ୍କ ପ୍ରତିରକ୍ଷା ଦଳର ପର୍ଯ୍ୟବେକ୍ଷଣ କରାଯାଇପାରିବ ନାହିଁ।।।।।।।।।।।।।।।।।।।।।।।।। ଇଣ୍ଟରନେଟ୍ ବିରୋଧୀ ଗୋଷ୍ଠୀଙ୍କୁ କେତେକ ଗୁରୁତ୍ୱ ବିରୁଦ୍ଧ ବୋଲି ଦାଖଲ କରୁଛି।।।।।।।।।।।।।।।।।।।।। ନିର୍ଗମନକାରୀଙ୍କୁ 
