In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import torch
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import safetensors

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-Small-3.1-24B-Instruct-2503')
x = {
  "chat_template": "{%- set today = strftime_now(\"%Y-%m-%d\") %}\n{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\\nYour knowledge base was last updated on 2023-10-01. The current date is \" + today + \".\\n\\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \\\"What are some good restaurants around me?\\\" => \\\"Where are you?\\\" or \\\"When is the next flight to Tokyo\\\" => \\\"Where do you travel from?\\\")\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n    {%- if messages[0]['content'] is string %}\n        {%- set system_message = messages[0]['content'] %}\n    {%- else %}\n        {%- set system_message = messages[0]['content'][0]['text'] %}\n    {%- endif %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = default_system_message %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n    {%- if message['role'] == 'user' %}\n        {%- if message['content'] is string %}\n            {{- '[INST]' + message['content'] + '[/INST]' }}\n        {%- else %}\n            {{- '[INST]' }}\n            {%- for block in message['content'] %}\n                {%- if block['type'] == 'text' %}\n                    {{- block['text'] }}\n                {%- elif block['type'] in ['image', 'image_url'] %}\n                    {{- '[IMG]' }}\n                {%- else %}\n                    {{- raise_exception('Only text and image blocks are supported in message content!') }}\n                {%- endif %}\n            {%- endfor %}\n            {{- '[/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'system' %}\n        {%- if message['content'] is string %}\n            {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n        {%- else %}\n            {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {%- if message['content'] is string %}\n            {{- message['content'] + eos_token }}\n        {%- else %}\n            {{- message['content'][0]['text'] + eos_token }}\n        {%- endif %}\n    {%- else %}\n        {{- raise_exception('Only user, system and assistant roles are supported!') }}\n    {%- endif %}\n{%- endfor %}"
}
tokenizer.chat_template = x['chat_template']
streamer = TextStreamer(tokenizer)

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    './local', torch_dtype = 'auto',
    device_map="auto"
)

[2025-06-01 16:47:16,704] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status
Loading checkpoint shards: 100%|████████████████████████████████| 10/10 [00:09<00:00,  1.11it/s]


In [6]:
state_dict = model.state_dict()

In [7]:
!ls lora-embedding-256-Mistral-Small-3.1-24B-Instruct-2503-malaysian-8k

checkpoint-2900  checkpoint-3000  checkpoint-3100


In [8]:
from safetensors import safe_open

folder = 'lora-embedding-256-Mistral-Small-3.1-24B-Instruct-2503-malaysian-8k/checkpoint-3100'

f = safe_open(f"{folder}/adapter_model.safetensors", framework="pt", device='cpu')
keys = f.keys()
keys = sorted(list(set([k.split('.lora')[0] for k in keys if '.lora' in k])))

In [9]:
from tqdm import tqdm

for k in tqdm(keys):
    k_ori = k.replace('base_model.model.', '') + '.weight'
    if 'embed_tokens' in k:
        post_A = '.lora_embedding_A'
        post_B = '.lora_embedding_B'
    else:
        post_A = '.lora_A.weight'
        post_B = '.lora_B.weight'
    A = k + post_A
    B = k + post_B
    
    W = state_dict[k_ori]
    if 'embed_tokens' not in k:
        W = W.t()
        
    A = f.get_tensor(A).to(W.device)
    B = f.get_tensor(B).to(W.device)
    with torch.no_grad():
        W.addmm_(A.t(), B.t(), alpha = 2)

100%|████████████████████████████████████████████████████████| 282/282 [00:00<00:00, 379.12it/s]


In [11]:
d = [
    {'role': 'user', 'content': '"butoh hang" tu apa dalam bahsa kedah'}
]

inputs = tokenizer.apply_chat_template(d, return_tensors = 'pt', add_generation_prompt=True).to('cuda')
generate_kwargs = dict(
    input_ids=inputs,
    max_new_tokens=128,
    top_p=0.95,
    top_k=50,
    temperature=0.9,
    do_sample=True,
    repetition_penalty=1.05,
    streamer=streamer
)
generation_output = model.generate(**generate_kwargs)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s>[SYSTEM_PROMPT]You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.
Your knowledge base was last updated on 2023-10-01. The current date is 2025-06-01.

When you're not sure about some information, you say that you don't have the information and don't make up anything.
If the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. "What are some good restaurants around me?" => "Where are you?" or "When is the next flight to Tokyo" => "Where do you travel from?")[/SYSTEM_PROMPT][INST]"butoh hang" tu apa dalam bahsa kedah[/INST]Ungkapan yang digunakan untuk menggambarkan seseorang atau sesuatu yang kelihatan sangat buruk, tidak menyenangkan, atau menjijikkan.. "Butoh" bermaksud hancur atau rosak teruk dalam bahasa Kedah, manakala "hang" adalah kata ganti nama dir

In [12]:
d = [
    {'role': 'system', 'content': 'You are going to enter reasoning mode, first you try to think step-by-step in malay after that give the final answer.'},
    {'role': 'user', 'content': '"Awok ni kene bebele dengan dokte, bomo tok jalan." terjemah ke malay'},
]

inputs = tokenizer.apply_chat_template(d, return_tensors = 'pt').to('cuda')
generate_kwargs = dict(
    input_ids=inputs,
    max_new_tokens=4096,
    top_p=0.95,
    top_k=50,
    temperature=0.9,
    do_sample=True,
    repetition_penalty=1.05,
    streamer=streamer
)
generation_output = model.generate(**generate_kwargs)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s>[SYSTEM_PROMPT]You are going to enter reasoning mode, first you try to think step-by-step in malay after that give the final answer.[/SYSTEM_PROMPT][INST]"Awok ni kene bebele dengan dokte, bomo tok jalan." terjemah ke malay[/INST]Baguslah. Kita mulakan proses penterjemahan ayat dialek Pahang ini secara **panjang lebar, terperinci dan bernaratif**, seperti yang anda minta. Ini bukan sekadar menggantikan perkataan demi perkataan kepada Bahasa Melayu Standard, tetapi juga memerlukan pemahaman konteks budaya, nuansa emosi, dan makna tersirat dalam ayat tersebut.

## **Langkah 1: Memahami Fungsi dan Nada Ayat Keseluruhan**

Ayat ini kelihatan seperti satu nasihat atau teguran ringan, mungkin dalam situasi di mana seseorang berhadapan dengan cabaran fizikal atau halangan, dan disarankan untuk menggunakan tenaga dan strategi yang betul supaya dapat mengatasi masalah tersebut.

Nada pertuturan adalah **bersahabat dan memberi sokongan** – penutur memberikan dorongan positif dan keyakinan bah

In [13]:
tokenizer.push_to_hub('mesolitica/Malaysian-Mistral-Small-3.1-24B-Instruct-2503')

CommitInfo(commit_url='https://huggingface.co/mesolitica/Malaysian-Mistral-Small-3.1-24B-Instruct-2503/commit/04e2a5cc3e291ba13baf94f21e337847d46f3613', commit_message='Upload tokenizer', commit_description='', oid='04e2a5cc3e291ba13baf94f21e337847d46f3613', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mesolitica/Malaysian-Mistral-Small-3.1-24B-Instruct-2503', endpoint='https://huggingface.co', repo_type='model', repo_id='mesolitica/Malaysian-Mistral-Small-3.1-24B-Instruct-2503'), pr_revision=None, pr_num=None)

In [14]:
model.save_pretrained('./24b')

In [1]:
!huggingface-cli upload mesolitica/Malaysian-Mistral-Small-3.1-24B-Instruct-2503 ./24b .

Start hashing 13 files.
Finished hashing 13 files.
Uploading files using Xet Storage..
Uploading...: 100%|████████████████████████| 47.1G/47.1G [10:39<00:00, 73.7MB/s]
https://huggingface.co/mesolitica/Malaysian-Mistral-Small-3.1-24B-Instruct-2503/tree/main/.
