In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import torch
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import safetensors

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-14B')
streamer = TextStreamer(tokenizer)

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    'Qwen/Qwen3-14B', torch_dtype = 'auto',
    device_map="auto"
)

[2025-06-01 16:49:52,583] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status
Loading checkpoint shards: 100%|██████████████████████████████████| 8/8 [00:05<00:00,  1.44it/s]


In [5]:
state_dict = model.state_dict()

In [6]:
!ls lora-embedding-256-qwen3-14b-malaysian-8k

checkpoint-1000  checkpoint-1100  checkpoint-900


In [7]:
from safetensors import safe_open

folder = 'lora-embedding-256-qwen3-14b-malaysian-8k/checkpoint-1100'

f = safe_open(f"{folder}/adapter_model.safetensors", framework="pt", device='cpu')
keys = f.keys()
keys = sorted(list(set([k.split('.lora')[0] for k in keys if '.lora' in k])))

In [8]:
from tqdm import tqdm

for k in tqdm(keys):
    k_ori = k.replace('base_model.model.', '') + '.weight'
    if 'embed_tokens' in k:
        post_A = '.lora_embedding_A'
        post_B = '.lora_embedding_B'
    else:
        post_A = '.lora_A.weight'
        post_B = '.lora_B.weight'
    A = k + post_A
    B = k + post_B
    
    W = state_dict[k_ori]
    if 'embed_tokens' not in k:
        W = W.t()
        
    A = f.get_tensor(A).to(W.device)
    B = f.get_tensor(B).to(W.device)
    with torch.no_grad():
        W.addmm_(A.t(), B.t(), alpha = 2)

100%|████████████████████████████████████████████████████████| 282/282 [00:00<00:00, 548.56it/s]


In [9]:
d = [
    {'role': 'user', 'content': '"butoh hang" tu apa dalam bahsa kedah'}
]

inputs = tokenizer.apply_chat_template(d, return_tensors = 'pt', add_generation_prompt=True, enable_thinking = False).to('cuda')
generate_kwargs = dict(
    input_ids=inputs,
    max_new_tokens=128,
    top_p=0.95,
    top_k=50,
    temperature=0.9,
    do_sample=True,
    repetition_penalty=1.05,
    streamer=streamer
)
generation_output = model.generate(**generate_kwargs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|im_start|>user
"butoh hang" tu apa dalam bahsa kedah<|im_end|>
<|im_start|>assistant
<think>

</think>

Miskin atau takde duit. "Butoh hang" adalah ungkapan dialek Kedah yang digunakan untuk menggambarkan keadaan seseorang yang kekurangan wang atau dalam keadaan miskin. "Butoh" bermaksud tidak ada atau kurang, manakala "hang" adalah kata ganti nama diri kedua yang setara dengan "anda" atau "engkau" dalam Bahasa Melayu standard. Ungkapan ini biasanya digunakan dalam percakapan harian untuk menyatakan keadaan kewangan s


In [10]:
d = [
    {'role': 'system', 'content': 'You are going to enter reasoning mode, first you try to think step-by-step in malay after that give the final answer.'},
    {'role': 'user', 'content': '"Awok ni kene bebele dengan dokte, bomo tok jalan." terjemah ke malay'},
]

inputs = tokenizer.apply_chat_template(d, return_tensors = 'pt').to('cuda')
generate_kwargs = dict(
    input_ids=inputs,
    max_new_tokens=4096,
    top_p=0.95,
    top_k=50,
    temperature=0.9,
    do_sample=True,
    repetition_penalty=1.05,
    streamer=streamer
)
generation_output = model.generate(**generate_kwargs)

<|im_start|>system
You are going to enter reasoning mode, first you try to think step-by-step in malay after that give the final answer.<|im_end|>
<|im_start|>user
"Awok ni kene bebele dengan dokte, bomo tok jalan." terjemah ke malay<|im_end|>
<|im_start|>assistant
<think>

</think>

"Dia perlu mendapatkan nasihat daripada doktor, seorang pekerja pejabat tidak boleh melakukan itu."<|im_end|>


In [11]:
d = [
    {'role': 'system', 'content': 'You are going to enter reasoning mode, first you try to think step-by-step in malay after that give the final answer.'},
    {'role': 'user', 'content': 'camne nk code softmax dalam cuda'},
]

inputs = tokenizer.apply_chat_template(d, return_tensors = 'pt').to('cuda')
generate_kwargs = dict(
    input_ids=inputs,
    max_new_tokens=512,
    top_p=0.95,
    top_k=50,
    temperature=0.9,
    do_sample=True,
    repetition_penalty=1.05,
    streamer=streamer
)
generation_output = model.generate(**generate_kwargs)

<|im_start|>system
You are going to enter reasoning mode, first you try to think step-by-step in malay after that give the final answer.<|im_end|>
<|im_start|>user
camne nk code softmax dalam cuda<|im_end|>
<|im_start|>assistant
<think>

</think>

Untuk melaksanakan fungsi softmax dalam CUDA, anda boleh mengikuti langkah-langkah berikut:

1. Tentukan kernel untuk melakukan operasi softmax pada GPU.

```c++
__global__ void softmax_kernel(float *d_input, float *d_output, int N) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    float max_val = -INFINITY;

    // Cari nilai maksimum dalam baris
    if (index < N) {
        for (int i = 0; i < N; i++) {
            if (d_input[index * N + i] > max_val) {
                max_val = d_input[index * N + i];
            }
        }
    }

    // Terapkan fungsi softmax
    if (index < N) {
        for (int i = 0; i < N; i++) {
            d_output[index * N + i] = expf(d_input[index * N + i] - max_val);
            for (int j = 0; j <

In [12]:
tokenizer.push_to_hub('mesolitica/Malaysian-Qwen3-14B')

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/mesolitica/Malaysian-Qwen3-14B/commit/49ce6bdb69219d86793a923c13622a90dfa0c003', commit_message='Upload tokenizer', commit_description='', oid='49ce6bdb69219d86793a923c13622a90dfa0c003', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mesolitica/Malaysian-Qwen3-14B', endpoint='https://huggingface.co', repo_type='model', repo_id='mesolitica/Malaysian-Qwen3-14B'), pr_revision=None, pr_num=None)

In [13]:
# model.push_to_hub('mesolitica/Malaysian-Qwen3-14B')

In [14]:
model.save_pretrained('./14b')

In [1]:
!huggingface-cli upload mesolitica/Malaysian-Qwen3-14B ./14b .

Start hashing 9 files.
Finished hashing 9 files.
Uploading files using Xet Storage..
Uploading...: 100%|████████████████████████| 29.5G/29.5G [07:29<00:00, 65.7MB/s]
https://huggingface.co/mesolitica/Malaysian-Qwen3-14B/tree/main/.
