In [3]:
pip install datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [1]:
#Trail 1; Multilingual Tokenizer; Suboptimal tokenization. 


from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
from transformers import PreTrainedTokenizerFast
import os

# 1. Gather multilingual corpus
corpus_files = [
    '/Users/abu/Downloads/Trail2/Yakut_Train/SH_merged_texts.txt',  # Yakut
    '/Users/abu/Downloads/Trail2/Russian_Train/RU_merged_texts.txt',  # Russian
    '/Users/abu/Downloads/Trail2/English_Train/EN_merged_texts.txt'   # English
]

# Create tokenizer with BPE model
tokenizer = Tokenizer(models.BPE(
    unk_token="<unk>",
    fuse_unk=True, 
))

# pre-tokenization
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
    pre_tokenizers.WhitespaceSplit(), 
    pre_tokenizers.Punctuation(),
])


# 4. Special tokens
special_tokens = [
    "<|begin_of_text|>", 
    "<|end_of_text|>", 
    "<unk>", 
    "<pad>",
    "<en>",  # English language token
    "<ru>",  # Russian language token
    "<sah>"  # Yakut language token
]

# Enhanced trainer configuration
trainer = trainers.BpeTrainer(
    vocab_size=32000,  # Reduced for better efficiency
    special_tokens=special_tokens,
    min_frequency=2,
    show_progress=True,
    limit_alphabet=1000,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
    continuing_subword_prefix="",  
)

# 6. Train on multilingual data
print("Training improved multilingual tokenizer...")
tokenizer.train(files=corpus_files, trainer=trainer)
print("Training complete!")

# 7. Configure processing
tokenizer.decoder = decoders.Sequence([
    decoders.Replace("▁", ""), 
    decoders.ByteLevel()
])

# 8. Save tokenizer
tokenizer.save("yakut_tokenizer.json")

# 9. Load as HF tokenizer
hf_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="yakut_tokenizer.json",
    bos_token="<|begin_of_text|>",
    eos_token="<|end_of_text|>",
    unk_token="<unk>",
    pad_token="<pad>",
    additional_special_tokens=["<en>", "<ru>", "<sah>"]
)

# 10. Test all languages
def print_tokenization(text):
    tokens = hf_tokenizer.tokenize(text)
    decoded = hf_tokenizer.convert_tokens_to_string(tokens)
    print(f"\nInput: {text}")
    print(f"Tokens: {tokens}")
    print(f"Decoded: {decoded}")

print("\nTesting tokenization:")
print_tokenization("<|begin_of_text|><sah>Мин аатым Кэскил.")  
print_tokenization("<|begin_of_text|><ru>Привет, как дела?")   
print_tokenization("<|begin_of_text|><en>To fix this issue")    

# 11. Save tokenizer
save_path = "yakut_tokenizer"
os.makedirs(save_path, exist_ok=True)
hf_tokenizer.save_pretrained(save_path)
print(f"\nTokenizer saved to: {save_path}")

Training improved multilingual tokenizer...



Training complete!

Testing tokenization:

Input: <|begin_of_text|><sah>Мин аатым Кэскил.
Tokens: ['<|begin_of_text|>', '<sah>', 'Мин', 'аа', 'тым', 'Кэскил', '.']
Decoded: <|begin_of_text|><sah>МинаатымКэскил.

Input: <|begin_of_text|><ru>Привет, как дела?
Tokens: ['<|begin_of_text|>', '<ru>', 'При', 'вет', ',', 'как', 'дела', '?']
Decoded: <|begin_of_text|><ru>Привет,какдела?

Input: <|begin_of_text|><en>To fix this issue
Tokens: ['<|begin_of_text|>', '<en>', 'To', 'fix', 'this', 'issue']
Decoded: <|begin_of_text|><en>Tofixthisissue

Tokenizer saved to: yakut_tokenizer


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn as nn

# 1. Load your Yakut tokenizer
tokenizer_path = "/Users/abu/Downloads/Trail2/yakut_tokenizer"
yakut_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# 2. Load Llama 3 model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"
token = "" #hf_secret_token removed due to obvious reasons

# Load model in float16 but convert to float32 for operations
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=token,
    torch_dtype=torch.float16
).to(torch.float32)  # Convert to float32 for compatibility

# Load the ORIGINAL tokenizer for Llama
original_tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=token
)

# 3. Get vocabulary sizes
original_vocab_size = model.config.vocab_size
new_vocab_size = len(yakut_tokenizer)
print(f"Original vocab size: {original_vocab_size}")
print(f"New vocab size: {new_vocab_size}")

# 4. Create new embedding matrix with improved initialization
print("\nCreating new embeddings...")
with torch.no_grad():
    # Get original embeddings in float32
    original_embeddings = model.get_input_embeddings().weight.data.clone().to(torch.float32)
    
    # Create new embedding layer in float32
    new_embeddings = nn.Embedding(new_vocab_size, model.config.hidden_size, dtype=torch.float32)
    
    # Initialize with normal distribution (matches original training)
    nn.init.normal_(
        new_embeddings.weight,
        mean=0.0,
        std=model.config.initializer_range
    )
    
    # Get vocabulary mappings
    original_vocab = original_tokenizer.get_vocab()
    new_vocab = yakut_tokenizer.get_vocab()
    
    # Copy embeddings for overlapping tokens
    for token, new_idx in new_vocab.items():
        if token in original_vocab:
            old_idx = original_vocab[token]
            new_embeddings.weight[new_idx] = original_embeddings[old_idx].clone()
    
    # Handle Llama 3's special tokens
    # Find a suitable base embedding for new tokens
    base_embedding = original_embeddings.mean(dim=0)
    if "<|begin_of_text|>" in original_vocab:
        bot_idx = original_vocab["<|begin_of_text|>"]
        base_embedding = original_embeddings[bot_idx].clone()
    
    # Special initialization for language tokens
    lang_tokens = {"<en>", "<ru>", "<sah>"}
    for token in lang_tokens:
        if token in new_vocab:
            # Initialize with base embedding
            new_embeddings.weight[new_vocab[token]] = base_embedding.clone()

    # Replace model embeddings
    model.set_input_embeddings(new_embeddings)
    model.config.vocab_size = new_vocab_size
    
    # Handle output embeddings
    if model.config.tie_word_embeddings:
        # Update tied weights
        model.lm_head.weight = new_embeddings.weight
    else:
        # Create new output layer if not tied
        new_lm_head = nn.Linear(
            model.config.hidden_size, 
            new_vocab_size, 
            bias=False,
            dtype=torch.float32
        )
        new_lm_head.weight.data.copy_(new_embeddings.weight.data)
        model.lm_head = new_lm_head

# 5. Configure padding
if yakut_tokenizer.pad_token is None:
    # Use existing pad token if available, otherwise use EOS
    if "<pad>" in yakut_tokenizer.get_vocab():
        yakut_tokenizer.pad_token = "<pad>"
    else:
        yakut_tokenizer.pad_token = yakut_tokenizer.eos_token
    
model.config.pad_token_id = yakut_tokenizer.pad_token_id

# Convert model back to float16 for efficiency
model = model.to(torch.float16)

# 6. Save the custom model
save_path = "yakut-llama-model"
model.save_pretrained(save_path)
yakut_tokenizer.save_pretrained(save_path)
print(f"\nCustom model saved to: {save_path}")

# 7. Test generation with improved parameters
test_text = "<|begin_of_text|><sah>Мин аатым Кэскил."
inputs = yakut_tokenizer(test_text, return_tensors="pt")

# Move inputs to same device and dtype as model
device = model.device
dtype = model.dtype
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

print("\nTest inference:")
with torch.no_grad():
    # Better generation parameters
    generated = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=20,
        do_sample=True,
        top_k=40,
        top_p=0.9,
        temperature=0.8,
        repetition_penalty=1.1,
        pad_token_id=yakut_tokenizer.pad_token_id
    )
    decoded = yakut_tokenizer.decode(generated[0], skip_special_tokens=False)
    print(f"Generated text: {decoded}")

Original vocab size: 128256
New vocab size: 32000

Creating new embeddings...

Custom model saved to: yakut-llama-model

Test inference:
Generated text: <|begin_of_text|><sah>МинаатымКэскил. 1st
, 2nd
, 3rd
, 4th
,


In [11]:
# 7. Test generation with improved parameters
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn as nn

tokenizer_path = "/Users/abu/Downloads/Trail2/yakut-llama-model"
yakut_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModelForCausalLM.from_pretrained(tokenizer_path)

test_text = "<|begin_of_text|><en> My name is "
inputs = yakut_tokenizer(test_text, return_tensors="pt")

# Move inputs to same device and dtype as model
device = model.device
dtype = model.dtype
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

print("\nTest inference:")
with torch.no_grad():
    # Better generation parameters
    generated = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=10,
        do_sample=True,
        top_k=10,
        top_p=0.1,
        temperature=0.9,
        repetition_penalty=1.1,
        pad_token_id=yakut_tokenizer.pad_token_id
    )
    decoded = [yakut_tokenizer.decode([token_id], skip_special_tokens=False)  for token_id in generated[0]]
    decoded_tokens = " ".join(decoded)
    print(f"Generated text: {decoded_tokens}")
    print("-" * 80)


Test inference:
Generated text: <|begin_of_text|> <en> My name is j ess ica 
 I ve been here for 2
--------------------------------------------------------------------------------
