In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, get_scheduler
from bitsandbytes.optim import Adam8bit,PagedAdam32bit
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig
from peft import prepare_model_for_kbit_training
import torch
from IPython.display import  clear_output
import time
import gc
from torch.utils.data import Dataset, DataLoader
from tokenizers import ByteLevelBPETokenizer

device = 'cuda' if torch.cuda.is_available() else 'cpu'



# Load a Multi Lingual Dataset

In [2]:
from datasets import load_dataset

# Load the dataset
# dataset = load_dataset("OdiaGenAI/hardcode_odia_qa_105")
dataset = load_dataset('OdiaGenAI/odia_domain_context_train_v1')

In [3]:
dataset['train'][50]

{'instruction': 'ଓଡ଼ିଶାର ଉଦୟଗିରି ଏବଂ ଖଣ୍ଡଗିରି ଗୁମ୍ଫାର ଇତିହାସ କ’ଣ?',
 'input': '',
 'output': 'ଉଦୟଗିରି ଏବଂ ଖଣ୍ଡଗିରି ଗୁମ୍ଫା ଭାରତର ଓଡ଼ିଶା ରାଜ୍ୟରେ ଅବସ୍ଥିତ ପ୍ରାଚୀନ ଗୁମ୍ଫା ପରିସରର ଏକ ସମୂହ। ଏହି ଗୁମ୍ଫା ଗୁଡିକ ଖ୍ରୀଷ୍ଟପୂର୍ବ ଦ୍ୱିତୀୟ ଶତାବ୍ଦୀ ଏବଂ ପ୍ରଥମ ଶତାବ୍ଦୀ ମଧ୍ୟରେ ଜୈନ ସନ୍ନ୍ୟାସୀମାନଙ୍କ ଦ୍ୱାରା ବାଲୁକା ପଥରରେ ଖୋଦିତ ହୋଇଥିବା ଜଣାଯାଏ।\nଉଦୟଗିରି ଗୁମ୍ଫା ଗୁଡିକ ବୃହତ ଏବଂ ଅଧିକ ଜଟିଳ ହୋଇଥିବାବେଳେ ଖଣ୍ଡଗିରି ଗୁମ୍ଫା ଗୁଡିକ ଛୋଟ ଏବଂ ଡିଜାଇନ ଦୃଷ୍ଟିରୁ ସରଳ। ଏହି ପ୍ରାଚୀନ ଗୁମ୍ଫା ଗୁଡିକ ପ୍ରାଚୀନ ଭାରତୀୟ ସଭ୍ୟତାର ସ୍ଥାପତ୍ୟ କୌଶଳ ଏବଂ କଳାତ୍ମକ ଶ୍ରେଷ୍ଠତାର ପ୍ରମାଣ।\nବିଗତ ବର୍ଷମାନଙ୍କରେ ଏହି ଗୁମ୍ଫା ସାରା ବିଶ୍ୱରୁ ପର୍ଯ୍ୟଟକଙ୍କ ଆକର୍ଷଣ କେନ୍ଦ୍ର ପାଲଟିଛି ଏବଂ ଭାରତୀୟ ପ୍ରତ୍ନତାତ୍ୱିକ ସର୍ବେକ୍ଷଣ ଦ୍ୱାରା ସଂରକ୍ଷିତ ହୋଇଛି। ଭାରତର ସମୃଦ୍ଧ ସାଂସ୍କୃତିକ ଐତିହ୍ୟ ପ୍ରତି ଆଗ୍ରହୀ ଯେକୌଣସି ବ୍ୟକ୍ତି ଏହି ଗୁମ୍ଫାକୁ ଦେଖିବା ଉଚିତ।'}

In [4]:
def get_training_corpus(dataset):
    for example in dataset['train']:
        yield example['instruction'].strip() + ' ' + example['output'].strip()

In [5]:
with open("odia_corpus.txt", "w", encoding="utf-8") as f:
    for line in get_training_corpus(dataset):
        f.write(line + "\n")

In [6]:
import sentencepiece as spm

data_file='odia_corpus.txt'
tokenizer_name='odia_tokenizer'
vocab_size=3000
num_threads=8



In [9]:
spm.SentencePieceTrainer.train(
    input=data_file,
    model_prefix=tokenizer_name,
    vocab_size=vocab_size,
    num_threads=num_threads,
    model_type="bpe",
    max_sentence_length=1073741824,
    shuffle_input_sentence="true",
    character_coverage=1.0,
    hard_vocab_limit="false",
    unk_id=0,
    bos_id=1,
    eos_id=2,
    pad_id=-1,
    unk_piece='<unk>',
    bos_piece='<|begin_of_text|>',
    eos_piece='<|eot_id|>',
    pad_piece='<|eot_id|>',
    unk_surface='⁇',
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: odia_corpus.txt
  input_format: 
  model_prefix: odia_tokenizer
  model_type: BPE
  vocab_size: 3000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 1073741824
  num_threads: 8
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 0
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <|begin_of_text|>
  eos_piece: <|eot_id|>
  pad_piece: <|eot_id|>
  unk_surface: ⁇
  enable_dif

# Test the tokenizer

In [11]:
sp = spm.SentencePieceProcessor(model_file='odia_tokenizer/odia_tokenizer.model')

In [12]:
odia_text='ଓଡ଼ିଶାର ଉଦୟଗିରି ଏବଂ ଖଣ୍ଡଗିରି ଗୁମ୍ଫାର ଇତିହାସ କ’ଣ?'

print(sp.Encode(odia_text)),print(len(sp.Encode(odia_text)))

[262, 413, 2833, 1854, 25, 2273, 1854, 2194, 2871, 6, 872, 3, 2896, 2837, 2858]
15


(None, None)

In [14]:
pp=sp.Decode([262, 413, 2833, 1854, 25, 2273])
pp

'ଓଡ଼ିଶାର ଉଦୟଗିରି ଏବଂ ଖଣ୍ଡ'

# Extend the LLAMA Tokenizer

In [15]:
import os
from transformers import AutoTokenizer
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model

llama_model_id = "meta-llama/Llama-3.2-3B-Instruct"
original_tokenizer = AutoTokenizer.from_pretrained(llama_model_id)
new_tokenizer_spm = sp_pb2_model.ModelProto()
new_tokenizer_spm.ParseFromString(open("odia_tokenizer/odia_tokenizer.model", "rb").read())

original_tokenizer_tokenset = set(original_tokenizer.get_vocab().keys())
print(f"Number of tokens before merge: {len(original_tokenizer_tokenset)}")
print(f"Number of tokens in new tokenizer: {len(new_tokenizer_spm.pieces)}")

Number of tokens before merge: 128256
Number of tokens in new tokenizer: 3000


In [16]:
new_pieces = []
for p in new_tokenizer_spm.pieces:
    piece = p.piece
    if piece not in original_tokenizer_tokenset:
        # new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_pieces.append(piece)
original_tokenizer.add_tokens(new_pieces)
print(f"Number of tokens after merge: {len(set(original_tokenizer.get_vocab().keys()))}")

Number of tokens after merge: 131142


In [17]:

extended_tokenizer_save_path = 'llama_odia_tokenizer'
os.makedirs(extended_tokenizer_save_path , exist_ok=True)
original_tokenizer.save_pretrained(extended_tokenizer_save_path)
print(f"Tokenizer saved to {extended_tokenizer_save_path}")

Tokenizer saved to llama_odia_tokenizer


In [18]:
from tokenizers import Tokenizer
# Verify that the extended tokenizer's English vocab matches with that of the original Llama tokenizer
tok1 = AutoTokenizer.from_pretrained(llama_model_id)
tok2 = Tokenizer.from_file(os.path.join(extended_tokenizer_save_path, "tokenizer.json"))
for i in range(len(tok1)):
    assert tok1.convert_ids_to_tokens(i) == tok2.id_to_token(i), f"Token mismatch at index {i}."

 RELOAD THE KERNEL

# Test using the new LLAMA Tokenizer

In [4]:
from tokenizers import Tokenizer
from transformers import AutoTokenizer
import os

extended_tokenizer_save_path = 'llama_odia_tokenizer'
tok = AutoTokenizer.from_pretrained(extended_tokenizer_save_path)
odia_text='ଓଡ଼ିଶାର ଉଦୟଗିରି ଏବଂ ଖଣ୍ଡଗିରି ଗୁମ୍ଫାର ଇତିହାସ କ’ଣ?<|eot_id|>'

In [5]:
tok_ids = tok(odia_text).input_ids
print(tok_ids)
print(len(tok_ids))

print(tok.decode(tok_ids))

[128000, 131101, 128709, 131077, 128260, 220, 131092, 131074, 131069, 130100, 220, 131072, 131061, 131082, 220, 131100, 128385, 130100, 220, 128378, 128296, 129791, 131058, 220, 131080, 129099, 220, 131060, 131113, 131073, 30, 128009]
32
<|begin_of_text|>ଓଡ଼ିଶାର ଉଦୟଗିରି ଏବଂ ଖଣ୍ଡଗିରି ଗୁମ୍ଫାର ଇତିହାସ କ’ଣ?<|eot_id|>
