## Install and Load Required Packages¶

In [3]:
# %%capture
# !pip install unsloth
# Also get the latest nightly Unsloth!
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [4]:
import os
import torch
import mlflow
from trl import SFTTrainer
from unsloth import FastLanguageModel
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from mlflow.models import infer_signature

mlflow.set_tracking_uri("http://127.0.0.1:5000")

## Load Model and Tokenizer

In [5]:
model_name = "unsloth/Meta-Llama-3.1-8B-Instruct"
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None          # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True   # Use 4bit quantization to reduce memory usage. Can be False.

load_model_info = {
    "model_name" : model_name,
    "max_seq_length" : max_seq_length,
    "dtype" : dtype,
    "load_in_4bit" : load_in_4bit,
}
load_model_info

{'model_name': 'unsloth/Meta-Llama-3.1-8B-Instruct',
 'max_seq_length': 2048,
 'dtype': None,
 'load_in_4bit': True}

In [6]:
model, tokenizer = FastLanguageModel.from_pretrained(**load_model_info)
EOS_TOKEN = tokenizer.eos_token   

==((====))==  Unsloth 2024.12.12: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.688 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.6. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Configure Model

In [7]:
model_configurations = {
    "r" : 16,
    "target_modules" : ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    "lora_alpha" : 16,
    "lora_dropout" : 0,  
    "bias" : "none",     
    "use_gradient_checkpointing" : "unsloth",  
    "random_state" : 3407,
    "use_rslora" : False,  
    "loftq_config" : None, 
}
model_configurations

{'r': 16,
 'target_modules': ['q_proj',
  'k_proj',
  'v_proj',
  'o_proj',
  'gate_proj',
  'up_proj',
  'down_proj'],
 'lora_alpha': 16,
 'lora_dropout': 0,
 'bias': 'none',
 'use_gradient_checkpointing': 'unsloth',
 'random_state': 3407,
 'use_rslora': False,
 'loftq_config': None}

In [8]:
model = FastLanguageModel.get_peft_model(model, ** model_configurations)

Unsloth 2024.12.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Prompt Template

In [9]:
alpaca_prompt = """You are a powerful Banglish-to-Bangla model. Given the banglish text, your job is to translate the banglish into Bangla.

### Instruction:
Convert the Input (Banglish) to Bangla

### Input:
{}

### Response:
{}"""

## Load and Format Data

In [10]:
def formatting_prompts_func(examples):
    inputs       = examples["rm"]
    outputs      = examples["bn"]
    texts = []
    for input, output in zip(inputs, outputs):
        text = alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

In [11]:
from datasets import load_dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map: 100%|██████████| 5006/5006 [00:00<00:00, 122572.60 examples/s]


## Seting Up Traner

In [12]:
training_configurations = {
    "per_device_train_batch_size" : 4,
    "gradient_accumulation_steps" : 4,
    "warmup_steps" : 5,
    "max_steps" : 180,
    "learning_rate" : 2e-4,
    "logging_steps" : 1,
    "optim" : "adamw_8bit",
    "weight_decay" : 0.01,
    "lr_scheduler_type" : "linear",
    "seed" : 3407,
    "output_dir" : "outputs",
    "report_to" : "mlflow", 
}

training_configurations

{'per_device_train_batch_size': 4,
 'gradient_accumulation_steps': 4,
 'warmup_steps': 5,
 'max_steps': 180,
 'learning_rate': 0.0002,
 'logging_steps': 1,
 'optim': 'adamw_8bit',
 'weight_decay': 0.01,
 'lr_scheduler_type': 'linear',
 'seed': 3407,
 'output_dir': 'outputs',
 'report_to': 'mlflow'}

In [13]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 4,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        **training_configurations,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
    ),
)

Map (num_proc=4): 100%|██████████| 5006/5006 [00:01<00:00, 3524.35 examples/s]


In [14]:
# Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090. Max memory = 23.688 GB.
6.004 GB of memory reserved.


## Train

In [15]:
trainer_stats = trainer.train()
trainer_stats

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 5,006 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 180
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.2966
2,2.3098
3,2.3019
4,2.1746
5,2.0139
6,1.7579
7,1.4653
8,1.2227
9,1.0327
10,0.9442


🏃 View run outputs at: http://127.0.0.1:5000/#/experiments/0/runs/deeb7032534443cb9a41bb3c4997576f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


TrainOutput(global_step=180, training_loss=0.6615929517481062, metrics={'train_runtime': 495.7765, 'train_samples_per_second': 5.809, 'train_steps_per_second': 0.363, 'total_flos': 1.9242339167698944e+16, 'train_loss': 0.6615929517481062, 'epoch': 0.5750798722044729})

In [16]:
# Display Time and Memory Uses
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

495.7765 seconds used for training.
8.26 minutes used for training.
Peak reserved memory = 7.305 GB.
Peak reserved memory for training = 1.301 GB.
Peak reserved memory % of max memory = 30.838 %.
Peak reserved memory for training % of max memory = 5.492 %.


## Inference

In [17]:
FastLanguageModel.for_inference(model) 
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Ami valo achi", 
        "",
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
res = tokenizer.batch_decode(outputs)
res

['<|begin_of_text|>You are a powerful Banglish-to-Bangla model. Given the banglish text, your job is to translate the banglish into Bangla.\n\n### Instruction:\nConvert the Input (Banglish) to Bangla\n\n### Input:\nAmi valo achi\n\n### Response:\nআমি ভালো আছি<|eot_id|>']

In [18]:
def extract_last_bangla_text(input_text: str) -> str:
    pos = input_text.rfind("Response:") + 10
    return input_text[pos:]

In [19]:
extract_last_bangla_text(res[0].strip())

'আমি ভালো আছি<|eot_id|>'

## Saving, loading finetuned models

In [21]:
# Local saving
model.save_pretrained("Llama-3.1-Banglish-to-Bangla-V1")    
tokenizer.save_pretrained("Llama-3.1-Banglish-to-Bangla-V1")

# Push to HuggingFace
# model.push_to_hub("mdhasnainali/banglish-to-bangla", token=os.environ.get('HF_TOKEN'))
# tokenizer.push_to_hub("mdhasnainali/banglish-to-bangla", token=os.environ.get('HF_TOKEN'))

('Llama-3.1-Banglish-to-Bangla-V1/tokenizer_config.json',
 'Llama-3.1-Banglish-to-Bangla-V1/special_tokens_map.json',
 'Llama-3.1-Banglish-to-Bangla-V1/tokenizer.json')

## MLflow

In [22]:
sample = dataset[1]

signature = infer_signature(
    model_input=sample["rm"],
    model_output=sample["bn"],
    params={"max_new_tokens": max_seq_length, "repetition_penalty": 1.15, "return_full_text": False},
)
signature

inputs: 
  [string (required)]
outputs: 
  [string (required)]
params: 
  ['max_new_tokens': long (default: 2048), 'repetition_penalty': double (default: 1.15), 'return_full_text': boolean (default: False)]

In [23]:
alpaca_prompt_mlflow = """You are a powerful Banglish-to-Bangla model. Given the banglish text, your job is to translate the banglish into Bangla.

### Instruction:
Convert the Input (Banglish) to Bangla

### Input:
{prompt}

### Response:
"""

In [25]:
with mlflow.start_run(run_name="Llama-3.1-Banglish-to-Bangla-V1") as run:
    run_id = run.info.run_id
    mlflow.log_params(model_configurations)
    mlflow.log_metrics(trainer_stats.metrics)
    mlflow.transformers.log_model(
        transformers_model={"model": trainer.model, "tokenizer": tokenizer},
        prompt_template=alpaca_prompt_mlflow,
        signature=signature,
        artifact_path="model",  
    )

Device set to use cuda:0
2025/01/03 20:23:32 INFO mlflow.transformers: Overriding save_pretrained to False for PEFT models, following the Transformers behavior. The PEFT adaptor and config will be saved, but the base model weights will not and reference to the HuggingFace Hub repository will be logged instead.
2025/01/03 20:23:40 INFO mlflow.transformers: Skipping saving pretrained model weights to disk as the save_pretrained argumentis set to False. The reference to the HuggingFace Hub repository unsloth/meta-llama-3.1-8b-instruct-bnb-4bit will be logged instead.
2025/01/03 20:23:41 INFO mlflow.transformers: text-generation pipelines saved with prompt templates have the `return_full_text` pipeline kwarg set to False by default. To override this behavior, provide a `model_config` dict with `return_full_text` set to `True` when saving the model.
2025/01/03 20:23:41 INFO mlflow.transformers: A local checkpoint path or PEFT model is given as the `transformers_model`. To avoid loading the 

🏃 View run Llama-3.1-Banglish-to-Bangla-V1 at: http://127.0.0.1:5000/#/experiments/0/runs/d042ae9810d340f5baba6385f0394eeb
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


### Register the Model

In [27]:
model_name = 'Llama-3.1-Banglish-to-Bangla-V1'
run_id= "d042ae9810d340f5baba6385f0394eeb"
model_uri = f'runs:/{run_id}/model_name'

with mlflow.start_run(run_id=run_id):
    mlflow.register_model(model_uri=model_uri, name=model_name)

Successfully registered model 'Llama-3.1-Banglish-to-Bangla-V1'.
2025/01/03 21:05:02 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Llama-3.1-Banglish-to-Bangla-V1, version 1


🏃 View run Llama-3.1-Banglish-to-Bangla-V1 at: http://127.0.0.1:5000/#/experiments/0/runs/d042ae9810d340f5baba6385f0394eeb
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


Created version '1' of model 'Llama-3.1-Banglish-to-Bangla-V1'.
