#Install and setup

##1. Install Unsloth & Load Model

In [1]:
# @title 1. Install Dependencies
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes psutil

In [2]:
# @title Import Unsloth
from unsloth import FastLanguageModel
print("‚úÖ Unsloth imported!")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
‚úÖ Unsloth imported!


In [3]:
# @title Load Qwen2.5-3B Model
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model_name = "unsloth/Qwen2.5-3B-Instruct-bnb-4bit"

print(f"üîÑ Loading {model_name}...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Add LoRA Adapters (The Fine-Tuning Layer)
# We target "all-linear" modules to help it learn the new language patterns better
model = FastLanguageModel.get_peft_model(
    model,
    r=16, # Rank: 16 is standard. 32 or 64 is better for new languages but uses more VRAM.
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0, # Optimized to 0 for Unsloth
    bias="none",    # Optimized to "none" for Unsloth
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora = False,
    loftq_config = None,
)

print("‚úÖ Model Loaded & LoRA Adapters Attached!")

üîÑ Loading unsloth/Qwen2.5-3B-Instruct-bnb-4bit...
==((====))==  Unsloth 2025.12.9: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.05G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth 2025.12.9 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


‚úÖ Model Loaded & LoRA Adapters Attached!


#Dataformatting and Training

##2. Format Data & Start Training

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# @title 2. Format Data & Start Training - REAL FIX

# STEP 1: Inject psutil into builtins BEFORE importing SFTTrainer
import psutil
import builtins
builtins.psutil = psutil  # Make psutil available to ALL modules, including cached ones

# STEP 2: NOW import everything
from unsloth import is_bfloat16_supported
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset

print("‚úÖ psutil injected into builtins")
print(f"‚úÖ psutil available: {psutil.__version__}\n")

# Rest of training code...
dataset = load_dataset("Redgerd/roman-urdu-alpaca-qa-mix", split="train")

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

print("üöÄ Starting Trainer Setup...\n")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=500,
        save_steps=100,
        output_dir="/tmp/RomanUrduModel",
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
    ),
)

print("üî• Training Started!\n")
trainer.train()

print("\n‚úÖ Training Complete!")


‚úÖ psutil injected into builtins
‚úÖ psutil available: 5.9.5



Map:   0%|          | 0/1489 [00:00<?, ? examples/s]

üöÄ Starting Trainer Setup...



Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/1489 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


üî• Training Started!



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,489 | Num Epochs = 3 | Total steps = 500
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 29,933,568 of 3,115,872,256 (0.96% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
wandb: (1) Create a W&B account
wandb: (2) Use an existing W&B account
wandb: (3) Don't visualize my results
wandb: Enter your choice:

 3


wandb: You chose "Don't visualize my results"


wandb: Detected [huggingface_hub.inference, openai] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,4.2091
2,3.8457
3,4.1687
4,3.301
5,3.6058
6,3.2325
7,3.4118
8,3.184
9,3.0969
10,2.7754




0,1
train/epoch,‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/grad_norm,‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÇ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñá‚ñÜ‚ñÜ‚ñÖ‚ñÑ‚ñÑ‚ñá‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñà‚ñá‚ñá‚ñÜ
train/learning_rate,‚ñà‚ñà‚ñà‚ñà‚ñà‚ñá‚ñá‚ñá‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
train/loss,‚ñà‚ñà‚ñÜ‚ñÖ‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÉ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÑ‚ñÉ‚ñÅ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÇ‚ñÉ‚ñÇ‚ñÇ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÉ

0,1
total_flos,2.0021117180780544e+16
train/epoch,2.67651
train/global_step,500.0
train/grad_norm,2.22145
train/learning_rate,0.0
train/loss,1.0717
train_loss,1.72605
train_runtime,2104.4412
train_samples_per_second,1.901
train_steps_per_second,0.238



‚úÖ Training Complete!


Now, the model is saved on drive

#Testing after training

In [9]:
# @title 3. Test Model Performance

from transformers import TextStreamer

# 1. Enable Fast Inference Mode
print("‚ö° Enabling fast inference mode...")
from unsloth import FastLanguageModel
FastLanguageModel.for_inference(model)
print("‚úÖ Model ready for inference!\n")

# 2. Define the Prompt Format (Same as training)
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# 3. Test Cases (Roman Urdu - Different from training examples)
test_cases = [
    ("Pakistan mein sabse purani University kaun si hai?", ""),
    ("Urdu kehlaqikhayaat kya hein?", ""),
    ("Machine Learning aur Deep Learning mein kya farq hai?", ""),
    ("Ramazan mein roza rakhne ke fawaiid kya hain?", ""),
    ("Lahore mein ghoomne ke liye best jagahein kaunsi hain?", ""),
    ("Sardi se bachaao ke liye kya tips hain?", ""),
    ("Smartphone ka istemaal sahih tareeka kya hai?", ""),
    ("Internet ki security ko secure rakhne ke liye kya karein?", ""),
]

# 4. Run Tests
print("ü§ñ Starting Model Performance Test...\n" + "="*50)

for i, (question, context) in enumerate(test_cases, 1):
    print(f"\nüìù Test Case #{i}")
    print(f"‚ùì Question: {question}")

    # Format the prompt
    prompt = alpaca_prompt.format(
        question,  # instruction
        context,   # input (optional context)
        ""         # response (leave blank for generation)
    )

    # Tokenize
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # Generate with optimized parameters
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        use_cache=True,
        do_sample=True,
    )

    # Decode and clean response
    full_response = tokenizer.batch_decode(outputs, skip_special_tokens=False)[0]

    # Extract only the answer part (after "### Response:")
    try:
        answer = full_response.split("### Response:")[-1].strip()
        answer = answer.replace("<|end_of_text|>", "").strip()
    except:
        answer = full_response

    print(f"üí° Answer: {answer[:300]}...")  # Show first 300 chars
    print("-" * 50)

print("\n" + "="*50)
print("‚úÖ Testing Complete!")
print("="*50)


‚ö° Enabling fast inference mode...
‚úÖ Model ready for inference!

ü§ñ Starting Model Performance Test...

üìù Test Case #1
‚ùì Question: Pakistan mein sabse purani University kaun si hai?
üí° Answer: Pakistan mein sab se bara university Punjab University hai jahan University Road rakhti hai. Yeh 1882 mein Muhammad Ali Jinnah ne qaim karaya tha. Is university par 400 million rupees ka bait haq paida hua hai.[1] Yeh Pakistan ke zyada tar schoolon aur madaris ko mutaliq tor par ahem kora hai. In sa...
--------------------------------------------------

üìù Test Case #2
‚ùì Question: Urdu kehlaqikhayaat kya hein?
üí° Answer: Urdu ki mukhtalif kahlayaath hain. Yeh Hindustani zabanon mein se aik hai aur is mein Brij Basha, Pahari, Awadhi, Kala Dehli, Awadhi aur Braj bain ul Arabi bain-ul-Arabiyat ki tashkeel hasil hai. Urdu aik qisam ki zaban hai jis ki mukhtalif shakal ghair masawi mumtaz musabqat ke saath aik shakal mei...
--------------------------------------------------

üìù Tes

In [10]:
# @title 4. Comprehensive Evaluation: NLG + Deployment Metrics

import subprocess
import sys

print("üì¶ Installing evaluation libraries...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                       "rouge_score", "nltk", "pandas"])

import time
import torch
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import nltk
from datetime import datetime

nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

print("‚úÖ Libraries ready!\n")

# ============= TEST DATASET =============

reference_dataset = [
    {
        "question": "Pakistan ka sab se bara shehr kaunsa hai?",
        "reference": "Pakistan ka sab se bara shehr Karachi hai, jahan ki aabadi 15 million se zyada hai."
    },
    {
        "question": "Urdu kehlaqikhayaat kya hein?",
        "reference": "Urdu ki khususiaat mein Nastaliq likhai, Persian aur Arabic shabd shamil hain."
    },
    {
        "question": "Machine Learning aur Deep Learning mein kya farq hai?",
        "reference": "ML mein features manually diye jate hain, DL mein model apne-aap features seekhta hai."
    },
    {
        "question": "Ramazan mein roza rakhne ke fawaiid kya hain?",
        "reference": "Roza rakhne se sehat behtar hoti hai aur nafs par kontrol hasil hota hai."
    },
    {
        "question": "Lahore mein ghoomne ke liye best jagahein kaunsi hain?",
        "reference": "Badshahi Mosque, Lahore Fort, Data Darbar, aur Shalimar Gardens mashhoor jagahein hain."
    }
]

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
smoothing_function = SmoothingFunction().method4
results = []

# ============= HELPER FUNCTIONS =============

def get_model_size():
    """Calculate model parameters and size"""
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return {
        "total_params": total_params,
        "trainable_params": trainable_params,
        "param_ratio": (trainable_params / total_params * 100) if total_params > 0 else 0,
        "model_size_mb": (total_params * 4) / (1024**2)
    }

def get_memory_usage():
    """Get GPU memory usage"""
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        memory_allocated = torch.cuda.memory_allocated() / (1024**3)
        memory_reserved = torch.cuda.memory_reserved() / (1024**3)
        return {
            "gpu_memory_allocated_gb": round(memory_allocated, 3),
            "gpu_memory_reserved_gb": round(memory_reserved, 3)
        }
    return {"gpu_memory_allocated_gb": 0, "gpu_memory_reserved_gb": 0}

# ============= EVALUATION =============

print("üöÄ Starting Comprehensive Evaluation...\n" + "="*70)

inference_times = []
token_counts = []

for idx, item in enumerate(reference_dataset, 1):
    question = item['question']
    reference = item['reference']

    print(f"\nüìù Test Case #{idx}: {question[:40]}...")

    start_time = time.time()

    inputs = tokenizer([alpaca_prompt.format(question, "", "")], return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.7, top_p=0.9, use_cache=True, do_sample=True)

    inference_time = time.time() - start_time
    inference_times.append(inference_time)

    full_response = tokenizer.batch_decode(outputs, skip_special_tokens=False)[0]
    model_answer = full_response.split("### Response:")[-1].strip().replace("<|end_of_text|>", "")

    generated_tokens = outputs.shape[1] - inputs['input_ids'].shape[1]
    token_counts.append(generated_tokens)

    # ===== NLG METRICS =====
    ref_tokens = [reference.split()]
    pred_tokens = model_answer.split()

    bleu1 = sentence_bleu(ref_tokens, pred_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothing_function)
    bleu2 = sentence_bleu(ref_tokens, pred_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function)
    bleu3 = sentence_bleu(ref_tokens, pred_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing_function)
    bleu4 = sentence_bleu(ref_tokens, pred_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing_function)

    rouge_scores = rouge_scorer_obj.score(reference, model_answer)
    rouge1_f = rouge_scores['rouge1'].fmeasure
    rouge2_f = rouge_scores['rouge2'].fmeasure
    rougeL_f = rouge_scores['rougeL'].fmeasure

    try:
        meteor = meteor_score([reference.split()], pred_tokens)
    except:
        meteor = 0.0

    pred_length = len(pred_tokens)
    ref_length = len(reference.split())

    results.append({
        "Question": question[:50],
        "BLEU-1": round(bleu1, 4),
        "BLEU-2": round(bleu2, 4),
        "BLEU-3": round(bleu3, 4),
        "BLEU-4": round(bleu4, 4),
        "ROUGE-1": round(rouge1_f, 4),
        "ROUGE-2": round(rouge2_f, 4),
        "ROUGE-L": round(rougeL_f, 4),
        "METEOR": round(meteor, 4),
        "Inference_Time_s": round(inference_time, 4),
        "Tokens_per_Sec": round(generated_tokens / inference_time, 2),
    })

    print(f"   ‚úÖ BLEU-4: {bleu4:.4f}, ROUGE-L: {rougeL_f:.4f}, ‚è±Ô∏è {inference_time:.4f}s")

# ============= RESULTS TABLE =============

df_results = pd.DataFrame(results)

print("\n" + "="*70)
print("üìä DETAILED RESULTS")
print("="*70)
print(df_results.to_string(index=False))

# ============= AGGREGATE NLG METRICS =============

print("\n" + "="*70)
print("üìà AGGREGATE NLG METRICS")
print("="*70)

nlg_metrics = {
    "BLEU-1": df_results['BLEU-1'].mean(),
    "BLEU-2": df_results['BLEU-2'].mean(),
    "BLEU-3": df_results['BLEU-3'].mean(),
    "BLEU-4": df_results['BLEU-4'].mean(),
    "ROUGE-1": df_results['ROUGE-1'].mean(),
    "ROUGE-2": df_results['ROUGE-2'].mean(),
    "ROUGE-L": df_results['ROUGE-L'].mean(),
    "METEOR": df_results['METEOR'].mean(),
}

for metric, value in nlg_metrics.items():
    print(f"‚úÖ Average {metric}: {value:.4f}")

# ============= DEPLOYMENT METRICS =============

print("\n" + "="*70)
print("‚öôÔ∏è  DEPLOYMENT & PERFORMANCE METRICS")
print("="*70)

model_size_metrics = get_model_size()
print(f"\nüìä Model Size Metrics:")
print(f"   ‚Ä¢ Total Parameters: {model_size_metrics['total_params']:,}")
print(f"   ‚Ä¢ Trainable Parameters (LoRA): {model_size_metrics['trainable_params']:,}")
print(f"   ‚Ä¢ Trainable %: {model_size_metrics['param_ratio']:.2f}%")
print(f"   ‚Ä¢ Model Size (float32): {model_size_metrics['model_size_mb']:.2f} MB")
print(f"   ‚Ä¢ Model Size (int8): {model_size_metrics['model_size_mb']/4:.2f} MB")
print(f"   ‚Ä¢ Model Size (int4): {model_size_metrics['model_size_mb']/8:.2f} MB")

avg_inference_time = np.mean(inference_times)
std_inference_time = np.std(inference_times)
avg_tokens_per_sec = np.mean([token_counts[i]/inference_times[i] for i in range(len(inference_times))])

print(f"\n‚è±Ô∏è  Inference Speed Metrics:")
print(f"   ‚Ä¢ Avg Inference Time: {avg_inference_time:.4f}s")
print(f"   ‚Ä¢ Std Deviation: {std_inference_time:.4f}s")
print(f"   ‚Ä¢ Min Time: {min(inference_times):.4f}s")
print(f"   ‚Ä¢ Max Time: {max(inference_times):.4f}s")
print(f"   ‚Ä¢ Avg Tokens/Sec: {avg_tokens_per_sec:.2f}")
print(f"   ‚Ä¢ Avg Generated Tokens: {np.mean(token_counts):.0f}")

memory_metrics = get_memory_usage()
print(f"\nüíæ Memory Metrics:")
print(f"   ‚Ä¢ GPU Allocated: {memory_metrics['gpu_memory_allocated_gb']:.3f} GB")
print(f"   ‚Ä¢ GPU Reserved: {memory_metrics['gpu_memory_reserved_gb']:.3f} GB")

print(f"\nüì± Edge Device Suitability:")
latency_per_token = (avg_inference_time*1000)/np.mean(token_counts)
print(f"   ‚Ä¢ Latency/Token: {latency_per_token:.2f}ms")
print(f"   ‚Ä¢ Real-time Chat: {'‚úÖ YES' if avg_inference_time < 2.0 else '‚ùå NO'}")
print(f"   ‚Ä¢ CPU Deployable: {'‚úÖ YES' if model_size_metrics['model_size_mb'] < 5000 else '‚ö†Ô∏è  Requires optimization'}")
print(f"   ‚Ä¢ Mobile (int8): {'‚úÖ YES' if model_size_metrics['model_size_mb']/4 < 1000 else '‚ö†Ô∏è  LARGE'}")

# ============= SUMMARY =============

print("\n" + "="*70)
print("üìã EVALUATION SUMMARY")
print("="*70)

summary = {
    "Avg BLEU-1": f"{nlg_metrics['BLEU-1']:.4f}",
    "Avg BLEU-4": f"{nlg_metrics['BLEU-4']:.4f}",
    "Avg ROUGE-L": f"{nlg_metrics['ROUGE-L']:.4f}",
    "Avg METEOR": f"{nlg_metrics['METEOR']:.4f}",
    "Inference Time": f"{avg_inference_time:.4f}s",
    "Tokens/Sec": f"{avg_tokens_per_sec:.2f}",
    "Model Size (MB)": f"{model_size_metrics['model_size_mb']:.2f}",
    "GPU Memory (GB)": f"{memory_metrics['gpu_memory_allocated_gb']:.3f}",
    "Trainable Params": f"{model_size_metrics['trainable_params']:,}",
}

for key, value in summary.items():
    print(f"{key:.<40} {value}")

# ============= SAVE RESULTS =============

print("\n" + "="*70)
print("üíæ SAVING RESULTS")
print("="*70)

df_results.to_csv("/tmp/evaluation_results.csv", index=False)
print(f"‚úÖ Results saved: /tmp/evaluation_results.csv")

import json
with open('/tmp/evaluation_report.json', 'w', encoding='utf-8') as f:
    json.dump({"summary": summary, "nlg_metrics": nlg_metrics}, f, indent=2)
print(f"‚úÖ JSON saved: /tmp/evaluation_report.json")

print("\n" + "="*70)
print("‚úÖ EVALUATION COMPLETE!")
print("="*70)


üì¶ Installing evaluation libraries...
‚úÖ Libraries ready!

üöÄ Starting Comprehensive Evaluation...

üìù Test Case #1: Pakistan ka sab se bara shehr kaunsa hai...
   ‚úÖ BLEU-4: 0.1902, ROUGE-L: 0.4615, ‚è±Ô∏è 3.2665s

üìù Test Case #2: Urdu kehlaqikhayaat kya hein?...
   ‚úÖ BLEU-4: 0.0079, ROUGE-L: 0.0938, ‚è±Ô∏è 50.8816s

üìù Test Case #3: Machine Learning aur Deep Learning mein ...
   ‚úÖ BLEU-4: 0.0050, ROUGE-L: 0.0833, ‚è±Ô∏è 23.1368s

üìù Test Case #4: Ramazan mein roza rakhne ke fawaiid kya ...
   ‚úÖ BLEU-4: 0.0042, ROUGE-L: 0.0620, ‚è±Ô∏è 20.6935s

üìù Test Case #5: Lahore mein ghoomne ke liye best jagahei...
   ‚úÖ BLEU-4: 0.0043, ROUGE-L: 0.0333, ‚è±Ô∏è 15.8244s

üìä DETAILED RESULTS
                                          Question  BLEU-1  BLEU-2  BLEU-3  BLEU-4  ROUGE-1  ROUGE-2  ROUGE-L  METEOR  Inference_Time_s  Tokens_per_Sec
         Pakistan ka sab se bara shehr kaunsa hai?  0.2299  0.2199  0.2084  0.1902   0.4615   0.3333   0.4615  0.2934            3.26

In [11]:
# @title
# # @title 5. Evaluation with Referenced Test Cases + Metrics

# import time
# import torch
# import numpy as np
# import pandas as pd
# from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# from nltk.translate.meteor_score import meteor_score
# from rouge_score import rouge_scorer
# import nltk

# nltk.download('punkt', quiet=True)
# nltk.download('wordnet', quiet=True)
# nltk.download('averaged_perceptron_tagger', quiet=True)

# print("‚úÖ Libraries ready!\n")

# # ============= EXACT TEST CASES FROM REFERENCED CODE =============

# alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

# ### Instruction:
# {}

# ### Input:
# {}

# ### Response:
# {}"""

# # EXACT test cases from referenced code
# test_cases = [
#     "Pakistan ka sab se bara shehr kaunsa hai?",
#     "Biryani banane ka tareeqa batao.",
#     "Artificial Intelligence kya hoti hai?",
#     "Garmi se bachne ke liye kya karna chahiye?"
# ]

# # Reference answers (baseline expectations)
# reference_answers = [
#     "Pakistan ka sab se bara shehr Karachi hai. Yeh Sindh ka shahri markaz hai aur aabadi ke lihaz se Pakistan ka sab se bada shehr hai.",
#     "Biryani banane ke liye pehle chawal ko 20-30 minute pani mein bhigo dijiye. Gosht ko masaloun ke saath 30 minute marinate kijiye. Ek bartan mein ghee laga kar chawal aur gosht ko alternate layers mein rakhiye. Neeche ki aag zor ke saath aur upar kam rahe. 45 minute tak pakane dijiye.",
#     "Artificial Intelligence ya AI ek computer science ka field hai jis mein machines ko insaani zehanat ki tarah sochne aur seekhne ki salaahiat di jati hai. ML aur DL AI ke mukhtalif fields hain.",
#     "Garmi se bachne ke liye thande pani mein naan duboke rakhen, barfi ya frozen mango pulp khayein, garmi ke samay bahar na niklen, AC mein rahen aur electrolytes lein."
# ]

# print("üöÄ Starting Evaluation with Referenced Test Cases...\n" + "="*70)

# # ============= SETUP =============

# rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
# smoothing_function = SmoothingFunction().method4
# results = []
# inference_times = []
# token_counts = []

# # ============= EVALUATION LOOP =============

# for idx, (question, reference) in enumerate(zip(test_cases, reference_answers), 1):
#     print(f"\n{'='*70}")
#     print(f"üìù Test Case #{idx}")
#     print(f"{'='*70}")
#     print(f"‚ùì Question: {question}")

#     # ===== INFERENCE =====
#     start_time = time.time()

#     inputs = tokenizer([alpaca_prompt.format(question, "", "")], return_tensors="pt").to("cuda")
#     outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)

#     inference_time = time.time() - start_time
#     inference_times.append(inference_time)

#     # Decode (exactly as referenced code)
#     decoded = tokenizer.batch_decode(outputs)[0]
#     model_answer = decoded.split("### Response:")[-1].strip().replace("<|end_of_text|>", "")

#     print(f"üí° Model Answer: {model_answer[:150]}...")

#     generated_tokens = outputs.shape[1] - inputs['input_ids'].shape[1]
#     token_counts.append(generated_tokens)

#     # ===== NLG METRICS =====
#     ref_tokens = [reference.split()]
#     pred_tokens = model_answer.split()

#     bleu1 = sentence_bleu(ref_tokens, pred_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothing_function)
#     bleu2 = sentence_bleu(ref_tokens, pred_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function)
#     bleu3 = sentence_bleu(ref_tokens, pred_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing_function)
#     bleu4 = sentence_bleu(ref_tokens, pred_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing_function)

#     rouge_scores = rouge_scorer_obj.score(reference, model_answer)
#     rouge1_f = rouge_scores['rouge1'].fmeasure
#     rouge2_f = rouge_scores['rouge2'].fmeasure
#     rougeL_f = rouge_scores['rougeL'].fmeasure

#     try:
#         meteor = meteor_score([reference.split()], pred_tokens)
#     except:
#         meteor = 0.0

#     results.append({
#         "Test_Case": idx,
#         "Question": question,
#         "BLEU-1": round(bleu1, 4),
#         "BLEU-2": round(bleu2, 4),
#         "BLEU-3": round(bleu3, 4),
#         "BLEU-4": round(bleu4, 4),
#         "ROUGE-1": round(rouge1_f, 4),
#         "ROUGE-2": round(rouge2_f, 4),
#         "ROUGE-L": round(rougeL_f, 4),
#         "METEOR": round(meteor, 4),
#         "Inference_Time_s": round(inference_time, 4),
#         "Generated_Tokens": generated_tokens,
#         "Tokens_per_Sec": round(generated_tokens / inference_time, 2),
#     })

#     print(f"\nüìä Metrics:")
#     print(f"   BLEU-1: {bleu1:.4f}  |  BLEU-4: {bleu4:.4f}")
#     print(f"   ROUGE-1: {rouge1_f:.4f}  |  ROUGE-L: {rougeL_f:.4f}")
#     print(f"   METEOR: {meteor:.4f}")
#     print(f"   ‚è±Ô∏è  {inference_time:.4f}s  |  {generated_tokens/inference_time:.2f} tokens/sec")

# # ============= RESULTS TABLE =============

# print("\n" + "="*70)
# print("üìä DETAILED RESULTS TABLE")
# print("="*70)

# df_results = pd.DataFrame(results)
# print("\n" + df_results[['Test_Case', 'Question', 'BLEU-1', 'BLEU-4', 'ROUGE-1', 'ROUGE-L', 'METEOR', 'Inference_Time_s']].to_string(index=False))

# # ============= AGGREGATE METRICS =============

# print("\n" + "="*70)
# print("üìà AGGREGATE METRICS (4 test cases)")
# print("="*70)

# aggregate_metrics = {
#     "BLEU-1": df_results['BLEU-1'].mean(),
#     "BLEU-2": df_results['BLEU-2'].mean(),
#     "BLEU-3": df_results['BLEU-3'].mean(),
#     "BLEU-4": df_results['BLEU-4'].mean(),
#     "ROUGE-1": df_results['ROUGE-1'].mean(),
#     "ROUGE-2": df_results['ROUGE-2'].mean(),
#     "ROUGE-L": df_results['ROUGE-L'].mean(),
#     "METEOR": df_results['METEOR'].mean(),
# }

# print("\n‚úÖ Average NLG Metrics:")
# for metric, value in aggregate_metrics.items():
#     print(f"   {metric}: {value:.4f}")

# # ============= PERFORMANCE =============

# avg_inference_time = np.mean(inference_times)
# avg_tokens_per_sec = np.mean([token_counts[i]/inference_times[i] for i in range(len(inference_times))])

# print("\n" + "="*70)
# print("‚è±Ô∏è  PERFORMANCE METRICS")
# print("="*70)

# print(f"""
# ‚úÖ Inference Speed:
#    Avg Time: {avg_inference_time:.4f}s
#    Min Time: {min(inference_times):.4f}s
#    Max Time: {max(inference_times):.4f}s
#    Tokens/Sec: {avg_tokens_per_sec:.2f}
#    Latency/Token: {(avg_inference_time*1000)/np.mean(token_counts):.2f}ms
# """)

# # ============= COMPARISON =============

# print("="*70)
# print("üîÑ COMPARISON: Referenced vs Previous Evaluation")
# print("="*70)

# comparison = {
#     "Metric": ["BLEU-1", "BLEU-4", "ROUGE-1", "ROUGE-L", "METEOR"],
#     "Previous (5 cases)": ["0.0742", "0.0423", "0.1532", "0.1468", "0.1374"],
#     "Referenced (4 cases)": [
#         f"{aggregate_metrics['BLEU-1']:.4f}",
#         f"{aggregate_metrics['BLEU-4']:.4f}",
#         f"{aggregate_metrics['ROUGE-1']:.4f}",
#         f"{aggregate_metrics['ROUGE-L']:.4f}",
#         f"{aggregate_metrics['METEOR']:.4f}",
#     ]
# }

# df_comp = pd.DataFrame(comparison)
# print("\n" + df_comp.to_string(index=False))

# # ============= FINAL SUMMARY =============

# print("\n" + "="*70)
# print("‚úÖ EVALUATION COMPLETE!")
# print("="*70)

# # Save results
# df_results.to_csv("/tmp/evaluation_referenced_testcases.csv", index=False)
# print(f"‚úÖ Results saved: /tmp/evaluation_referenced_testcases.csv")


‚úÖ Libraries ready!

üöÄ Starting Evaluation with Referenced Test Cases...

üìù Test Case #1
‚ùì Question: Pakistan ka sab se bara shehr kaunsa hai?
üí° Model Answer: Pakistan ka sab se bara shehar Punjab ki provincial division Jalandhar ke wida mein baqaida hai. Yeh Pakistan ka sab se bara maholiyati shehar aur Pun...

üìä Metrics:
   BLEU-1: 0.2267  |  BLEU-4: 0.0942
   ROUGE-1: 0.3301  |  ROUGE-L: 0.2524
   METEOR: 0.3965
   ‚è±Ô∏è  10.8071s  |  11.84 tokens/sec

üìù Test Case #2
‚ùì Question: Biryani banane ka tareeqa batao.
üí° Model Answer: Yaqeenan, aap ne biryani ka khana bohat chhotay se chhotay poday mein pakai hain, lekin yeh bohat saaf tareeqa bhi suna liya hai.
Aap ko aik mahin bae...

üìä Metrics:
   BLEU-1: 0.1538  |  BLEU-4: 0.0086
   ROUGE-1: 0.2069  |  ROUGE-L: 0.1379
   METEOR: 0.1145
   ‚è±Ô∏è  10.1153s  |  12.65 tokens/sec

üìù Test Case #3
‚ùì Question: Artificial Intelligence kya hoti hai?
üí° Model Answer: Artificial Intelligence (AI) aik mukhtasar sho

##4. Launch Public Web App (Gradio)

In [None]:
# @title 4. Launch Public Web App (Gradio)
!pip install -q gradio

import gradio as gr
from transformers import TextStreamer
from unsloth import FastLanguageModel

# 1. Enable fast inference
FastLanguageModel.for_inference(model)

def chat_response(message, history):
    prompt = alpaca_prompt.format(
        message,  # instruction
        "",       # input
        "",       # response (blank)
    )

    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        use_cache=True,
    )

    decoded_output = tokenizer.batch_decode(outputs)[0]
    cleaned_response = decoded_output.split("### Response:")[-1].strip().replace("<|end_of_text|>", "")
    return cleaned_response

demo = gr.ChatInterface(
    fn=chat_response,
    title="ü§ñ My Roman Urdu AI",
    description="Mujh se kuch bhi poochain! Main aap ke sawalat ka jawab dene ke liye tayaar hoon. Aap kya janna chahte hain?",

    examples=[
        "Pakistan ka sab say bda shehar konsa hai?",
        "Biryani ki recipe batao",
        "Gujranwala kyun mashhoor hai?",
        "Defence Raya kahan waqia hai?"
    ],
    theme="soft",
)

print("üöÄ Launching... Click the link that ends in '.gradio.live' below!")
demo.launch(share=True, debug=True)


  self.chatbot = Chatbot(


üöÄ Launching... Click the link that ends in '.gradio.live' below!
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://624b09eb6ae8a09f1d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


#Save to GGUF (16-bit)

Code is continue in the same testing/inferences colab file : https://colab.research.google.com/drive/1ob1OQgX7OoTNNAUGn1nsfywOx8y7Yc7h?usp=sharing

In [None]:
# # @title 5. Save Model to GGUF (For Deployment)
# # Ye code aapke model ko ek single file (model.gguf) mein convert karke
# # Google Drive mein save kar dega.

# # 1. Save to GGUF (16-bit)
# # Ye file thori bari hogi lekin quality best hogi.
# if False: model.save_pretrained_gguf("model_f16", tokenizer, quantization_method = "f16")

# # 2. Save to GGUF (8-bit Quantized) - RECOMMENDED
# # Ye file size mein choti hogi aur fast chalegi.
# print("üíæ Saving model to Google Drive as GGUF...")
# model.save_pretrained_gguf(
#     "/content/drive/My Drive/RomanUrduModel/model_q8", # Folder in Drive
#     tokenizer,
#     quantization_method = "q8_0" # 8-bit quantization
# )
# print("‚úÖ DONE! Check your Google Drive folder for 'model_q8.gguf'")

#Move code from drive to hugging face

For this code is on testing/inferences colab file : https://colab.research.google.com/drive/1ob1OQgX7OoTNNAUGn1nsfywOx8y7Yc7h?usp=sharing

In [None]:
# # @title 6. Push Model to Hugging Face Hub
# from huggingface_hub import login

# # 1. Login (Paste your token when asked)
# # Yahan apna "Write" token paste karein
# hf_token = "I will add it here" # <--- Paste Token Here
# login(token=hf_token)

# # 2. Upload to Hugging Face
# username = "mfareedkhan"
# model_name = "Roman-Urdu-Qwen2.5-3B-Instruct-4bit"

# print("‚òÅÔ∏è Uploading model to Hugging Face...")
# model.push_to_hub_gguf(
#     f"{username}/{model_name}",
#     tokenizer,
#     quantization_method = "q8_0",
#     token = hf_token
# )

# print(f"‚úÖ DONE! Model is live at: https://huggingface.co/{username}/{model_name}")