In [1]:
# Cell 1: Fix Libraries
!pip install -q -U "bitsandbytes>=0.46.1"
!pip install -q -U accelerate transformers datasets
print("✅ Libraries updated. NOW GO TO 'RUNTIME > RESTART SESSION' to make it work.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.6/47.6 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Libraries updated. NOW GO TO 'RUNTIME > RESTART SESSION' to make it work.


In [2]:
from google.colab import files
import os

# Upload the file
print("Please upload 'synthetic_qa_pairs.jsonl'...")
uploaded = files.upload()

# Verify upload
for filename in uploaded.keys():
    print(f"User uploaded file: {filename}")

Please upload 'synthetic_qa_pairs.jsonl'...


Saving urdu_covid_passages_min.jsonl to urdu_covid_passages_min.jsonl
Saving urdu_covid_passages.tsv to urdu_covid_passages.tsv
Saving urdu_covid_corpus_clean.jsonl to urdu_covid_corpus_clean.jsonl
Saving synthetic_qa_pairs.jsonl to synthetic_qa_pairs.jsonl
Saving eval_queries.jsonl to eval_queries.jsonl
Saving hard_negatives.jsonl to hard_negatives.jsonl
User uploaded file: urdu_covid_passages_min.jsonl
User uploaded file: urdu_covid_passages.tsv
User uploaded file: urdu_covid_corpus_clean.jsonl
User uploaded file: synthetic_qa_pairs.jsonl
User uploaded file: eval_queries.jsonl
User uploaded file: hard_negatives.jsonl


In [1]:
# Cell 2: Load Model for Linguistic Testing
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "Qwen/Qwen2.5-1.5B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

print(f"⏳ Loading {model_id}...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
print("✅ Model Loaded.")

⏳ Loading Qwen/Qwen2.5-1.5B-Instruct...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/338 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

✅ Model Loaded.


In [3]:
# Cell 3: Fluency Score (Perplexity)
import json
import torch
import os
from tqdm import tqdm
from google.colab import files

# Upload Corpus if missing
if not os.path.exists("urdu_covid_corpus_clean.jsonl"):
    print("Please upload 'urdu_covid_corpus_clean.jsonl'...")
    files.upload()

# Load text
texts = []
with open("urdu_covid_corpus_clean.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        doc = json.loads(line)
        t = doc.get('text', doc.get('content', ''))
        if len(t) > 50: texts.append(t)

# Test on 50 samples
test_subset = texts[:50]

def calculate_perplexity(model, tokenizer, data):
    model.eval()
    total_loss = 0
    total_tokens = 0
    print(f"Calculating Perplexity on {len(data)} samples...")

    with torch.no_grad():
        for text in tqdm(data):
            inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(model.device)
            outputs = model(**inputs, labels=inputs["input_ids"])

            total_loss += outputs.loss.item() * inputs["input_ids"].size(1)
            total_tokens += inputs["input_ids"].size(1)

    return torch.exp(torch.tensor(total_loss / total_tokens)).item()

score = calculate_perplexity(model, tokenizer, test_subset)

print("\n" + "="*40)
print(f"🧠 URDU LINGUISTIC SCORE: {score:.2f}")
print("="*40)
print("Interpretation:")
print("• < 30 : Native Speaker Level (Excellent)")
print("• 30-60: Fluent Second Language (Good)")
print("• > 100: Broken Grammar (Poor)")

Calculating Perplexity on 50 samples...


100%|██████████| 50/50 [00:07<00:00,  6.96it/s]


🧠 URDU LINGUISTIC SCORE: 14.15
Interpretation:
• < 30 : Native Speaker Level (Excellent)
• 30-60: Fluent Second Language (Good)
• > 100: Broken Grammar (Poor)





In [4]:
# Cell 4: Grammar Generation Test
prompts = [
    "ایک مختصر کہانی لکھیں: 'بارش کا پہلا قطرہ'", # Creative writing
    "اگر میں روزانہ ورزش کروں تو میرے جسم میں کیا تبدیلیاں آئیں گی؟", # Conditional (If/Then)
]

print("📝 GENERATING SAMPLES...")
for p in prompts:
    inputs = tokenizer.apply_chat_template([{"role":"user", "content":p}], tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(inputs, return_tensors="pt").to(model.device)

    outputs = model.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.7)
    print(f"\n🔹 Prompt: {p}")
    print(f"🔸 Response: {tokenizer.decode(outputs[0], skip_special_tokens=True).split('assistant')[-1].strip()}")

📝 GENERATING SAMPLES...

🔹 Prompt: ایک مختصر کہانی لکھیں: 'بارش کا پہلا قطرہ'
🔸 Response: "برچس، ارگ نے بارش کا پہلا قطرہ بنائی۔"

🔹 Prompt: اگر میں روزانہ ورزش کروں تو میرے جسم میں کیا تبدیلیاں آئیں گی؟
🔸 Response: ایک ورزش کرنا اسٹیشن کے طور پر بھی لیکن، روزانہ ورزش کرنا اسٹیشن کے ساتھ نہیں ممکن ہو۔ دوستی شروع کرے آپ کو خاصیت فراطبیل کرتا ہو۔

1. وقت: ورزش کرنا م
