# **1 - Quran Dataset Preprocessing**

**1.1 - Normalized Quranic Text**

In [None]:
import json

text_file = '/content/1_Quran_arabic_tanzil.txt'
output_file = '/content/2_Quran_normalized_arabic.json'

data = []

with open(text_file, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue

        parts = line.split('|')

        if len(parts) < 3:
            continue

        surah = int(parts[0])
        ayah = int(parts[1])
        text = '|'.join(parts[2:])

        data.append({
            "surah": surah,
            "ayah": ayah,
            "text": text
        })

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("Conversion completed! JSON saved to:", output_file)


Conversion completed! JSON saved to: /content/Quran_normalized_arabic.json


In [None]:
import json

def add_quran_id(input_path: str, output_path: str):

    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    for idx, rec in enumerate(data, start=1):
        rec["quran_id"] = idx

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"Added quran_id (1..{len(data)}) and saved → {output_path}")

input_path = "/content/2_Quran_normalized_arabic.json"
output_path = "/content/3_Quran_normalized_arabic_with_ids.json"

add_quran_id(input_path, output_path)


Added quran_id (1..6236) and saved → /content/Quran_normalized_arabic_with_ids.json


In [None]:
import json
import re

input_raw = "/content/3_Quran_normalized_arabic_with_ids.json"
output_fixed = "/content/4_Quran_normalized_arabic_final.json"

BISMILLAH_PREFIX_RE = re.compile(
    r"^بسم الله الرحمن الرحيم[ \u200c\u200f\u200e،,:;ـ\-–—]*"
)

def strip_prefixed_bismillah(records):
    fixed = []
    for rec in records:
        surah = rec.get("surah_id") or rec.get("surah")
        ayah = rec.get("ayah_id") or rec.get("ayah")
        text = rec.get("text") or rec.get("text_ar") or ""

        if surah is not None and ayah is not None:
            if surah >= 2 and ayah == 1:
                new_text = BISMILLAH_PREFIX_RE.sub("", text).strip()

                if not new_text:
                    print(f"[WARN] Surah {surah} Ayah {ayah}: stripping Bismillah removed all text, keeping original.")
                    new_text = text

                if "text" in rec:
                    rec["text"] = new_text
                elif "text_ar" in rec:
                    rec["text_ar"] = new_text

        fixed.append(rec)

    return fixed

with open(input_raw, "r", encoding="utf-8") as f:
    data = json.load(f)

fixed_data = strip_prefixed_bismillah(data)

for rec in fixed_data:
    if rec.get("surah_id") == 2 and rec.get("ayah_id") == 1:
        print("Surah 2, Ayah 1 after fix:", rec.get("text") or rec.get("text_ar"))
        break

with open(output_fixed, "w", encoding="utf-8") as f:
    json.dump(fixed_data, f, ensure_ascii=False, indent=2)

print("Saved cleaned Quran to:", output_fixed)


Saved cleaned Quran to: /content/Quran_normalized_arabic_final.json


**1.2 Quran Urdu & English Normalization**

In [None]:
%%capture
!pip install camel-tools
!pip install pyarabic
!pip install clean-text
!pip install ftfy
!pip install unidecode --quiet

import re
import json
import unicodedata

from camel_tools.utils.dediac import dediac_ar
from pyarabic import araby
from cleantext import clean
from ftfy import fix_text


In [None]:
raw_dataset = "/content/3_Al_Quran_Dataset_FINAL.json"
output_dataset = "/content/5_Quran_normalized_ur_en.json"


In [None]:
import re
import unicodedata

URDU_PUNCT_RE = re.compile(r"[،۔؛؟!,:;]")

DIACRITICS_RE = re.compile(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]")

URDU_MAP = {
    "ي": "ی",
    "ى": "ی",
    "ئ": "ی",
    "ؤ": "و",
    "ك": "ک",
    "ھ": "ہ",
    "ة": "ہ",
}

# Remove anything not Urdu script, Arabic script, digits, or whitespace..
NON_URDU_ALLOWED = re.compile(r"[^\u0600-\u06FF0-9\s]")


def normalize_urdu(text: str) -> str:
    if not text:
        return ""

    # Unicode normalize
    t = unicodedata.normalize("NFC", text)

    # 1) Orthographic replacements
    for src, tgt in URDU_MAP.items():
        t = t.replace(src, tgt)

    # 2) Remove diacritics
    t = DIACRITICS_RE.sub("", t)

    # 3) Remove Urdu punctuation
    t = URDU_PUNCT_RE.sub(" ", t)

    # 4) Remove tatweel
    t = t.replace("ـ", "")

    # 5) Remove non-Urdu characters except whitespace/digits
    t = NON_URDU_ALLOWED.sub(" ", t)

    # 6) Collapse spaces
    t = re.sub(r"\s+", " ", t).strip()

    return t


In [None]:

def normalize_english(text: str) -> str:
    if not text:
        return ""

    t = fix_text(text)
    t = clean(
        t,
        lower=True,
        fix_unicode=True,
        no_urls=True,
        no_punct=True,
        no_emoji=True,
        no_digits=False,
    )

    return re.sub(r"\s+", " ", t).strip()


In [None]:

def enrich_record(item: dict) -> dict:
    return {
        "quran_id": item["quran_id"],
        "juz": item["juz"],
        "surah_id": item["surah_id"],
        "ayah_id": item["ayah_id"],
        "source": item["source"],
        "transliteration": item["transliteration"],
        "surah_name_ar": item["surah_name_ar"],
        "surah_name_ur": item["surah_name_ur"],
        "surah_name_en": item["surah_name_en"],
        "surah_type": item["surah_type"],

        "text_ar": item["text_ar"],
        "normalized_ur": normalize_urdu(item.get("text_ur", "")),
        "normalized_en": normalize_english(item.get("text_en", "")),
    }

In [None]:
def load_and_preprocess_dataset(path: str):
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read().strip()

    data = []

    if raw.startswith("["):
        data = json.loads(raw)
    else:
        for line in raw.splitlines():
            if line.strip():
                data.append(json.loads(line))

    return [enrich_record(rec) for rec in data]

def save_preprocessed_json(records, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(records, f, ensure_ascii=False, indent=2)
    print(f"\nSaved JSON dataset to {output_path}")

processed_data = load_and_preprocess_dataset(raw_dataset)
save_preprocessed_json(processed_data, output_dataset)


Saved JSON dataset to /content/Quran_normalized_ur_en.json


**1.3 Merging Normalized Arabic with Normalized Urdu and English**

In [None]:
import json

data_ur_en = "/content/5_Quran_normalized_ur_en.json"
data_ar = "/content/4_Quran_normalized_arabic_final.json"
output_full = "/content/6_Quran_Dataset_preprocessed_full.jsonl"

with open(data_ur_en, "r", encoding="utf-8") as f:
    ur_en = json.load(f)

with open(data_ar, "r", encoding="utf-8") as f:
    ar_only = json.load(f)

ar_map = {item["quran_id"]: item["text"] for item in ar_only}

missing_ids = []

with open(output_full, "w", encoding="utf-8") as out:

    for rec in ur_en:
        qid = rec["quran_id"]

        rec.pop("text_ar", None)

        if qid in ar_map:
            rec["normalized_ar"] = ar_map[qid]
        else:
            rec["normalized_ar"] = ""
            missing_ids.append(qid)

        out.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"JSONL merged successfully → {output_full}")
print("Missing Arabic quran_ids:", missing_ids)


JSONL merged successfully → /content/Quran_Dataset_preprocessed_full.jsonl
Missing Arabic quran_ids: []


In [None]:
import json

INPUT = "/content/6_Quran_Dataset_preprocessed_full.jsonl"
OUTPUT = "/content/7_Quran_Preprocessed_FINAL.jsonl"

def reformat_record(rec):
    return {
        "quran_id": rec["quran_id"],
        "juz": rec["juz"],
        "surah_id": rec["surah_id"],
        "ayah_id": rec["ayah_id"],
        "source": rec["source"],

        "transliteration": rec["transliteration"],
        "surah_name_ar": rec["surah_name_ar"],
        "surah_name_ur": rec["surah_name_ur"],
        "surah_name_en": rec["surah_name_en"],
        "surah_type": rec["surah_type"],

        "normalized_ar": rec["normalized_ar"],
        "normalized_ur": rec["normalized_ur"],
        "normalized_en": rec["normalized_en"],
    }

output_file = open(OUTPUT, "w", encoding="utf-8")

with open(INPUT, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        new_rec = reformat_record(rec)
        output_file.write(json.dumps(new_rec, ensure_ascii=False) + "\n")

output_file.close()
print("Saved reformatted file →", OUTPUT)


Saved reformatted file → /content/Quran_Preprocessed_FINAL.jsonl
