# **1 - Hadith (Bukhari) Dataset Preprocessing**

**1.1 Hadith Urdu & English Normalization**

In [1]:
%%capture
!pip install camel-tools
!pip install pyarabic
!pip install clean-text
!pip install ftfy
!pip install unidecode --quiet

import re
import json
import unicodedata

from camel_tools.utils.dediac import dediac_ar
from pyarabic import araby
from cleantext import clean
from ftfy import fix_text


In [2]:
raw_dataset = "/content/6_Al_Hadith_Dataset_FINAL.json"
output_dataset = "/content/1_Al_Hadith_Dataset_Preprocessed.json"


In [3]:
import re
import unicodedata

URDU_PUNCT_RE = re.compile(r"[،۔؛؟!,:;]")

DIACRITICS_RE = re.compile(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]")

URDU_MAP = {
    "ي": "ی",
    "ى": "ی",
    "ئ": "ی",
    "ؤ": "و",
    "ك": "ک",
    "ھ": "ہ",
    "ة": "ہ",
}

# Remove anything not Urdu script, Arabic script, digits, or whitespace..
NON_URDU_ALLOWED = re.compile(r"[^\u0600-\u06FF0-9\s]")


def normalize_urdu(text: str) -> str:
    if not text:
        return ""

    # Unicode normalize
    t = unicodedata.normalize("NFC", text)

    # 1) Orthographic replacements
    for src, tgt in URDU_MAP.items():
        t = t.replace(src, tgt)

    # 2) Remove diacritics
    t = DIACRITICS_RE.sub("", t)

    # 3) Remove Urdu punctuation
    t = URDU_PUNCT_RE.sub(" ", t)

    # 4) Remove tatweel
    t = t.replace("ـ", "")

    # 5) Remove non-Urdu characters except whitespace/digits
    t = NON_URDU_ALLOWED.sub(" ", t)

    # 6) Collapse spaces
    t = re.sub(r"\s+", " ", t).strip()

    return t


In [5]:
def normalize_english(text: str) -> str:
    if not text:
        return ""

    t = fix_text(text)
    t = clean(
        t,
        lower=True,
        fix_unicode=True,
        no_urls=True,
        no_punct=True,
        no_emoji=True,
        no_digits=False,
    )

    return re.sub(r"\s+", " ", t).strip()


In [None]:
# {
#         "hadith_id": 1,
#         "source": "Hadith",
#         "book": "Sahih Bukhari",
#         "book_id": 1,
#         "chapter_id": 1,
#         "chapter_name_ar": "كتاب بدء الوحى",
#         "chapter_name_ur": "وحی کے بیان میں",
#         "chapter_name_en": "Revelation",
#         "arabic_text": "حَدَّثَنَا الْحُمَيْدِيُّ عَبْدُ اللَّهِ بْنُ الزُّبَيْرِ، قَالَ حَدَّثَنَا سُفْيَانُ، قَالَ حَدَّثَنَا يَحْيَى بْنُ سَعِيدٍ الأَنْصَارِيُّ، قَالَ أَخْبَرَنِي مُحَمَّدُ بْنُ إِبْرَاهِيمَ التَّيْمِيُّ، أَنَّهُ سَمِعَ عَلْقَمَةَ بْنَ وَقَّاصٍ اللَّيْثِيَّ، يَقُولُ سَمِعْتُ عُمَرَ بْنَ الْخَطَّابِ ـ رضى الله عنه ـ عَلَى الْمِنْبَرِ قَالَ سَمِعْتُ رَسُولَ اللَّهِ صلى الله عليه وسلم يَقُولُ ‏ \"‏ إِنَّمَا الأَعْمَالُ بِالنِّيَّاتِ، وَإِنَّمَا لِكُلِّ امْرِئٍ مَا نَوَى، فَمَنْ كَانَتْ هِجْرَتُهُ إِلَى دُنْيَا يُصِيبُهَا أَوْ إِلَى امْرَأَةٍ يَنْكِحُهَا فَهِجْرَتُهُ إِلَى مَا هَاجَرَ إِلَيْهِ ‏\"‏‏.‏",
#         "urdu_text": "آپ صلی اللہ علیہ وسلم فرما رہے تھے کہ تمام اعمال کا دارومدار نیت پر ہے اور ہر عمل کا نتیجہ ہر انسان کو اس کی نیت کے مطابق ہی ملے گا ۔ پس جس کی ہجرت ( ترک وطن ) دولت دنیا حاصل کرنے کے لیے ہو یا کسی عورت سے شادی کی غرض ہو ۔ پس اس کی ہجرت ان ہی چیزوں کے لیے ہو گی جن کے حاصل کرنے کی نیت سے اس نے ہجرت کی ہے ۔",
#         "english_text": "I heard Allah's Messenger (ﷺ) saying, \"The reward of deeds depends upon the intentions and every person will get the reward according to what he has intended. So whoever emigrated for worldly benefits or for a woman to marry, his emigration was for what he emigrated for.\"",
#         "narrator": "Narrated 'Umar bin Al-Khattab (May Allah be pleased with him):",
#         "status": "Sahih"
#     },

In [6]:
def enrich_record(item: dict) -> dict:
    return {
        "hadith_id": item["hadith_id"],
        "chapter_id": item["chapter_id"],
        "book_id": item["book_id"],
        "book": item["book"],
        "source": item["source"],
        "chapter_name_ar": item["chapter_name_ar"],
        "chapter_name_ur": item["chapter_name_ur"],
        "chapter_name_en": item["chapter_name_en"],
        "narrator": item["narrator"],
         "status": item["status"],

        "arabic_text": item["arabic_text"],
        "normalized_ur": normalize_urdu(item.get("urdu_text", "")),
        "normalized_en": normalize_english(item.get("english_text", "")),
    }

In [7]:
def load_and_preprocess_dataset(path: str):
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read().strip()

    data = []

    if raw.startswith("["):
        data = json.loads(raw)
    else:
        for line in raw.splitlines():
            if line.strip():
                data.append(json.loads(line))

    return [enrich_record(rec) for rec in data]

def save_preprocessed_json(records, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(records, f, ensure_ascii=False, indent=2)
    print(f"\nSaved JSON dataset to {output_path}")

processed_data = load_and_preprocess_dataset(raw_dataset)
save_preprocessed_json(processed_data, output_dataset)


Saved JSON dataset to /content/1_Al_Hadith_Dataset_Preprocessed.json


**1.2 - Removing Arabic Text**

In [9]:
input_path = "/content/1_Al_Hadith_Dataset_Preprocessed.json"
output_path = "/content/2_Bukhari_Hadith_Preprocessed_FINAL.jsonl"

if input_path.endswith(".jsonl"):
    data = []
    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
else:
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

print(f"Loaded {len(data)} records")

for item in data:
    if "arabic_text" in item:
        del item["arabic_text"]

with open(output_path, "w", encoding="utf-8") as f:
    for obj in data:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print(f"\nSaved JSONL with {len(data)} records ---> {output_path}")


Loaded 7563 records

Saved JSONL with 7563 records ---> /content/2_Bukhari_Hadith_Preprocessed_FINAL.jsonl
