#**1 - Quran Datasets Download**

In [None]:
import requests
from pathlib import Path

def download_file(url: str, save_path: Path) -> bool:

    try:
        response = requests.get(url, timeout=20)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f'Could not download {url}\nReason: {e}')
        return False

    save_path.parent.mkdir(parents=True, exist_ok=True)

    save_path.write_text(response.text, encoding="utf-8")

    print(f"[OK] Saved: {save_path}")
    return True

# Quran dataset list..
datasets = {
    "Quran_ar.json": "https://cdn.jsdelivr.net/npm/quran-json@3.1.2/dist/quran.json",
    "Quran_ur.json": "https://cdn.jsdelivr.net/npm/quran-json@3.1.2/dist/quran_ur.json",
    "Quran_en.json": "https://cdn.jsdelivr.net/npm/quran-json@3.1.2/dist/quran_en.json",
}

# base directory in Google Colab..
DATA_DIR = Path("/content/Quran_datasets")

print("Downloading the Quran Datasets (Arabic / Urdu / English)\n")

for filename, url in datasets.items():
    save_path = DATA_DIR / filename
    download_file(url, save_path)

print("\nAll Quran datasets downloaded successfully!")
print(f"\nFiles are stored in: {DATA_DIR.resolve()}")


Downloading the Quran Datasets (Arabic / Urdu / English)

[OK] Saved: /content/Quran_datasets/Quran_ar.json
[OK] Saved: /content/Quran_datasets/Quran_ur.json
[OK] Saved: /content/Quran_datasets/Quran_en.json

All Quran datasets downloaded successfully!

Files are stored in: /content/Quran_datasets


**1.1 - Quran Datasets Merge**

In [None]:
import json
from pathlib import Path

DATA_DIR = Path("/content/Quran_datasets")

with open(DATA_DIR / "Quran_ar.json", "r", encoding="utf-8") as f:
    quran_ar = json.load(f)

with open(DATA_DIR / "Quran_ur.json", "r", encoding="utf-8") as f:
    quran_ur = json.load(f)

with open(DATA_DIR / "Quran_en.json", "r", encoding="utf-8") as f:
    quran_en = json.load(f)

print("Datasets loaded successfully!")

merged_quran = []
global_quran_id = 1

for surah_ar, surah_en, surah_ur in zip(quran_ar, quran_en, quran_ur):

    surah_id = surah_ar["id"]
    surah_name_ar = surah_ar["name"]
    surah_name_en = surah_en["translation"]
    surah_name_ur = surah_ur["translation"]

    transliteration = surah_ar["transliteration"]
    surah_type = surah_ar["type"]

    for v_ar, v_en, v_ur in zip(surah_ar["verses"], surah_en["verses"], surah_ur["verses"]):

        ayah_id = v_ar["id"]

        doc = {
            "doc_id": f"Q_{surah_id}_{ayah_id}",
            "quran_id": global_quran_id,
            "surah_id": surah_id,
            "ayah_id": ayah_id,
            "source": "Quran",

            "surah_name_ar": surah_name_ar,
            "surah_name_en": surah_name_en,
            "surah_name_ur": surah_name_ur,

            "transliteration": transliteration,
            "surah_type": surah_type,

            "text_ar": v_ar["text"],
            "text_en": v_en["translation"],
            "text_ur": v_ur["translation"]
        }

        merged_quran.append(doc)
        global_quran_id += 1

OUTPUT_PATH = Path("/content/1_Quran_merged.json")
OUTPUT_PATH.write_text(json.dumps(merged_quran, ensure_ascii=False, indent=2), encoding="utf-8")

print(f"\nMerged Quran dataset created!\n\nSaved at: {OUTPUT_PATH}")
print(f"\nTotal verses merged: {len(merged_quran)}")


Datasets loaded successfully!

Merged Quran dataset created!

Saved at: /content/Quran_merged.json

Total verses merged: 6236


**1.2 - Juz in Quran Dataset**

In [None]:
import json
from pathlib import Path

MERGED_PATH = Path("/content/1_Quran_merged.json")

with open(MERGED_PATH, "r", encoding="utf-8") as f:
    merged_quran = json.load(f)

print(f"Merged Quran records loaded: {len(merged_quran)}")

# Juz boundaries..
juz_boundaries = [
    {"juz": 1,  "start": (1, 1),   "end": (2, 141)},
    {"juz": 2,  "start": (2, 142), "end": (2, 252)},
    {"juz": 3,  "start": (2, 253), "end": (3, 91)},
    {"juz": 4,  "start": (3, 92),  "end": (4, 23)},
    {"juz": 5,  "start": (4, 24),  "end": (4, 147)},
    {"juz": 6,  "start": (4, 148), "end": (5, 82)},
    {"juz": 7,  "start": (5, 83),  "end": (6, 110)},
    {"juz": 8,  "start": (6, 111), "end": (7, 87)},
    {"juz": 9,  "start": (7, 88),  "end": (8, 40)},
    {"juz": 10, "start": (8, 41),  "end": (9, 93)},
    {"juz": 11, "start": (9, 94),  "end": (11, 5)},
    {"juz": 12, "start": (11, 6),  "end": (12, 52)},
    {"juz": 13, "start": (12, 53), "end": (15, 1)},
    {"juz": 14, "start": (15, 2),  "end": (16, 128)},
    {"juz": 15, "start": (17, 1),  "end": (18, 74)},
    {"juz": 16, "start": (18, 75), "end": (20, 135)},
    {"juz": 17, "start": (21, 1),  "end": (22, 78)},
    {"juz": 18, "start": (23, 1),  "end": (25, 20)},
    {"juz": 19, "start": (25, 21), "end": (27, 59)},
    {"juz": 20, "start": (27, 60), "end": (29, 44)},
    {"juz": 21, "start": (29, 45), "end": (33, 30)},
    {"juz": 22, "start": (33, 31), "end": (36, 21)},
    {"juz": 23, "start": (36, 22), "end": (39, 31)},
    {"juz": 24, "start": (39, 32), "end": (41, 46)},
    {"juz": 25, "start": (41, 47), "end": (45, 37)},
    {"juz": 26, "start": (46, 1),  "end": (51, 30)},
    {"juz": 27, "start": (51, 31), "end": (57, 29)},
    {"juz": 28, "start": (58, 1),  "end": (66, 12)},
    {"juz": 29, "start": (67, 1),  "end": (77, 50)},
    {"juz": 30, "start": (78, 1),  "end": (114, 6)},
]

def is_in_range(surah, ayah, start, end):

    s_surah, s_ayah = start
    e_surah, e_ayah = end

    if surah < s_surah or surah > e_surah:
        return False

    if surah == s_surah and ayah < s_ayah:
        return False

    if surah == e_surah and ayah > e_ayah:
        return False

    return True

# assigning the Juz numbers..
for doc in merged_quran:
    surah = doc["surah_id"]
    ayah = doc["ayah_id"]

    # assign juz by range lookup...
    for item in juz_boundaries:
        if is_in_range(surah, ayah, item["start"], item["end"]):
            doc["juz"] = item["juz"]
            break

OUTPUT_PATH = Path("/content/1_Quran_merged_juz.json")

OUTPUT_PATH.write_text(
    json.dumps(merged_quran, ensure_ascii=False, indent=2),
    encoding="utf-8"
)

print("\nJuz added successfully!")
print(f"\nSaved updated file to: {OUTPUT_PATH}")


Merged Quran records loaded: 6236

Juz added successfully!

Saved updated file to: /content/Quran_merged_juz.json


**1.3 - Quran Dataset Analysis**

In [None]:
import json
import re
from pathlib import Path

DATA_PATH = Path("/content/1_Quran_merged_juz.json")
quran = json.loads(DATA_PATH.read_text(encoding="utf-8"))

print(f"Loaded {len(quran)} ayahs.\n")

checks = {
    "required_fields": True,
    "doc_id_format": True,
    "arabic_text": True,
    "english_translation": True,
    "urdu_translation": True,
    "quran_id_order": True,
    "surah_range": True,
    "juz_range": True
}

required_fields = [
    "doc_id","quran_id","surah_id","ayah_id",
    "surah_name_ar","surah_name_en","surah_name_ur",
    "transliteration","surah_type",
    "text_ar","text_en","text_ur",
    "juz"
]

# 1 - required fields ...
for d in quran:
    for k in required_fields:
        if k not in d or d[k] in ["", None]:
            checks["required_fields"] = False

# 2 - doc_id format..
pat = re.compile(r"^Q_\d+_\d+$")
for d in quran:
    if not pat.match(d["doc_id"]):
        checks["doc_id_format"] = False

# 3 - arabic text contains Arabic
for d in quran:
    if not re.search(r"[\u0600-\u06FF]", d["text_ar"]):
        checks["arabic_text"] = False

# 4 - english translation is non-empty and string..
for d in quran:
    if not isinstance(d["text_en"], str) or d["text_en"].strip() == "":
        checks["english_translation"] = False

# 5 - urdu translation is non-empty and string...
for d in quran:
    if not isinstance(d["text_ur"], str) or d["text_ur"].strip() == "":
        checks["urdu_translation"] = False

# 6 - quran_id in order
for i, d in enumerate(quran, start=1):
    if d["quran_id"] != i:
        checks["quran_id_order"] = False

# 7 - surah_id valid
for d in quran:
    if not (1 <= d["surah_id"] <= 114):
        checks["surah_range"] = False

# 8 - juz valid
for d in quran:
    if not (1 <= d["juz"] <= 30):
        checks["juz_range"] = False

print("Required Fields Check:", "PASS" if checks["required_fields"] else "FAIL")
print("\ndoc_id Format Check:", "PASS" if checks["doc_id_format"] else "FAIL")

print("\nArabic Text Check:", "PASS" if checks["arabic_text"] else "FAIL")
print("Urdu Translation Check:", "PASS" if checks["urdu_translation"] else "FAIL")
print("English Translation Check:", "PASS" if checks["english_translation"] else "FAIL")

print("\nquran_id Order Check:", "PASS" if checks["quran_id_order"] else "FAIL")
print("Surah Range Check:", "PASS" if checks["surah_range"] else "FAIL")
print("Juz Range Check:", "PASS" if checks["juz_range"] else "FAIL")


Loaded 6236 ayahs.

Required Fields Check: PASS

doc_id Format Check: PASS

Arabic Text Check: PASS
Urdu Translation Check: PASS
English Translation Check: PASS

quran_id Order Check: PASS
Surah Range Check: PASS
Juz Range Check: PASS


In [None]:
import json
import pandas as pd
import numpy as np
import os

file_path = '/content/1_Quran_merged_juz.json'

if not os.path.exists(file_path):
    print(f"ERROR: File not found at: {file_path}\n")
    print("Please check the path and try again.")
else:
    print(f"Scanning {file_path}...")

    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    df = pd.DataFrame(data)

    # ensuring that quran_id is integer..
    # Using errors = 'coerce' turns non-numeric values into NaN..
    df['quran_id'] = pd.to_numeric(df['quran_id'], errors='coerce')

    print(f"\nVALIDATION REPORT FOR 'quran_id")

    # 1 - checking for duplicates..
    duplicates = df[df.duplicated('quran_id', keep=False)]
    if not duplicates.empty:
        print(f"FAILED: Found {len(duplicates.drop_duplicates(subset=['quran_id']))} duplicate Quran IDs.")
        print("Duplicate IDs:", duplicates['quran_id'].unique().tolist()[:20], "...")
    else:
        print("\nDuplicates: None")

    # 2 - checking for the missing Numbers (gaps in sequence)...
    # filtering out any rows where quran_id is invalid/NaN
    valid_ids = df['quran_id'].dropna().astype(int)

    if valid_ids.empty:
         print("FAILED: 'quran_id' column is empty or invalid.")
    else:
        max_id = int(valid_ids.max())
        expected_ids = set(range(1, max_id + 1))
        actual_ids = set(valid_ids)

        missing_ids = sorted(list(expected_ids - actual_ids))

        if missing_ids:
            print(f"FAILED: Sequence is broken. Found {len(missing_ids)} missing IDs.")
            print(f"First 10 missing IDs: {missing_ids[:10]}")
        else:
            print("Sequence: Perfect (1 to Last)")

        # 3 - checking Start
        min_id = int(valid_ids.min())
        if min_id != 1:
            print(f"WARNING: Sequence starts at {min_id}, not 1.")
        else:
            print("Start: Sequence starts at 1.")

        # 4 - total Count Check..
        print(f"\nSUMMARY")
        print(f"\nTotal Rows: {len(df)}")
        print(f"Max Quran ID found: {max_id}")

        if len(duplicates) == 0 and len(missing_ids) == 0 and min_id == 1:
            print("\nRESULT: PERFECT SEQUENCE. 'quran_id' is clean.")
        else:
            print("\nRESULT: DATASET HAS ERRORS. Fix duplicates/missing IDs before using.")


Scanning /content/Quran_merged_juz.json...

VALIDATION REPORT FOR 'quran_id

Duplicates: None
Sequence: Perfect (1 to Last)
Start: Sequence starts at 1.

SUMMARY

Total Rows: 6236
Max Quran ID found: 6236

RESULT: PERFECT SEQUENCE. 'quran_id' is clean.


#**2 - Hadith Dataset from Hadith API**

In [None]:
import requests
import json
import time
import sys

API_KEY = "$2y$10$bxpfV3LpNWhUKJvdVcOqe9dLd4jeM2x55z9gegNcONYekiI4maS"
BASE_URL = "https://hadithapi.com/api/hadiths"
BOOK_SLUG = "sahih-bukhari"
OUTPUT_FILE = "1_Sahih_Bukhari_HadithAPI.json"

PER_PAGE = 50
BASE_DELAY = 1.5
MAX_RETRIES = 3

all_hadith = []
page = 1

print(f"Starting Download for {BOOK_SLUG}\n")

try:
    while True:
        url = f"{BASE_URL}?book={BOOK_SLUG}&paginate={PER_PAGE}&page={page}&apiKey={API_KEY}"

        success = False
        for attempt in range(MAX_RETRIES):
            try:
                print(f"Fetching page {page} (Attempt {attempt + 1})...", end=" ")
                resp = requests.get(url, timeout=10)

                if resp.status_code == 429:
                    print("\nToo Many Requests! Cooling down for 30 seconds...")
                    time.sleep(30)
                    continue

                if resp.status_code != 200:
                    print(f"Error: {resp.status_code}")
                    time.sleep(5)
                    continue

                data = resp.json()
                hadith_list = data.get("hadiths", {}).get("data", [])

                if not hadith_list:
                    print("\nNo more data found. Download complete!")
                    success = True
                    break

                for h in hadith_list:
                    formatted = {
                        "doc_id": f"H_Bukhari_{h.get('hadithNumber', 'Unknown')}",
                        "source": "Hadith",
                        "book": "Sahih Bukhari",
                        "book_id": h.get("book", {}).get("id"),
                        "chapter_id": h.get("chapter", {}).get("id"),
                        "hadith_id": h.get("hadithNumber"),
                        "bukhari_id": h.get("id"),

                        "chapter_name_ar": h.get("chapter", {}).get("chapterArabic"),
                        "chapter_name_en": h.get("chapter", {}).get("chapterEnglish"),
                        "chapter_name_ur": h.get("chapter", {}).get("chapterUrdu"),

                        "arabic_text": h.get("hadithArabic"),
                        "english_text": h.get("hadithEnglish"),
                        "urdu_text": h.get("hadithUrdu"),

                        "narrator": h.get("englishNarrator"),
                        "status": h.get("status", "Sahih"),
                        "tags": []
                    }
                    all_hadith.append(formatted)

                print(f"Done. (Total: {len(all_hadith)})")
                success = True

                last_page = data.get("hadiths", {}).get("last_page")
                if last_page and page >= last_page:
                    print("Reached the last page defined by API.")
                    sys.exit(0)

                break

            except requests.exceptions.RequestException as e:
                print(f"\nNetwork Error: {e}. Retrying in 5s")
                time.sleep(5)

        if not success and not hadith_list:
            break
        elif not success:
            print(f"\nFailed to fetch page {page} after {MAX_RETRIES} attempts. Stopping safely.")
            break

        page += 1
        time.sleep(BASE_DELAY)

except KeyboardInterrupt:
    print("\nScript stopped manually by user.")

except SystemExit:
    pass

finally:
    print(f"\nSaving {len(all_hadith)} Hadiths to file.")
    if len(all_hadith) > 0:
        with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
            json.dump(all_hadith, f, ensure_ascii=False, indent=2)
        print(f"\nSuccessfully saved to {OUTPUT_FILE}")
    else:
        print("No data collected to save.")


Starting Download for sahih-bukhari

Fetching page 1 (Attempt 1)... Done. (Total: 50)
Fetching page 2 (Attempt 1)... Done. (Total: 100)
Fetching page 3 (Attempt 1)... Done. (Total: 150)
Fetching page 4 (Attempt 1)... Done. (Total: 200)
Fetching page 5 (Attempt 1)... Done. (Total: 250)
Fetching page 6 (Attempt 1)... Done. (Total: 300)
Fetching page 7 (Attempt 1)... Done. (Total: 350)
Fetching page 8 (Attempt 1)... Done. (Total: 400)
Fetching page 9 (Attempt 1)... Done. (Total: 450)
Fetching page 10 (Attempt 1)... Done. (Total: 500)
Fetching page 11 (Attempt 1)... Done. (Total: 550)
Fetching page 12 (Attempt 1)... Done. (Total: 600)
Fetching page 13 (Attempt 1)... Done. (Total: 650)
Fetching page 14 (Attempt 1)... Done. (Total: 700)
Fetching page 15 (Attempt 1)... Done. (Total: 750)
Fetching page 16 (Attempt 1)... Done. (Total: 800)
Fetching page 17 (Attempt 1)... Done. (Total: 850)
Fetching page 18 (Attempt 1)... Done. (Total: 900)
Fetching page 19 (Attempt 1)... Done. (Total: 950)
Fetc

**2.1 - Hadith Dataset Sorting**

In [None]:
import json
import re

input_filename = "/content/1_Sahih_Bukhari_HadithAPI.json"
output_filename = "2_Sahih_Bukhari_sorted.json"

print("Loading file...")
with open(input_filename, "r", encoding="utf-8") as f:
    data = json.load(f)

def get_sort_key(item):

    h_id = str(item.get("hadith_id", "0"))

    numbers = re.findall(r'\d+', h_id)

    if numbers:
        return int(numbers[0])
    return 0

print("\nSorting hadiths...")
sorted_data = sorted(data, key = get_sort_key)

print(f"\nSaving to {output_filename}...")
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(sorted_data, f, ensure_ascii=False, indent=2)

print("\nDone! Your file is now perfectly sorted.")


Loading file...

Sorting hadiths...

Saving to Sahih_Bukhari_sorted.json...

Done! Your file is now perfectly sorted.


**2.2 - Hadith Dataset Analysis**

In [None]:
import pandas as pd

FILE_PATH = "/content/2_Sahih_Bukhari_sorted.json"

if FILE_PATH.endswith(".json"):
    df = pd.read_json(FILE_PATH)
elif FILE_PATH.endswith(".csv"):
    df = pd.read_csv(FILE_PATH)
elif FILE_PATH.endswith(".ndjson"):
    df = pd.read_json(FILE_PATH, lines=True)
else:
    raise ValueError("Unsupported file format")

chapters = (
    df[["chapter_id", "chapter_name_ar", "chapter_name_en", "chapter_name_ur"]]
    .drop_duplicates()
    .sort_values("chapter_id")
    .reset_index(drop=True)
)

print("Beautiful Chapter Listing (Your Dataset)")

for _, row in chapters.iterrows():
    print(f"\nChapter ---> {row['chapter_id']}")
    print(f"Arabic   :   {row['chapter_name_ar']}")
    print(f"Urdu   :   {row['chapter_name_ur']}")
    print(f"English   :   {row['chapter_name_en']}")

print(f"\nTotal Chapters Found: {len(chapters)}")


Beautiful Chapter Listing (Your Dataset)

Chapter ---> 1
Arabic   :   كتاب بدء الوحى
Urdu   :   وحی کے بیان میں
English   :   Revelation

Chapter ---> 2
Arabic   :   كتاب الإيمان
Urdu   :   ایمان کے بیان میں
English   :   Belief

Chapter ---> 3
Arabic   :   كتاب العلم
Urdu   :   علم کے بیان میں
English   :   Knowledge

Chapter ---> 4
Arabic   :   كتاب الوضوء
Urdu   :   وضو کے بیان میں
English   :   Ablutions (Wudu')

Chapter ---> 5
Arabic   :   كتاب الغسل
Urdu   :   غسل کے احکام و مسائل
English   :   Bathing (Ghusl)

Chapter ---> 6
Arabic   :   كتاب الحيض
Urdu   :   حیض کے احکام و مسائل
English   :   Menstrual Periods

Chapter ---> 7
Arabic   :   كتاب التيمم
Urdu   :   تیمم کے احکام و مسائل
English   :   Rubbing hands and feet with dust (Tayammum)

Chapter ---> 8
Arabic   :   كتاب الصلاة
Urdu   :   نماز کے احکام و مسائل
English   :   Prayers (Salat)

Chapter ---> 9
Arabic   :   كتاب مواقيت الصلاة
Urdu   :   اوقات نماز کے بیان میں
English   :   Times of the Prayers

Chapter ---> 10
Arab

In [None]:
import json
import pandas as pd

file_path = "/content/2_Sahih_Bukhari_sorted.json"

try:
    df = pd.DataFrame(json.load(open(file_path, 'r', encoding='utf-8')))
except:
    data = [json.loads(line) for line in open(file_path, 'r', encoding='utf-8') if line.strip()]
    df = pd.DataFrame(data)

print("Dataset Overview\n")
print(f"Total Books: {df['book'].nunique()}")
print(f"Total Hadiths: {len(df)}")
print(f"Total Chapters: {df['chapter_id'].nunique()}")

print("\nData Quality Checks")

nulls = df.isnull().sum()

if nulls.sum() == 0:
    print(f"\nMissing values: 0")
else:
    print(f"\n---> Missing values found:\n")
    for col in nulls[nulls>0].index:
        missing_ids = df[df[col].isnull()]['doc_id'].tolist()
        print(f"  {col}: {len(missing_ids)} -> {missing_ids}")

for col in ['arabic_text', 'urdu_text', 'english_text']:
    empty = df[col].isnull().sum() | (df[col].fillna('').str.strip()=='').sum()
    if empty == 0:
        print(f"\nEmpty {col}: 0")
    else:
        empty_ids = df[(df[col].isnull()) | (df[col].str.strip()=='')]['doc_id'].tolist()
        print(f"\nEmpty {col}: {empty} -> {empty_ids}")

print(f"\nDuplicate IDs: {df['doc_id'].duplicated().sum()}" if not df['doc_id'].duplicated().any() else f"---> Duplicates found")


Dataset Overview

Total Books: 1
Total Hadiths: 7276
Total Chapters: 99

Data Quality Checks

---> Missing values found:

  english_text: 2 -> ['H_Bukhari_1830', 'H_Bukhari_6119']
  narrator: 13 -> ['H_Bukhari_537', 'H_Bukhari_583', 'H_Bukhari_656', 'H_Bukhari_850', 'H_Bukhari_897', 'H_Bukhari_1250', 'H_Bukhari_1259', 'H_Bukhari_1324', 'H_Bukhari_1646', 'H_Bukhari_1848', 'H_Bukhari_2124', 'H_Bukhari_2173', 'H_Bukhari_4080']

Empty arabic_text: 0

Empty urdu_text: 0

Empty english_text: 2 -> ['H_Bukhari_1830', 'H_Bukhari_6119']

Duplicate IDs: 0


**2.3 - Hadith Dataset Refining**

In [None]:
import json
import pandas as pd

file_path = "/content/2_Sahih_Bukhari_sorted.json"

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
except:
    data = [json.loads(line) for line in open(file_path, 'r', encoding='utf-8') if line.strip()]

print("Fixing missing data...")

for item in data:
    if item['doc_id'] == 'H_Bukhari_1830':
        item['english_text'] = 'While we were in the company of the Prophet (ﷺ) in a cave at Mina, when Surat-wal-Mursalat were revealed and he recited it and I heard it (directly) from his mouth as soon as he recited its revelation. Suddenly a snake sprang at us and the Prophet (ﷺ) said (ordered us): "Kill it." We ran to kill it but it escaped quickly. The Prophet (ﷺ) said, "It has escaped your evil and you too have escaped its evil."'
        print(f"\nFixed: {item['doc_id']} - Added English text")

    if item['doc_id'] == 'H_Bukhari_6119':
        item['english_text'] = 'The Prophet (ﷺ) was more shy (from Haya\': pious shyness from committing religeous indiscretions) than a veiled virgin girl.'
        print(f"\nFixed: {item['doc_id']} - Added English text")

    if item.get('narrator') is None or item.get('narrator') == '':
        item['narrator'] = 'Unknown'
        print(f"\nFixed: {item['doc_id']} - Added narrator as 'Unknown'")

output_file = file_path.replace('.json', '_fixed.json')
with open(output_file, 'w', encoding = 'utf-8') as f:
    json.dump(data, f, ensure_ascii = False, indent=2)

print(f"\nFixed dataset saved to: {output_file}")

df = pd.DataFrame(data)
print(f"\nVerification:")
print(f"Missing english_text: {df['english_text'].isnull().sum()}")
print(f"Missing narrator: {df['narrator'].isnull().sum()}")


Fixing missing data...

Fixed: H_Bukhari_537 - Added narrator as 'Unknown'

Fixed: H_Bukhari_583 - Added narrator as 'Unknown'

Fixed: H_Bukhari_656 - Added narrator as 'Unknown'

Fixed: H_Bukhari_850 - Added narrator as 'Unknown'

Fixed: H_Bukhari_897 - Added narrator as 'Unknown'

Fixed: H_Bukhari_1250 - Added narrator as 'Unknown'

Fixed: H_Bukhari_1259 - Added narrator as 'Unknown'

Fixed: H_Bukhari_1324 - Added narrator as 'Unknown'

Fixed: H_Bukhari_1646 - Added narrator as 'Unknown'

Fixed: H_Bukhari_1830 - Added English text

Fixed: H_Bukhari_1848 - Added narrator as 'Unknown'

Fixed: H_Bukhari_2124 - Added narrator as 'Unknown'

Fixed: H_Bukhari_2173 - Added narrator as 'Unknown'

Fixed: H_Bukhari_4080 - Added narrator as 'Unknown'

Fixed: H_Bukhari_6119 - Added English text

Fixed dataset saved to: /content/Sahih_Bukhari_sorted_fixed.json

Verification:
Missing english_text: 0
Missing narrator: 0


In [None]:
import json

INPUT_FILE = "/content/3_Sahih_Bukhari_sorted_fixed.json"
OUTPUT_FILE = "4_Sahih_Bukhari_sorted_fixed_final.json"

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

expanded = []

for item in data:

    raw = str(item.get("hadith_id")).strip()

    if raw.isdigit():
        hid = int(raw)
        item["hadith_id"] = hid
        item["doc_id"] = f"H_Bukhari_{hid}"
        expanded.append(item)
        continue

    if "," in raw:
        parts = [p.strip() for p in raw.split(",")]

        for p in parts:
            if p.isdigit():
                hid = int(p)
                new_item = item.copy()
                new_item["hadith_id"] = hid
                new_item["doc_id"] = f"H_Bukhari_{hid}"
                expanded.append(new_item)
        continue

    digits = "".join([c for c in raw if c.isdigit()])
    if digits.isdigit():
        hid = int(digits)
        item["hadith_id"] = hid
        item["doc_id"] = f"H_Bukhari_{hid}"
        expanded.append(item)
        continue

    continue

# sorting by official hadith number...
expanded_sorted = sorted(expanded, key=lambda x: x["hadith_id"])

# sssigning the bukhari_id = order (1 to last)
for idx, item in enumerate(expanded_sorted, start=1):
    item["bukhari_id"] = idx

    # delete original_id if present...
    if "original_id" in item:
        del item["original_id"]

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(expanded_sorted, f, ensure_ascii=False, indent=2)

print(f"Total final hadith: {len(expanded_sorted)}")
print("\nSaved to:", OUTPUT_FILE)


Total final hadith: 7582

Saved to: Sahih_Bukhari_sorted_fixed_final.json
