In [1]:
chain_data_dir="/home/ansary/work/hadith/chain_data/"
save_data_dir="/home/ansary/work/hadith/chain_data_hadith_wise/"

In [2]:
import os
import json
from tqdm import tqdm
import pandas as pd
all_relations=[]

def extract_narration_chain_and_relations(data, hadith_id):
    # Extract the relevant source data
    try:
        source_data = data[hadith_id][0]["_source"]
    except (KeyError, IndexError):
        print(f"Warning: Invalid data structure for hadith_id {hadith_id}. Skipping.")
        return None, None, None

    nodes = source_data.get("nodes", [])
    links = source_data.get("links", [])
    
    if not nodes or not links:
        print(f"Warning: No nodes or links found for hadith_id {hadith_id}. Skipping.")
        return None, None, None

    # Create a dictionary to map node IDs to their labels and types
    node_dict = {node["id"]: {
        "label": node["label"],
        "type": node["node_type"]
    } for node in nodes}
    
    # Create a forward narration dictionary (teacher -> student)
    narration_dict = {link["target"]: link["source"] for link in links}
    
    # Find the Prophet node (if exists)
    prophet_id = None
    for node in nodes:
        if node["node_type"] == "Prophet":
            prophet_id = node["id"]
            break
    
    # If no Prophet found, find the starting point (node with no incoming links)
    start_id = prophet_id
    if not start_id:
        all_sources = set(link["source"] for link in links)
        all_targets = set(link["target"] for link in links)
        potential_starts = all_targets - all_sources
        if potential_starts:
            start_id = list(potential_starts)[0]

    if not start_id:
        print(f"Warning: No starting point found for hadith_id {hadith_id}. Skipping.")
        return None, None, None

    # Build the chain forward from the starting point (preferably Prophet)
    chain = {}
    current_id = start_id
    position = 0
    visited = set()
    
    while current_id and current_id not in visited:
        if current_id not in node_dict:
            break
        chain[position] = {
            "id": current_id,
            "label": node_dict[current_id]["label"],
            "type": node_dict[current_id]["type"]
        }
        visited.add(current_id)
        current_id = narration_dict.get(current_id)
        position += 1
    
    # Determine if chain starts with Prophet
    starts_with_prophet = (prophet_id is not None and chain.get(0, {}).get("id") == prophet_id)

    # Build teacher-student relationships
    relations = []
    for link in links:
        teacher_id = link["target"]
        student_id = link["source"]
        relations.append({
            "narrator": student_id,
            "teacher": teacher_id,
            "student": None,
            "student_label": None,
            "teacher_label": node_dict[teacher_id]["label"],
            "narrator_label": node_dict[student_id]["label"],

        })
    
    # Add student relationships by matching with teacher entries
    for rel in relations:
        for other_rel in relations:
            if rel["narrator"] == other_rel["teacher"]:
                rel["student"] = other_rel["narrator"]
                rel["student_label"]=other_rel["narrator_label"]
                break
    
    return chain, relations, starts_with_prophet

# Main execution
books = os.listdir(chain_data_dir)

for book in tqdm(books, desc="Processing books"):
    jsons = os.listdir(os.path.join(chain_data_dir, book))
    for j in tqdm(jsons, desc=f"Processing JSONs in {book}", leave=False):
        hadith_id = j.split(".")[0]
        file_path = os.path.join(chain_data_dir, book, j)
        
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
            chain, relations, starts_with_prophet = extract_narration_chain_and_relations(data, hadith_id)
            
            # Skip if chain or relations are None
            if chain is None or relations is None:
                continue
            
            _save={"chain":chain,"relations":relations,"start_with_prophet":starts_with_prophet}
            _save_path=os.path.join(save_data_dir,book,j)
            os.makedirs(os.path.join(save_data_dir,book),exist_ok=True)
            with open(_save_path,"w+",encoding="utf-8") as f:
                json.dump(_save,f,indent=2, ensure_ascii=False)

            for rel in relations:
                rel["book"]=book
                rel["hadith"]=hadith_id
                all_relations.append(rel) 
            # # Process and display the results
            # print(f"\nProcessed {hadith_id}:")
            # print(f"Starts with Prophet: {starts_with_prophet}")
            # print("Chain:")
            # for pos, node in chain.items():
            #     print(f"{pos}: {node['id']} - {node['label']} ({node['type']})")
            # print("Relations:")
            # for rel in relations:
            #     student_info = f"Student: {rel['student']}" if rel['student'] else "Student: None"
            #     print(f"Narrator: {rel['narrator']} ({rel['student_label']}), "
            #           f"Teacher: {rel['teacher']} ({rel['teacher_label']}), {student_info}")
            
        except json.JSONDecodeError:
            print(f"Error: Failed to decode JSON in {file_path}. Skipping.")
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}. Skipping.")
        
   
all_relations=pd.DataFrame(all_relations)
all_relations

Processing books: 100%|██████████| 12/12 [00:55<00:00,  4.62s/it]


Unnamed: 0,narrator,teacher,student,student_label,teacher_label,narrator_label,book,hadith
0,a-19,7863,,,نافع مولى ابن عمر,مالك بن أنس (موطأ مالك رواية يحيى الليثي),19,19-190
1,7863,4967,a-19,مالك بن أنس (موطأ مالك رواية يحيى الليثي),عبد الله بن عمر بن الخطاب,نافع مولى ابن عمر,19,19-190
2,a-19,8055,,,هشام بن عروة بن الزبير,مالك بن أنس (موطأ مالك رواية يحيى الليثي),19,19-524
3,8055,553,a-19,مالك بن أنس (موطأ مالك رواية يحيى الليثي),أسماء بنت أبي بكر,هشام بن عروة بن الزبير,19,19-524
4,a-19,7272,,,الزهري,مالك بن أنس (موطأ مالك رواية يحيى الليثي),19,19-1521
...,...,...,...,...,...,...,...,...
490882,7863,3146,6641,الليث بن سعد,زيد بن عبد الله بن عمر بن الخطاب,نافع مولى ابن عمر,137,137-2065
490883,465,6641,a-137,أبو محمد عبد الله الدارمي (سنن الدارمي),الليث بن سعد,أحمد بن عبد الله اليربوعي,137,137-2065
490884,8101,p-1,4894,عبد الله بن عبد الرحمن بن أبي بكر الصديق,رسول الله ﷺ,هند بنت أبي أمية زوج رسول الله,137,137-2065
490885,3146,4894,7863,نافع مولى ابن عمر,عبد الله بن عبد الرحمن بن أبي بكر الصديق,زيد بن عبد الله بن عمر بن الخطاب,137,137-2065


# BOOK MAP CREATION

In [3]:
book_wise_hadith_dir="/home/ansary/work/hadith/hadith_data_bookwise"
books=os.listdir(book_wise_hadith_dir)
book_map={}
for book in books:
    book_id=os.listdir(os.path.join(book_wise_hadith_dir,book))[0].split("-")[0]
    book_map[book_id]=book
book_map

{'158': 'صحيح مسلم',
 '184': 'سنن أبي داود',
 '137': 'سنن الدارمي',
 '146': 'صحيح البخاري',
 '173': 'سنن ابن ماجه',
 '345': 'صحيح ابن خزيمة',
 '454': 'صحيح ابن حبان',
 '19': 'موطأ مالك رواية يحيى الليثي',
 '195': 'جامع الترمذي',
 '121': 'مسند أحمد بن حنبل',
 '594': 'المستدرك على الصحيحين',
 '319': 'سنن النسائى الصغرى'}

In [4]:
all_relations.book=all_relations.book.apply(lambda x: book_map[x])
all_relations.to_csv("/home/ansary/work/hadith/alminasaScrapper/data/datarelation_data.csv",index=False)
all_relations

Unnamed: 0,narrator,teacher,student,student_label,teacher_label,narrator_label,book,hadith
0,a-19,7863,,,نافع مولى ابن عمر,مالك بن أنس (موطأ مالك رواية يحيى الليثي),موطأ مالك رواية يحيى الليثي,19-190
1,7863,4967,a-19,مالك بن أنس (موطأ مالك رواية يحيى الليثي),عبد الله بن عمر بن الخطاب,نافع مولى ابن عمر,موطأ مالك رواية يحيى الليثي,19-190
2,a-19,8055,,,هشام بن عروة بن الزبير,مالك بن أنس (موطأ مالك رواية يحيى الليثي),موطأ مالك رواية يحيى الليثي,19-524
3,8055,553,a-19,مالك بن أنس (موطأ مالك رواية يحيى الليثي),أسماء بنت أبي بكر,هشام بن عروة بن الزبير,موطأ مالك رواية يحيى الليثي,19-524
4,a-19,7272,,,الزهري,مالك بن أنس (موطأ مالك رواية يحيى الليثي),موطأ مالك رواية يحيى الليثي,19-1521
...,...,...,...,...,...,...,...,...
490882,7863,3146,6641,الليث بن سعد,زيد بن عبد الله بن عمر بن الخطاب,نافع مولى ابن عمر,سنن الدارمي,137-2065
490883,465,6641,a-137,أبو محمد عبد الله الدارمي (سنن الدارمي),الليث بن سعد,أحمد بن عبد الله اليربوعي,سنن الدارمي,137-2065
490884,8101,p-1,4894,عبد الله بن عبد الرحمن بن أبي بكر الصديق,رسول الله ﷺ,هند بنت أبي أمية زوج رسول الله,سنن الدارمي,137-2065
490885,3146,4894,7863,نافع مولى ابن عمر,عبد الله بن عبد الرحمن بن أبي بكر الصديق,زيد بن عبد الله بن عمر بن الخطاب,سنن الدارمي,137-2065


In [8]:

save_data_dir="/home/ansary/work/hadith/chain_data_narrator_data/"
df=all_relations.copy()
# Group by narrator
for narrator, group in tqdm(df.groupby("narrator")):
    # Create unique teachers
    teachers = group.groupby(["teacher", "teacher_label"])["hadith"].apply(list).reset_index()
    teachers = teachers.rename(columns={"teacher": "id", "teacher_label": "label"}).to_dict(orient="records")

    # Create unique students
    students = group.groupby(["student", "student_label"])["hadith"].apply(list).reset_index()
    students = students.rename(columns={"student": "id", "student_label": "label"}).to_dict(orient="records")

    # Construct JSON structure
    result = {
        "narrator": narrator,
        "teachers": teachers,
        "students": students
    }

    # Save as JSON file
    with open(f"{save_data_dir}{narrator}.json", "w", encoding="utf-8") as f:
        json.dump(result, f, indent=4, ensure_ascii=False)

100%|██████████| 10985/10985 [01:27<00:00, 125.54it/s]


In [10]:

save_data_dir="/home/ansary/work/hadith/chain_data_bookwise_narrator/"
df=all_relations.copy()
# Group by book and narrator
for book, book_group in df.groupby("book"):
    # Create directory for each book
    book_dir = os.path.join(save_data_dir,book)
    os.makedirs(book_dir, exist_ok=True)

    # Group by narrator within each book
    for narrator, group in tqdm(book_group.groupby("narrator")):
        # Create unique teachers
        teachers = group.groupby(["teacher", "teacher_label"])["hadith"].apply(list).reset_index()
        teachers = teachers.rename(columns={"teacher": "id", "teacher_label": "label"}).to_dict(orient="records")

        # Create unique students
        students = group.groupby(["student", "student_label"])["hadith"].apply(list).reset_index()
        students = students.rename(columns={"student": "id", "student_label": "label"}).to_dict(orient="records")

        # Construct JSON structure
        result = {
            "narrator": narrator,
            "teachers": teachers,
            "students": students
        }

        # Save as JSON file in the corresponding book directory
        file_path = os.path.join(book_dir, f"{narrator}.json")
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=4, ensure_ascii=False)

100%|██████████| 6158/6158 [00:51<00:00, 120.34it/s]
100%|██████████| 2962/2962 [00:22<00:00, 129.03it/s]
100%|██████████| 3667/3667 [00:29<00:00, 124.97it/s]
100%|██████████| 3530/3530 [00:28<00:00, 122.46it/s]
100%|██████████| 2081/2081 [00:16<00:00, 124.76it/s]
100%|██████████| 2859/2859 [00:23<00:00, 123.75it/s]
100%|██████████| 3653/3653 [00:30<00:00, 119.83it/s]
100%|██████████| 2081/2081 [00:17<00:00, 119.91it/s]
100%|██████████| 1478/1478 [00:13<00:00, 113.53it/s]
100%|██████████| 1723/1723 [00:13<00:00, 125.92it/s]
100%|██████████| 5624/5624 [00:46<00:00, 121.61it/s]
100%|██████████| 482/482 [00:04<00:00, 114.59it/s]
