In [1]:
chain_data_dir="/home/ansary/work/hadith/chain_data_book_wise/"
hadith_data_dir="/home/ansary/work/hadith/hadith_data_bookwise/"
save_data_dir="/home/ansary/work/hadith/isnad_data_bookwise/"

In [None]:
import os 
from glob import glob 
from tqdm import tqdm
import pandas as pd 
os.makedirs(save_data_dir,exist_ok=True)
hadiths=[f for f in tqdm(glob(os.path.join(hadith_data_dir,"*/*.json") ))]
chains=[f for f in tqdm(glob(os.path.join(chain_data_dir,"*/*.json") ))]
hdf=pd.DataFrame({"hadith_path":hadiths})
cdf=pd.DataFrame({"chain_path":chains})
hdf["hadith_id"]=hdf["hadith_path"].apply(lambda x: os.path.basename(x))
cdf["hadith_id"]=cdf["chain_path"].apply(lambda x: os.path.basename(x))
hdf["book"]=hdf["hadith_path"].apply(lambda x: os.path.basename(os.path.dirname(x)))
df=hdf.merge(cdf,on=["hadith_id"],how="outer")
df

In [3]:
import re
from bs4 import BeautifulSoup

def extract_isnad(hadith_text):
    """
    Extracts the isnad (chain of narrators) by removing the matn (main hadith content).
    
    Args:
        hadith_text (str): The full hadith text including isnad and matn.
    
    Returns:
        str: The extracted isnad text.
    """
    # Parse the text as HTML
    soup = BeautifulSoup(hadith_text, 'html.parser')

    # Remove the matn tag and its contents
    matn_tag = soup.find('a', class_='matn')
    if matn_tag:
        matn_tag.decompose()

    # Get the isnad with narrator IDs preserved
    isnad = str(soup)

    return isnad.strip()

In [4]:
from bs4 import BeautifulSoup

def check_additional_narrators(raw_isnad, chain):
    """
    Check if there are additional narrators in raw_isnad not present in chain data.
    
    Args:
        raw_isnad (str): HTML string containing the isnad with narrator tags.
        chain (dict): Dictionary of chain data with id, label, and type.
    
    Returns:
        tuple: (bool, set or None) - True if additional narrators exist, with their IDs; False otherwise.
    """
    # Step 1: Parse raw_isnad and extract narrator IDs
    soup = BeautifulSoup(raw_isnad, 'html.parser')
    rawy_tags = soup.find_all('a', class_='rawy')
    raw_isnad_ids = [tag['id'] for tag in rawy_tags]
    
    # Step 2: Extract narrator IDs from chain where type is 'Narrator'
    chain_narrator_ids = [value['id'] for value in chain.values() if value['type'] == 'Narrator']
    
    # Step 3: Find additional IDs using set difference
    additional_ids = set(raw_isnad_ids) - set(chain_narrator_ids)
    
    # Step 4: Return result
    if additional_ids:
        return True, additional_ids
    return False, None

In [5]:
from bs4 import BeautifulSoup
import arabic_reshaper

def process_hadith_text(hadith_text):
    """
    Process hadith text to exclude matn and extract narrator IDs before it, with Arabic reshaping.
    
    Args:
        hadith_text (str): Full hadith text with isnad and matn.
    
    Returns:
        tuple: (str, list) - Processed text before matn (original right-to-left, reshaped), list of narrator IDs.
    """
    # Parse the hadith text
    soup = BeautifulSoup(hadith_text, 'html.parser')
    
    # Find the matn tag
    matn_tag = soup.find('a', class_='matn')
    if not matn_tag:
        raise ValueError("No matn tag found in the hadith text.")
    
    # Remove matn and everything after it
    for element in matn_tag.find_all_next() + [matn_tag]:
        element.decompose()
    
    # Extract narrator IDs in serial order (before matn)
    narrator_tags = soup.find_all('a', class_='rawy')
    narrator_ids = [tag['id'] for tag in narrator_tags]
    
    # Get the processed text and reshape it for original reading order (right-to-left)
    processed_text = soup.get_text()
    reshaped_text = arabic_reshaper.reshape(processed_text)  # Reshape without reversing
    
    return reshaped_text, narrator_ids

In [None]:
import json
for idx in tqdm(range(len(df))):
    book=df.iloc[idx,2]
    hadith_id=df.iloc[idx,1]
    data={}
    with open(df.iloc[idx,0],"r") as f:
        hadith_data=json.load(f)
    if hadith_data["_source"]['matn_with_tashkeel']:
        processed_text, narrator_ids = process_hadith_text(hadith_data['_source']["hadith"])
        data["raw_isnad"]=processed_text
        data["grade"]=hadith_data['_source']["rulings"]
        data["narrators"]=narrator_ids
        chain_path=df.iloc[idx,-1]
        if type(chain_path)==str:
            with open(chain_path,"r") as f:
                chain_data=json.load(f)
            data["chain"]=chain_data["chain"]
        else:
            data["chain"]={}
        save_book=os.path.join(save_data_dir,book)
        os.makedirs(save_book,exist_ok=True)
        save_json=os.path.join(save_book,hadith_id)
        with open(save_json,"w+",encoding="utf-8") as f:
            json.dump(data,f,ensure_ascii=False,indent=2)
        