- [x] WbW Quran English And Urdu
- [x] Full Quran Text from Tanzil XML
- [x] Full Quran Translation From Tanzil XML
- [x] Morphology Loading and initial Structuring to pandas
- [x] Root words loading from CSV produced by semi manual extraction from html
- [x] Root meaning assignment to morphology pandas
- [ ] Extracting Lemma from Features of morphology df to new coloumn

### Imports and File Locations

In [94]:
import pandas as pd
import numpy as np
from constants import BUCKWALTER2UNICODE as b2u
from constants import ROOT2BUCK as r2b
import xml.etree.ElementTree as ET 
import json


#display(pos.BUCKWALTER2UNICODE)
morphologyFile = 'data/corpus-morphology/quranic-corpus-morphology-0.4.txt'
enWordsFile    = 'data/hablullah/word-translation/en-word.json'
urWordsFile    = 'data/hablullah/word-translation/ur-word.json'
arWordsFile    = 'data/hablullah/word/word.json'
arRootsFile    = 'data/rootWords/allRoots.csv'

indoPakScript  = 'data/hablullah/ayah-text/indopak.json'

tanzil_simple_Quran = 'data/AlQuran/quran-simple.xml'
tanzil_uthmani_Quran = 'data/AlQuran/quran-uthmani.xml'


tanzil_Quran_metadata = 'data/metadata/quran-data.xml'

tanzil_ur_modudi = 'data/translations/ur/ur.maududi.xml'

#### Function for loading Morphology by https://corpus.quran.com/ Word By Word Quran and translation by Hablullah from github

In [95]:

"""
Morphology file loading to Pandas and assigning global ayah index from 1 to 6236
File is downloaded from corpus.quran.com
"""
def loadMorphology(morphologyFilePath):
    df = pd.read_csv(morphologyFilePath,names=['LOCATION','FORM','TAG','FEATURES'],skiprows=57,sep='\t')
    df[['Surah','Ayah','Word','SubWord']] = df.LOCATION.map(lambda x: x.lstrip('()').rstrip(')')).str.split(':',expand=True)
    df[['Surah','Ayah','Word']] = df[['Surah', 'Ayah','Word']].apply(pd.to_numeric)
    df = df.set_index(['Surah','Ayah','Word'])
    df = df.sort_index()
    grouped  = df.groupby(['Surah','Ayah']).count()
    grouped['autoIndex'] = np.arange(1,grouped.shape[0]+1)
    df['globalAyah'] = grouped.autoIndex
    
    wgrouped  = df.groupby(['Surah','Ayah','Word']).count()
    wgrouped['autoIndex'] = np.arange(1,wgrouped.shape[0]+1)
    df['globalWord'] = wgrouped.autoIndex
    return df

"""
Word By Word Quran Data is acquired from Hablullah from github 
"""
def loadArabicWords(arWords):
    df = pd.read_json(arWords,orient='index')
    df = df.set_index(['ayah','position'])
    df = df.sort_index()
    grouped  = df.groupby(['ayah','position']).count()
    grouped['autoIndex'] = np.arange(1,grouped.shape[0]+1)
    df['globalWordwbw'] = grouped.autoIndex
    return df

def attachEnglishWbW(df,enWords):
    df_en = pd.read_json(enWordsFile,orient='index')
    df_en.columns = ['en']
    return df.merge(df_en,left_on='globalWordwbw', right_index=True)

def attachUrduWbW(df,urWords):
    df_ur = pd.read_json(urWordsFile,orient='index')
    df_ur.columns = ['ur']
    return df.merge(df_ur,left_on='globalWordwbw', right_index=True)

### Tanzil Parser from XML Files Provided by Tanzil.org

In [96]:
"""Tanzil XML Meta Parser"""  
def getTanzilMetaDataSuras(xmlFile):
    # create element tree object 
    tree = ET.parse(xmlFile)
    root = tree.getroot()
    SurahByIndex = []
    SurahByIndexAttribs = []
    for item in root.findall("./suras"):
        for child in item:
            SurahByIndex.append(child.attrib['index'])
            SurahByIndexAttribs.append((child.attrib['ayas'],
                                        child.attrib['start'],
                                        child.attrib['name'],
                                        child.attrib['tname'],
                                        child.attrib['ename'],
                                        child.attrib['type'],
                                        child.attrib['order'],
                                        child.attrib['rukus']))
    columns = ['ayas','start','name','tname','ename','type','order','rukus']
    return (pd.DataFrame(SurahByIndexAttribs,columns=columns,index=pd.Index(SurahByIndex).astype('int32')))

"""Surah Ayah Text From Tanzil Quran"""
def getDFSurahAyahIndexed(xmlFile):
    # create element tree object 
    tree = ET.parse(xmlFile)
    root = tree.getroot()
    SurahAyahTouples = []
    SurahAyahText = []
    for item in root.findall("./sura"):
        for child in item:
            SurahAyahTouples.append((int(item.attrib['index']),int(child.attrib['index'])))
            if('bismillah' not in child.attrib):
                SurahAyahText.append([item.attrib['name'],child.attrib['text'],0])
            else:
                SurahAyahText.append([item.attrib['name'],child.attrib['text'],child.attrib['bismillah']])
    index = pd.MultiIndex.from_tuples(SurahAyahTouples)
    index.set_names(['SurahNumber','AyahNumber'], inplace=True)
    return (pd.DataFrame(SurahAyahText,columns=['SurahName','Ayah','Bismillah'],index=index))

"""Attaching Juzz info to DF"""
def setDFSurahAyahIndexedJuzs(df,xmlFile):
    tree = ET.parse(xmlFile)
    root = tree.getroot()
    
    df['juz'] = 0
    for item in root.findall("./juzs"):
        for child in item:
            df.loc[(int(child.attrib['sura']),int(child.attrib['aya'])):,['juz']] = child.attrib['index']   
    return df

"""Attaching Ruku info to DF"""
def setDFSurahAyahIndexedRukus(df,xmlFile):
    tree = ET.parse(xmlFile)
    root = tree.getroot()
    
    df['ruku'] = 0
    df['sub_ruku'] = 0
    sub_ruku = 0;
    pre_sura = 0;
    for item in root.findall("./rukus"):
        for child in item:
            if(pre_sura == child.attrib['sura']):
                sub_ruku = sub_ruku + 1
            else:
                sub_ruku = 1
            df.loc[(int(child.attrib['sura']),int(child.attrib['aya'])):,['ruku','sub_ruku']] = child.attrib['index'],sub_ruku   
            pre_sura = child.attrib['sura']
    return df  
def attachindoPak(df,indoPak):
    df_indopak = pd.read_json(indoPak,orient='index')
    df_indopak.columns = ['indoPakText']
    return df.merge(df_indopak,left_on='autoIndex', right_index=True)
def getTranslation(xmlFile):
    # create element tree object 
    tree = ET.parse(xmlFile)
    root = tree.getroot()
    SurahAyahTouples = []
    SurahAyahText = []
    for item in root.findall("./sura"):
        for child in item:
            SurahAyahTouples.append((int(item.attrib['index']),int(child.attrib['index'])))
            if('bismillah' not in child.attrib):
                SurahAyahText.append([item.attrib['name'],child.attrib['text'],0])
            else:
                SurahAyahText.append([item.attrib['name'],child.attrib['text'],child.attrib['bismillah']])
    index = pd.MultiIndex.from_tuples(SurahAyahTouples)
    index.set_names(['SurahNumber','AyahNumber'], inplace=True)
    return (pd.DataFrame(SurahAyahText,columns=['SurahName','Ayah','Bismillah'],index=index))

#### Some utility Functions for Transliterations and root words extracted from HTML files provided by studyquran.co.uk

In [97]:


"""BuckWalter to Unicode Converter"""
def buckToUniString(buck):
    result=""
    for ch in buck:
        try:
            result += b2u[ch]
        except:
            return None
            
    return result
"""Load Root Words"""
def rootWordsToPanda():
    return pd.read_csv(arRootsFile,names = ["Root", "Meanings"])
def rootToUni(root):
    uni=""
    chList = root.split("-")
    for ch in chList:
        ch = ch.lstrip().rstrip()
        if ch in r2b:
            uni += buckToUniString(r2b[ch])
            uni += " "
        else:
            print(root)
    return uni
def rootToBck(root):
    bck=""
    chList = root.split("-")
    for ch in chList:
        ch = ch.lstrip().rstrip().lstrip()
        if ch in r2b:
            bck += r2b[ch]
        else:
            print(root)
    return bck


#### Loading Morphology and WBW

`display(mor_df.groupby('TAG').count())
 display(mor_df.loc[mor_df['TAG'] == 'V'])`

In [141]:
mor_df = loadMorphology(morphologyFile)             # Morphology file to Pandas
wbw_df = loadArabicWords(arWordsFile)               # WBW JSON to Pandas for Arabic Words
wbw_df = attachEnglishWbW(wbw_df,enWordsFile)       # WBW JSON English to Mega DF
wbw_df = attachUrduWbW(wbw_df,urWordsFile)          # WBW JSON Urdu to Mega DF
wbw_df.tail(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,uthmani,nastaliq,globalWordwbw,en,ur
ayah,position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6236,2,الْجِنَّةِ,ٱلۡجِنَّةِ,77428,the jinn,جنوں میں
6236,3,وَ النَّاسِ۠,وَٱلنَّاسِ,77429,and men,اور انسانوں میں


#### Loading Mushaf Provided by Tanzil and attaching MetaData to loaded DF

In [99]:
AlQuranDF = getDFSurahAyahIndexed(tanzil_uthmani_Quran)                     # Loading From XML Mushaf to By Surah Ayah Indexed
AlQuranDF = AlQuranDF.sort_index()                                         # Sorting idex for Range Slicing
AlQuranDF['autoIndex'] = np.arange(1,AlQuranDF.shape[0]+1)                 # Assigning Global Ayas to complete DF
AlQuranDF['globalIndex'] = np.arange(0,AlQuranDF.shape[0])                 # Assigning Global Ayas from 0 to complete DF
AlQuranDF = setDFSurahAyahIndexedJuzs(AlQuranDF,tanzil_Quran_metadata)     # Assigning Juzs to complete DF
AlQuranDF = setDFSurahAyahIndexedRukus(AlQuranDF,tanzil_Quran_metadata)    # Assigning Rukus to all 
AlQuranDF = attachindoPak(AlQuranDF,indoPakScript)

SurahMetaDF = getTanzilMetaDataSuras(tanzil_Quran_metadata)                # Quran MetaData e.g. Surah info By Tanzil.org
SurahMetaDF = SurahMetaDF.sort_index()  


trans_ur_maududi = getTranslation(tanzil_ur_modudi)
AlQuranDF.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,SurahName,Ayah,Bismillah,autoIndex,globalIndex,juz,ruku,sub_ruku,indoPakText
SurahNumber,AyahNumber,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,الفاتحة,بِسْمِ ٱللَّهِ ٱلرَّحْمَـٰنِ ٱلرَّحِيمِ,0,1,0,1,1,1,بِسۡمِ اللهِ الرَّحۡمٰنِ الرَّحِيۡمِ
1,2,الفاتحة,ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَـٰلَمِينَ,0,2,1,1,1,1,اَلۡحَمۡدُ لِلّٰهِ رَبِّ الۡعٰلَمِيۡنَۙ‏
1,3,الفاتحة,ٱلرَّحْمَـٰنِ ٱلرَّحِيمِ,0,3,2,1,1,1,الرَّحۡمٰنِ الرَّحِيۡمِۙ‏
1,4,الفاتحة,مَـٰلِكِ يَوْمِ ٱلدِّينِ,0,4,3,1,1,1,مٰلِكِ يَوۡمِ الدِّيۡنِؕ‏
1,5,الفاتحة,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,0,5,4,1,1,1,اِيَّاكَ نَعۡبُدُ وَاِيَّاكَ نَسۡتَعِيۡنُؕ‏


#### Code to get Ayah or range of Ayah from DF 
`display(AlQuranDF.loc[[(2,6),(2,6)]])`

`display(AlQuranDF.loc[(2,283):(3,1)])`

#### Loading roots and meanings converted from HTML PROnline to CSV key value pairs and assigning BuckWalter and Unicode

`rootsDF.Root.to_csv('rootsKeys.csv')
 rootsDF = rootsDF.groupby(['Root']).size().nlargest(20)
`

In [100]:
roots_df = rootWordsToPanda()
roots_df['TRILETTER'] =  roots_df.Root.apply(lambda x: rootToUni(x))
roots_df['BUCK']      =  roots_df.Root.apply(lambda x: rootToBck(x))

#### Morphology Feature extraction 
- [x] Get Root Words if present in feature
- [x] Join with Root words meaning 
- [x] Get Lemma if present

In [101]:
def featureRootExtract(f):
    if "ROOT:" in f:
        fts = f.split("|")
        for ft in fts:
            if "ROOT" in ft:
                r = ft.split(":")
                return(r[1])
    else:
        return None
def featureLemmaExtract(f):
    if "LEM:" in f:
        fts = f.split("|")
        for ft in fts:
            if "LEM" in ft:
                r = ft.split(":")
                return(buckToUniString(r[1]))
    else:
        return None

In [135]:
#roots_df.loc[roots_df['BUCK'] == 'rbb']
mor_df['BUCKMOR']  =  mor_df.FEATURES.apply(lambda x: featureRootExtract(x))
mor_df['LEMMA']  =  mor_df.FEATURES.apply(lambda x: featureLemmaExtract(x))
mor_root_df = mor_df.merge(roots_df,how='left',left_on='BUCKMOR', right_on='BUCK').set_axis(mor_df.index)
mor_root_df = mor_root_df.drop(columns= ['BUCKMOR'])
mor_root_wbw_df = mor_root_df.merge(wbw_df,how='left',left_on='globalWord',right_on='globalWordwbw').set_axis(mor_df.index)
mor_root_wbw_df = mor_root_wbw_df.drop(columns= ['globalWordwbw'])

mor_root_wbw_df.loc[ (1,1,) , : ]

Unnamed: 0_level_0,LOCATION,FORM,TAG,FEATURES,SubWord,globalAyah,globalWord,LEMMA,Root,Meanings,TRILETTER,BUCK,uthmani,nastaliq,en,ur
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,(1:1:1:1),bi,P,PREFIX|bi+,1,1,1,,,,,,بِسْمِ,بِسۡمِ,In (the) name,ساتھ نام
1,(1:1:1:2),somi,N,STEM|POS:N|LEM:{som|ROOT:smw|M|GEN,2,1,1,ٱسْم,Siin-Miim-Waw,"to be high/lofty, raised, name, attribute. sa...",س م و,smw,بِسْمِ,بِسۡمِ,In (the) name,ساتھ نام
2,(1:1:2:1),{ll~ahi,PN,STEM|POS:PN|LEM:{ll~ah|ROOT:Alh|GEN,1,1,2,ٱللَّه,Alif-Lam-ha,"to serve, worship or adore; to protect, grant...",ا ل ه,Alh,اللّٰهِ,ٱللَّهِ,(of) Allah,اللہ کے
3,(1:1:3:1),{l,DET,PREFIX|Al+,1,1,3,,,,,,الرَّحْمٰنِ,ٱلرَّحۡمَٰنِ,the Most Gracious,جو بے حد مہربان ہے
3,(1:1:3:2),r~aHoma`ni,ADJ,STEM|POS:ADJ|LEM:r~aHoma`n|ROOT:rHm|MS|GEN,2,1,3,رَّحْمَٰن,Ra-Ha-Miim,"Rahima - He favored, benefited, pardoned, or f...",ر ح م,rHm,الرَّحْمٰنِ,ٱلرَّحۡمَٰنِ,the Most Gracious,جو بے حد مہربان ہے
4,(1:1:4:1),{l,DET,PREFIX|Al+,1,1,4,,,,,,الرَّحِیْمِ,ٱلرَّحِيمِ,the Most Merciful,بار بار رحم فرمانے والا ہے
4,(1:1:4:2),r~aHiymi,ADJ,STEM|POS:ADJ|LEM:r~aHiym|ROOT:rHm|MS|GEN,2,1,4,رَّحِيم,Ra-Ha-Miim,"Rahima - He favored, benefited, pardoned, or f...",ر ح م,rHm,الرَّحِیْمِ,ٱلرَّحِيمِ,the Most Merciful,بار بار رحم فرمانے والا ہے


In [140]:
#display(merged_wbw.loc[ (1,1,) ,  : ])
#display(mor_df.loc[ (1,1,) , : ])
#display(mor_df)
#grouped = merged_wbw.groupby(['Surah','Ayah','Word'])

#grouped.apply(print)
#.loc[ (1,1,) , : ]
result = mor_root_wbw_df.to_json('./Morphology.json',orient="records",force_ascii=False)
#print(json.dumps(result,indent=4,ensure_ascii=False))

### APIs to get Quran Data

In [75]:
def getSurasMeta(num=0):
    if num > 114 or num < 0:
        return None
    elif num == 0:
        tempMeta = SurahMetaDF
        tempMeta['index'] = tempMeta.index
        result = tempMeta.to_json('./SurasMeta.json',orient="records",force_ascii=False)
        #result = tempMeta.to_json(orient="records")
        #return (json.loads(result))
        #return json.dumps(parsed, indent=4)  
    else:
        tempMeta = SurahMetaDF
        tempMeta['index'] = tempMeta.index
        result = tempMeta[tempMeta.index == num].to_json('./SurasMeta.json',orient="records",force_ascii=False)
        #result = tempMeta[tempMeta.index == num].to_json(orient="records")
        return(json.loads(result))
        #return json.dumps(parsed, indent=4)  

In [76]:
res = getSurasMeta()
#print(json.dumps(res, indent=4))
#display(AlQuranDF.loc[[(114,1),(114,6)]])

In [59]:
def getAyahByChapter(Surah=0,Ayah=1):
    if Surah > 114 or Surah < 0:
        return None
    info = getSurasMeta(Surah)
    if Ayah > int(info[0]["ayas"]) or Ayah < 0:
        return None
    else:
        result = AlQuranDF.loc[[(Surah,Ayah)],['Ayah','autoIndex','indoPakText']].to_json(orient="records",lines=True)
        res = json.loads(result)
        result = trans_ur_maududi.loc[[(Surah,Ayah)],['Ayah']].to_json(orient="records",lines=True)
        res['tr']=json.loads(result)['Ayah']
        
        return res
        #return json.dumps(parsed, indent=4)  

In [61]:
print(json.dumps(getAyahByChapter(114,5),indent=4,ensure_ascii=False))

{
    "Ayah": "الَّذِي يُوَسْوِسُ فِي صُدُورِ النَّاسِ",
    "autoIndex": 6235,
    "indoPakText": "الَّذِىۡ يُوَسۡوِسُ فِىۡ صُدُوۡرِ النَّاسِۙ‏",
    "tr": "جو لوگوں کے دلوں میں وسوسے ڈالتا ہے"
}


In [74]:
AlQuranDF.to_json('./QuranText.json',orient="records",force_ascii=False)