# Load Dataset of Digital Quran

In [None]:
import pandas as pd
import xml.etree.ElementTree as ET

## This data is taken from Mushaf Al-Madinah Hafs narration version 18 from King Fahad you can download data and fonts from here "https://qurancomplex.gov.sa/techquran/dev/" (Unicode Uthmanic Font (Hafs Narration))
### اخذ نص هذا المصحف من مصحف المدينة الرقمي الاصدار رقم 18 برواية حفص و يمكن تنزيله من الرابط اعلاه

In [None]:
mushaf_madinah = pd.read_csv('data/hafsData_v18.csv')
mushaf_madinah.head()

In [None]:
mushaf_madinah.info()

In [None]:
mushaf_madinah.rename(columns={'id':'aya_index'}, inplace= True)
mushaf_madinah.rename(columns={'aya_text_emlaey':'spoken_aya_text'}, inplace= True)
mushaf_madinah.rename(columns={'sora':'sura_mushaf_order'}, inplace= True)
mushaf_madinah.rename(columns={'sora_name_ar':'sura_name'}, inplace= True)
quran = mushaf_madinah

## some Quran metadata from tanzil "https://tanzil.net/docs/"

In [None]:
tanzil_quran_metadata_root = ET.parse('data/tanzil_quran_metadata.xml').getroot()
suras = {}
for child in tanzil_quran_metadata_root.find('suras'):
    suras[int(child.attrib['index'])] = [child.attrib['order'], child.attrib['type']]
quran['sura_revel_order'] = quran['sura_mushaf_order'].apply(lambda x: int(suras[x][0]))
quran['sura_revel_place'] = quran['sura_mushaf_order'].apply(lambda x: 'مكية' if suras[x][1] == 'Meccan' else 'مدنية')

# Calculate Jomal (حساب الجُمّل Abjad numerals )
here some useful link "http://www.islamnoon.com/new/Derasat/Erhasat/e_hesab.htm"

In [None]:
quran_letters = {'ء','أ','إ','ا','ٱ','آ','ى','ب','ت','ث','ج','ح','خ','د','ذ','ر','ز','س','ش','ص','ض','ط','ظ','ع','غ','ف','ق','ك','ل','م','ن','ه','و','ؤ','ي','ئ'}
letter_jomal = {'ء': 1 ,'أ': 1 ,'إ': 1 ,'ا': 1 ,'ٱ': 1 ,'آ': 1 ,'ى': 1 ,'ب': 2 ,'ت': 400 ,'ة': 400 ,'ث': 500 ,'ج': 3 ,'ح': 8 ,'خ': 600 ,'د': 4 ,'ذ': 700 ,'ر': 200 ,'ز': 7 ,'س': 60 ,'ش': 300 ,'ص': 90 ,'ض': 800 ,'ط': 9 ,'ظ': 900 ,'ع': 70 ,'غ': 1000 ,'ف': 80 ,'ق': 100 ,'ك': 20 ,'ل': 30 ,'م': 40 ,'ن': 50 ,'ه': 5 ,'و': 6 ,'ؤ': 6 ,'ي': 10 ,'ئ': 10}

def get_aya_words(aya):
    aya = aya.split()[:-1]
    if aya[0] == '۞':
        aya = aya[1:]
    if aya[-1][-1] == '۩':
        aya[-1] = aya[-1][:-1]
    return aya

def get_letters(text):
    letters = []
    for words in text.split():
        for c in words:
            if c in quran_letters:
                letters.append(c)
    return letters

def get_invariant_letters(text):
    letters = []
    for words in text.split():
        for c in words:
            if c in quran_letters:
                if c in ['ء','آ','أ','إ','ا','ى','ٱ']:
                    letters.append('ا')
                elif c in ['ة','ت',]:
                    letters.append('ت')
                elif c in ['ؤ','و']:
                    letters.append('و')
                elif c in ['ئ','ي']:
                    letters.append('ي')
                else:
                    letters.append(c)
    return letters

def calculate_text_jomal(text):
    total = 0
    for c in get_letters(text):
        total += letter_jomal[c]
    return total
    
def calculate_letters_jomal(letters):
    total = 0
    for c in letters:
        total += letter_jomal[c]
    return total

def calculate_cumulative_sum(number):
    total = 0
    for n in range(number + 1):
        total += n
    return total

def get_aya_indexes_list(aya):
    pass


# sora name featuers

In [None]:
quran['sura_cumulative_mushaf_order'] = quran['sura_mushaf_order'].apply(lambda order: calculate_cumulative_sum(order))
quran['sura_cumulative_revel_order'] = quran['sura_revel_order'].apply(lambda order: calculate_cumulative_sum(order))
quran['sura_name_letters'] = quran['sura_name'].apply(lambda name: get_letters(name))
quran['sura_name_letters_count'] = quran['sura_name_letters'].apply(lambda letters: len(letters))
quran['invariant_sura_name_letters'] = quran['sura_name'].apply(lambda name: get_invariant_letters(name))
quran['sura_name_invariant_letters_count'] = quran['invariant_sura_name_letters'].apply(lambda letters: len(letters))
quran['sura_name_jomal'] = quran['sura_name_letters'].apply(lambda letters: calculate_letters_jomal(letters))
quran['sura_name_cumulative_jomal'] = quran['sura_name_jomal'].cumsum()
quran['sura_name_unique_letters'] = quran['sura_name_letters'].apply(lambda letters: set(letters))
quran['sura_name_unique_letters_count'] = quran['sura_name_unique_letters'].apply(lambda letters: len(letters))
quran['sura_name_unique_invariant_letters'] = quran['invariant_sura_name_letters'].apply(lambda letters: set(letters))
quran['sura_name_unique_invariant_letters_count'] = quran['sura_name_unique_invariant_letters'].apply(lambda letters: len(letters))
quran['sura_name_unique_letters_jomal'] = quran['sura_name_unique_letters'].apply(lambda letters: calculate_letters_jomal(letters))
quran['sura_name_unique_letters_cumulative_jomal'] = quran['sura_name_unique_letters_jomal'].cumsum()
quran.head(10)[['sura_cumulative_mushaf_order','sura_cumulative_revel_order','sura_name_letters',\
    'sura_name_letters_count','invariant_sura_name_letters','sura_name_invariant_letters_count',\
        'sura_name_jomal','sura_name_cumulative_jomal','sura_name_unique_letters',\
            'sura_name_unique_letters_count','sura_name_unique_invariant_letters',\
                'sura_name_unique_invariant_letters_count','sura_name_unique_letters_jomal',\
                    'sura_name_unique_letters_cumulative_jomal']]

# written (uthmani) text features

In [None]:
quran['aya_words'] = quran['aya_text'].apply(lambda x : get_aya_words(x))
quran['aya_words_count'] = quran['aya_words'].apply(lambda words: len(words))
quran['aya_letters'] = quran['aya_text'].apply(lambda aya: get_letters(aya))
quran['aya_letters_count'] = quran['aya_letters'].apply(lambda letters: len(letters))
quran['aya_invariant_letters'] = quran['aya_text'].apply(lambda letters: get_invariant_letters(letters))
quran['aya_invariant_letters_count'] = quran['aya_invariant_letters'].apply(lambda letters: len(letters))
quran['aya_jomal'] = quran['aya_letters'].apply(lambda letters: calculate_letters_jomal(letters))
quran['aya_cumulative_jomal'] = quran['aya_jomal'].cumsum()
quran['aya_unique_letters'] = quran['aya_letters'].apply(lambda letters: set(letters))
quran['aya_unique_letters_count'] = quran['aya_unique_letters'].apply(lambda letters: len(letters))
quran['aya_unique_invariant_letters'] = quran['aya_invariant_letters'].apply(lambda letters: set(letters))
quran['aya_unique_invariant_letters_count'] = quran['aya_unique_invariant_letters'].apply(lambda letters: len(letters))
quran['aya_unique_letters_jomal'] = quran['aya_unique_letters'].apply(lambda letters: calculate_letters_jomal(letters))
quran['aya_unique_letters_commulative_jomal'] = quran['aya_unique_letters_jomal'].cumsum()
quran.head(10)[['aya_words','aya_words_count','aya_letters','aya_letters_count','aya_invariant_letters',\
    'aya_invariant_letters_count','aya_jomal','aya_cumulative_jomal','aya_unique_letters',\
        'aya_unique_letters_count','aya_unique_invariant_letters','aya_unique_invariant_letters_count',\
            'aya_unique_letters_jomal','aya_unique_letters_commulative_jomal']]

# spoken text (Note: text as what it sounds) features

In [None]:
quran['spoken_aya_words'] = quran['spoken_aya_text'].str.split()
quran['spoken_aya_words_count'] = quran['spoken_aya_words'].apply(lambda words: len(words))
quran['spoken_aya_letters'] = quran['spoken_aya_text'].apply(lambda aya: get_letters(aya))
quran['spoken_aya_letters_count'] = quran['spoken_aya_letters'].apply(lambda letters: len(letters))
quran['spoken_aya_invariant_letters'] = quran['spoken_aya_text'].apply(lambda letters: get_invariant_letters(letters))
quran['spoken_aya_invariant_letters_count'] = quran['spoken_aya_invariant_letters'].apply(lambda letters: len(letters))
quran['spoken_aya_jomal'] = quran['spoken_aya_letters'].apply(lambda letters: calculate_letters_jomal(letters))
quran['spoken_aya_cumulative_jomal'] = quran['spoken_aya_jomal'].cumsum()
quran['spoken_aya_unique_letters'] = quran['spoken_aya_letters'].apply(lambda letters: set(letters))
quran['spoken_aya_unique_letters_count'] = quran['spoken_aya_unique_letters'].apply(lambda letters: len(letters))
quran['spoken_aya_unique_invariant_letters'] = quran['spoken_aya_invariant_letters'].apply(lambda letters: set(letters))
quran['spoken_aya_unique_invariant_letters_count'] = quran['spoken_aya_unique_invariant_letters'].apply(lambda letters: len(letters))
quran['spoken_aya_unique_letters_jomal'] = quran['spoken_aya_unique_letters'].apply(lambda letters: calculate_letters_jomal(letters))
quran['spoken_aya_unique_letters_commulative_jomal'] = quran['spoken_aya_unique_letters_jomal'].cumsum()
quran.head(10)[['spoken_aya_words','spoken_aya_words_count','spoken_aya_letters','spoken_aya_letters_count',\
    'spoken_aya_invariant_letters','spoken_aya_invariant_letters_count','spoken_aya_jomal',\
        'spoken_aya_cumulative_jomal','spoken_aya_unique_letters','spoken_aya_unique_letters_count',\
            'spoken_aya_unique_invariant_letters','spoken_aya_unique_invariant_letters_count',\
                'spoken_aya_unique_letters_jomal','spoken_aya_unique_letters_commulative_jomal']]

# Fix some spaces issues

### fix spaces in aya of spoken text

In [None]:
# get un matched words counts between written and spoken text
words_to_fix = quran[(quran['spoken_aya_words_count'] != quran['aya_words_count'])]['spoken_aya_words']
len(words_to_fix)

In [None]:
words_to_fix.iloc[0][0:2] = [''.join(words_to_fix.iloc[0][0:2])]
words_to_fix.iloc[1][0:2] = [''.join(words_to_fix.iloc[1][0:2])]
words_to_fix.iloc[2][14:16] = [''.join(words_to_fix.iloc[2][14:16])]
words_to_fix.iloc[3][9:11] = [''.join(words_to_fix.iloc[3][9:11])]
words_to_fix.iloc[4][0:2] = [''.join(words_to_fix.iloc[4][0:2])]
words_to_fix.iloc[5][0:2] = [''.join(words_to_fix.iloc[5][0:2])]
words_to_fix.iloc[6][0:2] = [''.join(words_to_fix.iloc[6][0:2])]
words_to_fix.iloc[7][0:2] = [''.join(words_to_fix.iloc[7][0:2])]
words_to_fix.iloc[8][16:18] = [''.join(words_to_fix.iloc[8][16:18])]
words_to_fix.iloc[9][0:2] = [''.join(words_to_fix.iloc[9][0:2])]
words_to_fix.iloc[10][0:2] = [''.join(words_to_fix.iloc[10][0:2])]
words_to_fix.iloc[11][0:2] = [''.join(words_to_fix.iloc[11][0:2])]
words_to_fix.iloc[12][-4:-2] = [''.join(words_to_fix.iloc[12][-4:-2])]
words_to_fix.iloc[13][0:2] = [''.join(words_to_fix.iloc[13][0:2])]
words_to_fix.iloc[14][0:2] = [''.join(words_to_fix.iloc[14][0:2])]
words_to_fix.iloc[15][0:2] = [''.join(words_to_fix.iloc[15][0:2])]
words_to_fix.iloc[16][0:2] = [''.join(words_to_fix.iloc[16][0:2])]
words_to_fix.iloc[17][0:2] = [''.join(words_to_fix.iloc[17][0:2])]
words_to_fix.iloc[18][0:2] = [''.join(words_to_fix.iloc[18][0:2])]
words_to_fix.iloc[19][17:19] = [''.join(words_to_fix.iloc[19][17:19])]
words_to_fix.iloc[20][0:2] = [''.join(words_to_fix.iloc[20][0:2])]
words_to_fix.iloc[21][1:3] = [''.join(words_to_fix.iloc[21][1:3])]
words_to_fix.iloc[22][0:2] = [''.join(words_to_fix.iloc[22][0:2])]
words_to_fix.iloc[23][0:2] = [''.join(words_to_fix.iloc[23][0:2])]
words_to_fix.iloc[24][0:2] = [''.join(words_to_fix.iloc[24][0:2])]
words_to_fix.iloc[25][6:8] = [''.join(words_to_fix.iloc[25][6:8])]
words_to_fix.iloc[26][0:2] = [''.join(words_to_fix.iloc[26][0:2])]
words_to_fix.iloc[27][0:2] = [''.join(words_to_fix.iloc[27][0:2])]
words_to_fix.iloc[28][1:3] = [''.join(words_to_fix.iloc[28][1:3])]
words_to_fix.iloc[29][1:3] = [''.join(words_to_fix.iloc[29][1:3])]
words_to_fix.iloc[30][3:5] = [''.join(words_to_fix.iloc[30][3:5])]
words_to_fix.iloc[31][12:14] = [''.join(words_to_fix.iloc[31][12:14])]
words_to_fix.iloc[32][8:10] = [''.join(words_to_fix.iloc[32][8:10])]
words_to_fix.iloc[33][6:8] = [''.join(words_to_fix.iloc[33][6:8])]
words_to_fix.iloc[34][24:26] = [''.join(words_to_fix.iloc[34][24:26])]
words_to_fix.iloc[35][0:2] = [''.join(words_to_fix.iloc[35][0:2])]
words_to_fix.iloc[36][0:2] = [''.join(words_to_fix.iloc[36][0:2])]
words_to_fix.iloc[37][0:2] = [''.join(words_to_fix.iloc[37][0:2])]
words_to_fix.iloc[38][0:2] = [''.join(words_to_fix.iloc[38][0:2])]
words_to_fix.iloc[39][0:2] = [''.join(words_to_fix.iloc[39][0:2])]
words_to_fix.iloc[40][0:2] = [''.join(words_to_fix.iloc[40][0:2])]
words_to_fix.iloc[41][0:2] = [''.join(words_to_fix.iloc[41][0:2])]
words_to_fix.iloc[42][0:2] = [''.join(words_to_fix.iloc[42][0:2])]
words_to_fix.iloc[43][11:13] = [''.join(words_to_fix.iloc[43][11:13])]
words_to_fix.iloc[44][0:2] = [''.join(words_to_fix.iloc[44][0:2])]
words_to_fix.iloc[45][0:2] = [''.join(words_to_fix.iloc[45][0:2])]
words_to_fix.iloc[46][0:2] = [''.join(words_to_fix.iloc[46][0:2])]
words_to_fix.iloc[47][0:2] = [''.join(words_to_fix.iloc[47][0:2])]
words_to_fix.iloc[48][0:2] = [''.join(words_to_fix.iloc[48][0:2])]
words_to_fix.iloc[49][0:2] = [''.join(words_to_fix.iloc[49][0:2])]
words_to_fix.iloc[50][7:9] = [''.join(words_to_fix.iloc[50][7:9])]
words_to_fix.iloc[51][0:2] = [''.join(words_to_fix.iloc[51][0:2])]
words_to_fix.iloc[52][0:2] = [''.join(words_to_fix.iloc[52][0:2])]
words_to_fix.iloc[53][1:3] = [''.join(words_to_fix.iloc[53][1:3])]
words_to_fix.iloc[54][12:14] = [''.join(words_to_fix.iloc[54][12:14])]
words_to_fix.iloc[55][11:13] = [''.join(words_to_fix.iloc[55][11:13])]
words_to_fix.iloc[56][0:2] = [''.join(words_to_fix.iloc[56][0:2])]
words_to_fix.iloc[57][1:3] = [''.join(words_to_fix.iloc[57][1:3])]
words_to_fix.iloc[58][0:2] = [''.join(words_to_fix.iloc[58][0:2])]
words_to_fix.iloc[59][0:2] = [''.join(words_to_fix.iloc[59][0:2])]
words_to_fix.iloc[60][0:2] = [''.join(words_to_fix.iloc[60][0:2])]

In [None]:
#recalculate words counts
quran['spoken_aya_words_count'] = quran['spoken_aya_words'].apply(lambda x: len(x))

In [None]:
#check if there is any unmatched ayas left
len(quran[(quran['spoken_aya_words_count'] != quran['aya_words_count'])]['spoken_aya_words'])

# save our dataset to new files

### rearrange columns

In [None]:
quran = quran [[\
'sura_name',\
'sura_mushaf_order',\
'sura_revel_order',\
'sura_revel_place',\
'sura_cumulative_mushaf_order',\
'sura_cumulative_revel_order',\
'sura_name_letters',\
'sura_name_letters_count',\
'invariant_sura_name_letters',\
'sura_name_invariant_letters_count',\
'sura_name_jomal',\
'sura_name_cumulative_jomal',\
'sura_name_unique_letters',\
'sura_name_unique_letters_count',\
'sura_name_unique_invariant_letters',\
'sura_name_unique_invariant_letters_count',\
'sura_name_unique_letters_jomal',\
'sura_name_unique_letters_cumulative_jomal',\
'aya_index',\
'aya_no',\
'aya_text',\
'aya_words',\
'aya_words_count',\
'aya_letters',\
'aya_letters_count',\
'aya_invariant_letters',\
'aya_invariant_letters_count',\
'aya_jomal',\
'aya_cumulative_jomal',\
'aya_unique_letters',\
'aya_unique_letters_count',\
'aya_unique_invariant_letters',\
'aya_unique_invariant_letters_count',\
'aya_unique_letters_jomal',\
'aya_unique_letters_commulative_jomal',\
'spoken_aya_text',\
'spoken_aya_words',\
'spoken_aya_words_count',\
'spoken_aya_letters',\
'spoken_aya_letters_count',\
'spoken_aya_invariant_letters',\
'spoken_aya_invariant_letters_count',\
'spoken_aya_jomal',\
'spoken_aya_cumulative_jomal',\
'spoken_aya_unique_letters',\
'spoken_aya_unique_letters_count',\
'spoken_aya_unique_invariant_letters',\
'spoken_aya_unique_invariant_letters_count',\
'spoken_aya_unique_letters_jomal',\
'spoken_aya_unique_letters_commulative_jomal'\
    ]]
quran.info()

### Save Dataset

In [None]:
quran.to_csv('generated_dataset/quran_Hafs_numeric_features.csv',index = False)
quran.to_excel('generated_dataset/quran_Hafs_numeric_features.xlsx',index = False)