In [10]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xml.etree.ElementTree as ET

# data of Mushaf Al-Madinah Hafs narration version 18 from King Fahad here is their link for digital quran and fonts "https://dm.qurancomplex.gov.sa/"

In [11]:
mushaf_madinah = pd.read_csv('data/hafsData_v18.csv')
mushaf_madinah.rename(columns={'id':'aya_index'}, inplace= True)
mushaf_madinah.rename(columns={'aya_text_emlaey':'plain_text_aya'}, inplace= True)
mushaf_madinah.rename(columns={'aya_text':'uthmani_text_aya'}, inplace= True)
mushaf_madinah.rename(columns={'sora':'sora_quran_order'}, inplace= True)
mushaf_madinah.rename(columns={'sora_name_ar':'sora'}, inplace= True)
mushaf_madinah.rename(columns={'line_start': 'aya_line_start'}, inplace= True)
mushaf_madinah.rename(columns={'line_end': 'aya_line_end'}, inplace= True)
quran = mushaf_madinah
quran.head()

## some Quran metadata from tanzil "https://tanzil.net/docs/"

In [12]:
tanzil_quran_metadata_root = ET.parse('data/tanzil_quran_metadata.xml').getroot()
suras = {}
quarter = {}
for child in tanzil_quran_metadata_root.find('suras'):
    suras[int(child.attrib['index'])] = [child.attrib['order'], child.attrib['type']]
for child in tanzil_quran_metadata_root.find('hizbs'):
    quarter[int(child.attrib['sura']), int(child.attrib['aya'])] = int(child.attrib['index'])
quran['sora_revel_order'] = quran['sora_quran_order'].apply(lambda x: suras[x][0])
quran['sora_revel_place'] = quran['sora_quran_order'].apply(lambda x: 'مكية' if suras[x][1] == 'Meccan' else 'مدنية')
quran['hizb'] = quran.apply(lambda x: quarter.get((x['sora_quran_order'], x['aya_no']), None), axis=1 ).fillna(method='ffill') / 4

# Calculate Jomal (حساب الجمل Abjad numerals )
here some usful link "http://www.islamnoon.com/new/Derasat/Erhasat/e_hesab.htm"

In [13]:
quran_letters = set(''.join(''.join(quran['plain_text_aya']).split()))
quran_letters.add('ٱ')
chars_jomal = {'ٱ': 1,'ؤ': 6,'ا': 1,'أ': 1,'ش': 300,'ء': 1,'ذ': 700,'ت': 400,'ر': 200,'ى': 1,'ز': 7,'ص': 90,'د': 4,'ل': 30,'إ': 1,'ك': 20,'س': 60,'ط': 9,'ض': 800,'غ': 1000,'آ': 1,'ن': 50,'ح': 8,'ظ': 900,'ئ': 10,'ي': 10,'ف': 80,'م': 40,'ق': 100,'ة': 400,'خ': 600,'ب': 2,'ج': 3,'و': 6,'ث': 500,'ه': 5,'ع': 70}

def calculate_aya_jomal(letters):
    total = 0
    for c in letters:
        total += chars_jomal[c]
    return total
def get_letters(text):
    letters = []
    for c in text:
        if c in quran_letters:
            if c in ['ء','آ','أ','إ','ا','ى','ٱ']:
                letters.append('ا')
            elif c in ['ة','ت',]:
                letters.append('ت')
            elif c in ['ؤ','و']:
                letters.append('و')
            elif c in ['ئ','ي']:
                letters.append('ي')
            else:
                letters.append(c)
    return letters

def get_words(aya):
    aya = aya.split()[:-1]
    if aya[0] == '۞':
        aya = aya[1:]
    if aya[-1][-1] == '۩':
        aya[-1] = aya[-1][:-1]
    return aya

# sora name featers

In [14]:
quran['sora_name_letters'] = quran['sora'].apply(lambda x: get_letters(x))
quran['sora_name_letters_count'] = quran['sora_name_letters'].apply(lambda x: len(x))
quran['sora_name_unique_aya_letters'] = quran['sora_name_letters'].apply(lambda x: set(x))
quran['sora_name_unique_aya_letters_count'] = quran['sora_name_unique_aya_letters'].apply(lambda x: len(x))
quran['sora_name_jomal'] = quran['sora_name_letters'].apply(lambda x: calculate_aya_jomal(x))
quran['sora_name_unique_aya_letters_jomal'] = quran['sora_name_unique_aya_letters'].apply(lambda x: calculate_aya_jomal(x))

# plain text (Note: text as what it sound or spoken) features

In [15]:
quran['plain_text_aya_words'] = quran['plain_text_aya'].str.split()
quran['plain_text_aya_words_count'] = quran['plain_text_aya_words'].apply(lambda x: len(x))
quran['plain_text_aya_letters'] = quran['plain_text_aya_words'].apply(lambda x: get_letters(''.join(x)))
quran['plain_text_aya_letters_count'] = quran['plain_text_aya_letters'].apply(lambda x: len(x))
quran['plain_text_unique_aya_letters'] = quran['plain_text_aya_letters'].apply(lambda x: set(x))
quran['plain_text_unique_aya_letters_count'] = quran['plain_text_unique_aya_letters'].apply(lambda x: len(x))
quran['plain_text_aya_jomal'] = quran['plain_text_aya_letters'].apply(lambda x: calculate_aya_jomal(x))
quran['plain_text_unique_aya_letters_jomal'] = quran['plain_text_unique_aya_letters'].apply(lambda x: calculate_aya_jomal(x))

# uthmani text features

In [16]:
quran['uthmani_text_aya_words'] = quran['uthmani_text_aya'].apply(lambda x : get_words(x))
quran['uthmani_text_aya_words_count'] = quran['uthmani_text_aya_words'].str.len()
quran['uthmani_text_aya_letters'] = quran['uthmani_text_aya_words'].apply(lambda x: get_letters(''.join(x)))
quran['uthmani_text_aya_letters_count'] = quran['uthmani_text_aya_letters'].str.len()
quran['uthmani_text_unique_aya_letters'] = quran['uthmani_text_aya_letters'].apply(lambda x: set(x))
quran['uthmani_text_unique_aya_letters_count'] = quran['uthmani_text_unique_aya_letters'].apply(lambda x: len(x))
quran['uthmani_text_aya_jomal'] = quran['uthmani_text_aya_letters'].apply(lambda x: calculate_aya_jomal(x))
quran['uthmani_text_unique_aya_letters_jomal'] = quran['uthmani_text_unique_aya_letters'].apply(lambda x: calculate_aya_jomal(x))

### fix spaces in words (61 places with not necessary places in plain text)

In [17]:
# get un matched words counts  between written and spoken text
words_to_fix = quran[(quran['plain_text_aya_words_count'] != quran['uthmani_text_aya_words_count'])]['plain_text_aya_words']
words_to_fix.iloc[0][0:2] = [''.join(words_to_fix.iloc[0][0:2])]
words_to_fix.iloc[1][0:2] = [''.join(words_to_fix.iloc[1][0:2])]
words_to_fix.iloc[2][14:16] = [''.join(words_to_fix.iloc[2][14:16])]
words_to_fix.iloc[3][9:11] = [''.join(words_to_fix.iloc[3][9:11])]
words_to_fix.iloc[4][0:2] = [''.join(words_to_fix.iloc[4][0:2])]
words_to_fix.iloc[5][0:2] = [''.join(words_to_fix.iloc[5][0:2])]
words_to_fix.iloc[6][0:2] = [''.join(words_to_fix.iloc[6][0:2])]
words_to_fix.iloc[7][0:2] = [''.join(words_to_fix.iloc[7][0:2])]
words_to_fix.iloc[8][16:18] = [''.join(words_to_fix.iloc[8][16:18])]
words_to_fix.iloc[9][0:2] = [''.join(words_to_fix.iloc[9][0:2])]
words_to_fix.iloc[10][0:2] = [''.join(words_to_fix.iloc[10][0:2])]
words_to_fix.iloc[11][0:2] = [''.join(words_to_fix.iloc[11][0:2])]
words_to_fix.iloc[12][-4:-2] = [''.join(words_to_fix.iloc[12][-4:-2])]
words_to_fix.iloc[13][0:2] = [''.join(words_to_fix.iloc[13][0:2])]
words_to_fix.iloc[14][0:2] = [''.join(words_to_fix.iloc[14][0:2])]
words_to_fix.iloc[15][0:2] = [''.join(words_to_fix.iloc[15][0:2])]
words_to_fix.iloc[16][0:2] = [''.join(words_to_fix.iloc[16][0:2])]
words_to_fix.iloc[17][0:2] = [''.join(words_to_fix.iloc[17][0:2])]
words_to_fix.iloc[18][0:2] = [''.join(words_to_fix.iloc[18][0:2])]
words_to_fix.iloc[19][17:19] = [''.join(words_to_fix.iloc[19][17:19])]
words_to_fix.iloc[20][0:2] = [''.join(words_to_fix.iloc[20][0:2])]
words_to_fix.iloc[21][1:3] = [''.join(words_to_fix.iloc[21][1:3])]
words_to_fix.iloc[22][0:2] = [''.join(words_to_fix.iloc[22][0:2])]
words_to_fix.iloc[23][0:2] = [''.join(words_to_fix.iloc[23][0:2])]
words_to_fix.iloc[24][0:2] = [''.join(words_to_fix.iloc[24][0:2])]
words_to_fix.iloc[25][6:8] = [''.join(words_to_fix.iloc[25][6:8])]
words_to_fix.iloc[26][0:2] = [''.join(words_to_fix.iloc[26][0:2])]
words_to_fix.iloc[27][0:2] = [''.join(words_to_fix.iloc[27][0:2])]
words_to_fix.iloc[28][1:3] = [''.join(words_to_fix.iloc[28][1:3])]
words_to_fix.iloc[29][1:3] = [''.join(words_to_fix.iloc[29][1:3])]
words_to_fix.iloc[30][3:5] = [''.join(words_to_fix.iloc[30][3:5])]
words_to_fix.iloc[31][12:14] = [''.join(words_to_fix.iloc[31][12:14])]
words_to_fix.iloc[32][8:10] = [''.join(words_to_fix.iloc[32][8:10])]
words_to_fix.iloc[33][6:8] = [''.join(words_to_fix.iloc[33][6:8])]
words_to_fix.iloc[34][24:26] = [''.join(words_to_fix.iloc[34][24:26])]
words_to_fix.iloc[35][0:2] = [''.join(words_to_fix.iloc[35][0:2])]
words_to_fix.iloc[36][0:2] = [''.join(words_to_fix.iloc[36][0:2])]
words_to_fix.iloc[37][0:2] = [''.join(words_to_fix.iloc[37][0:2])]
words_to_fix.iloc[38][0:2] = [''.join(words_to_fix.iloc[38][0:2])]
words_to_fix.iloc[39][0:2] = [''.join(words_to_fix.iloc[39][0:2])]
words_to_fix.iloc[40][0:2] = [''.join(words_to_fix.iloc[40][0:2])]
words_to_fix.iloc[41][0:2] = [''.join(words_to_fix.iloc[41][0:2])]
words_to_fix.iloc[42][0:2] = [''.join(words_to_fix.iloc[42][0:2])]
words_to_fix.iloc[43][11:13] = [''.join(words_to_fix.iloc[43][11:13])]
words_to_fix.iloc[44][0:2] = [''.join(words_to_fix.iloc[44][0:2])]
words_to_fix.iloc[45][0:2] = [''.join(words_to_fix.iloc[45][0:2])]
words_to_fix.iloc[46][0:2] = [''.join(words_to_fix.iloc[46][0:2])]
words_to_fix.iloc[47][0:2] = [''.join(words_to_fix.iloc[47][0:2])]
words_to_fix.iloc[48][0:2] = [''.join(words_to_fix.iloc[48][0:2])]
words_to_fix.iloc[49][0:2] = [''.join(words_to_fix.iloc[49][0:2])]
words_to_fix.iloc[50][7:9] = [''.join(words_to_fix.iloc[50][7:9])]
words_to_fix.iloc[51][0:2] = [''.join(words_to_fix.iloc[51][0:2])]
words_to_fix.iloc[52][0:2] = [''.join(words_to_fix.iloc[52][0:2])]
words_to_fix.iloc[53][1:3] = [''.join(words_to_fix.iloc[53][1:3])]
words_to_fix.iloc[54][12:14] = [''.join(words_to_fix.iloc[54][12:14])]
words_to_fix.iloc[55][11:13] = [''.join(words_to_fix.iloc[55][11:13])]
words_to_fix.iloc[56][0:2] = [''.join(words_to_fix.iloc[56][0:2])]
words_to_fix.iloc[57][1:3] = [''.join(words_to_fix.iloc[57][1:3])]
words_to_fix.iloc[58][0:2] = [''.join(words_to_fix.iloc[58][0:2])]
words_to_fix.iloc[59][0:2] = [''.join(words_to_fix.iloc[59][0:2])]
words_to_fix.iloc[60][0:2] = [''.join(words_to_fix.iloc[60][0:2])]
quran['plain_text_aya_words_count'] = quran['plain_text_aya_words'].apply(lambda x: len(x))

### rearrange columns

In [18]:
quran = quran [[\
'page',\
'jozz',\
'hizb',\
'sora',\
'sora_quran_order',\
'sora_revel_order',\
'sora_revel_place',\
'sora_name_letters',\
'sora_name_letters_count',\
'sora_name_jomal',\
'sora_name_unique_aya_letters',\
'sora_name_unique_aya_letters_count',\
'sora_name_unique_aya_letters_jomal',\
'aya_index',\
'aya_no',\
'aya_line_start',\
'aya_line_end',\
'uthmani_text_aya',\
'uthmani_text_aya_words',\
'uthmani_text_aya_words_count',\
'uthmani_text_aya_letters',\
'uthmani_text_aya_letters_count',\
'uthmani_text_unique_aya_letters',\
'uthmani_text_unique_aya_letters_count',\
'uthmani_text_aya_jomal',\
'uthmani_text_unique_aya_letters_jomal',\
'plain_text_aya',\
'plain_text_aya_words',\
'plain_text_aya_words_count',\
'plain_text_aya_letters',\
'plain_text_aya_letters_count',\
'plain_text_unique_aya_letters',\
'plain_text_unique_aya_letters_count',\
'plain_text_aya_jomal',\
'plain_text_unique_aya_letters_jomal',\
    ]]

In [19]:
quran.info()

# save our dataset to new files

In [26]:
quran.to_csv('quran_Hafs_numeric_features.csv',index = False)