-
Notifications
You must be signed in to change notification settings - Fork 1
/
parts.py
108 lines (100 loc) · 2.76 KB
/
parts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# coding=utf8
import codecs, re
from pyquery import PyQuery as pq
import Levenshtein
from idlelib.IOBinding import encoding
import tashaphyne.normalize as norm
from nltk import stem
isri = stem.ISRIStemmer()
almizan = codecs.open('data/output-trans.html',encoding='utf-8').read()
errors = open('data/parts_errors.txt','w')
quran = codecs.open('data/quran.txt', encoding='utf-8').readlines()
tags = codecs.open('data/part-tags.html','w',encoding='utf-8')
d = pq(almizan)
for sec in d("div"):
"""find section ayas"""
sec = pq(sec)
sec_str = sec.attr('rel')
if sec_str == None:
continue
print sec_str
match = re.search(r'\d+', sec_str)
sura = int(match.group(0))
match = re.search(r'\-\d+',sec_str )
aya_begin = int(match.group(0)[1:])
match = re.search(r'\:\d+', sec_str)
aya_end = int(match.group(0)[1:])
sec_ayas = []
i = aya_begin
while(i <= aya_end):
aya_num = str(sura) +'|'+ str(i)
for c in range(len(quran)):
q_aya = quran[c]
if(q_aya.startswith(aya_num)):
sec_ayas.append(q_aya)
quran = quran[c+1:]
break
i += 1
counter = 1
for part in sec.find("em"):
success = 0
part = pq(part)
try:
part_text = norm.normalize_searchtext(part[0].text)
part_text = isri.stem(part_text)
except:
part_text = part[0].text
try:
errors.write(part.outerHtml())
except:
pass
print counter
counter += 1
if part == None or part[0].text == None:
continue
if len(part[0].text)<=2:
continue
for aya in sec_ayas:
aya = norm.normalize_searchtext(aya)
aya = aya.replace(u'ي', u'ی').replace(u'ك', u'ک')
aya = isri.stem(aya)
aya_tokens = re.split("[ |\r\n]",aya[:-2])
aya_tokens = aya_tokens[2:]
part_tokens = re.split(" ",part_text)
partlen = len(part_tokens)
if partlen<1:
continue
for i in range(len(aya_tokens)-partlen + 1):
start = i
end = start + partlen
current = start
threshold = 0.7
while current<end:
try:
if Levenshtein.ratio(aya_tokens[current], part_tokens[current-i]) > threshold:
current += 1
continue
except:
pass
break
if current >= end:
success = 1
break
if success != 1 :
continue
match = re.search(r'\d+', aya)
sura_num = int(match.group(0))
match = re.search(r'\|\d+', aya)
aya_num = int(match.group(0)[1:])
part.attr('rel','%s-%s/%s-%s' % (sura_num, aya_num, start, end))
break
d.root.write('data/output-parts.html', encoding='utf-8')
for tag in d('em'):
try:
tag = pq(tag)
tags.write(tag.outerHtml())
tags.write('\n')
except:
pass
tags.close()
print('aya parts tagged!\n')