Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Added function to strip cantillation marks and vowels

  • Loading branch information...
commit ce22709abe32e622edd20a79d5fd46e1217ed578 1 parent 4f577c4
@jag3773 jag3773 authored
Showing with 306,798 additions and 1 deletion.
  1. +14 −1 oxlos-import/ConvertWLC.py
  2. +306,784 −0 oxlos-import/wlc_cons.txt
View
15 oxlos-import/ConvertWLC.py
@@ -39,6 +39,8 @@ def __init__(self):
"Jer","Lam","Ezek","Dan","Hos","Joel","Amos","Obad","Jonah","Mic","Nah","Hab","Zeph","Hag","Zech","Mal"]
self.wlcflat = 'wlc_flat.txt'
self.wlcnocant = 'wlc_nocant.txt'
+ self.wlcconsfile = 'wlc_cons.txt'
+ self.vowels = [u'\u05B0', u'\u05B1', u'\u05B2', u'\u05B3', u'\u05B4', u'\u05B5', u'\u05B6', u'\u05B7', u'\u05B8', u'\u05B9', u'\u05BA', u'\u05BB', u'\u05BC', u'\u05BD', u'\u05BE', u'\u05BF', u'\u05C0', u'\u05C1', u'\u05C2', u'\u05C3', u'\u05C4', u'\u05C5', u'\u05C6', u'\u05C7', u'\u05C8', u'\u05C9', u'\u05CA', u'\u05CB', u'\u05CC', u'\u05CD', u'\u05CE', u'\u05CF']
self.cant = [u'\u0590', u'\u0591', u'\u0592', u'\u0593', u'\u0594', u'\u0595', u'\u0596', u'\u0597', u'\u0598', u'\u0599', u'\u059A', u'\u059B', u'\u059C', u'\u059D', u'\u059E', u'\u059F',u'\u05A0', u'\u05A1', u'\u05A2', u'\u05A3', u'\u05A4', u'\u05A5', u'\u05A6', u'\u05A7', u'\u05A8', u'\u05A9', u'\u05AA', u'\u05AB', u'\u05AC', u'\u05AD', u'\u05AE', u'\u05AF']
def normalize(self, data):
@@ -46,13 +48,21 @@ def normalize(self, data):
data = data.replace(cant, '')
return data
+ def keepconsonantsonly(self, data):
+ for cant in self.cant:
+ data = data.replace(cant, '')
+ for vowel in self.vowels:
+ data = data.replace(vowel, '')
+ return data
+
def transform(self):
print 'Trying to delete old %s' % self.wlcflat
try: os.remove(self.wlcflat)
except: pass
- print "Creating flat file..."
+ print "Creating flat files..."
self.wlcf = open(self.wlcflat, 'w')
self.wlcnocantf = open(self.wlcnocant, 'w')
+ self.wlcconsf = open(self.wlcconsfile, 'w')
for self.book in self.books:
print self.book
bookxml = minidom.parse('./%s/%s.xml' % (self.bookdir, self.book))
@@ -72,13 +82,16 @@ def transform(self):
# Previous format: '%s, %d, %d, %s, %s'
print >> self.wlcf, '%s %d:%d.%d\t%s\t%s' % (self.book.strip('.xml'), self.c, self.v, self.elnum, el.attributes['lemma'].value.encode('utf-8'), el.firstChild.data.encode('utf-8'))
print >> self.wlcnocantf, '%s %d:%d.%d\t%s\t%s' % (self.book.strip('.xml'), self.c, self.v, self.elnum, el.attributes['lemma'].value.encode('utf-8'), self.normalize(el.firstChild.data).encode('utf-8'))
+ print >> self.wlcconsf, '%s %d:%d.%d\t%s\t%s' % (self.book.strip('.xml'), self.c, self.v, self.elnum, el.attributes['lemma'].value.encode('utf-8'), self.keepconsonantsonly(el.firstChild.data).encode('utf-8'))
except KeyError:
print >> self.wlcf, '%s %d:%d.%d\t%s\t%s' % (self.book.strip('.xml'), self.c, self.v, self.elnum, '0', el.firstChild.data.encode('utf-8'))
print >> self.wlcnocantf, '%s %d:%d.%d\t%s\t%s' % (self.book.strip('.xml'), self.c, self.v, self.elnum, '0', self.normalize(el.firstChild.data).encode('utf-8'))
+ print >> self.wlcconsf, '%s %d:%d.%d\t%s\t%s' % (self.book.strip('.xml'), self.c, self.v, self.elnum, '0', self.keepconsonantsonly(el.firstChild.data).encode('utf-8'))
self.v += 1
self.c += 1
self.wlcf.close()
self.wlcnocantf.close()
+ self.wlcconsf.close()
if __name__ == '__main__':
c = ConvertWLC()
View
306,784 oxlos-import/wlc_cons.txt
306,784 additions, 0 deletions not shown
Please sign in to comment.
Something went wrong with that request. Please try again.