# Pahlavi Corpus Builder

Links:
- [extracting MS Word data](https://towardsdatascience.com/how-to-extract-data-from-ms-word-documents-using-python-ed3fbb48c122)
- [navigating MS Word XML data](https://virantha.com/2013/08/16/reading-and-writing-microsoft-word-docx-files-with-python/)

In [169]:
import os, zipfile, re, glob, nltk
from bs4 import BeautifulSoup

In [2]:
#set home directory path
hdir = os.path.expanduser('~')

#pahlavi corpus directory
pah_path = hdir + "/Box/Notes/Digital_Humanities/Corpora/pahlavi_corpus/"

### Glob the corpus

In [3]:
pah_files = glob.glob(pah_path + r'/*.docx')

pah_xml_corpus = {}
for longname in pah_files:
    document = zipfile.ZipFile(longname)
    txt = zipfile.ZipFile.read(document, 'word/document.xml', pwd=None)
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    pah_xml_corpus[short[0]] = txt

### Assemble simple corpus divided by MS Word paragraph breaks

In [4]:
pahlavi_corpus = {}
for work in pah_xml_corpus:
    tree = BeautifulSoup(pah_xml_corpus[work])
    paras = tree.find_all("w:p")
    document = {}
    for i in range(len(paras)):
        if len(paras[i].get_text()) > 0:
            document[i] = paras[i].get_text()
    pahlavi_corpus[work] = document

In [111]:
# Example:
#pahlavi_corpus["Dēnkard 5"][100]

### Extracting the Line Numbers

In [151]:
pahlavi_corpus_lines = {}
for work in pahlavi_corpus:
    segment = {}
    for para in pahlavi_corpus[work]:
        num_pattern = re.compile(r"^[0-9]{0,3}\.[0-9]{0,3}\.?[0-9]{0,3}")
        num_match = re.match(num_pattern, pahlavi_corpus[work][para])
        if num_match is not None:
            num = num_match.group(0)
            position = num_match.end()
            line = pahlavi_corpus[work][para][position:]
        else:
            num = "--"
            line = pahlavi_corpus[work][para]
        
        segment[para] = [num, line]
        
    pahlavi_corpus_lines[work] = segment
    

In [163]:
# Example:
#pahlavi_corpus_lines["Dēnkard 5"][10]

type(pahlavi_corpus_lines["Dēnkard 5"])

dict

### Tokenizing

In [171]:
pahlavi_corpus_toks = {}
for fn in pahlavi_corpus_lines.():
    toks = nltk.word_tokenize(pahlavi_corpus_lines[fn])
    trans_nar_ext_toks[fn] = [num, toks]

SyntaxError: can't assign to literal (<ipython-input-171-25319ac0783d>, line 2)

In [170]:
nltk.word_tokenize("went to the store")

['went', 'to', 'the', 'store']

In [166]:
pahlavi_corpus_lines.items()


dict_items([('afsōn tab ud xōn abāz estādan', {0: ['--', 'afsōn tab ud xōn abāz estādan'], 1: ['--', '[TD4a398]'], 2: ['--', 'šnāyišn ī dādār Ohrmazd ud wispān yazdān ud hamāg yazdān ī mēnōyān gētīyān ∵ '], 3: ['--', 'afsōn tab '], 4: ['--', 'surahī . virsā . anmō . karošī . vašā . ahrąs . ahrs ∵'], 5: ['--', "<wlyy’m’n'> ∵ ī mād duxt ∵ rasīdan pad nām ī ān tan 3 tāg ud mād 3 tāg "], 6: ['--', 'ud agar tab ī čahārom ast ān grāy ī abar mayān abgand ud ēk az <snyD> ud ēk az <snyD> '], 7: ['--', 'ud agar mard pad bāzūg ud agar zan pad bāzūg bastan ∵'], 9: ['--', 'afsōn ud nīrang xōn abāz estādan rāy wāz sūd wahišt ud ēw tan roz afsōn 7 bār ayāb 10 ī bār ayāb 21 bār padiš abar guftan'], 10: ['--', 'afsōn ud ēn kō . kārō . nāga . kārō . '], 11: ['--', 'āb az xānag kē bastan bast pad framān ī nēw Frēdōn az kōf andar āmad tan ēw <’sp̄wlys> nihuft tan ēw <’sp̄wlys> paymōxtan ud 9 tabarzīn pad dast dārēd aniiāi ']}), ('ARDĀ WIRĀZ', {0: ['--', 'ARDĀ WIRĀZ-NĀMAG'], 1: ['--', '[K20] = Ka'], 

In [172]:
type(pahlavi_corpus_lines["Dēnkard 5"][10][1])

str