# Pahlavi Corpus Builder

Links:
- [extracting MS Word data](https://towardsdatascience.com/how-to-extract-data-from-ms-word-documents-using-python-ed3fbb48c122)
- [navigating MS Word XML data](https://virantha.com/2013/08/16/reading-and-writing-microsoft-word-docx-files-with-python/)

In [47]:
import os, zipfile, re, glob, nltk, pickle
import pandas as pd
from bs4 import BeautifulSoup
from collections import defaultdict

# explanation of default dict: https://www.geeksforgeeks.org/defaultdict-in-python/

In [2]:
#set home directory path
hdir = os.path.expanduser('~')

#pahlavi corpus directory
pah_path = hdir + "/Box/Notes/Digital_Humanities/Corpora/pahlavi_corpus/"

#pickle path
pickle_path = hdir + "/Box/Notes/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"

### Glob the corpus

In [3]:
pah_files = glob.glob(pah_path + r'/*.docx')

pah_xml_corpus = {}
for longname in pah_files:
    document = zipfile.ZipFile(longname)
    txt = zipfile.ZipFile.read(document, 'word/document.xml', pwd=None)
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    pah_xml_corpus[short[0]] = txt

### Assemble simple corpus divided by MS Word paragraph breaks

In [4]:
pahlavi_corpus = {}
for work in pah_xml_corpus:
    tree = BeautifulSoup(pah_xml_corpus[work])
    paras = tree.find_all("w:p")
    document = {}
    for i in range(len(paras)):
        if len(paras[i].get_text()) > 0:
            document[i] = paras[i].get_text()
    pahlavi_corpus[work] = document

In [11]:
# Example:
pahlavi_corpus["nĪrang ī āb"]

{0: 'nĪrang ī āb',
 1: 'Also WD.89[ML / 4050 / 303v]',
 2: '[ML / 4050 / 303v]',
 3: '[TUL 11263_294r]',
 4: '[DZ 4010_ 292r]',
 5: '[Nik 4040_ 294v]',
 6: ' Nīrang ī āb ud pādyāb yaštan',
 7: 'fradom kār ēn kū ōy kū āb ud  pādyāb // kunēd naxust xwēš-tan pad baršnūm bē abāyēd šustan ',
 8: 'ud ka-š // 3 3 \\\\ 3 šabag dāšt bawēd āb pad karbās  ī pad-pādyāb pālūdan ud pad  ǰāmag ī // \\\\ pad-pādyāb andar kunišn ',
 9: 'gōmēz  az gāw ī gušn ka nē ān ī wādag  // šāyēd \\\\ bē kunišn ',
 10: 'ud andar ǰāmag ī pad-pādy<āb  xūbīhā andar kunišn ',
 11: 'u-š sar  // \\\\ bē nihumbišn ud az xrafstar ud abārīg  rēmanīh pad pahrēz dārišn ',
 12: 'ud aw>ēšān kē āb  // \\\\ ud g[ōmēz yazēnd yaštan tan p<ad baršnūm bē šōyišn ',
 13: 'ka 3 3 3 > šabag xub [dāšt // ēg-išān 30 <gām frāz kunišn ',
 14: 'yašt ī 3 paywa \\\\ nd abāg kas ī h[u-xēmtar ī awest<wārtar ī // rāst>-Abestāg \\\\ tar (narm-Abestāg-tar) xūb-n[ērangtar ud dēn-āgāhtar [ML_304r] bē < kunišn // ',
 15: 'ud ān kas kē zōdīh k>unēd šab<

### Extracting the Line Numbers

In [5]:
# currently does not work for works that lack line numbers, e.g. nĪrang ī āb

pahlavi_corpus_lines = {}
for work in pahlavi_corpus:
    segment = {}
    # reset the paragraph and line number for each work
    prev_para = None
    prev_num_match = None
    for para in pahlavi_corpus[work]:
        num_pattern = re.compile(r'^.*(\.[0-9]{1,3}){1,3}')
        num_match = re.match(num_pattern, pahlavi_corpus[work][para])
        if num_match is not None:
            num = num_match.group(0)
            position = num_match.end()
            line = pahlavi_corpus[work][para][position:]
        elif prev_num_match is not None:
            pahlavi_corpus[work][prev_para] += pahlavi_corpus[work][para]
        else:
            num = "--"
            line = pahlavi_corpus[work][para]
        
        segment[num] = line
        prev_num_match = num_match
        prev_para = para
        
    pahlavi_corpus_lines[work] = segment
    

In [30]:
#pahlavi_corpus_lines.keys()
pahlavi_corpus_lines["nĪrang ī āb"]

{'--': '[TUL 11263_295v col.]', 'Also WD.89': ''}

### Flat Indexing

In [44]:
# list of tuples

#doc = pahlavi_corpus_lines["ARDĀ WIRĀZ"]
#sum([[(ln, pos, tok) for pos, tok in enumerate(line.split())] for ln, line in doc.items()], [])

# any advantage to using nltk.word_tokenize() instead of split()?

pahlavi_flat_corpus = []
for work in pahlavi_corpus_lines:
    doc = pahlavi_corpus_lines[work]
    output = sum([[(work, ln, pos, tok) for pos, tok in enumerate(line.split())] for ln, line in doc.items()], [])
    pahlavi_flat_corpus += output

In [46]:
#pahlavi_flat_corpus

### Pass to Dataframe

In [51]:
# pass to dataframe: pd.DataFrame([(1,2,3), (2,3,4)], columns=['a', 'b', 'c'])

#pd.DataFrame(pahlavi_flat_corpus, columns=['title', 'line', 'index', 'token'])
pd.DataFrame(pahlavi_flat_corpus, columns=['title', 'line', 'index', 'token']).to_csv(os.path.join(pickle_path,r'pahlavi_corpus.csv'), index=False)