# Corpora Cleaning, Tokenizing, Pickling

### Libraries

In [48]:
import sys
sys.path.append('/Users/Enkidu/Documents/digital_humanities/python_routines/')

import arabic_cleaning as ac

In [49]:
import nltk, os, glob, pickle

## Corpus Globbing Section

### Corpus Build: Machine-Readable Central Asian Persian Texts
Corpus of complete texts edited by others composed in early modern Transoxania.

In [50]:
trans_corpus_files = glob.glob(r'/Users/Enkidu/Box Sync/Notes/Digital Humanities/\
Corpora/machine_readable_persian_transoxania_texts//**/*.txt', recursive=True)

trans_corpus = {}
for longname in trans_corpus_files:
    with open(longname) as f:
        txt = f.read()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    trans_corpus[short[0]] = txt
    
    
trans_corpus.keys()


dict_keys(['khumuli', 'samarat', 'ikromcha', 'proshenie_k_general-gubernatory_ser721', 'rasskaz_praviteli_shahrisabz_ser724', 'prisoedineniia_samarkand_ser723', 'damla_abid_akhund_ser722', 'tarikh-i_jadida_tashkent_ser725', 'tuhfa-ahli-bukhara_ser25', 'darbandi_alexiii_coronation_ser728', 'tuhfa-i_taib_ser726'])

### Corpus Build: Machine-Readable Indo-Persian Texts
Corpus of complete texts edited by others composed in early modern India.

In [51]:
indo_corpus_files = glob.glob(r'/Users/Enkidu/Box Sync/Notes/Digital Humanities/\
Corpora/indo-persian_corpora//**/*.txt', recursive=True)

indo_corpus = {}
for longname in indo_corpus_files:
    with open(longname) as f:
        txt = f.read()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    indo_corpus[short[0]] = txt
    
#add in personally transcribed texts

indo_man_files = glob.glob(r'/Users/Enkidu/Box Sync/Notes/Primary Sources/non-machine-readable_notes/Indian Manuscripts//**/*.txt', recursive=True)

indo_man = {}
for longname in indo_man_files:
    with open(longname) as f:
        txt = f.read()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    indo_man[short[0]] = txt
    
    
#indo_corpus.keys()
indo_man.keys()

dict_keys(['ratan_lal_tuhfa-i_dakkan_ser783', 'jilani_salar_al-intizam_ser789', 'qanuncha_adalat_ser788', 'dustur_al-amal_ser790', 'gawhar_khan_waqa-i_shaykh_dalil_ser796', 'hidayat-i_zururiyya_kotwali_ser791', 'muhammad_al-madrasi_minhaj_al-adala_ser801', 'shahjahanpuri_yadgar-i_makhan_lal_ser780', 'awrangabadi_gul-i_rana_ser794'])

### Corpus Build: Manuscript Notes
Corpus based on partially transcribed manuscripts from early modern Transoxania.

In [52]:
nmr_files = glob.glob(r'/Users/Enkidu/Box Sync/Notes/\
Primary Sources/non-machine-readable_notes/bactriana_notes/*.txt')

raw_notes_corpus = {}
for longname in nmr_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    raw_notes_corpus[short[0]] = txt

#Adding in MarkDown stage files, not yet converted to XML    
md_files = glob.glob(r'/Users/Enkidu/Box Sync/Notes/\
Primary Sources/transcription_markdown_drafting_stage1/document_conversion_backlog/pre-parser_backlog/**/*.txt', recursive=True)

md_notes_corpus = {}
for longname in md_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    md_notes_corpus[short[0]] = txt

#raw_notes_corpus.keys()
#md_notes_corpus.keys()
#md_notes_corpus['apsa_524']




### Corpus Build: XML Documents
Corpus based on transcribed XML documents early modern Transoxania.

In [53]:
# Python 3.5 and newer supports recursive **/ functionality, i.e. cycle through all subdirectories.

xml_files = glob.glob(r'/Users/Enkidu/Box Sync/Notes/Primary Sources/xml_notes_stage2/**/*.xml', recursive=True)


# For-loop through file names and build a dictionary of key (filename): value (text content)

xml_corpus = {}
for longname in xml_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    xml_corpus[short[0]] = txt

#xml_corpus['TsGARUz_i126_1_601_6_ser187']
#xml_corpus.keys()


### Defunct method: [creating an NLTK corpus](http://www.nltk.org/book/ch02.html#loading-your-own-corpus)

```python

os.chdir('/Users/Enkidu/Documents/digital_humanities/jupyter_notebooks')
corpus_root = 'machine_readable_persian_transoxania_texts'
turkestan_corpus = PlaintextCorpusReader(corpus_root, '.*')
turkestan_corpus.fileids()
```

## Persian Literature Digital Corpus
Massive corpus of Persian literature, pulled from Ganjur (http://ganjoor.net/) by Roshan (https://persdigumd.github.io/PDL/)

*Corpus pre-cleaned, tokenized, and pickled from a separate script. (Cleaning takes a long time; and this corpus doesn't change very often, and so does not need to be re-run.)*

In [45]:
f = open('/Users/Enkidu/Box Sync/Notes/Digital Humanities/Corpora/corpus_scripts/persian_lit_toks.pkl', 'rb') 

pers_lit_toks = pickle.load(f)
f.close()

In [47]:
#pers_lit_toks.keys()
#pers_lit_toks["hafez.masnavi"][:50]
#pers_lit_toks['ferdowsi.shahnameh']

#type (pers_lit_toks['ferdowsi.shahnameh'][5])

## Cleaning

Now a function is pulled from an external file (arabic_cleaning.py). Previous method saved for posterity:

```python
clean_edited_i = {}
for fn in raw_edited_corpus:
    clean_edited_i[fn] = re.sub(r'ي', 'ی', raw_edited_corpus[fn])

clean_edited = {}
for fn in clean_edited_i:
    clean_edited[fn] = re.sub(r'[^آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهس ي یی ]', '', clean_edited_i[fn])
```


### Cleaning edited texts and notes

In [54]:
# Clean edited texts

## TO DO: figure out more efficient way of doing the for loop, add in ک swaps too

#Transoxania Corpus (transcribed by others)
clean_trans = {fn: ac.clean_document(doc) for fn, doc in trans_corpus.items()}

#Indo Corpus (transcribed by others)
clean_indo = {fn: ac.clean_document(doc) for fn, doc in indo_corpus.items()}

#Manual Indo Corpus (transcribed by me)
clean_indo_man = {fn: ac.clean_document(doc) for fn, doc in indo_man.items()}


#XML-stage texts
clean_xml = {fn: ac.clean_document(doc) for fn, doc in xml_corpus.items()}

#Raw Notes
clean_notes = {fn: ac.clean_document(doc) for fn, doc in raw_notes_corpus.items()}

#Markdown Notes
clean_markdown = {fn: ac.clean_document(doc) for fn, doc in md_notes_corpus.items()}


#clean_trans['ikromcha'][:1000]
#clean_trans['ikromcha'][:1000]

#clean_markdown['apsa_76']

#clean_xml['ser561']

#clean_indo['mu_vol1'][:1000]

### Cleaning XML documents

*Dormant XML cleaning method using BeautifulSoup (still in use for Persian literature tokenization in separate script)*

```python
bstree = bs4.BeautifulSoup(clean_xml["ser561"], 'lxml')


print(bstree.get_text())

clean_xml = {}
for fn in raw_xml:
    bstree = bs4.BeautifulSoup(raw_xml[fn], 'lxml')
    clean_xml[fn] = bstree.get_text()
    
clean_xml['TsGARUZ_i126_1_1986_1_ser201']
```



## Tokenizing

In [57]:

edited_toks = {}
for (fn, txt) in clean_trans.items():
    toks = nltk.word_tokenize(txt)
    edited_toks[fn] = toks

indo_toks = {}
for (fn, txt) in clean_indo.items():
    toks = nltk.word_tokenize(txt)
    indo_toks[fn] = toks
    
indo_toks_man = {}
for (fn, txt) in clean_indo_man.items():
    toks = nltk.word_tokenize(txt)
    indo_toks_man[fn] = toks
    

notes_toks = {}
for (fn, txt) in clean_notes.items():
    toks = nltk.word_tokenize(txt)
    notes_toks[fn] = toks
    
markdown_toks = {}
for (fn, txt) in clean_markdown.items():
    toks = nltk.word_tokenize(txt)
    notes_toks[fn] = toks


xml_toks = {}
for (fn, txt) in clean_xml.items():
    toks = nltk.word_tokenize(txt)
    xml_toks[fn] = toks
    

In [None]:
#xml_toks['TsGARUz_R-2678_ser184'][50:70]

#notes_toks["jung_i_mahzar_va_rivayat_al_biruni_9767"]

#indo_toks['mu_vol1'][:50]

### Pickling Corpora

In [112]:
with open("pickled_refined_data/corpora.pkl", "wb") as f:
    pickle.dump((edited_toks, indo_toks, indo_toks_man, notes_toks, markdown_toks, xml_toks), f)

### Merging Corpuses

Sub-Corpuses:
- (a) Indic Persian Mansucripts: corpus + manuscript notes
- (b) Transoxania Manuscripts: corpus + manuscript notes
- (c) Persian literature
- (d) Documents (right now all together, regardless of location)

Combined Corpora:
- (i) History: a + b
- (ii) Literature: c
- (iii) Documents: d


In [63]:
#Merging dictionaries: https://www.webucator.com/how-to/how-merge-dictionaries-python.cfm

#Sub-Corpuses

# (a) All Indic Manuscripts
india_man_toks = {**indo_toks_man, **indo_toks}

# (b) All Transoxania Manuscripts
trans_man_toks = {**edited_toks, **notes_toks}

# (c) All Documents
doc_corpus_toks = {**xml_toks, **markdown_toks}


#Corpora

# (i) Historical Manuscripts
hist_corpus_toks = {**india_man_toks, **trans_man_toks}


# Meta-Corpus
combined_corpus_toks = {**hist_corpus_toks, **doc_corpus_toks, **pers_lit_toks}



In [61]:
#combined_corpus_toks['mu_vol1'][:50]
#pers_lit_toks["hafez.masnavi"][:50]

### Pickling combined corpora

In [113]:
with open("pickled_refined_data/meta_corpora.pkl", "wb") as f:
    pickle.dump((india_man_toks, trans_man_toks, doc_corpus_toks, hist_corpus_toks, combined_corpus_toks), f)

### Losing corpus hierarchy for simple token lists

In [65]:
#Combined Tokens (loses corpus text designation)

doc_toks = []
for (fn, text) in doc_corpus_toks.items():
    doc_toks.extend(doc_corpus_toks[fn])
    
    
hist_toks = []
for (fn, text) in hist_corpus_toks.items():
    hist_toks.extend(hist_corpus_toks[fn])
    
lit_toks = []
for (fn, text) in pers_lit_toks.items():
    lit_toks.extend(pers_lit_toks[fn])

combined_toks = []
for (fn, text) in combined_corpus_toks.items():
    combined_toks.extend(combined_corpus_toks[fn])


In [69]:
#hist_toks[100:125]

### Pickling Raw Tokens

In [114]:
with open("pickled_refined_data/raw_tokens.pkl", "wb") as f:
    pickle.dump((doc_toks, hist_toks, lit_toks, combined_toks), f)