# Corpora Cleaning, Tokenizing, Pickling

### Libraries

In [1]:
import arabic_cleaning as ac

In [2]:
import nltk, os, glob, pickle

### Paths

Home Directory

In [26]:
#set home directory path
hdir = os.path.expanduser('~')

#external relative path
ext_corp_path = hdir + "/Box/Notes/Digital_Humanities/Corpora"

#internal relative path
int_corp_path = hdir + "/Box/Notes/Primary_Sources"

#pickle path
pickle_path = hdir + "/Box/Notes/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"

##### Pre-existing Corpora

In [4]:
# Indic Narrative
indo_path = ext_corp_path + "/indo-persian_corpora"

# Transoxania Narrative (Persian)
trans_path = ext_corp_path + "/machine_readable_persian_transoxania_texts"

# Khiva documents
khiva_path = ext_corp_path + "/khiva_khanate_chancery_corpus"

# Muscovite Persian diplomatic documents
musc_path = ext_corp_path + "/khorezm_muscovy_diplomatic"

# Persian Lit
perslit_path = ext_corp_path + "/persian_literature_digital_corpus_roshan"

# Turkic Narrative sources
turk_path = ext_corp_path + "/turkic_corpora"

##### Self-created Corpora

In [5]:
# Indian Narrative
indo_man_path = int_corp_path + "/non-machine-readable_notes/indian_manuscripts"

# Transoxania Narrative
trans_man_path = int_corp_path + "/non-machine-readable_notes/bactriana_notes"

# Transoxania Documents
trans_man_docs_path = int_corp_path + "/xml_notes_stage2/bukhara_xml"

# Hyderabad Documents
hyd_man_docs_path = int_corp_path + "/xml_notes_stage2/hyderabad_xml"

# Indian Documents (misc. transcribed)
indo_man_docs_path = int_corp_path + "/xml_notes_stage2/indic_corpus_xml"


##### Unorganized Documents

In [6]:
# Converted to XML, pre-sorted
parser_xml_path = int_corp_path + "/xml_notes_stage2/parser_depository"

# Old system, yet to update
xml_old_sys_path = int_corp_path + "/xml_notes_stage2/xml_transcriptions_old_system"

# Markdown stage
markdown_path = int_corp_path + "/transcription_markdown_drafting_stage1"

# Markdown backlog (old system)
md_backlog_path = int_corp_path + "/transcription_markdown_drafting_stage1/document_conversion_backlog"

## Corpus Globbing Section

### Pre-existing Corpora

#### Indic Narrative
Thackston corpus

In [7]:
indo_corpus_files = glob.glob(indo_path + r'//**/*.txt', recursive=True)

indo_corpus = {}
for longname in indo_corpus_files:
    with open(longname) as f:
        txt = f.read()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    indo_corpus[short[0]] = txt
    
indo_corpus.keys()

dict_keys(['ain-i_akbari_murty', 'an1_nassim', 'an2_nassim', 'an3_nassim', 'badauni_muntakhab_al-tawwarikh', 'jahangirnama', 'mu_vol1', 'mu_vol2', 'mu_vol3', 'psn1', 'psn2', 'psn3', 'siyar_al-mutaakhirin1', 'siyar_al-mutaakhirin2', 'sjn1', 'sjn2', 'sjn3'])

#### Transoxania Narrative

In [8]:
trans_corpus_files = glob.glob(trans_path + r'//**/*.txt', recursive=True)

trans_corpus = {}
for longname in trans_corpus_files:
    with open(longname) as f:
        txt = f.read()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    trans_corpus[short[0]] = txt
    
    
trans_corpus.keys()

dict_keys(['ikromcha', 'khumuli', 'samarat', 'damla_abid_akhund_ser722', 'darbandi_alexiii_coronation_ser728', 'prisoedineniia_samarkand_ser723', 'proshenie_k_general-gubernatory_ser721', 'rasskaz_praviteli_shahrisabz_ser724', 'tarikh-i_jadida_tashkent_ser725', 'tuhfa-ahli-bukhara_ser25', 'tuhfa-i_taib_ser726'])

#### Persian Literature
*See below*

#### Khiva Documents

In [9]:
khiva_corpus_files = glob.glob(khiva_path + r'//**/*.txt', recursive=True)

khiva_corpus = {}
for longname in khiva_corpus_files:
    with open(longname) as f:
        txt = f.read()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    khiva_corpus[short[0]] = txt
    
    
#khiva_corpus.keys.()

#### Turkic Documents
*TBD*

#### Muscovite Persian diplomatic documents
*TBD*

### Self-created Corpora

#### Indic Narrative

In [10]:
indo_man_files = glob.glob(indo_man_path + r'//**/*.txt', recursive=True)

indo_man = {}
for longname in indo_man_files:
    with open(longname) as f:
        txt = f.read()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    indo_man[short[0]] = txt
    
    
#indo_corpus.keys()
indo_man.keys()

dict_keys(['awrangabadi_gul-i_rana_ser794', 'dustur_al-amal_ser790', 'gawhar_khan_waqa-i_shaykh_dalil_ser796', 'hidayat-i_zururiyya_kotwali_ser791', 'husayni_waqay-i_dakkan_ser779', 'jilani_salar_al-intizam_ser789', 'muhammad_al-madrasi_minhaj_al-adala_ser801', 'qanuncha_adalat_ser788', 'ratan_lal_tuhfa-i_dakkan_ser783', 'shahjahanpuri_yadgar-i_makhan_lal_ser780'])

#### Transoxania Narrative
Corpus based on partially transcribed manuscripts from early modern Transoxania.

In [11]:
trans_man_files = glob.glob(trans_man_path + r'/*.txt')

trans_man = {}
for longname in trans_man_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    trans_man[short[0]] = txt

#trans_man.keys()

#### Transoxania Documents
Qushbegi documents at XML stage

In [12]:
trans_man_doc_files = glob.glob(trans_man_docs_path + r'/*.xml')

trans_man_docs = {}
for longname in trans_man_doc_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    trans_man_docs[short[0]] = txt

#trans_man_docs['ser179']

#### Hyderabad Documents

In [13]:
# Hyderabad Documents

hyd_man_doc_files = glob.glob(hyd_man_docs_path + r'/*.xml')

hyd_man_docs = {}
for longname in hyd_man_doc_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    hyd_man_docs[short[0]] = txt

hyd_man_docs.keys()
#Note: nothing in that folder yet.


dict_keys([])

#### Indic Documents
Misc. Indic documents other than those from the Nizam State collection

In [14]:
ind_man_doc_files = glob.glob(indo_man_docs_path + r'/*.xml')

ind_man_docs = {}
for longname in ind_man_doc_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    ind_man_docs[short[0]] = txt

ind_man_docs.keys()

dict_keys(['ser818'])

### Unorganized Documents
E.g. documents still at the markdown stage, and not yet sorted by region.

#### XML, pre-sorted

In [15]:
xml_presort_files = glob.glob(parser_xml_path + r'/*.xml')

xml_presort_docs = {}
for longname in xml_presort_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    xml_presort_docs[short[0]] = txt

xml_presort_docs.keys()

dict_keys(['ser560', 'ser808', 'ser809', 'ser811', 'ser812', 'ser813', 'ser814', 'ser815', 'ser816', 'ser817', 'ser842', 'ser843', 'ser857', 'ser876', 'ser877', 'ser898'])

#### XML, old system

In [16]:
xml_oldsys_files = glob.glob(xml_old_sys_path + r'//**/*.xml', recursive=True)

xml_oldsys_docs = {}
for longname in xml_oldsys_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    xml_oldsys_docs[short[0]] = txt

xml_oldsys_docs.keys()

dict_keys(['IVANUz_1936_ser185', 'NLR_f-940_ser190', 'RGVIA_400-1-1015_ser143', 'TsGARUz_i126-1-938-2_ser82', 'TsGARUz_i126_1_1160_ser193', 'TsGARUZ_i126_1_1729_101_ser213', 'TsGARUz_i126_1_1730_19_ser218', 'TsGARUz_i126_1_1730_22_ser217', 'TsGARUz_i126_1_1730_2_ser188', 'TsGARUZ_i126_1_1730_81_ser227', 'TsGARUZ_i126_1_1986_1_ser201', 'TsGARUz_i126_1_1990_20_ser186', 'TsGARUZ_i126_1_1990_3_ser192', 'TsGARUz_R-2678_ser184'])

### Pickling XML Corpora

In [23]:
# Merges

## All final stage XML documents
combo_xml_final = {**ind_man_docs, **hyd_man_docs, **trans_man_docs}
## All XML all stages
combo_xml_all = {**combo_xml_final, **xml_oldsys_docs, **xml_presort_docs}


combo_xml_all.keys()

dict_keys(['ser818', 'ser179', 'ser183', 'ser187', 'ser212', 'ser215', 'ser237', 'ser537', 'ser561', 'ser596', 'ser626', 'ser706', 'ser72', 'ser91', 'IVANUz_1936_ser185', 'NLR_f-940_ser190', 'RGVIA_400-1-1015_ser143', 'TsGARUz_i126-1-938-2_ser82', 'TsGARUz_i126_1_1160_ser193', 'TsGARUZ_i126_1_1729_101_ser213', 'TsGARUz_i126_1_1730_19_ser218', 'TsGARUz_i126_1_1730_22_ser217', 'TsGARUz_i126_1_1730_2_ser188', 'TsGARUZ_i126_1_1730_81_ser227', 'TsGARUZ_i126_1_1986_1_ser201', 'TsGARUz_i126_1_1990_20_ser186', 'TsGARUZ_i126_1_1990_3_ser192', 'TsGARUz_R-2678_ser184', 'ser560', 'ser808', 'ser809', 'ser811', 'ser812', 'ser813', 'ser814', 'ser815', 'ser816', 'ser817', 'ser842', 'ser843', 'ser857', 'ser876', 'ser877', 'ser898'])

In [27]:
# No need to pickle sub-directories of unsorted XML files
with open(pickle_path + "/xml_corpora.pkl", "wb") as f:
    pickle.dump((ind_man_docs, hyd_man_docs, trans_man_docs,\
                combo_xml_final, combo_xml_all), f)

#### Markdown Stage
Transcribed docs, yet to be ported over to XML

In [18]:
markdown_files = glob.glob(markdown_path + r'/*.xml')

markdown_docs = {}
for longname in markdown_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    markdown_docs[short[0]] = txt

markdown_docs.keys()
#Will be empty if everything was recently parsed and transfered, per workflow

dict_keys([])

#### Markdown, old system

In [28]:
markdown_old_files = glob.glob(md_backlog_path + r'//**/*.txt', recursive=True)

markdown_old_docs = {}
for longname in markdown_old_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    markdown_old_docs[short[0]] = txt

#markdown_old_docs.keys()

## Persian Literature Digital Corpus
Massive corpus of Persian literature, pulled from Ganjur (http://ganjoor.net/) by Roshan (https://persdigumd.github.io/PDL/)

*Corpus pre-cleaned, tokenized, and pickled from a separate script. (Cleaning takes a long time; and this corpus doesn't change very often, and so does not need to be re-run.)*

In [31]:
f = open(perslit_path + '/persian_lit_toks.pkl', 'rb') 

pers_lit_toks = pickle.load(f)
f.close()

In [33]:
#pers_lit_toks.keys()
#pers_lit_toks["hafez.masnavi"][:50]
#pers_lit_toks['ferdowsi.shahnameh']

#type (pers_lit_toks['ferdowsi.shahnameh'][5])

### Cleaning edited texts and notes

In [30]:
# possible to do this once by iterating over the following? crashed computer last time...

# indo_corpus, trans_corpus, khiva_corpus
# indo_man, trans_man
# trans_man_docs, hyd_man_docs, ind_man_docs
# xml_presort_docs, xml_oldsys_docs, markdown_docs, markdown_old_docs


clean_indo = {fn: ac.clean_document(doc) for fn, doc in indo_corpus.items()}
clean_trans = {fn: ac.clean_document(doc) for fn, doc in trans_corpus.items()}
clean_khiva = {fn: ac.clean_document(doc) for fn, doc in khiva_corpus.items()}

clean_indo_man = {fn: ac.clean_document(doc) for fn, doc in indo_man.items()}
clean_trans_man = {fn: ac.clean_document(doc) for fn, doc in trans_man.items()}

clean_trans_man_docs = {fn: ac.clean_document(doc) for fn, doc in trans_man_docs.items()}
clean_hyd_man_docs = {fn: ac.clean_document(doc) for fn, doc in hyd_man_docs.items()}
clean_ind_man_docs = {fn: ac.clean_document(doc) for fn, doc in ind_man_docs.items()}

clean_xml_presort_docs = {fn: ac.clean_document(doc) for fn, doc in xml_presort_docs.items()}
clean_xml_oldsys_docs = {fn: ac.clean_document(doc) for fn, doc in xml_oldsys_docs.items()}
clean_markdown_docs = {fn: ac.clean_document(doc) for fn, doc in markdown_docs.items()}
clean_markdown_old_docs = {fn: ac.clean_document(doc) for fn, doc in markdown_old_docs.items()}



#clean_trans['ikromcha'][:1000]
#clean_trans['ikromcha'][:1000]


#clean_xml['ser561']

#clean_indo['mu_vol1'][:1000]

## Tokenizing

In [36]:

# External Corpora Toks

indo_nar_ext_toks = {}
for (fn, txt) in clean_indo.items():
    toks = nltk.word_tokenize(txt)
    indo_nar_ext_toks[fn] = toks

trans_nar_ext_toks = {}
for (fn, txt) in clean_trans.items():
    toks = nltk.word_tokenize(txt)
    trans_nar_ext_toks[fn] = toks 
    
khiva_doc_toks = {}
for (fn, txt) in clean_khiva.items():
    toks = nltk.word_tokenize(txt)
    khiva_doc_toks[fn] = toks

    
# Manually Entered Manuscript Toks

indo_nar_toks = {}
for (fn, txt) in clean_indo_man.items():
    toks = nltk.word_tokenize(txt)
    indo_nar_toks[fn] = toks
    
trans_nar_toks = {}
for (fn, txt) in clean_trans_man.items():
    toks = nltk.word_tokenize(txt)
    trans_nar_toks[fn] = toks

# Clean XML-stage Document Toks
 
trans_xml_toks = {}
for (fn, txt) in clean_trans_man_docs.items():
    toks = nltk.word_tokenize(txt)
    trans_xml_toks[fn] = toks
    
hyd_xml_toks = {}
for (fn, txt) in clean_hyd_man_docs.items():
    toks = nltk.word_tokenize(txt)
    hyd_xml_toks[fn] = toks

indo_xml_toks = {}
for (fn, txt) in clean_ind_man_docs.items():
    toks = nltk.word_tokenize(txt)
    indo_xml_toks[fn] = toks


# Unorganized Markdown-stage Toks


presort_xml_toks = {}
for (fn, txt) in clean_xml_presort_docs.items():
    toks = nltk.word_tokenize(txt)
    presort_xml_toks[fn] = toks
    
oldsys_xml_toks = {}
for (fn, txt) in clean_xml_oldsys_docs.items():
    toks = nltk.word_tokenize(txt)
    oldsys_xml_toks[fn] = toks
    
md_stage_toks = {}
for (fn, txt) in clean_markdown_docs.items():
    toks = nltk.word_tokenize(txt)
    md_stage_toks[fn] = toks

md_oldsys_toks = {}
for (fn, txt) in clean_markdown_old_docs.items():
    toks = nltk.word_tokenize(txt)
    md_oldsys_toks[fn] = toks



*First-stage combinations*: Collapse unsorted documents

In [37]:
unsorted_doc_toks = {**presort_xml_toks, **oldsys_xml_toks, **md_stage_toks, **md_oldsys_toks}

In [40]:
#unsorted_doc_toks['ser560']

### Pickling Corpora

In [42]:
pickle_path = hdir + "/Box/Notes/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"

In [43]:
with open(pickle_path + "/corpora.pkl", "wb") as f:
    pickle.dump((unsorted_doc_toks,\
                indo_xml_toks, hyd_xml_toks, trans_xml_toks,\
                trans_nar_toks, indo_nar_toks,\
                trans_nar_ext_toks, indo_nar_ext_toks), f)

### Merging Corpuses

[Explanation on merging dictionaries](https://www.webucator.com/how-to/how-merge-dictionaries-python.cfm)

Combos:
- Indic narrative sources (combining external and self-transcribed)
- Transoxania narrative sources (combining external and self-transcribed)
- All narrative sources
- All Persian documents

In [44]:

comb_india_nar_toks = {**indo_nar_toks, **indo_nar_ext_toks}

comb_trans_nar_toks = {**trans_nar_toks, **trans_nar_ext_toks}

nar_corpus_toks = {**comb_trans_nar_toks, **comb_india_nar_toks}

doc_corpus_toks = {**unsorted_doc_toks, **indo_xml_toks, **hyd_xml_toks, **trans_xml_toks}


# Meta-Corpus (except Persian lit)
combined_corpus_toks = {**nar_corpus_toks, **doc_corpus_toks}

# Mega-Corpus
mega_corpus_toks = {**combined_corpus_toks, **pers_lit_toks}


In [46]:
#combined_corpus_toks['mu_vol1'][:50]
#mega_corpus_toks["hafez.masnavi"][:50]

### Pickling combined corpora

In [47]:
with open(pickle_path + "/meta_corpora.pkl", "wb") as f:
    pickle.dump((comb_india_nar_toks, comb_trans_nar_toks, nar_corpus_toks, doc_corpus_toks,\
                combined_corpus_toks, mega_corpus_toks), f)

### Losing corpus hierarchy for simple token lists

In [49]:
#Combined Tokens (loses corpus text designation)

raw_doc_toks = []
for (fn, text) in doc_corpus_toks.items():
    raw_doc_toks.extend(doc_corpus_toks[fn])
    
raw_nar_toks = []
for (fn, text) in nar_corpus_toks.items():
    raw_nar_toks.extend(nar_corpus_toks[fn])
    
raw_lit_toks = []
for (fn, text) in pers_lit_toks.items():
    raw_lit_toks.extend(pers_lit_toks[fn])

raw_combo_toks = []
for (fn, text) in combined_corpus_toks.items():
    raw_combo_toks.extend(combined_corpus_toks[fn])


In [51]:
#raw_combo_toks[100:125]

### Pickling Raw Tokens

In [53]:
with open(pickle_path + "/raw_tokens.pkl", "wb") as f:
    pickle.dump((raw_doc_toks, raw_nar_toks, raw_lit_toks, raw_combo_toks), f)

# Archive
(Old methods, now memorialized in markdown

----


### Defunct method: [creating an NLTK corpus](http://www.nltk.org/book/ch02.html#loading-your-own-corpus)

```python

os.chdir('/Users/Enkidu/Documents/digital_humanities/jupyter_notebooks')
corpus_root = 'machine_readable_persian_transoxania_texts'
turkestan_corpus = PlaintextCorpusReader(corpus_root, '.*')
turkestan_corpus.fileids()
```

Cleaning: Now a function is pulled from an external file (arabic_cleaning.py). Previous method saved for posterity:

```python
clean_edited_i = {}
for fn in raw_edited_corpus:
    clean_edited_i[fn] = re.sub(r'ي', 'ی', raw_edited_corpus[fn])

clean_edited = {}
for fn in clean_edited_i:
    clean_edited[fn] = re.sub(r'[^آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهس ي یی ]', '', clean_edited_i[fn])
```


### Cleaning XML documents

*Dormant XML cleaning method using BeautifulSoup (still in use for Persian literature tokenization in separate script)*

```python
bstree = bs4.BeautifulSoup(clean_xml["ser561"], 'lxml')


print(bstree.get_text())

clean_xml = {}
for fn in raw_xml:
    bstree = bs4.BeautifulSoup(raw_xml[fn], 'lxml')
    clean_xml[fn] = bstree.get_text()
    
clean_xml['TsGARUZ_i126_1_1986_1_ser201']
```

