### Libraries

In [1]:
import nltk, os, glob, re, bs4

## Corpus Globbing Section

### Corpus Build: Machine-Readable Central Asian Texts
Corpus of complete texts edited by others composed in early modern Transoxania.

In [74]:
trans_corpus_files = glob.glob(r'/Users/Enkidu/Box Sync/Notes/Digital Humanities/\
Corpora/machine_readable_persian_transoxania_texts/*.txt')

trans_corpus = {}
for longname in trans_corpus_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    trans_corpus[short[0]] = txt
    

#trans_corpus['samarat']


### Corpus Build: Manuscript Notes
Corpus based on partially transcribed manuscripts from early modern Transoxania.

In [62]:
nmr_files = glob.glob(r'/Users/Enkidu/Box Sync/Notes/\
Primary Sources/non-machine-readable_notes/bactriana_notes/*.txt')

raw_notes_corpus = {}
for longname in nmr_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    raw_notes_corpus[short[0]] = txt

### Corpus Build: XML Documents
Corpus based on transcribed XML documents early modern Transoxania.

In [77]:
# Python 3.5 and newer supports recursive **/ functionality, i.e. cycle through all subdirectories.

xml_files = glob.glob(r'/Users/Enkidu/Box Sync/Notes/Primary Sources/xml_notes_stage2/**/*.xml', recursive=True)


# For-loop through file names and build a dictionary of key (filename): value (text content)

xml_corpus = {}
for longname in xml_files:
    f = open(longname)
    txt = f.read()
    f.close()
    start = os.path.basename(longname)
    short = os.path.splitext(start)
    xml_corpus[short[0]] = txt

#xml_corpus['TsGARUz_i126_1_601_6_ser187']


### Defunct method: [creating an NLTK corpus](http://www.nltk.org/book/ch02.html#loading-your-own-corpus)

```python

os.chdir('/Users/Enkidu/Documents/digital_humanities/jupyter_notebooks')
corpus_root = 'machine_readable_persian_transoxania_texts'
turkestan_corpus = PlaintextCorpusReader(corpus_root, '.*')
turkestan_corpus.fileids()
```

## Cleaning

### Cleaning edited texts and notes

In [6]:
# Clean edited texts

## TO DO: figure out more efficient way of doing the for loop, add in ک swaps too

clean_edited_i = {}
for fn in raw_edited_corpus:
    clean_edited_i[fn] = re.sub(r'ي', 'ی', raw_edited_corpus[fn])

clean_edited = {}
for fn in clean_edited_i:
    clean_edited[fn] = re.sub(r'[^آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهس ي یی ]', '', clean_edited_i[fn])

#clean_edited['ikromcha'][:1000]
#raw_edited_corpus['ikromcha'][:1000]



clean_notes_i = {}
for fn in raw_notes_corpus:
    clean_notes_i[fn] = re.sub(r'ي', 'ی', raw_notes_corpus[fn])

clean_notes = {}
for fn in clean_notes_i:
    clean_notes[fn] = re.sub(r'[^آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهس ي یی ]', '', clean_notes_i[fn])

clean_notes['jung_i_rivayat_al_biruni_4798'][600:650]

'اتقیای رحم الله که هر یک مقتدای  دین راهنمای رسوم '

### Cleaning XML documents

In [7]:
# bstree = bs4.BeautifulSoup(raw_xml["ser561"], 'lxml')

#print(bstree.get_text())


## TO DO: still need to strip out some of the non-Arabic script stuff, like above.

clean_xml = {}
for fn in raw_xml:
    bstree = bs4.BeautifulSoup(raw_xml[fn], 'lxml')
    clean_xml[fn] = bstree.get_text()
    
#clean_xml['TsGARUZ_i126_1_1986_1_ser201']

## Tokenizing

In [8]:

edited_toks = {}
for (fn, txt) in clean_edited.items():
    toks = nltk.word_tokenize(txt)
    edited_toks[fn] = toks

notes_toks = {}
for (fn, txt) in clean_notes.items():
    toks = nltk.word_tokenize(txt)
    notes_toks[fn] = toks


xml_toks = {}
for (fn, txt) in clean_xml.items():
    toks = nltk.word_tokenize(txt)
    xml_toks[fn] = toks
    

xml_toks['TsGARUz_R-2678_ser184'][50:70]

['عبد',
 'الاحد',
 'گذشته',
 'گی',
 'این',
 'است',
 'داملا',
 'حسن',
 'آخوند',
 'بخاری',
 'الاصل',
 'از',
 'گذر',
 'کاکلهً',
 'خورد',
 'بسیار',
 'ملا',
 'درس',
 'گو',
 'بوده']

### Merging Corpuses

In [9]:
#Merging dictionaries: https://www.webucator.com/how-to/how-merge-dictionaries-python.cfm


#Combined corpus
combined_corpus_toks = {**edited_toks, **notes_toks, **xml_toks}


#Combined Tokens (loses corpus text designation)
combined_toks = []
for (fn, text) in list(combined_corpus_toks.items()):
    combined_toks.extend(combined_corpus_toks[fn])



## Persian Literature Digital Corpus
Massive corpus of Persian literature, pulled from Ganjur (http://ganjoor.net/) by Roshan (https://persdigumd.github.io/PDL/)

In [10]:
## ISSUE: A lot of the corpus is in Unicode format, some of it is in Arabic script. Is the unicode automatically converted?

#Globbing

files = glob.glob(r'/Users/Enkidu/Documents/digital_humanities/\
persian_literature_digital_corpus_roshan/data/**/*.xml', recursive=True)


#Assembling Corpus

raw_perslit_corpus = {}
for longname in files:
        f = open(longname)
        txt = f.read()
        f.close()
        start = longname.rindex('/')+1
        short = longname[start:-4]
        raw_perslit_corpus[short] = txt

        
#Cleaning Text
## TO DO: still need to strip out some of the non-Arabic script stuff, like above.


clean_perslit = {}
for fn in raw_perslit_corpus:
    bstree = bs4.BeautifulSoup(raw_perslit_corpus[fn], 'lxml')
    clean_perslit[fn] = bstree.get_text()

#Tokenizing Text
 
perslit_toks = {}
for (fn, txt) in clean_perslit.items():
    toks = nltk.word_tokenize(txt)
    perslit_toks[fn] = toks
        
#clean_perslit['ferdousi.shahname.pdl'][5000:5800]
perslit_toks['ferdousi.shahname.pdl'][50:70]

['University',
 'of',
 'Maryland',
 'College',
 'ParkGanjoor',
 'corpus',
 'به',
 'نام',
 'خداوند',
 'جان',
 'و',
 'خرد',
 'کزین',
 'برتر',
 'اندیشه',
 'برنگذرد',
 'خداوند',
 'نام',
 'و',
 'خداوند']

## Stats

### Word Frequency

In [11]:
#pah_freq = nltk.FreqDist(pah_toks_clean)
#pah_freq.most_common(20)

combo_freq = nltk.FreqDist(combined_toks)


long_toks = [x for x in combined_toks if len(x)>10]
long_freq = nltk.FreqDist(long_toks)
long_freq.most_common(10)


[('چنانچهجنانجه', 68),
 ('بزروارپزروار', 64),
 ('اینچنیناینجنین', 25),
 ('i126-1-1730', 7),
 ('اشتغالداشته', 6),
 ('بماوراالنهر', 5),
 ('همچنانهمجنان', 5),
 ('اجمعینتیمنا', 5),
 ('همچنینهمجنین', 4),
 ('اشتغالنموده', 3)]

### Bigrams

In [12]:
bigrams = nltk.ngrams(combined_toks, 2)
bi_freq = nltk.FreqDist(bigrams)
bi_freq.most_common(10)


[(('بعد', 'از'), 1065),
 (('و', 'از'), 741),
 (('و', 'در'), 664),
 (('اند', 'و'), 613),
 (('قدس', 'سره'), 502),
 (('است', 'و'), 453),
 (('از', 'ان'), 391),
 (('که', 'در'), 379),
 (('را', 'از'), 363),
 (('بوده', 'اند'), 361)]

### Trigrams

In [13]:
trigrams = nltk.ngrams(combined_toks, 3)
tri_freq = nltk.FreqDist(trigrams)
tri_freq.most_common(10)

[(('بعد', 'از', 'ان'), 192),
 (('بوده', 'اند', 'و'), 124),
 (('الله', 'علیه', 'و'), 105),
 (('علیه', 'و', 'سلم'), 103),
 (('و', 'بعد', 'از'), 103),
 (('و', 'از', 'ایشان'), 86),
 (('بشرایطه', 'یا', 'نی'), 81),
 (('فرموده', 'اند', 'ه'), 80),
 (('درین', 'مسله', 'که'), 75),
 (('ابو', 'الفیض', 'خان'), 71)]

### Muchos-Grams

In [14]:
muchos_grams = nltk.ngrams(combined_toks, 10)
muchos_freq = nltk.FreqDist(muchos_grams)
#muchos_freq.most_common(5)

### Conditional Frequency

In [15]:
# ConditionalFreqDist() takes a list of pairs.
# Generator variable uses itself up upon assignment, so need to recreate above

## TO DO: How to integrate Regex searches into this?

bigrams_cfd = nltk.ngrams(combined_toks, 2)

cfd = nltk.ConditionalFreqDist(bigrams_cfd)

In [16]:
cfd['درین'].most_common(5)

[('مسله', 80), ('باب', 36), ('اثنا', 27), ('آوان', 21), ('ایام', 18)]

### Concordance

In [23]:
# for whatever reason you can't just use the concordance method on a string;
# you have to convert it to an NLTK Text type one way or another

trans_corpus = nltk.Text(combined_toks)

trans_corpus.concordance('نقشب')

no matches
