### Libraries

In [101]:
import nltk, os
import pandas as pd

### Importing Pahlavi Corpus

In [102]:
os.chdir('/Users/Enkidu/Documents/digital_humanities/pahlavi_corpus_digital/')
f = open('pahlavi_corpus_comp_ling.txt')
pahlavi_text = f.read()
f.close()

### Tokenizing

In [103]:
pah_toks = nltk.word_tokenize(pahlavi_text)

len(pah_toks)

1036918

In [104]:
'handarz' in pah_toks

True

### Cleaning the Tokens

*next steps*: still messing up boolean logic in latter part of the string comprehension

In [105]:
pah_toks_clean = [x for x in pah_toks if x.isalpha() or '-' in x]

In [106]:
pah_toks_clean = [x for x in pah_toks if x.lower() not in ('colophon', 'n', 'not', 'texts', '...')]

In [107]:
pah_toks_clean = [x for x in pah_toks_clean if len(x) > 2]
print ('Number of clean tokens: ', str(len(pah_toks_clean)))
print ('Tokens reduced by', str(int(len(pah_toks_clean)/len(pah_toks)*100)), 'percent in cleaning process.')

Number of clean tokens:  487022
Tokens reduced by 46 percent in cleaning process.


In [108]:
'not' in pah_toks_clean
# don't understand why 'not' is still in the list

False

In [141]:
pah_long_toks = [x for x in pah_toks_clean if '-' in x and len(x)>12]
long_freq = nltk.FreqDist(pah_long_toks)
long_freq.most_common(10)

[('bowandag-menišnīh', 97),
 ('meh-dādestānīh', 50),
 ('dūdag-sālārīh', 39),
 ('fraškerd-kerdārīh', 38),
 ('ham-dādestānīh', 36),
 ('bē-widerišnīh', 35),
 ('hamē-rawišnīh', 27),
 ('harwisp-āgāhīh', 26),
 ('rāst-gōwišnīh', 24),
 ('azabar-nibišt', 23)]

### Word Frequency

*next steps*: need to figure out how to better take advantage of the iterator (same for below) 

In [109]:
pah_freq = nltk.FreqDist(pah_toks_clean)
pah_freq.most_common(20)

[('pad', 19534),
 ('andar', 5979),
 ('abar', 4963),
 ('bawēd', 4604),
 ('čiyōn', 3774),
 ('u-š', 3623),
 ('rāy', 2915),
 ('ast', 2660),
 ('harw', 2478),
 ('estēd', 2428),
 ('būd', 2389),
 ('xwēš', 2258),
 ('tan', 2194),
 ('guft', 2156),
 ('Ohrmazd', 2066),
 ('kerd', 2014),
 ('dād', 1937),
 ('dēn', 1898),
 ('ayāb', 1752),
 ('abāg', 1710)]

In [110]:
help(pah_freq)

Help on FreqDist in module nltk.probability object:

class FreqDist(collections.Counter)
 |  A frequency distribution for the outcomes of an experiment.  A
 |  frequency distribution records the number of times each outcome of
 |  an experiment has occurred.  For example, a frequency distribution
 |  could be used to record the frequency of each word type in a
 |  document.  Formally, a frequency distribution can be defined as a
 |  function mapping from each sample to the number of times that
 |  sample occurred as an outcome.
 |  
 |  Frequency distributions are generally constructed by running a
 |  number of experiments, and incrementing the count for a sample
 |  every time it is an outcome of an experiment.  For example, the
 |  following code will produce a frequency distribution that encodes
 |  how often each word occurs in a text:
 |  
 |      >>> from nltk.tokenize import word_tokenize
 |      >>> from nltk.probability import FreqDist
 |      >>> sent = 'This is an example s

In [111]:
pah_freq.tabulate(10)

  pad andar  abar bawēd čiyōn   u-š   rāy   ast  harw estēd 
19534  5979  4963  4604  3774  3623  2915  2660  2478  2428 


### Bigrams

In [112]:
pah_bi = nltk.ngrams(pah_toks_clean, 2)
pah_bi_freq = nltk.FreqDist(pah_bi)
pah_bi_freq.most_common(10)

[(('nigēz', 'weh-dēn'), 374),
 (('weh-dēn', 'hād'), 361),
 (('u-šān', 'ēn-iz'), 341),
 (('ōwōn', 'dāšt'), 339),
 (('ēn-iz', 'ōwōn'), 337),
 (('pad', 'tan'), 312),
 (('guft', 'estēd'), 275),
 (('kerd', 'estēd'), 260),
 (('bawēd', 'pad'), 251),
 (('u-š', 'pad'), 249)]

### Trigrams

In [113]:
pah_tri = nltk.ngrams(pah_toks_clean, 3)
pah_tri_freq = nltk.FreqDist(pah_tri)
pah_tri_freq.most_common(10)

[(('nigēz', 'weh-dēn', 'hād'), 355),
 (('u-šān', 'ēn-iz', 'ōwōn'), 331),
 (('ēn-iz', 'ōwōn', 'dāšt'), 328),
 (('ēdōn', 'bawēd', 'čiyōn'), 88),
 (('bawēd', 'ēd-iz', 'rāy'), 80),
 (('menišn', 'gōwišn', 'kunišn'), 73),
 (('gōwēd', 'Srōš-ahlaw', 'Ādur-yazd'), 73),
 (('Srōš-ahlaw', 'Ādur-yazd', 'ruwān'), 72),
 (('frazaft', 'pad', 'drōd'), 69),
 (('pad', 'drōd', 'šādīh'), 62)]

### Quad-grams

In [114]:
pah_quad = nltk.ngrams(pah_toks_clean, 4)
pah_quad_freq = nltk.FreqDist(pah_quad)
pah_quad_freq.most_common(10)

[(('u-šān', 'ēn-iz', 'ōwōn', 'dāšt'), 326),
 (('gōwēd', 'Srōš-ahlaw', 'Ādur-yazd', 'ruwān'), 63),
 (('mēnōy', 'xrad', 'passox', 'kerd'), 62),
 (('frazaft', 'pad', 'drōd', 'šādīh'), 59),
 (('pursīd', 'dānāg', 'mēnōy', 'xrad'), 59),
 (('Srōš-ahlaw', 'Ādur-yazd', 'ruwān', 'druwand'), 41),
 (('u-m', 'pursīd', 'tan', 'wināh'), 41),
 (('pad', 'drōd', 'šādīh', 'rāmišn'), 40),
 (('pursīd', 'tan', 'wināh', 'kerd'), 39),
 (('wināh', 'kerd', 'ruwān', 'ōwōn'), 34)]

In [115]:
pah_tri_phrases = pah_tri_freq.most_common()
pah_tri_phrases = [x for x in pah_tri_phrases if x[1]>1]
pah_tri_phrases

[(('nigēz', 'weh-dēn', 'hād'), 355),
 (('u-šān', 'ēn-iz', 'ōwōn'), 331),
 (('ēn-iz', 'ōwōn', 'dāšt'), 328),
 (('ēdōn', 'bawēd', 'čiyōn'), 88),
 (('bawēd', 'ēd-iz', 'rāy'), 80),
 (('menišn', 'gōwišn', 'kunišn'), 73),
 (('gōwēd', 'Srōš-ahlaw', 'Ādur-yazd'), 73),
 (('Srōš-ahlaw', 'Ādur-yazd', 'ruwān'), 72),
 (('frazaft', 'pad', 'drōd'), 69),
 (('pad', 'drōd', 'šādīh'), 62),
 (('mēnōy', 'xrad', 'passox'), 62),
 (('xrad', 'passox', 'kerd'), 62),
 (('pursīd', 'dānāg', 'mēnōy'), 60),
 (('dānāg', 'mēnōy', 'xrad'), 60),
 (('ast', 'ēdōn', 'gōwēd'), 55),
 (('humad', 'hūxt', 'huwaršt'), 52),
 (('u-m', 'dīd', 'ruwān'), 52),
 (('tan', 'wināh', 'kerd'), 49),
 (('gōwēd', 'pad', 'dēn'), 45),
 (('Ādur-yazd', 'ruwān', 'druwand'), 44),
 (('xwāstag', 'pad', 'stūrīh'), 43),
 (('pad', 'tan', 'pasēn'), 42),
 (('u-m', 'pursīd', 'tan'), 41),
 (('pursīd', 'tan', 'wināh'), 41),
 (('čiyōn', 'man', 'dānam'), 41),
 (('drōd', 'šādīh', 'rāmišn'), 40),
 (('andar', 'ham', 'dar'), 40),
 (('u-š', 'guft', 'Ohrmazd'), 39),


In [116]:
df = pd.DataFrame([(x, y, z, c) for (x, y, z), c in pah_tri_phrases], columns=['tok1', 'tok2', 'tok3', 'count'])

In [117]:
df

Unnamed: 0,tok1,tok2,tok3,count
0,nigēz,weh-dēn,hād,355
1,u-šān,ēn-iz,ōwōn,331
2,ēn-iz,ōwōn,dāšt,328
3,ēdōn,bawēd,čiyōn,88
4,bawēd,ēd-iz,rāy,80
5,menišn,gōwišn,kunišn,73
6,gōwēd,Srōš-ahlaw,Ādur-yazd,73
7,Srōš-ahlaw,Ādur-yazd,ruwān,72
8,frazaft,pad,drōd,69
9,pad,drōd,šādīh,62


In [118]:
df.to_csv('pahlavi_trigrams.csv')

In [119]:
type(pah_bi_freq)

nltk.probability.FreqDist

### Conditional Frequency

In [120]:
# ConditionalFreqDist() takes a list of pairs.
# Generator variable uses itself up upon assignment, so need to recreate above

pah_bi2 = nltk.ngrams(pah_toks_clean, 2)
pah_bi2 = list(pah_bi2)

pah_cfd = nltk.ConditionalFreqDist(pah_bi2)

In [121]:
pah_cfd['draxt'].most_common(8)

[('xwānēnd', 6),
 ('ī-š', 3),
 ('nišīnēd', 3),
 ('homānāg', 2),
 ('dād', 2),
 ('rust', 2),
 ('mēwag', 2),
 ('draxt', 2)]

### Export Data

In [122]:
# for whatever reason you can't just use the concordance method on a string;
# you have to convert it to an NLTK Text type one way or another

pahlavi_corpus = nltk.Text(pah_toks_clean)

pahlavi_corpus.concordance('draxt')

Displaying 25 of 25 matches:
R PT25 MĀH FRAWARDĪN RŌZ HORDAD PT26 DRAXT ASŪRĪG PT27 WIZĀRIŠN ČATRANG NIHIŠN 
 zamīg abar K20b_14v āmad hēnd čiyōn draxt homānāg tāg abar rēšag azēr rust u-š
š hamē waxšīd 6d.6 nazdīk Wan Gōkarn draxt dād pad abāz-dārišnīh zarmān duš-pid
.6* nazdīk Wan Harwisp-tōhmag Gōkarn draxt dād pad abāz-dārišnīh zarmān duš-pid
 šud ast ruwān nūn-iz pad hangōšīdag draxt rust estēd kē-š bar ēwēnag mardōm 14
 šud ast ruwān nūn-iz pad hangōšīdag draxt rust estād kē-š bar ēwēnag mardōm 14
 harw xwarēd a-hōš bawēd u-š Gōkaran draxt xwānēnd čiyōn gōwēd hōm dūr-ōš u-š p
r harw xwarēd a-hōš bawēd u-š Gōkarn draxt xwānēnd čiyōn gōwēd hōm dūr-ōš u-š p
 ast 16.6 urwar and ēwēnag bawēd dār draxt mēwag ǰōrdāy gul sprahm tērag abzāra
mag 16.6* urwar and ēwēnag bawēd dār draxt mēwag ǰōrdāg gul sprahm tārag ōzārag
ār spēd-dār šamšār abārīg ēwēnag dār draxt xwānēnd 16.9 harw čē-š bar pad xwār-
-bār mardōmān šāyēd šāyēd sālwār ast draxt xwānēnd TD2_61r/117 16.10* harw pad 
10* harw pa