# Text Deciphering Tool

In [1]:
import pickle, re, nltk, os

In [2]:
import numpy as np
import pandas as pd

from pandas import DataFrame, Series

In [3]:
#set home directory path
hdir = os.path.expanduser('~')

Sister files:
- Pickled corpora cleaned in text_cleaning_tokenizing
- Corpora stats in corpora_statistics

## I. Importing Corpora



In [4]:
pickle_path = hdir + "/Box/Notes/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"

In [5]:
with open(pickle_path + "/corpora.pkl", "rb") as f:
    unsorted_doc_toks,\
                indo_xml_toks, hyd_xml_toks, trans_xml_toks,\
                trans_nar_toks, indo_nar_toks,\
                trans_nar_ext_toks, indo_nar_ext_toks = pickle.load(f)

In [6]:
#trans_xml_toks.keys()

In [7]:
with open(pickle_path + "/meta_corpora.pkl", "rb") as f:
    comb_india_nar_toks, comb_trans_nar_toks, nar_corpus_toks, doc_corpus_toks,\
                combined_corpus_toks, mega_corpus_toks = pickle.load(f)

In [76]:

#"خان" in combined_corpus_toks["tarikh_i_baljuvan_al_biruni_2663iii_ser412"]
        


## II. Importing Raw Tokens
I.e. tokens without parent text designation, i.e. format necessary for many NLTK routines.

In [9]:
with open(pickle_path + "/raw_tokens.pkl", "rb") as f:
    raw_doc_toks, raw_nar_toks, raw_lit_toks, raw_combo_toks = pickle.load(f)

In [10]:
#raw_combo_toks[100:125]

## III. Importing Datasets

- Von Melzer Persian Lexicon
- Glossary
- Place Names

In [13]:
# dataset path

ds_path = hdir + "/Box/Notes/Digital_Humanities/Datasets"

In [158]:
# Von Melzer
meltzer = pd.read_csv(ds_path + "/von_melzer.csv")

In [159]:
#meltzer["Präs.-Stamm"].sample(5)
#meltzer.sample(10)

In [40]:
# Locations
locations = pd.read_csv(ds_path + '/exported_database_data/locations.csv', names=['UID', 'Ar_Names', \
                                                'Lat_Name', 'Nickname', 'Type'])
# Social Roles
roles = pd.read_csv(ds_path + '/exported_database_data/roles.csv', names=['UID', 'Term', 'Emic', 'Etic', 'Scope'])

# Glossary
glossary = pd.read_csv(ds_path + '/exported_database_data/glossary.csv', names=['UID', 'Term', \
                                                'Eng_Term', 'Translation', 'Transliteration', 'Scope', 'Tags'])

___
___

# Basic Search

Regex reminders:
- Just the word itself: `^مال$`

In [160]:
search_term = re.compile(r"ب.د")

### Von Melzer Persion Dictionary

In [161]:
melz_query_mask = meltzer["Präs.-Stamm"].str.contains(search_term, na=False)
melz_query = meltzer[melz_query_mask]
melz_query

Unnamed: 0,UID,Volume,Unnamed: 2,Persisch,Präs.-Stamm,Transkription,Deutsch,Bemerkung,Quellenangaben
45,46,I,45,46,‫ﺁب بادﻩ رنگ‬,āb-e bāde-rang,blutige Tränen,,FN I 9b
62,63,I,62,63,‫ﺁب بردار‬,āb-bar-dār,sinnvoll; gedankenreich,,Haïm 1934:I 1045b
63,64,I,63,64,‫ﺁب بردن‬,āb bordan,Wasser führen,[durchgestr.],ʿAṭṭār
73,74,I,73,74,‫ﺁب بقدر یک سنگ‬,āb be-qadr⁺-e jek sang,Wasser für einen Mühlstein,,Rosen 1890:100
74,75,I,74,75,‫ﺁب بند‬,āb-band,Damm (m.); Deich (m.),,Haïm 1934:I 1045b
...,...,...,...,...,...,...,...,...,...
65269,65270,IV,8242,4964,(‫معترض بودن )که‬,mo ke,ʾtarez⁺ būdan (+Akk.; ),bekennen; eingestehen; gestehen (was; daß),Īrānšahr I 317/6
65280,65281,IV,8253,4975,‫معتکف پردﻩ بودن‬,mo,ʾtakef⁺-e parde būdan,stets hinter dem Vorhang verweilen,Ḥāfiẓ (Dīvān 192/3V.)
65413,65414,IV,8386,5108,‫معشوق… بودن‬,ma,ʾšūq⁺-e… būdan,geliebt werden (von),Īrānšahr I 184/24
65442,65443,IV,8415,5137,‫معطل… بودن‬,moʾattal⁺-e… būdan,,warten (auf),Hinz 1942:87


### Database Terms

#### (a) Technical Lexicon

In [150]:
glos_query_mask = glossary["Term"].str.contains(search_term, na=False)
glos_query = glossary[glos_query_mask]
glos_query

Unnamed: 0,UID,Term,Eng_Term,Translation,Transliteration,Scope,Tags
49,51,آق قرا پل,aq qara pul,,,transoxania,money
157,160,آلیق,aliq,,,transoxania,taxes


#### (b) Social Roles

In [46]:
roles_query_mask = roles["Emic"].str.contains(search_term, na=False)
roles_query = roles[roles_query_mask]
roles_query

Unnamed: 0,UID,Term,Emic,Etic,Scope
22,23,Qazi,قاضی,,Islamic
35,36,Qazi al-Quza,قاضی القضاة,,Islamic
36,37,Qazi-yi Kalan,قاضی کلان,,Transoxania
48,49,Qazi-yi Askari,قاضی عسکری,,Transoxania


#### (c) Place Names

In [142]:
loc_query_mask = locations["Ar_Names"].str.contains(search_term, na=False)
loc_query = locations[loc_query_mask]
loc_query

Unnamed: 0,UID,Ar_Names,Lat_Name,Nickname,Type
2,3,بخارا,Bukhāra (city),Bukhara,citykhanate
97,101,کاکانکاگانینگی بخارا,KāgānNew Bukhara,Kagan,districtcity
453,5007,ارک بخارا,,,neighborhoodguzar


### Corpus Tokens

In [61]:
search_term = re.compile(r"قوشبیگی")

In [62]:
combo_freq = nltk.FreqDist(raw_doc_toks)
toks = [x for x in combo_freq if re.match(search_term, x)]
toks[:5]

['قوشبیگی', 'قوشبیگیاه']

### Keyword in Context

In [63]:
conc0 = sum([trans_corpus.concordance_list(x) for x in toks], [])
conc1 = [c.line for c in conc0]
print('\n'.join(conc1))

ر جناب وزارت پناهی قبله گاهی میر کل قوشبیگی معروض رای انور عالیجاهی زبده الا له
نویسنده گان ایلچی خانه نوشته اند که قوشبیگی برای دولتخانه از دختران نویسنده گان
پگاه تا پشین بعرک امیر رفته در نزد قوشبیگیاه ایستاده خبرداری دولت خاتمه کرده و 


### Custom KWIC (beta)

In [67]:
# Better KWIC: need to (a) list source,
# and (b) have the ability to have multiple tokens in a row.

In [69]:
five_grams = nltk.ngrams(combined_corpus_toks, 5)

In [70]:
five_grams = list(five_grams)
five_grams[5][2] == "پانصد"

False

In [66]:
search_toks = [x for x in five_grams if five_grams[x][2] is "پانصد"]
search_toks[:5]

TypeError: list indices must be integers or slices, not tuple

___
___

# Conditional Frequency

*Meta-Corpus*

In [98]:
# ConditionalFreqDist() takes a list of pairs.
# Generator variable uses itself up upon assignment, so need to recreate above

bigrams_cfd = nltk.ngrams(raw_combo_toks, 2)

cfd = nltk.ConditionalFreqDist(bigrams_cfd)

### Simple Conditional Frequency:

*Meta-Corpus*

In [101]:
search_term = r"قاضی"

In [122]:
print (search_term, " is most commonly followed by:\n")
cfd[search_term].most_common(5)

مقرر  is most commonly followed by:



[('شد', 299), ('گشت', 191), ('شده', 161), ('بود', 113), ('گشته', 103)]

*Document Corpus*

In [105]:
bigrams_doc_fd = nltk.ngrams(raw_doc_toks, 2)

cfd_doc = nltk.ConditionalFreqDist(bigrams_doc_fd)

In [115]:
search_term = "مقرر"

In [121]:
print ("\nin the documents corpus, ", search_term, " is most commonly followed by: \n")
cfd_doc[search_term].most_common(5)


in the documents corpus,  مقرر  is most commonly followed by: 



[('و', 28), ('شده', 6), ('بود', 6), ('گشته', 4), ('است', 4)]

### Third term, if first two known:

*Document Corpus (Meta-Corpus simply too computationally costly)*

In [123]:
tri0 = nltk.ngrams(raw_doc_toks, 3)
tri1 = [((a, b), c) for (a, b, c) in tri0]
cfd1 = nltk.ConditionalFreqDist(tri1)

In [126]:
first_term = "بعد"
second_term = "از"

In [130]:
print ("The pair ", first_term, second_term, " is most commonly followed by :\n")

cfd1[(first_term, second_term)]

The pair  بعد از  is most commonly followed by :



FreqDist({'ان': 34, 'ختم': 7, 'انقلاب': 3, 'چند': 2, 'آن': 2, 'قرن': 2, 'تبلیغ': 2, 'انکه': 2, 'ماه': 1, 'اظهار': 1, ...})

### Reversed conditional frequency, i.e. if second word in sequence known but not first

*Meta-Corpus*

In [140]:
search_term = "بخارا"

In [137]:
bi0 = nltk.ngrams(raw_doc_toks, 2)
bir = [(b, a) for (a, b) in bi0]
cfdr = nltk.ConditionalFreqDist(bir)

In [141]:
print ("The term ", search_term, " is most commonly preceded by:\n")

cfdr[search_term].most_common(15)

The term  بخارا  is most commonly preceded by:



[('بدرون', 7),
 ('در', 5),
 ('شهر', 3),
 ('اعلم', 3),
 ('امیر', 2),
 ('انقلاب', 1),
 ('به', 1),
 ('ایلچی', 1),
 ('فاخره', 1),
 ('صدارت', 1),
 ('ممالک', 1),
 ('پادشاهان', 1),
 ('امیران', 1),
 ('بازار', 1),
 ('های', 1)]