# Text Deciphering Tool

In [1]:
import pickle, re, nltk, os

In [2]:
import numpy as np
import pandas as pd

from pandas import DataFrame, Series

In [3]:
#set home directory path
hdir = os.path.expanduser('~')

Sister files:
- Pickled corpora cleaned in text_cleaning_tokenizing
- Corpora stats in corpora_statistics

## I. Importing Corpora



In [4]:
pickle_path = hdir + "/Box/Notes/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"

In [5]:
with open(pickle_path + "/corpora.pkl", "rb") as f:
    unsorted_doc_toks,\
                indo_xml_toks, hyd_xml_toks, trans_xml_toks,\
                trans_nar_toks, indo_nar_toks,\
                trans_nar_ext_toks, indo_nar_ext_toks = pickle.load(f)

In [9]:
#trans_xml_toks.keys()

In [9]:
with open(pickle_path + "/meta_corpora.pkl", "rb") as f:
    icomb_india_nar_toks, comb_trans_nar_toks, nar_corpus_toks, doc_corpus_toks,\
                combined_corpus_toks, mega_corpus_toks = pickle.load(f)

In [30]:

#"خان" in combined_corpus_toks["tarikh_i_baljuvan_al_biruni_2663iii_ser412"]
        


## II. Importing Raw Tokens
I.e. tokens without parent text designation, i.e. format necessary for many NLTK routines.

In [12]:
with open(pickle_path + "/raw_tokens.pkl", "rb") as f:
    raw_doc_toks, raw_nar_toks, raw_lit_toks, raw_combo_toks = pickle.load(f)

In [14]:
#raw_combo_toks[100:125]

['خواب',
 'گردیم',
 'چندین',
 'حکیمها',
 'آمده',
 'دیده',
 'پاسخ',
 'علاجی',
 'کرده',
 'نتوانستند',
 'عین',
 'وقت',
 'خدمت',
 'دولتخانه',
 'عالی',
 'میباشد',
 'بنابران',
 'بخذمت',
 'ذی',
 'شفرها',
 'شان',
 'شد',
 'که',
 'اگر',
 'مهربانی']

## III. Importing Datasets

- Von Melzer Persian Lexicon
- Glossary
- Place Names

In [8]:
# Von Melzer
meltzer = pd.read_csv("data_master/von_melzer.csv")

In [9]:
#meltzer["Transkription"].sample(5)
#meltzer.head()

In [10]:
# Locations
locations = pd.read_csv('data/locations.csv', names=['UID', 'Ar_Names', \
                                                'Lat_Name', 'Nickname', 'Type'])

# Social Roles

# Glossary

glossary = pd.read_csv('data/glossary.csv', names=['UID', 'Term', \
                                                'Eng_Term', 'Translation', 'Transliteration', 'Scope', 'Tags'])

___
___

# Basic Search

Regex reminders:
- Just the word itself: `^مال$`

In [15]:
search_term = re.compile(r"..احب")

### Von Melzer Persion Dictionary

In [78]:
melz_query_mask = meltzer["Präs.-Stamm"].str.contains(search_term, na=False)
melz_query = meltzer[melz_query_mask]
melz_query

Unnamed: 0,UID,Volume,Unnamed: 2,Persisch,Präs.-Stamm,Transkription,Deutsch,Bemerkung,Quellenangaben
2241,2242,I,2241,2242,‫اﺟلۀ صاحب منصبان‬,aǧelle⁺-je sāheb⁺ mansab⁺-ān,die höchsten der Offiziere,,Kāva I/11:6/3b
2381,2382,I,2381,2382,(Pl. ‫احبوش )احابیش‬,ohbūš⁺ (Pl. ahābīš),Horde (f.),,
9570,9571,I,9570,9571,‫آن صاحب‬,ān sāheb⁺,jener Herr (Briefanrede an Europäer),,Beck 1915:221
18128,18129,I,18128,6640,‫بی صاحبی‬,bī-sāheb⁺ī,Herrenlosigkeit (f.),,Ibrāhīm Baik (Siyāḥatnāma 26/3)
25903,25904,II,7119,2751,‫تصاحب‬,tasāhob⁺ A,nei,gnung (f.),Šaibānī 1314:I 76/8
25904,25905,II,7120,2752,‫تصاحب کردن‬,tasāhob⁺ kardan (+Akk.) b,ese,tzen,Īrān 6015/3a
25905,25906,II,7121,2753,‫تصاحب کردن‬,tasāhob⁺ kardan (+Akk.) i,n B,esitz nehmen; sich aneignen,"Ramażānī 1315:21/21, 246/11, Šaibānī 1314:I 34..."
33354,33355,II,14570,1318,‫حکیم صاحب‬,hakīm⁺ sāheb⁺,Herr Doktor!,,Qaraǧadaġī (Mūsī Žūrdān 21) 1197
34012,34013,II,15228,183,‫خاقان صاحب قران‬,ḫāqān⁺⁺-e sāheb⁺-qerān⁺,der Großherr mit der günstigen Stern- stellung...,,"Qulī-Ḫān (Sifāratnāma 14/6, 18, 20/19)"
38057,38058,III,464,166,5 ‫درﺟۀ صاحب منصفی‬,daraǧe⁺-je sāheb⁺- mansaf⁺ī,Rangabzeichen der Offiziere [rect...,ﻲ,‬Īrānšahr I 21/20f.


### Database Terms

#### (a) Technical Lexicon

In [24]:
glos_query_mask = glossary["Term"].str.contains(search_term, na=False)
glos_query = glossary[glos_query_mask]
glos_query

Unnamed: 0,UID,Term,Eng_Term,Translation,Transliteration,Scope,Tags


#### (b) Social Roles

#### (c) Place Names

In [25]:
loc_query_mask = locations["Ar_Names"].str.contains(search_term, na=False)
loc_query = locations[loc_query_mask]
loc_query

Unnamed: 0,UID,Ar_Names,Lat_Name,Nickname,Type


### Corpus Tokens

In [16]:
combo_freq = nltk.FreqDist(raw_doc_toks)
toks = [x for x in combo_freq if re.match(search_term, x)]
toks[:5]

['اخاحب']

### Keyword in Context

In [17]:
search_term = re.compile(r"عیدین")

In [19]:
trans_corpus = nltk.Text(raw_doc_toks)

In [20]:
conc0 = sum([trans_corpus.concordance_list(x) for x in toks], [])
conc1 = [c.line for c in conc0]
print('\n'.join(conc1))

احوال خیرمال عمده الامرا ظهیز الفقرا اخاحب الحنات جامع الخیرات مربعی اعلما معین


### Custom KWIC (beta)

In [21]:
five_grams = nltk.ngrams(raw_doc_toks, 5)

In [22]:
five_grams = list(five_grams)
five_grams[5][2] == "پانصد"

False

In [23]:
search_toks = [x for x in five_grams if five_grams[x][2] is "پانصد"]
search_toks[:5]

TypeError: list indices must be integers or slices, not tuple

___
___

# Conditional Frequency

*Meta-Corpus*

In [19]:
# ConditionalFreqDist() takes a list of pairs.
# Generator variable uses itself up upon assignment, so need to recreate above

bigrams_cfd = nltk.ngrams(combined_toks, 2)

cfd = nltk.ConditionalFreqDist(bigrams_cfd)

### Simple Conditional Frequency:

*Meta-Corpus*

In [54]:
search_term = ""

In [55]:
cfd[search_term].most_common(5)

[('را', 164), ('ای', 112), ('می', 91), ('وار', 80), ('به', 51)]

*Document Corpus*

In [75]:
bigrams_doc_fd = nltk.ngrams(doc_toks, 2)

cfd_doc = nltk.ConditionalFreqDist(bigrams_doc_fd)

In [76]:
search_term = "مقرر"
cfd_doc[search_term].most_common(5)

[('و', 27), ('بود', 6), ('شده', 5), ('گشته', 3), ('است', 3)]

### Third term, if first two known:

*Document Corpus (Meta-Corpus simply too computationally costly)*

In [60]:
tri0 = nltk.ngrams(doc_toks, 3)
tri1 = [((a, b), c) for (a, b, c) in tri0]
cfd1 = nltk.ConditionalFreqDist(tri1)

In [87]:
first_term = "و"
second_term = "موفوره"

cfd1[(first_term, second_term)]

FreqDist({})

### Reversed conditional frequency, i.e. if second word in sequence known but not first

*Meta-Corpus*

In [25]:
search_term = "راک"

In [26]:
bi0 = nltk.ngrams(combined_toks, 2)
bir = [(b, a) for (a, b) in bi0]
cfdr = nltk.ConditionalFreqDist(bir)

cfdr[search_term].most_common(15)

[('ی', 2), ('اثر', 2), ('مهر', 1), ('پ', 1), ('سجه', 1), ('بودند', 1)]