# Text Deciphering Tool

In [1]:
import pickle, re, nltk, os

In [2]:
import numpy as np
import pandas as pd

from pandas import DataFrame, Series

In [3]:
#set home directory path
hdir = os.path.expanduser('~')

Sister files:
- Pickled corpora cleaned in text_cleaning_tokenizing
- Corpora stats in corpora_statistics

## I. Importing Corpora



In [4]:
pickle_path = hdir + "/Box/Notes/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"

In [5]:
with open(pickle_path + "/corpora.pkl", "rb") as f:
    unsorted_doc_toks,\
                indo_xml_toks, hyd_xml_toks, trans_xml_toks,\
                trans_nar_toks, indo_nar_toks,\
                trans_nar_ext_toks, indo_nar_ext_toks = pickle.load(f)

In [6]:
with open(pickle_path + "/meta_corpora.pkl", "rb") as f:
    comb_india_nar_toks, comb_trans_nar_toks, nar_corpus_toks, doc_corpus_toks,\
                combined_corpus_toks, mega_corpus_toks = pickle.load(f)

In [7]:

#"خان" in combined_corpus_toks["tarikh_i_baljuvan_al_biruni_2663iii_ser412"]
        


## II. Importing Raw Tokens
I.e. tokens without parent text designation, i.e. format necessary for many NLTK routines.

In [8]:
with open(pickle_path + "/raw_tokens.pkl", "rb") as f:
    raw_doc_toks, raw_nar_toks, raw_lit_toks, raw_combo_toks = pickle.load(f)

In [9]:
#indo_nar_toks.keys()

## III. Importing Datasets

- Von Melzer Persian Lexicon
- Glossary
- Place Names

In [10]:
# dataset path

ds_path = hdir + "/Box/Notes/Digital_Humanities/Datasets"

In [11]:
# Von Melzer
meltzer = pd.read_csv(ds_path + "/von_melzer.csv")

In [12]:
#meltzer["Präs.-Stamm"].sample(5)
#meltzer.sample(10)

In [13]:
# Locations
locations = pd.read_csv(ds_path + '/exported_database_data/locations.csv', names=['UID', 'Ar_Names', \
                                                'Lat_Name', 'Nickname', 'Type'])
# Social Roles
roles = pd.read_csv(ds_path + '/exported_database_data/roles.csv', names=['UID', 'Term', 'Emic', 'Etic', 'Scope'])

# Glossary
glossary = pd.read_csv(ds_path + '/exported_database_data/glossary.csv', names=['UID', 'Term', \
                                                'Eng_Term', 'Translation', 'Transliteration', 'Scope', 'Tags'])

___
___

In [14]:
dehkhoda = pd.read_csv(ds_path + "/dehkhoda_dictionary.csv", names=['Term', 'Definition'])

In [15]:
#dehkhoda.sample(10)

# Basic Search

Regex reminders:
- Just the word itself: `^مال$`

In [16]:
search_term = re.compile(r"ب.د")

### Von Melzer Persian Dictionary

In [17]:
melz_query_mask = meltzer["Präs.-Stamm"].str.contains(search_term, na=False)
melz_query = meltzer[melz_query_mask]
melz_query

Unnamed: 0,UID,Volume,Unnamed: 2,Persisch,Präs.-Stamm,Transkription,Deutsch,Bemerkung,Quellenangaben
45,46,I,45,46,‫ﺁب بادﻩ رنگ‬,āb-e bāde-rang,blutige Tränen,,FN I 9b
62,63,I,62,63,‫ﺁب بردار‬,āb-bar-dār,sinnvoll; gedankenreich,,Haïm 1934:I 1045b
63,64,I,63,64,‫ﺁب بردن‬,āb bordan,Wasser führen,[durchgestr.],ʿAṭṭār
73,74,I,73,74,‫ﺁب بقدر یک سنگ‬,āb be-qadr⁺-e jek sang,Wasser für einen Mühlstein,,Rosen 1890:100
74,75,I,74,75,‫ﺁب بند‬,āb-band,Damm (m.); Deich (m.),,Haïm 1934:I 1045b
75,76,I,75,76,‫ﺁب بندان‬,āb-bandān,Stausee (m.); Wasserbehälter (m.),,Haïm 1934:I 1045b
76,77,I,76,77,‫ﺁب بندى‬,āb-bandī,Abdichtung (f.) (eines Lecks),,Haïm 1934:I 1045b
181,182,I,181,182,(‫ﺁب دادن )به‬,āb dādan (be),tränken (wen),,Haïm 1931:II 742b
182,183,I,182,183,(‫ﺁب دادن )…را‬,āb dādan (…rā),begießen; wässern (wen),,
183,184,I,183,184,(‫ﺁب دادن )…را‬,āb dādan (…rā),tränken (wen),,


### Database Terms

#### (a) Technical Lexicon

In [18]:
glos_query_mask = glossary["Term"].str.contains(search_term, na=False)
glos_query = glossary[glos_query_mask]
glos_query

Unnamed: 0,UID,Term,Eng_Term,Translation,Transliteration,Scope,Tags
105,107,سه بندی,sih-bandi (sebundy),troops employed in the collection of revenues,,indic,taxesmilitary
118,121,جمعبندی,jama-bandi,,jamaʿ-bandī,indic,taxes
126,129,بنده نوازا,banda-nawaza,"O servant-succoring, world-sheltering lord",banda-nawāzā,,salutationhonorific


#### (b) Social Roles

In [19]:
roles_query_mask = roles["Emic"].str.contains(search_term, na=False)
roles_query = roles[roles_query_mask]
roles_query

Unnamed: 0,UID,Term,Emic,Etic,Scope
13,14,Naqshbandiyya,نقشبندیه,,Islamic
14,15,Naqshbandiyya-Mujaddidiyya,نقشبندیه مجددیه,,Islamic
65,66,Maktab-dar,مکتب دار,,Islamic
190,193,Shahbandar,شهبندر,consul,Ottoman


#### (c) Place Names

In [20]:
loc_query_mask = locations["Ar_Names"].str.contains(search_term, na=False)
loc_query = locations[loc_query_mask]
loc_query

Unnamed: 0,UID,Ar_Names,Lat_Name,Nickname,Type
52,53,خیرآباد,Khayrābād,,tuman
59,60,فتحاباد,Fatḥābād,Fathabad,
65,67,آبادی,Ābādī,,
80,82,بغداد,Baghdād,Baghdad,
99,103,شیر آباد,Shīr ĀbādSherabad,,town
101,105,رامپورمسطفی آباد,RāmpūrMustafabad,,city
119,123,قبادیان,QabādiyānКабадиан,,villagevilayat
129,133,استرآبادگرگان,AstarabadGurgan,Astarabad,city
145,149,عشق آباد,ʿIshq ĀbādAshgabat,Ashgabat,city
152,156,دهبید,Dahbīd,,village


### Corpus Tokens

In [21]:
search_term = re.compile(r"کوبکار.?")

In [22]:
combo_freq = nltk.FreqDist(raw_doc_toks)
toks = [x for x in combo_freq if re.match(search_term, x)]
toks[:5]

['کوبکاری']

### Keyword in Context

### NLTK Concordance

In [23]:

# for whatever reason you can't just use the concordance method on a string;
# you have to convert it to an NLTK Text type one way or another

trans_corpus = nltk.Text(raw_combo_toks)

#trans_corpus.concordance('خانه')



### Regex Concordance

*Tokens in corpus regex matching the string:*

In [24]:
toks = [x for x in combo_freq if re.match(r'...خوی', x)]
toks[:5]

['اندخوی']

In [25]:
conc0 = sum([trans_corpus.concordance_list(x) for x in toks], [])
conc1 = [c.line for c in conc0]
print('\n'.join(conc1))

سیده اند سی و هفتم اخوند داملا طاهر اندخوی و ایشان فی سبیل الله مال و جان فدا 
 بسیار خذمتها کرده اند و ایشان ص در اندخوی بسجاده شیخی نشسته اند سی و هشتم خلی
خینه ریش سیاه میانه قد ولد رحیم بای اندخوی همگی و تمامی سکینات چهار باب دوکان 
ه از برای تحصیل علوم دینیه از ولایت اندخوی آمده در بخارای شریف استقامت کرده تح
ت یکماه بسیار تر شد که از طرف ولایت اندخوی و اقچه نرسیده بکرکی آدم نمی اید تا 


### Custom KWIC (beta)

In [26]:
# Better KWIC: need to (a) list source,
# and (b) have the ability to have multiple tokens in a row.

In [42]:
combined_corpus_toks["al_biruni_card_catalog_suleimanov_fond"][192]

IndexError: list index out of range

In [54]:
five_grams = {k:list(nltk.ngrams(v, 5)) for (k,v) in combined_corpus_toks.items() if len(v) >= 5}

In [57]:
re.match('abcq', 'abcdef')

In [62]:
s = 'abc'
def find_doc(d, s):
    for v in d:
        m = re.match(s, v[2])
        if m is not None:
            yield ' '.join(v)
list(find_doc(five_grams['al_biruni_card_catalog_suleimanov_fond'], 'ف.'))

['الوقفیه الروایه فی مسایل الهدایه']

In [70]:
# add in formatting
# you may or may not need to reverse the ">" for an RTL script, i'm not sure how they interpret that
#i also don't know how arabic deals with bold. you could also make the middle word a different color instead

def print_align(v, m):
    plen = max([sum([len(z)+1 for z in x[:m]]) for x in v])
    for x in v:
        pre = ' '.join(x[:m])
        mid = x[m]
        pos = ' '.join(x[m+1:])
        print(f'{pre:>{plen}s} \033[1m{mid}\033[0m {pos}')

In [69]:
def find_corpus(c, s):
    for k, d in five_grams.items():
        for m in find_doc(d, s):
            yield f'{k:50s}: {m}'
print('\n'.join(find_corpus(five_grams, 'پ.سند')))

jung_i_rivayat_al_biruni_4798                     : را ستاره پرسند و آنچه
ikromcha                                          : دارم م پرسند که بحضرت
khumuli                                           : آن ستم پرسند بجز حواله
tarikh-i_jadida_tashkent_ser725                   : خواهید ازمن پرسند هلاکوخان کفت
ain-i_akbari_murty                                : پایه برافرازد پرسنده نادان و
ain-i_akbari_murty                                : پیشین ولاد پرسنده زهره و
an1_nassim                                        : بساتین هرچه پرسند جواب دانسته
an1_nassim                                        : جهانبانی می پرسند جواب میدهند
badauni_muntakhab_al-tawwarikh                    : ازو می پرسند که پیرزاده
badauni_muntakhab_al-tawwarikh                    : ضیغه عیسی پرسند آن زمان
badauni_muntakhab_al-tawwarikh                    : ازو می پرسند که اول
jahangirnama                                      : کنگاش می پرسند بعضی می
jahangirnama                                      : بوده می پرسند که فتح


In [None]:
five_grams = {k:list(v) for (k,v) in five_grams.items()}

In [None]:
five_grams = list(five_grams)
five_grams[5][2] == "پانصد"

In [None]:
five_grams[5][3]

In [None]:
search_toks = [x for x in five_grams if x[2] == "پانصد"]
search_toks[:5]

___
___

# Conditional Frequency

*Meta-Corpus*

In [None]:
# ConditionalFreqDist() takes a list of pairs.
# Generator variable uses itself up upon assignment, so need to recreate above

bigrams_cfd = nltk.ngrams(raw_combo_toks, 2)

cfd = nltk.ConditionalFreqDist(bigrams_cfd)

### Simple Conditional Frequency:

*Meta-Corpus*

In [None]:
search_term = r"جهد"

In [None]:
print (search_term, " is most commonly followed by:\n")
cfd[search_term].most_common(5)

*Document Corpus*

In [None]:
bigrams_doc_fd = nltk.ngrams(raw_doc_toks, 2)

cfd_doc = nltk.ConditionalFreqDist(bigrams_doc_fd)

In [None]:
search_term = "بداند"

In [None]:
print ("\nin the documents corpus, ", search_term, " is most commonly followed by: \n")
cfd_doc[search_term].most_common(5)

### Third term, if first two known:

*Document Corpus (Meta-Corpus simply too computationally costly)*

In [None]:
tri0 = nltk.ngrams(raw_doc_toks, 3)
tri1 = [((a, b), c) for (a, b, c) in tri0]
cfd1 = nltk.ConditionalFreqDist(tri1)

In [None]:
first_term = "بکار"
second_term = "برد"

In [None]:
print ("The pair ", first_term, second_term, " is most commonly followed by :\n")

cfd1[(first_term, second_term)]

### Reversed conditional frequency, i.e. if second word in sequence known but not first

*Meta-Corpus*

In [None]:
search_term = "دلربا"

In [None]:
bi0 = nltk.ngrams(raw_lit_toks, 2)
bir = [(b, a) for (a, b) in bi0]
cfdr = nltk.ConditionalFreqDist(bir)

In [None]:
print ("The term ", search_term, " is most commonly preceded by:\n")

cfdr[search_term].most_common(15)

## Functions

## Multi-Search

In [None]:
def multi_dic (term):
    
    search_term = re.compile(term)
    
    glos_query_mask = glossary["Term"].str.contains(search_term, na=False)
    glos_query = glossary[glos_query_mask][["UID", "Term", "Translation"]]
    glos_query
    
    
    dehkhoda_query_mask = dehkhoda["Term"].str.contains(search_term, na=False)
    dehkhoda_query = dehkhoda[dehkhoda_query_mask]
    dehkhoda_query    
    
    melz_query_mask = meltzer["Präs.-Stamm"].str.contains(search_term, na=False)
    melz_query = meltzer[melz_query_mask][["Präs.-Stamm", "Deutsch"]]
    melz_query
    
    
    result = print ("Glossary \n\n", glos_query,"\n\n\n", \
                    "Dehkhoda \n\n", dehkhoda_query,"\n\n\n",\
                    "Von_Meltzer \n\n", melz_query)

    return result




In [None]:
#multi_dic ("ب.د")

## Simple Conditional Frequency Tool

In [None]:
def confreq (term, corpus=raw_combo_toks):
        
    bigrams_cfd = nltk.ngrams(corpus, 2)
    cfd = nltk.ConditionalFreqDist(bigrams_cfd)
    output = cfd[term].most_common(5)
    result = print (term, " is most commonly followed by:\n\n", output)
    
    return result


In [None]:
confreq ("خان")

In [None]:
    bigrams_cfd = nltk.ngrams(raw_combo_toks, 2)
    cfd = nltk.ConditionalFreqDist(bigrams_cfd)
    output = cfd["خانه"].most_common(5)
    result = print ( " is most commonly followed by:\n\n", output)

## Regex Concordance

In [None]:
def regcon (term, corpus=raw_combo_toks):
    # corpus="raw_combo_toks" provides a default argument, which can be overruled.

    search_term = re.compile(term)
    
    freq = nltk.FreqDist(corpus)
    toks = [x for x in combo_freq if re.match(search_term, x)]
    #toks[:5]

    toks = [x for x in combo_freq if re.match(search_term, x)]
    #toks[:5]

    conc0 = sum([trans_corpus.concordance_list(x) for x in toks], [])
    conc1 = [c.line for c in conc0]
    
    result = print('\n'.join(conc1))
    
    return result

In [None]:
regcon ("خ.ن", raw_nar_toks)