# Text Deciphering Tool

In [2]:
import pickle, re, nltk, os

In [3]:
import numpy as np
import pandas as pd

from pandas import DataFrame, Series

In [4]:
#set home directory path
hdir = os.path.expanduser('~')

Sister files:
- Pickled corpora cleaned in text_cleaning_tokenizing
- Corpora stats in corpora_statistics

## I. Importing Corpora



In [5]:
pickle_path = hdir + "/Box/Notes/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"

In [6]:
with open(pickle_path + "/corpora.pkl", "rb") as f:
    unsorted_doc_toks,\
                indo_xml_toks, hyd_xml_toks, trans_xml_toks,\
                trans_nar_toks, indo_nar_toks,\
                trans_nar_ext_toks, indo_nar_ext_toks, khiva_doc_toks = pickle.load(f)

In [7]:
with open(pickle_path + "/meta_corpora.pkl", "rb") as f:
    comb_india_nar_toks, comb_trans_nar_toks, nar_corpus_toks, doc_corpus_toks,\
                comb_india_toks, comb_trans_toks, comb_turk_toks,\
                combined_corpus_toks, mega_corpus_toks = pickle.load(f)

In [8]:

#"خان" in combined_corpus_toks["tarikh_i_baljuvan_al_biruni_2663iii_ser412"]
        


## II. Importing Raw Tokens
I.e. tokens without parent text designation, i.e. format necessary for many NLTK routines.

In [9]:
with open(pickle_path + "/raw_tokens.pkl", "rb") as f:
    raw_doc_toks, raw_nar_toks, raw_indo_toks,\
                 raw_trans_toks, raw_lit_toks, raw_combo_toks, raw_turk_toks = pickle.load(f)

In [10]:
#indo_nar_toks.keys()

## III. Importing Pre-processed NTLK Data

In [11]:
pickle_data_path = hdir + "/Box/Notes/Digital_Humanities/Corpora/pickled_nltk_data"

In [16]:
#NLTK Word Frequencies

with open(pickle_data_path + "/frequencies.pkl", "rb") as f:
    combo_freq, pers_lit_freq,\
                indo_freq, trans_freq,\
                nar_freq, doc_freq,\
                turk_freq = pickle.load(f)

In [13]:
#NLTK Conditional Frequency Dictionaries (raw tokens)

with open(pickle_data_path + "/cfd.pkl", "rb") as f:
    combo_cfd,\
                indo_cfd, trans_cfd,\
                nar_cfd, doc_cfd,\
                turk_cfd = pickle.load(f)
    

In [14]:
#NLTK 5-grams (tokens by work)

with open(pickle_data_path + "/fivegrams.pkl", "rb") as f:
    combo_five_grams,\
                indo_five_grams, trans_five_grams,\
                nar_five_grams, doc_five_grams,\
                turk_five_grams = pickle.load(f)

In [15]:
#Three-way Conditional Frequency Dictionaries (raw tokens)

with open(pickle_data_path + "/tri_cfd.pkl", "rb") as f:
    combo_tricfd,\
                indo_tricfd, trans_tricfd,\
                nar_tricfd, doc_tricfd,\
                turk_tricfd = pickle.load(f)


## IV. Importing Datasets

- Von Melzer Persian Lexicon
- Glossary
- Place Names

In [83]:
# dataset path

ds_path = hdir + "/Box/Notes/Digital_Humanities/Datasets"

In [121]:
# Von Melzer
meltzer = pd.read_csv(ds_path + "/von_melzer.csv")

In [85]:
#meltzer["Präs.-Stamm"].sample(5)
#meltzer.sample(10)

In [86]:
# Locations
locations = pd.read_csv(ds_path + '/exported_database_data/locations.csv', names=['UID', 'Ar_Names', \
                                                'Lat_Name', 'Nickname', 'Type'])
# Social Roles
roles = pd.read_csv(ds_path + '/exported_database_data/roles.csv', names=['UID', 'Term', 'Emic', 'Etic', 'Scope'])

# Glossary
glossary = pd.read_csv(ds_path + '/exported_database_data/glossary.csv', names=['UID', 'Term', \
                                                'Eng_Term', 'Translation', 'Transliteration', 'Scope', 'Tags'])

___
___

In [87]:
dehkhoda = pd.read_csv(ds_path + "/dehkhoda_dictionary.csv", names=['Term', 'Definition'])

In [88]:
#dehkhoda.sample(10)

# Basic Search

Regex reminders:
- Just the word itself: `^مال$`

In [89]:
search_term = re.compile(r"ب.د")

### Von Melzer Persian Dictionary

In [122]:
melz_query_mask = meltzer["Präs.-Stamm"].str.contains(search_term, na=False)
melz_query = meltzer[melz_query_mask]
melz_query

Unnamed: 0,UID,Volume,Unnamed: 2,Persisch,Präs.-Stamm,Transkription,Deutsch,Bemerkung,Quellenangaben
62,63,I,62,63,‫ﺁب بردار‬,āb-bar-dār,sinnvoll; gedankenreich,,Haïm 1934:I 1045b
188,189,I,188,189,‫ﺁب دار‬,āb-dār,(blankes) Schwert,,"Firdausī (Šāhnāma, FǦ I 46/8)"
189,190,I,189,190,‫ﺁب دار‬,āb-dār,blank; blinkend; glänzend; strahlend,,"Rūdakī (LF 33/9), Firdausī, Hugo 72/27a, Ku 15/12"
190,191,I,190,191,‫ﺁب دار‬,āb-dār,hell,,Sūzanī
191,192,I,191,192,‫ﺁب دار‬,āb-dār,Mundschenk,,Polak 1865:II 58
192,193,I,192,193,‫ﺁب دار‬,āb-dār,reich; mächtig,,Sanāʾī (FǦ I 46/6)
193,194,I,193,194,‫ﺁب دار‬,āb-dār,saftig,,"FN I 10b, Armaġān XVIII 304/21"
194,195,I,194,195,‫ﺁب دار‬,āb-dār,schön,,"Rūdakī (LF 33/9), Sūzanī, Farruḫī (Dīvān 2/19)..."
195,196,I,195,196,‫ﺁب دار‬,āb-dār,wassergefüllt,,FN I 10b
196,197,I,196,197,‫ﺁب دار‬,āb-dār,wasserhältig; wasserreich,,Firdausī


### Database Terms

#### (a) Technical Lexicon

In [91]:
glos_query_mask = glossary["Term"].str.contains(search_term, na=False)
glos_query = glossary[glos_query_mask]
glos_query

Unnamed: 0,UID,Term,Eng_Term,Translation,Transliteration,Scope,Tags
105,107,سه بندی,sih-bandi (sebundy),troops employed in the collection of revenues,,indic,taxesmilitary
118,121,جمعبندی,jama-bandi,,jamaʿ-bandī,indic,taxes
126,129,بنده نوازا,banda-nawaza,"O servant-succoring, world-sheltering lord",banda-nawāzā,,salutationhonorific


#### (b) Social Roles

In [92]:
roles_query_mask = roles["Emic"].str.contains(search_term, na=False)
roles_query = roles[roles_query_mask]
roles_query

Unnamed: 0,UID,Term,Emic,Etic,Scope
13,14,Naqshbandiyya,نقشبندیه,,Islamic
14,15,Naqshbandiyya-Mujaddidiyya,نقشبندیه مجددیه,,Islamic
65,66,Maktab-dar,مکتب دار,,Islamic
190,193,Shahbandar,شهبندر,consul,Ottoman


#### (c) Place Names

In [93]:
loc_query_mask = locations["Ar_Names"].str.contains(search_term, na=False)
loc_query = locations[loc_query_mask]
loc_query

Unnamed: 0,UID,Ar_Names,Lat_Name,Nickname,Type
52,53,خیرآباد,Khayrābād,,tuman
59,60,فتحاباد,Fatḥābād,Fathabad,
65,67,آبادی,Ābādī,,
80,82,بغداد,Baghdād,Baghdad,
99,103,شیر آباد,Shīr ĀbādSherabad,,town
101,105,رامپورمسطفی آباد,RāmpūrMustafabad,,city
119,123,قبادیان,QabādiyānКабадиан,,villagevilayat
129,133,استرآبادگرگان,AstarabadGurgan,Astarabad,city
145,149,عشق آباد,ʿIshq ĀbādAshgabat,Ashgabat,city
152,156,دهبید,Dahbīd,,village


### Corpus Tokens

In [184]:
search_term = re.compile(r"د.ر")

In [186]:
combo_freq = nltk.FreqDist(raw_doc_toks)
toks = [x for x in combo_freq if re.match(search_term, x)]
toks[:5]

AttributeError: 'list' object has no attribute 'most_common'

### Keyword in Context

### NLTK Concordance

In [96]:

# for whatever reason you can't just use the concordance method on a string;
# you have to convert it to an NLTK Text type one way or another

trans_corpus = nltk.Text(raw_combo_toks)

#trans_corpus.concordance('خانه')



### Regex Concordance

*Tokens in corpus regex matching the string:*

In [97]:
toks = [x for x in combo_freq if re.match(r'...خوی', x)]
toks[:5]

['اندخوی']

In [98]:
conc0 = sum([trans_corpus.concordance_list(x) for x in toks], [])
conc1 = [c.line for c in conc0]
print('\n'.join(conc1))

سیده اند سی و هفتم اخوند داملا طاهر اندخوی و ایشان فی سبیل الله مال و جان فدا 
 بسیار خذمتها کرده اند و ایشان ص در اندخوی بسجاده شیخی نشسته اند سی و هشتم خلی
خینه ریش سیاه میانه قد ولد رحیم بای اندخوی همگی و تمامی سکینات چهار باب دوکان 
ه از برای تحصیل علوم دینیه از ولایت اندخوی آمده در بخارای شریف استقامت کرده تح
ت یکماه بسیار تر شد که از طرف ولایت اندخوی و اقچه نرسیده بکرکی آدم نمی اید تا 


### Custom KWIC (beta)

In [99]:
# Creating 5-Grams

five_grams = {k:list(nltk.ngrams(v, 5)) for (k,v) in combined_corpus_toks.items() if len(v) >= 5}

In [100]:
# Find in Document
## This function takes a dictionary of 5-grams as the first argument,
## a regex search term as the second argument, and returns the sequence of 5 words

def find_doc(d, s):
    for v in d:
        m = re.match(s, v[2])
        if m is not None:
            yield ' '.join(v)
            

# Note: Return sends a specified value back to its caller
# whereas Yield can produce a sequence of values.


# Example:
## list(find_doc(five_grams['al_biruni_card_catalog_suleimanov_fond'], 'ف.'))

In [101]:
# Find Corpus
## Produces a generator object with the KWIC with associated work title

def find_corpus(c, s):
    for k, d in five_grams.items():
        for m in find_doc(d, s):
            yield f'{k:50s}: {m}'
            

In [102]:
# Formatting

def print_align(v, m):
    plen = max([sum([len(z)+1 for z in x[:m]]) for x in v])
    for x in v:
        pre = ' '.join(x[:m])
        mid = x[m]
        pos = ' '.join(x[m+1:])
        print(f'{pre:>{plen}s} \033[1m{mid}\033[0m {pos}')

In [103]:
print_align(find_corpus(five_grams, 'پ.سند'), 2)

In [104]:
find_corpus(five_grams, 'پ.سند')

<generator object find_corpus at 0x1a23b78cf0>

In [158]:
print('\n'.join(find_corpus(five_grams, '^من.قر?$')))

haji_nimatallah_tazkirat_al_shuara_i_muhtaram_al_biruni_2252_ii: در علم منطق بقراط را
haji_nimatallah_tazkirat_al_shuara_i_muhtaram_al_biruni_2252_ii: نجوم و منطق و فلسفه
manaqib_wa_maqamat_i_sayyid_muhammad_ataallah_shaykh_al_islam_pnb_200_ser42: لاف از منطق و حکمت
mujaddidi_manaqib_al_ahmadiyyah_va_maqamat_al_saidiyyah_al_biruni_2933ii: کتابهای علم منطق خواندم دلم
musayyab_bukhari_atallah_khwaja_maqamat_i_mashayikh_spbgu_854_ff_676b-718b_ser40: تامل نحو منطق کلام حکمت
musayyab_bukhari_muhammad_sharif_maqamat_i_mashayikh_spbgu_854_ff_763b-893a_ser40: در علم منطق نیز اجتهاد
sadr_zia_tazkar_i_ashar                           : سلمان در منطق و فلسفه
tazkirat_al_shuara_i_abd_al_azim_shari_al_biruni_3396iii: نحو و منطق و صلاف
tazkirat_al_shuara_i_abd_al_azim_shari_al_biruni_3396iii: در علوم منطق و حکمت
tazkirat_al_shuara_i_abd_al_azim_shari_al_biruni_3396iii: بلاغت و منطق و فقه
tazkirat_al_shuara_i_abd_al_azim_shari_al_biruni_3396iii: نحو و منطق و بلاغت
topical_card_catalog_index_al_biruni_

___
___

# Conditional Frequency

*Meta-Corpus*

In [106]:
# ConditionalFreqDist() takes a list of pairs.
# Generator variable uses itself up upon assignment, so need to recreate above

bigrams_cfd = nltk.ngrams(raw_combo_toks, 2)

cfd = nltk.ConditionalFreqDist(bigrams_cfd)

### Simple Conditional Frequency:

*Meta-Corpus*

In [107]:
search_term = r"جهد"

In [108]:
print (search_term, " is most commonly followed by:\n")
cfd[search_term].most_common(5)

جهد  is most commonly followed by:



[('و', 15), ('بلیغ', 10), ('تمام', 8), ('بکار', 6), ('بتقدیم', 5)]

*Document Corpus*

In [109]:
bigrams_doc_fd = nltk.ngrams(raw_doc_toks, 2)

cfd_doc = nltk.ConditionalFreqDist(bigrams_doc_fd)

In [110]:
search_term = "بداند"

In [111]:
print ("\nin the documents corpus, ", search_term, " is most commonly followed by: \n")
cfd_doc[search_term].most_common(5)


in the documents corpus,  بداند  is most commonly followed by: 



[('منصب', 1), ('مسموع', 1)]

### Third term, if first two known:

*Document Corpus (Meta-Corpus simply too computationally costly)*

In [112]:
tri0 = nltk.ngrams(raw_doc_toks, 3)
tri1 = [((a, b), c) for (a, b, c) in tri0]
cfd1 = nltk.ConditionalFreqDist(tri1)

In [159]:
first_term = "باید"
second_term = "که"

In [160]:
print ("The pair ", first_term, second_term, " is most commonly followed by :\n")

cfd1[(first_term, second_term)]

The pair  باید که  is most commonly followed by :



FreqDist({'مومی': 4, 'مشار': 2, 'لوازم': 1, 'موضعمذکور': 1, 'بر': 1, 'قصبه': 1, 'بهبودی': 1, 'وجه': 1, 'قرار': 1, 'اراضی': 1, ...})

### Reversed conditional frequency, i.e. if second word in sequence known but not first

*Meta-Corpus*

In [115]:
search_term = "دلربا"

In [116]:
bi0 = nltk.ngrams(raw_lit_toks, 2)
bir = [(b, a) for (a, b) in bi0]
cfdr = nltk.ConditionalFreqDist(bir)

In [117]:
print ("The term ", search_term, " is most commonly preceded by:\n")

cfdr[search_term].most_common(15)

The term  دلربا  is most commonly preceded by:



[('آن', 14),
 ('و', 11),
 ('حسن', 5),
 ('صنم', 4),
 ('روی', 4),
 ('خیالات', 4),
 ('جمال', 3),
 ('رخ', 3),
 ('تو', 3),
 ('یار', 3),
 ('های', 3),
 ('شوخ', 2),
 ('رود', 2),
 ('ای', 2),
 ('بدان', 2)]

# Utility Functions

In [141]:
def corpora_guide ():
    print(
        "\tCombined Token Corpora:\n\
        \t Narrative Sources from India: comb_india_nar_toks\n\
        \t Narrative Sources from Transoxania: comb_trans_nar_toks\n\n\
        \t All Narrative Sources: nar_corpus_toks\n\
        \t All Document Sources: doc_corpus_toks\n\n\
        \t Documents and Narrative Sources: combined_corpus_toks\n\
        \t Mega Corpus including Persian lit. corpus: mega_corpus_toks\n\n\n\
        Individual Corpora:\n\
        \t External Indic Corpus: indo_nar_ext_toks\n\
        \t External Transoxania Corpus: trans_nar_ext_toks\n\n\
        \t Khiva Turkic Document Corpus: khiva_doc_toks\n\n\
        \t Internal India Narrative Corpus: indo_nar_toks\n\
        \t Internal Transoxania Narrative orpus: trans_nar_toks\n\n\
        \t XML-stage Transoxania Documents: trans_xml_toks\n\
        \t XML-stage Indic Documents: indo_xml_toks\n\
        \t XML-stage Hyderabad Documents: hyd_xml_toks\n\n\
        \t"
                 
    )

In [142]:
corpora_guide()

	Combined Token Corpora:
        	 Narrative Sources from India: comb_india_nar_toks
        	 Narrative Sources from Transoxania: comb_trans_nar_toks

        	 All Narrative Sources: nar_corpus_toks
        	 All Document Sources: doc_corpus_toks

        	 Documents and Narrative Sources: combined_corpus_toks
        	 Mega Corpus including Persian lit. corpus: mega_corpus_toks


        Individual Corpora:
        	 External Indic Corpus: indo_nar_ext_toks
        	 External Transoxania Corpus: trans_nar_ext_toks

        	 Khiva Turkic Document Corpus: khiva_doc_toks

        	 Internal India Narrative Corpus: indo_nar_toks
        	 Internal Transoxania Narrative orpus: trans_nar_toks

        	 XML-stage Transoxania Documents: trans_xml_toks
        	 XML-stage Indic Documents: indo_xml_toks
        	 XML-stage Hyderabad Documents: hyd_xml_toks

        	


### Frequency

[Another way of doing max value](https://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary):

```python
def keywithmaxval(d):
     """ a) create a list of the dict's keys and values; 
         b) return the key with the max value"""  
     v=list(d.values())
     k=list(d.keys())
     return k[v.index(max(v))]
```

In [101]:
def best_match (term, corpus):
    
    """Takes a search term and frequency dictionary, returns the most frequently\
    appearing match within the specified corpus as [matching term, frequency of appearnace]."""
    
    search_term = re.compile(term)
    toks = {k:v for (k,v) in corpus.items() if re.match(search_term, k)}
    if len(toks) > 0:
        match = sorted(toks, key=toks.get, reverse=True)[0]
        freq = corpus[match]
        pair = [match, freq]
    
    else:
        pair = None
    
    return pair

In [165]:
#help (best_match)

#best_match ("error", nar_freq)

best_match("د.رو", doc_freq)

['داروی', 6]

In [119]:
def match_freq(term):
    
   
    
    if best_match(term, combo_freq) is not None:
        print ("Most likely match in corpus:\n\n\t",\
              best_match(term, combo_freq)[0], "appearing ", best_match(term, combo_freq)[1], "times;\n")
    
    search_term = re.compile(term)
    toks = {k:v for (k,v) in combo_freq.items() if re.match(search_term, k)}
    if len(toks) > 3:
        cf2 = sorted(toks, key=toks.get, reverse=True)[1]
        cf3 = sorted(toks, key=toks.get, reverse=True)[2]
        print ("\tfollowed by:\n\t",\
                    cf2, "appearing ", combo_freq[cf2], "times, and \n\t",\
                   list(sorted(toks))[2], "appearing ", combo_freq[list(sorted(toks))[2]], "times\n\n")
    
    
    print ("Most likely matches in sub-corpora:\n")
    
    if best_match(term, doc_freq) is not None:
           print("\tDocuments:", best_match(term, doc_freq)[0], "appearing ", best_match(term, doc_freq)[1], "times;\n")
    
    if best_match(term, nar_freq) is not None:
           print ("\tNarrative texts:", best_match(term, nar_freq)[0], "apprearing", best_match(term, nar_freq)[1], "times;\n\n")

    if best_match(term, indo_freq) is not None:
           print ("\tIndic texts:", best_match(term, indo_freq)[0], "appearing ", best_match(term, indo_freq)[1], "times;\n")
    
    if best_match(term, trans_freq) is not None:
           print ("\tTransoxania texts:", best_match(term, trans_freq)[0], "appearing ", best_match(term, trans_freq)[1], "times;\n")
    

    print ("\nMost likely matches in Persian literature corpus:\n\t")
           
    if best_match(term, pers_lit_freq) is not None:
           print ("\t",best_match(term, pers_lit_freq)[0], "appearing ", best_match(term, pers_lit_freq)[1], "times;\n")
    
    

    


In [111]:
#match_freq("error")

## Multi-Search

In [123]:
def multi_dic (term):
    
    match_freq(term)
    
    search_term = re.compile(term)
    
    glos_query_mask = glossary["Term"].str.contains(search_term, na=False)
    glos_query = glossary[glos_query_mask][["UID", "Term", "Translation"]]
    glos_query
    
    
    dehkhoda_query_mask = dehkhoda["Term"].str.contains(search_term, na=False)
    dehkhoda_query = dehkhoda[dehkhoda_query_mask]
    dehkhoda_query    
    
    melz_query_mask = meltzer["Präs.-Stamm"].str.contains(search_term, na=False)
    melz_query = meltzer[melz_query_mask][["Präs.-Stamm", "Deutsch"]]
    melz_query
    
    
    result = print ("Glossary \n\n", glos_query,"\n\n\n", \
                    "Dehkhoda \n\n", dehkhoda_query,"\n\n\n",\
                    "Von_Meltzer \n\n", melz_query)

    return result


In [124]:
multi_dic ("^سند")

Most likely match in corpus:

	 سند appearing  355 times;

	followed by
	 سندر appearing  116 times, and 
	 سندان appearing  6 times


Most likely matches in sub-corpora:

	Documents: سند appearing  60 times;

	Narrative texts: سند apprearing 295 times;


	Indic texts: سند appearing  263 times;

	Transoxania texts: سند appearing  32 times;


Most likely matches in Persian literature corpus:
	
	 سندان appearing  254 times;

Glossary 

      UID Term Translation
258  261  سند         NaN 


 Dehkhoda 

           Term                                         Definition
10448     سند    1 برگه، بنچاق، قباله، قواله، قولنامه، مدرک 2 ...
10449     سند   صفت 1 حرام‌زاده، سندره، زنازاده، روسپی زاده، و...
10450  سندساز   صفت 1 سوء‌استفاده‌چی، مختلس 2 ترفندباف، دروغ‌پ...
10451    سنده                                & پیشاب، شاش، ادرار 
10452   سندیت            1 ارزش، اصالت، اعتبار 2 ملاک، مناط، حجت 
10453  سندیکا                                      اتحادیه، انجمن 


 Von_Meltzer 

 Empty DataFr

## Simple Conditional Frequency Tool

In [173]:
def confreq (term):
    
    if len(combo_cfd[term]) > 0:
        print (term, " is most commonly followed by:\n\n", combo_cfd[term].most_common(5))
    
    else:
        print ("no results")
    
    if len(combo_cfd[term]) > 0:
        print("\nWithin sub-corpora:\n")
        
        # Still need to fill out sub-corpora
    
        if len(doc_cfd[term]) > 0 :
            print ("\tDocuments:", term, " is most commonly followed by:\n\n\t", doc_cfd[term].most_common(5))

   

In [185]:
def regcf (term):
    
    if best_match(term, combo_freq) is not None:
        print ("The most likly match for ", term, " is ", best_match(term, combo_freq)[0],\
              "(", best_match(term, combo_freq)[1], ").\n")
        print ("Conditional frequency (combined corpus):\n\t", combo_cfd[best_match(term, combo_freq)[0]].most_common(5))
        
        # Fill out subcorpora
    

In [186]:
#regcf("د.رو")

### Custom KWIC

In [187]:
# Find in Document

def find_doc(d, s):
    
    """This function takes a dictionary of 5-grams as the first argument,\
    a regex search term as the second argument, and returns the sequence of 5 words"""
    
    for v in d:
        m = re.match(s, v[2])
        if m is not None:
            yield ' '.join(v)
            

# Note: Return sends a specified value back to its caller
# whereas Yield can produce a sequence of values.


# Example:
## list(find_doc(five_grams['al_biruni_card_catalog_suleimanov_fond'], 'ف.'))

In [199]:
# Find Corpus
## Produces a generator object with the KWIC with associated work title

def find_corpus(c, s):
    for k, d in c.items():
        for m in find_doc(d, s):
            yield f'{k:50s}: {m}'
            

In [200]:
def kwic(term, corpus=combo_five_grams):
    
    print('\n'.join(find_corpus(corpus, term)))
    
    # todo: organize this by best match
    

In [206]:
#kwic("د.رو", indo_five_grams)

In [None]:
# TODO: functions for 3-part confreq, and reverse confreq