### Custom Frequency Dictionary

1) Import dependencies

In [1]:
import os
import regex as re
import math
import string
from collections import Counter
import pandas as pd
import numpy as np
import nltk
import PyPDF2

2) Import Standard English Frequency Dictionary

In [2]:
# SymSpell frequency dictionary
with open('frequency_dictionary_en_82_765.txt', 'r') as myfile:
    DICT=myfile.read().replace('\n', '')

#add space between numbers and characters
DICT2 = re.sub(r'(\d+)(\w+)', r'\1 \2', DICT.lower())

#get list of word and count tuples
d_tup = re.findall(r'(\w+)\s(\d+)', DICT2)

#assign counter
C = Counter()
for pair in d_tup:
    C[pair[0]] = int(pair[1])

3) Import English Book with Special Words

In [3]:
# creating a pdf file object 
pdfFileObj = open('The-Bhagavad-Gita-Translation-by-Shri-Purohit-Swami.pdf', 'rb') 
  
# creating a pdf reader object 
pdfReader = PyPDF2.PdfFileReader(pdfFileObj) 

# calculate page quantity
num_pages = pdfReader.numPages

# build dataframe
page_list = []
text_list = []
for i in range(num_pages):
    pageObj = pdfReader.getPage(i)
    page_list.append(i)
    text = re.sub(r"\n", " ", pageObj.extractText())
    text = re.sub(r"ﬂ"," ", text)
    text_list.append(text)
    
# closing the pdf file object 
pdfFileObj.close()    

# write to dataframe    
data = pd.DataFrame.from_dict({'page':page_list, 'text':text_list})
data.head()



Unnamed: 0,page,text
0,0,TThhee BBhhaaggaavvaadd GGiittaa Translation...
1,1,A NOTE ABOUT THE TRANSLATOR Shri Purohit Swam...
2,2,CONTENTS ONE: THE DESPONDENCY OF ARJUNA.........
3,3,"PREFACE The Bhagavad Gita , the greatest dev..."
4,4,1 1 ONE: THE DESPONDENCY OF ARJUNA The King ...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 2 columns):
page    55 non-null int64
text    55 non-null object
dtypes: int64(1), object(1)
memory usage: 960.0+ bytes


In [5]:
data['text'][3]

'  PREFACE The Bhagavad Gita , the greatest devotional book of Hinduism, has long been recognized  as one of the world™s spiritual classics and a guide to all on the path of Truth. It is sometimes known as the Song of the Lord or the Gospel of the Lord Shri Krishna.  According to Western scholarship, it was composed later than the Vedas and the   Upanishads Œ probably between the fifth and second centuries before Christ. It is a  fragment, part of the sixth book of the epic poem The Mahabaratha.     The Mahabaratha tells of the Pandavas, Prince Arjuna and his four brothers, growing up  in north India at the court of their uncle, the blind King Dhritarashtra, after the death of  their father, the previous ruler. There is always great rivalry between the Pandavas or sons  of Pandu and the Kauravas, the one hundred sons of Dhritarashtra. Eventually the old  king gives his nephews some land of their own but his eldest son, Duryodhana, defeats   Yudhisthira, the eldest Pandava, by cheating 

4) Process the Text

In [6]:
#all pages as string
TEXT=""
for i,nrows in data.iterrows():
    TEXT += (nrows['text'])
# swap out special characters for spaces
TEXT = re.sub(r"[^a-zA-Z0-9]+", ' ', TEXT)
# swap out numbers for spaces
TEXT = re.sub(r"\d"," ", TEXT)
# number of words
len(TEXT)

109133

In [7]:
# tokenize the text string
tokens = nltk.word_tokenize(TEXT.lower())
# determine the vocabulary
vocab = set(tokens)
vocab = sorted(vocab)
vocab_size = len(vocab)
vocab_size

2899

In [8]:
#get frequency distribution
fdist = nltk.FreqDist(tokens)
fdist.most_common(5)

[('the', 1597), ('and', 744), ('of', 744), ('is', 430), ('in', 384)]

In [9]:
# Find Special words to add to dictionary
special = []
for k in fdist:
    # check if not in standard English Dictionary
    if k not in C:
        # check if not a one-off typo nor single character
        if fdist[k] != 1 and len(k) != 1:
            special.append(k)       
print(special)

['shri', 'purohit', 'bhagavad', 'gita', 'dnyana', 'mahabaratha', 'pandavas', 'dhritarashtra', 'pandu', 'duryodhana', 'sanjaya', 'bheeshma', 'kurukshetra', 'drona', 'drupada', 'bheema', 'virata', 'soubhadra', 'droupadi', 'karna', 'kuru', 'conches', 'kunti', 'unmanifest', 'thinkest', 'centered', 'viwaswana', 'manu', 'knowest', 'apana', 'om', 'narada', 'vyasa', 'fillest']


In [10]:
# Build Frequency dataframe
w_list = []
f_list = []
d_list = []
for word in vocab:
    if fdist[word] != 1 and len(word) != 1:
        w_list.append(word)
        f_list.append(fdist[word])
        d_list.append(C[word])
    
df_words = pd.DataFrame({'word':w_list,'freq':f_list,'dict':d_list})
df_words.sample(5)

Unnamed: 0,word,freq,dict
29,air,3,160850401
891,procreate,2,104450
129,blind,3,16485480
134,body,26,136560842
3,abnegation,3,26265


In [11]:
# frequency scale factor
scale = C['the']/fdist['the']
scale

14487070.232936757

5) Update the Dictionary and Save

In [12]:
#loop through the corpus to update dictionary counts
for i,nrows in df_words.iterrows():
    # update dictionary frequency if word present
    if nrows['dict'] != 0:
        C[nrows['word']] = int(scale* nrows['freq'])
    # attempt to add special word to dictionary
    else:
        try:
            C[nrows['word']] = int(scale* fdist[nrows['word']])
        except:
            pass

C['drupada']

43461210

In [13]:
#Sort Counter for Output
Cout = sorted(C.items(), key=lambda pair: pair[1], reverse=True)

In [14]:
with open("custom_dict_bg.txt", encoding='utf-8', mode='w') as fp: 
    for tag, count in Cout:  
        fp.write('{} {}\n'.format(tag, count))  

6) Proceed to symspell worksheet in separate notebook.