In [1]:
#Import the required modules

#NLP Modules
import spacy
nlp = spacy.load('en_core_web_sm')
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


#Statistics Modules
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix

In [2]:
#Read the text file

fp = open('History-Class6.txt', 'r')
text = fp.read()
fp.close()

In [3]:
#Add named entities to the raw keywords list.

doc = nlp(text)
ent_list = set([ent.text for ent in doc.ents if not ent.label_ in ["ORDINAL", "CARDINAL"] 
                and nlp.vocab[ent.text].is_alpha 
                and len((ent.text).split())<2])

#Create a dictionary to hold keywords to demarcate.
#Later, will have the key-value pairs section segmented with the key being the main clue.
raw_keywords = {'Entities': ent_list}  

print(ent_list)

{'Buddhist', 'Vajjis', 'Atranjikhera', 'Forests', 'today', 'Bihar', 'Priests', 'Ganga', 'Pataliputra', 'RepublicThis', 'Vassakara', 'Vedas', 'Mahajanapadas', 'Europe', 'Appointments', 'Mahavira', 'Citizens', 'Egypt', 'Indian', 'Vedic', 'Bimbisara', 'Chaityas', 'Ajatasattu', 'Delhi', 'Gupta', 'Vajji', 'Meerut', 'Macedonia', 'Athens', 'India', 'Etah', 'Yamuna', 'Rajgir', 'Alexander', 'Kings', 'Rajagriha', 'Rajas', 'Shankaran', 'Hastinapur', 'Buddha'}


In [4]:
#Pre-processing the text.

# 1. Removing punctuations
text = re.sub('[^a-zA-Z]', ' ', text)

# 2. Convert to lower case
text = text.lower()

# 3. Remove tags
text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

# 4. Remove special characters
text = re.sub("(\W)+", " ", text)

# 5. Convert to list
text = text.split()

print('PRE-PROCESSED TEXT:', text[:20], sep = "\n\n")

PRE-PROCESSED TEXT:

['kingdoms', 'kings', 'and', 'an', 'early', 'republic', 'election', 'day', 'shankaran', 'woke', 'up', 'to', 'see', 'his', 'grandparents', 'all', 'ready', 'to', 'go', 'and']


In [5]:
#Extract common words / word-count

word_c = {}                                              #Create a dictionary to hold the frequencies of every word.
for word in text:
    if word in word_c.keys():
        word_c[word] += 1
    else:
        word_c[word] = 1
word_c

#Choose mechanism to set threshold to decide stop words.
# a. Divide the total number of words in the text by x
# b. Divide the number of words in the dictionary by x
#Chosen mechanism = percentile (97th)

frequencies = [v for v in word_c.values()]
threshold = np.percentile(frequencies, 97)

#Creating custom stop words list.

s_w = ['chapter', 'ncert', 'class', 'vi']                #NEED TO POSSIBLY INCLUDE THE CHAPTER NAME IN THIS LIST
for k, v in word_c.items():
    if v>threshold:
        s_w.append(k)

#Adding custom stop words

new_sw = []
def custom_stop_words(word):
    if not nlp.vocab[word].is_stop:
        new_sw.append(word)                               #Just to keep a record of the custom words added.
        nlp.Defaults.stop_words.add(word)
        nlp.vocab[word].is_stop = True

for word in s_w:
    custom_stop_words(word)

print("STOP WORDS ADDED:", new_sw, sep = "\n\n")

STOP WORDS ADDED:

['chapter', 'ncert', 'class', 'vi', 'people']


In [6]:
#Lemmatization

doc = nlp(" ".join(text))
lemmatized_list = []
for token in doc:
    lemmatized_list.append(token.lemma_)
    
print("LEMMATIZED LIST:", lemmatized_list[:20], sep = "\n\n")

LEMMATIZED LIST:

['kingdom', 'king', 'and', 'an', 'early', 'republic', 'election', 'day', 'shankaran', 'wake', 'up', 'to', 'see', '-PRON-', 'grandparent', 'all', 'ready', 'to', 'go', 'and']


In [7]:
#Removing stop words

doc = nlp(" ".join(lemmatized_list))
filtered_list = []
for token in doc:
    if not token.is_stop:
        if not token.is_punct:
            if not token.text == '-PRON-':
                filtered_list.append(token.text)

print("FILTERED TEXT:",filtered_list[:20], sep = "\n\n")

FILTERED TEXT:

['kingdom', 'king', 'early', 'republic', 'election', 'day', 'shankaran', 'wake', 'grandparent', 'ready', 'vote', 'want', 'reach', 'polling', 'booth', 'shankaran', 'want', 'know', 'excited', 'somewhat']


In [8]:
#Text Feature Extraction

cv=CountVectorizer(max_df=0.8,stop_words=nlp.Defaults.stop_words, max_features=10000, ngram_range=(1,3))
X=cv.fit_transform(filtered_list)

#Most frequently occuring words
def get_top_n_words(text, n=None):
    vec = CountVectorizer().fit(text)
    bag_of_words = vec.transform(text)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      
                   vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                       reverse=True)
    return words_freq[:n]
top_words = get_top_n_words(filtered_list, n=20)


top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
print(top_df)

         Word  Freq
0        king    14
1        raja    14
2       ruler    13
3     perform    13
4        year    12
5      priest    11
6   sacrifice    10
7     kingdom     9
8      ritual     9
9     magadha     9
10       want     8
11        man     8
12   janapada     8
13       know     7
14       find     7
15       mean     7
16      varna     7
17  different     7
18      study     7
19  important     7


In [9]:
#Keyword extraction tf-idf method.
 
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)

# get feature names
feature_names=cv.get_feature_names()
 
# fetch document for which keywords needs to be extracted
doc=" ".join(filtered_list)
 
#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results
#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())
#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)
 
# now add the results to keyword dictionary

raw_keywords['Extras'] = [k for k in keywords if not k in raw_keywords['Entities']]
print(raw_keywords)

# tf-idf values of the keywords:
print('\n')
for k in keywords:
    print(f'{k:{15}} {keywords[k]:{5}}')

{'Entities': {'Buddhist', 'Vajjis', 'Atranjikhera', 'Forests', 'today', 'Bihar', 'Priests', 'Ganga', 'Pataliputra', 'RepublicThis', 'Vassakara', 'Vedas', 'Mahajanapadas', 'Europe', 'Appointments', 'Mahavira', 'Citizens', 'Egypt', 'Indian', 'Vedic', 'Bimbisara', 'Chaityas', 'Ajatasattu', 'Delhi', 'Gupta', 'Vajji', 'Meerut', 'Macedonia', 'Athens', 'India', 'Etah', 'Yamuna', 'Rajgir', 'Alexander', 'Kings', 'Rajagriha', 'Rajas', 'Shankaran', 'Hastinapur', 'Buddha'}, 'Extras': ['raja', 'king', 'ruler', 'perform', 'year', 'priest', 'sacrifice', 'ritual', 'magadha', 'kingdom']}


raja            0.188
king            0.188
ruler           0.177
perform         0.177
year            0.165
priest          0.154
sacrifice       0.142
ritual           0.13
magadha          0.13
kingdom          0.13
