In [1]:
#Perform all the imports

import spacy
import pandas
import numpy as np
import re

nlp = spacy.load('en_core_web_sm')
print(len(nlp.Defaults.stop_words))

305


In [2]:
#Read the text file

fp = open('History-Class6.txt', 'r')
text = fp.read()
fp.close()

In [3]:
#Pre-processing the text

# 1. Removing punctuations
text = re.sub('[^a-zA-Z]', ' ', text)

# 2. Convert to lower case
text = text.lower()

# 3. Remove tags
text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

# 4. Remove special characters
text = re.sub("(\W)+", " ", text)

# 5. Convert to list
text = text.split()

#print(text)

In [7]:
#Extract common words / word-count

#freq = pandas.Series(text.split()).value_counts()[:40]
#freq

word_c = {}                         #Create a dictionary to hold the frequencies of every word.
for word in text:
    if word in word_c.keys():
        word_c[word] += 1
    else:
        word_c[word] = 1
word_c

#Choose mechanism to set threshold to decide stop words.
# a. Divide the total number of words in the text by x
# b. Divide the number of words in the dictionary by x
#Chosen mechanism = percentile

frequencies = [v for v in word_c.values()]
threshold = np.percentile(frequencies, 97)

#Creating custom stop words list.

s_w = []
for k, v in word_c.items():
    if v>threshold:
        s_w.append(k)
print((s_w))

#Adding custom stop words

new_sw = []
def custom_stop_words(word):
    if not nlp.vocab[word].is_stop:
        new_sw.append(word)                        #Just to keep a record of the custom words added.
        nlp.Defaults.stop_words.add(word)
        nlp.vocab[word].is_stop = True

for word in s_w:
    custom_stop_words(word)
print(new_sw)    
#print(len(nlp.Defaults.stop_words))

['and', 'to', 'they', 'be', 'the', 'were', 'some', 'by', 'that', 'in', 'of', 'people', 'as', 'was', 'a', 'had', 'who', 'these', 'this']
['people']


In [8]:
#Lemmatization

doc = nlp(" ".join(text))
lemmatized_list = []
for token in doc:
    lemmatized_list.append(token.lemma_)
print(len(lemmatized_list))

2345


In [10]:
#Removing stop words

doc = nlp(" ".join(lemmatized_list))
filtered_list = []
for token in doc:
    if not token.is_stop:
        if not token.is_punct:
            if not token.text == '-PRON-':
                filtered_list.append(token.text)
print(len(filtered_list))

1067


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
import re
cv=CountVectorizer(max_df=0.8,stop_words=nlp.Defaults.stop_words, max_features=10000, ngram_range=(1,3))
X=cv.fit_transform(filtered_list)
#list(cv.vocabulary_.keys())[:10]


In [12]:
#Most frequently occuring words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      
                   vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                       reverse=True)
    return words_freq[:n]
#Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(filtered_list, n=20)
top_df = pandas.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
#print(top_df)

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
 
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)
# get feature names
feature_names=cv.get_feature_names()
 
# fetch document for which keywords needs to be extracted
doc=" ".join(filtered_list)
 
#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

In [14]:
#Function for sorting tf_idf in descending order
from scipy.sparse import coo_matrix

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results
#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())
#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)
 
# now print the results
print("\nAbstract:")
print(doc)
print("\nKeywords:")
for k in keywords:
    print(k,keywords[k])


Abstract:
kingdom king early republic election day shankaran wake grandparent ready vote want reach polling booth shankaran want know excited somewhat impatiently grandfather explain choose ruler today man ruler choose leader ruler voting common year man ruler past raja read chapter probably choose jana year ago find change place way rajas choose man recognise rajas perform big sacrifice ashvamedha horse sacrifice ritual horse let loose wander freely guard raja s man horse wander kingdom raja stop fight allow horse pass mean accept raja want perform sacrifice strong raja invite sacrifice perform specially train priest reward gift raja organise sacrifice recognise powerful come bring gift raja central figure ritual special seat throne tiger skin charioteer companion battle field witness exploit chant tale glory relative especially wife son perform variety minor ritual raja simply spectator sit watch performance sacrifice priest perform ritual include sprinkling sacred water king ordina

In [15]:
#Restoring the stop word list.

def remove_stop_word(word):
    if nlp.vocab[word].is_stop:
        nlp.Defaults.stop_words.remove(word)
        nlp.vocab[word].is_stop = False

for word in new_sw:
    remove_stop_word(word)

print(len(nlp.Defaults.stop_words))

305
