In [6]:
import pandas as pd

In [9]:
questions = pd.read_csv("questions.csv")

In [10]:
questions['word_count'] = questions['qtext'].apply(lambda x: len(str(x).split(" ")))
questions[['qtext','word_count']].head()

Unnamed: 0,qtext,word_count
0,Can I delay paying taxes on my advanced commis...,10
1,I received a dividend from a corporation that ...,22
2,If I purchased but did not cash a coowned bond...,17
3,I work for a state or local government or a ta...,21
4,My employer promised a bonusaward Do I need to...,14


In [11]:
questions.word_count.describe()

count    1008.000000
mean       11.540675
std         4.656879
min         2.000000
25%         8.000000
50%        11.000000
75%        14.000000
max        37.000000
Name: word_count, dtype: float64

In [2]:
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
#nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
##Creating a list of stop words and adding custom stopwords
stop_words = set(stopwords.words("english"))
##Creating a list of custom stopwords
new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"]
stop_words = stop_words.union(new_words)

In [12]:
corpus = []
for i in range(0, 1008):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', questions['qtext'][i])
    
    #Convert to lowercase
    text = text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    
    ##Stemming
    ps=PorterStemmer()
    #Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in  
            stop_words] 
    text = " ".join(text)
    corpus.append(text)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
import re
cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
X=cv.fit_transform(corpus)

In [15]:
list(cv.vocabulary_.keys())[:10]

['delay',
 'paying',
 'tax',
 'advanced',
 'commissionsalary',
 'delay paying',
 'paying tax',
 'tax advanced',
 'advanced commissionsalary',
 'delay paying tax']

In [17]:
#Most frequently occuring words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      
                   vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                       reverse=True)
    return words_freq[:n]
#Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(corpus, n=20)
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
#Barplot of most freq words
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
g.set_xticklabels(g.get_xticklabels(), rotation=30)

Matplotlib is building the font cache using fc-list. This may take a moment.


[Text(0, 0, 'income'),
 Text(0, 0, 'tax'),
 Text(0, 0, 'benefit'),
 Text(0, 0, 'taxable'),
 Text(0, 0, 'plan'),
 Text(0, 0, 'payment'),
 Text(0, 0, 'employer'),
 Text(0, 0, 'rental'),
 Text(0, 0, 'distribution'),
 Text(0, 0, 'retirement'),
 Text(0, 0, 'received'),
 Text(0, 0, 'interest'),
 Text(0, 0, 'qualified'),
 Text(0, 0, 'security'),
 Text(0, 0, 'expense'),
 Text(0, 0, 'property'),
 Text(0, 0, 'report'),
 Text(0, 0, 'annuity'),
 Text(0, 0, 'social'),
 Text(0, 0, 'bond')]

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer
 
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)
# get feature names
feature_names=cv.get_feature_names()
 
# fetch document for which keywords needs to be extracted
doc=corpus[532]
 
#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

In [19]:
#Function for sorting tf_idf in descending order
from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results
#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())
#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,5)
 
# now print the results
print("\nAbstract:")
print(doc)
print("\nKeywords:")
for k in keywords:
    print(k,keywords[k])


Abstract:
report uncollected tax

Keywords:
uncollected tax 0.491
report uncollected tax 0.491
report uncollected 0.454
uncollected 0.441
report 0.28
