In [29]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Megha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [21]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Megha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [23]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Megha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


True

In [5]:
cwd= os.getcwd()

df= pd.read_csv(cwd+"\\archive\\papers.csv")

In [6]:
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [7]:
df.columns

Index(['id', 'year', 'title', 'event_type', 'pdf_name', 'abstract',
       'paper_text'],
      dtype='object')

In [18]:
stop_words= set(stopwords.words('english'))
new_words = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"]
stop_words= list(stop_words.union(new_words))
stop_words


['two',
 'these',
 'hadn',
 'against',
 'down',
 's',
 'how',
 'both',
 'show',
 'until',
 'if',
 'between',
 'all',
 'figure',
 "wouldn't",
 'of',
 'large',
 'isn',
 'up',
 're',
 'over',
 'he',
 'fig',
 'we',
 'and',
 'don',
 'three',
 'what',
 'themselves',
 'our',
 'while',
 "don't",
 'who',
 "aren't",
 'him',
 'needn',
 'but',
 'into',
 "should've",
 "she's",
 'not',
 'y',
 'shouldn',
 'were',
 'd',
 'during',
 'am',
 'this',
 'why',
 "that'll",
 "won't",
 'ours',
 'yours',
 "you're",
 'be',
 'a',
 'by',
 'below',
 'also',
 'at',
 'that',
 'there',
 'because',
 'they',
 'wasn',
 'haven',
 "mightn't",
 "mustn't",
 'herself',
 'ain',
 'result',
 'an',
 'do',
 'with',
 'will',
 'being',
 'your',
 'same',
 'yourself',
 "isn't",
 "you've",
 'or',
 'own',
 'mightn',
 'through',
 'wouldn',
 'itself',
 'for',
 "shouldn't",
 'other',
 'whom',
 'mustn',
 'off',
 'theirs',
 'more',
 'won',
 'where',
 "doesn't",
 "you'll",
 "haven't",
 'four',
 'has',
 'himself',
 'are',
 'above',
 'couldn',


In [24]:
def pre_process(text):
    text= text.lower()
    # remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    #convert to list from string
    text= text.split()
    
    #remove stop words
    text= [word for word in text if word not in stop_words]
    
    # removing words having less than 3 letters
    text=[word for word in text if len(word)>=3]
    
    # lemmatize
    lmtzr= WordNetLemmatizer()
    text= [lmtzr.lemmatize(word) for word in text]
    return ' '.join(text)

    

In [None]:
docs= df['paper_text'].apply(lambda x:pre_process(x))
docs

In [26]:
docs.head()

0    767 self-organization associative database app...
1    683 mean field theory layer visual cortex appl...
2    394 storing covariance associative long?term p...
3    bayesian query construction neural network mod...
4    neural network ensembles, cross validation, ac...
Name: paper_text, dtype: object

### Using TF-IDF

In [30]:
# creating a vocabulary of words
cv= CountVectorizer(max_df= 0.95, max_features= 10000, ngram_range= (1,3))
word_count_vector= cv.fit_transform(docs)

In [31]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

### Function for keyword extraction

In [33]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)


In [34]:
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

# get feature names
feature_names=cv.get_feature_names()




In [35]:
def get_keywords(idx, docs):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

In [36]:
def print_results(idx,keywords, df):
    # now print the results
    print("\n=====Title=====")
    print(df['title'][idx])
    print("\n=====Abstract=====")
    print(df['abstract'][idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])

In [37]:
idx=941
keywords=get_keywords(idx, docs)
print_results(idx,keywords, df)


=====Title=====
Algorithms for Non-negative Matrix Factorization

=====Abstract=====
Non-negative matrix factorization (NMF) has previously been shown to 
be a useful decomposition for multivariate data. Two different multi- 
plicative algorithms for NMF are analyzed. They differ only slightly in 
the multiplicative factor used in the update rules. One algorithm can be 
shown to minimize the conventional least squares error while the other 
minimizes the generalized Kullback-Leibler divergence. The monotonic 
convergence of both algorithms can be proven using an auxiliary func- 
tion analogous to that used for proving convergence of the Expectation- 
Maximization algorithm. The algorithms can also be interpreted as diag- 
onally rescaled gradient descent, where the rescaling factor is optimally 
chosen to ensure convergence. 

===Keywords===
ht 0.654
update 0.224
update rule 0.216
auxiliary 0.165
non negative matrix 0.164
negative matrix 0.163
nmf 0.143
multiplicative 0.137
