# Publication Keyword Extraction

In [7]:
#Importing libraries and data

In [8]:
import pandas as pd

In [9]:
df = pd.read_csv('papers.csv')

In [10]:
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [11]:
df.shape

(7241, 7)

In [12]:
df = df.iloc[:5000,:]

In [13]:
df.shape

(5000, 7)

# Data Pre-Processing

In [14]:
df['paper_text'][0]
# title and ids are dropped as text will contain title as well as other abstracts

'767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABASE\nAND ITS APPLICATIONS\nHisashi Suzuki and Suguru Arimoto\nOsaka University, Toyonaka, Osaka 560, Japan\nABSTRACT\nAn efficient method of self-organizing associative databases is proposed together with\napplications to robot eyesight systems. The proposed databases can associate any input\nwith some output. In the first half part of discussion, an algorithm of self-organization is\nproposed. From an aspect of hardware, it produces a new style of neural network. In the\nlatter half part, an applicability to handwritten letter recognition and that to an autonomous\nmobile robot system are demonstrated.\n\nINTRODUCTION\nLet a mapping f : X -+ Y be given. Here, X is a finite or infinite set, and Y is another\nfinite or infinite set. A learning machine observes any set of pairs (x, y) sampled randomly\nfrom X x Y. (X x Y means the Cartesian product of X and Y.) And, it computes some\nestimate j : X -+ Y of f to make small, the estimation erro

In [15]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nehajoshi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# make an object for stopwords and remove them
stop_word = set(stopwords.words('English'))

In [17]:
len(stop_word)

179

In [18]:
custom_stop_words = set(["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"])

stop_words = list(stop_word.union(custom_stop_words))
len(stop_words)

196

In [19]:
def data_processing(text):
    # convert everything to lower case
    text = text.lower()
    # remove all html
    text = re.sub(r"<.*?>", ' ', text)
    # remove special chars
    text = re.sub(r"[^a-zA-Z]", ' ', text)
    # tokenization
    text = nltk.word_tokenize(text)
    # remove stop words
    text = [word for word in text if word not in stop_words]
    # remove keywords that are less than 3 letter
    text = [word for word in text if len(word)>3]
    # limitization
    stm = PorterStemmer()
    text = [stm.stem(word) for word in text]
    return " ".join(text)

In [20]:
data_processing("HELLO the the via moving dancing 54!!!! <h1> <p> hello world </p> </h1>")

'hello move danc hello world'

In [21]:
docs = df['paper_text'].apply(lambda x: data_processing(x))

In [22]:
docs.head()

0    self organ associ databas applic hisashi suzuk...
1    mean field theori layer visual cortex applic a...
2    store covari associ long term potenti depress ...
3    bayesian queri construct neural network model ...
4    neural network ensembl cross valid activ learn...
Name: paper_text, dtype: object

In [23]:
# convert the text into sparse matrix (numbers)
# appky count vectorization first
# then done using TF- IDF

from sklearn.feature_extraction.text import CountVectorizer
# words more than 95% then cant be a keyword
# after making the matrix, max features can be 5k
# also take upto trigrams as "i like to code"- I, i like, i like to --taken
cv = CountVectorizer(max_df=0.95, max_features = 5000, ngram_range=(1,3))
word_hash = cv.fit_transform(docs)

In [24]:
word_hash

<5000x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 3064773 stored elements in Compressed Sparse Row format>

In [25]:
# now get importance of each word using transformer
# using TF- IDF transformer

In [26]:
from sklearn.feature_extraction.text import TfidfTransformer

TF_IDF_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
# smooth_idf = t - if word is absent - div/0 presented
# useidf means less present more imp

TF_IDF_transformed = TF_IDF_transformer.fit(word_hash)

In [27]:
feature_names = cv.get_feature_names_out()

In [28]:
def get_keywords(index, docs, n=10):
    # getting count of words and their respective importance
    docs_cnt = TF_IDF_transformer.transform(cv.transform([docs[index]]))
    
    # sorting the sparse matrix
        
    docs_coo = docs_cnt.tocoo()
    
    tuples = zip(docs_coo.col, docs_coo.data)
    sorted_tuples = sorted(tuples, key = lambda x: (x[1],x[0]), reverse= True)
    
    # now get top 10 or n keywords
    sorted_tuples = sorted_tuples[:n]
    
    score_vals = []
    feature_vals = []
    for idx, score in sorted_tuples:
        fname = feature_names[idx]
        score_vals.append(round(score,3))
        feature_vals.append(feature_names[idx])

    #create a tuples of features,score
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]  # Fix: Changed '==' to '='
    return results
    

In [29]:
def print_keywords(index, keywords, df):
    print("\n========Title========")
    print(df['title'][index])
    print("/n========Abstract========")
    print(df['abstract'][index]) 
    print("/n========Keywords========")
    for k in keywords:
        print(k)
        
# to test
index = 2
# use this function to get keywords
keywords = get_keywords(index, docs)
print_keywords(index, keywords, df)


Storing Covariance by the Associative Long-Term Potentiation and Depression of Synaptic Strengths in the Hippocampus
Abstract Missing
synapt
weak
hippocampu
stimul
input
pathway
depress
postsynapt
phase
burst


# Pickle 

In [30]:
import pickle
pickle.dump(cv, open('count_vector.pkl', 'wb'))
pickle.dump(TF_IDF_transformer, open('TF_IDF_transformer.pkl', 'wb'))
pickle.dump(feature_names, open('feature_names.pkl', 'wb'))

In [61]:
# ref : Artificial Intellegence - Youtube

In [62]:
# EOF