In [1]:
# Importing essential modules for the algorithms
from rake_nltk import Rake
import yake
from keybert import KeyBERT
from sklearn.decomposition import LatentDirichletAllocation
import spacy
import pytextrank


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from numpy import array, log

# data processing
import pandas as pd 

### Importing the dataset
- The dataset is pre-processed from the "Pre-Processing-NLM500.ipynb" file

In [2]:
data = pd.read_pickle('Data/Processed_NLM500.pkl')
data.head()

Unnamed: 0,Doc_no,Abstract,Keywords
0,100,poor oxidation behavior major barrier increase...,"alloys,alloys with substantially improved oxid..."
1,101,key problem inspector access data small inspec...,"block,build a model of the smallest thicknesse..."
2,102,situ oxidation experiment carried mm diameter ...,"3mm diameter discs,accelerating voltage of 30k..."
3,103,study outline trial transient response analysi...,"assessment of the corrosion condition,assess t..."
4,104,result type oxidation test combined study tabl...,"adjusted to accommodate buoyancy effects,alumi..."


In [3]:
docs = data['Abstract'].tolist()

In [4]:
doc_no = data['Doc_no'].tolist()

### RAKE Algorithm
- The following code block uses the rake_nltk module to extract ketwords from the document

In [5]:
# This function takes in the text from the datafame and returns keywords
def get_rake_keywords(docs):
    r = Rake(min_length=2, max_length=3)
    sr_no = 1
    results = []
    for row in docs:
        #print(f'Processing : {sr_no}')
        r.extract_keywords_from_text(row)
        keywords = r.get_ranked_phrases()
        keywords = keywords[:10]
        keywords = ",".join(keywords)
        results.append(keywords)
        sr_no += 1
    rake_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    rake_df.insert(loc=0, column="Doc_no", value=doc_no)
    return rake_df

In [6]:
# Since Rake algorithm did not work well with pre-processed data, we applied the abstract from the 
rake_df = pd.read_pickle('Data/NLM500_data.pkl')
rake_docs = rake_df['Abstract'].tolist()
rake_pred_df = get_rake_keywords(rake_docs)

In [7]:
pd.to_pickle(rake_pred_df, 'nlm_results/rake_pred_df.pkl')
rake_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,100,Poor oxidation behavior is the major barrier t...,"requires careful study,broader compositional r..."
1,101,A key part of this problem is that an inspecto...,"small inspected area,provide enough informatio..."
2,102,"In situ oxidation, experiments were carried ou...","transmission electron microscope,primary beam ..."
3,103,The study outlines a trial of transient respon...,"transient response analysis,term benefits prov..."
4,104,The results from two types of oxidation test a...,"table 1 shows,horizontal tube furnaces,accommo..."
...,...,...,...
488,588,MicroCT has been applied to AM parts in variou...,"preliminary results demonstrating,hot isostati..."
489,589,Aeroengine turbine disks often consist of para...,"known hysteresis behaviour,remaining flux dens..."
490,590,Although the presented model is developed and ...,"single outer surface,carbon based materials,nu..."
491,591,Power and particle exhaust are crucial for the...,"severe technical challenge,presumably dramatic..."


### YAKE Algorithm
- The following code block uses the yake module to get keywords from the document

In [8]:
# This function takes in the text from the datafame and returns keywords
def get_yake_keywords(docs):
    kw_extractor = yake.KeywordExtractor(n=3, top=10)
    results = []
    for row in docs:
        keywords = kw_extractor.extract_keywords(row)
        keys = []
        for val in keywords:
            keys.append(val[0])
        results.append(",".join(keys))
    yake_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    yake_df.insert(loc=0, column="Doc_no", value=doc_no)
    return yake_df

In [9]:
yake_pred_df = get_yake_keywords(docs)

In [10]:
pd.to_pickle(yake_pred_df, 'nlm_results/yake_pred_df.pkl')
yake_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,100,poor oxidation behavior major barrier increase...,"based alloy,oxidation behavior,behavior major,..."
1,101,key problem inspector access data small inspec...,"thickness measurement,smallest thickness,minim..."
2,102,situ oxidation experiment carried mm diameter ...,"sample heated,situ oxidation,tem sample,heated..."
3,103,study outline trial transient response analysi...,"transient response,response analysis,ass long,..."
4,104,result type oxidation test combined study tabl...,"table show,time interval,test conducted,result..."
...,...,...,...
488,588,microct applied part form preliminary result d...,"average porosity,build direction,porosity stru..."
489,589,aeroengine turbine disk consist paramagnetic m...,"aeroengine turbine,turbine disk,magnetic field..."
490,590,presented model developed tested c h layer min...,"presented model,forming volatile,model develop..."
491,591,power particle exhaust crucial viability futur...,"fusion reactor,crucial viability,viability fut..."


## TF-IDF 
- The following code block uses the TF-IDF method from the Scikit-Learn module

In [11]:
def get_tf_idf_keywords(docs):
    results = []
    for row in docs:
        vectorizer = CountVectorizer(ngram_range = (2,3))
        X1 = vectorizer.fit_transform([row]) 
        features = (vectorizer.get_feature_names())
        vectorizer = TfidfVectorizer(ngram_range = (2,3))
        X2 = vectorizer.fit_transform([row])
        scores = (X2.toarray())
        sums = X2.sum(axis = 0)
        data1 = []
        for col, term in enumerate(features):
            data1.append( (term, sums[0,col] ))
        ranking = pd.DataFrame(data1, columns = ['term','rank'])
        words = (ranking.sort_values('rank', ascending = False))
        ex_keywords = []
        for term in words['term'][:10]:
            ex_keywords.append(term)
        results.append(",".join(ex_keywords))
    tf_idf_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    tf_idf_df.insert(loc=0, column="Doc_no", value=doc_no)
    return tf_idf_df

In [12]:
tf_idf_pred_df = get_tf_idf_keywords(docs)

In [13]:
pd.to_pickle(tf_idf_pred_df, 'nlm_results/tf_idf_pred_df.pkl')
tf_idf_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,100,poor oxidation behavior major barrier increase...,"based alloy,oxidation behavior,ti based alloy,..."
1,101,key problem inspector access data small inspec...,"thickness measurement,smallest thickness,small..."
2,102,situ oxidation experiment carried mm diameter ...,"tem sample,sample heated,accelerating voltage,..."
3,103,study outline trial transient response analysi...,"transient response,protective current,response..."
4,104,result type oxidation test combined study tabl...,"table show,time interval,test conducted,accomm..."
...,...,...,...
488,588,microct applied part form preliminary result d...,"average porosity,porosity structure,build dire..."
489,589,aeroengine turbine disk consist paramagnetic m...,"aeroengine turbine,non magnetic,magnetic field..."
490,590,presented model developed tested c h layer min...,"forming volatile,presented model,allow applyin..."
491,591,power particle exhaust crucial viability futur...,"fusion reactor,limit term material,pfc reach,p..."


### KeyBert Algorithm

In [14]:
def get_keybert_keywords(docs):
    model = KeyBERT('distilbert-base-nli-mean-tokens')
    results = []
    doc_no = 1
    for row in docs:
        keywords = model.extract_keywords(row, keyphrase_ngram_range=(2, 3), stop_words=None)
        temp_keys = []
        for val in keywords:
            temp_keys.append(val[0])
        results.append(",".join(temp_keys))
        doc_no += 1
    key_bert_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    key_bert_df.insert(loc=0, column="Doc_no", value=doc_no)
    return key_bert_df

In [15]:
key_bert_pred_df = get_keybert_keywords(docs)

In [16]:
pd.to_pickle(key_bert_pred_df, 'nlm_results/key_bert_pred_df.pkl')
key_bert_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,494,poor oxidation behavior major barrier increase...,"improved oxidation resistance,oxidation techni..."
1,494,key problem inspector access data small inspec...,"thickness measurement partitioning,minimum thi..."
2,494,situ oxidation experiment carried mm diameter ...,"electron detector sample,collected nanolab sca..."
3,494,study outline trial transient response analysi...,"analysis scale motorway,scale motorway bridge,..."
4,494,result type oxidation test combined study tabl...,"oxidation test combined,furnace test batch,the..."
...,...,...,...
488,494,microct applied part form preliminary result d...,"study porosity structure,reported study porosi..."
489,494,aeroengine turbine disk consist paramagnetic m...,"magnetometer magnetic remanence,flux gate magn..."
490,494,presented model developed tested c h layer min...,"nuclear fusion device,way nuclear fusion,nucle..."
491,494,power particle exhaust crucial viability futur...,"fusion reactor severe,density fusion reactor,f..."


### Latent Dirichlet Allocation

In [17]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return topic_dict

In [18]:
def get_lda_keywords(docs):
    results = []
    for row in docs:
        vectorizer = CountVectorizer(ngram_range=(3,3))
        tf = vectorizer.fit_transform([row])
        tf_feature_names = vectorizer.get_feature_names()
        number_of_topics = 1
        model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
        model.fit(tf)
        no_top_words = 10
        result_dict = display_topics(model, tf_feature_names, no_top_words)
        kw = result_dict['Topic 0 words']
        results.append(",".join(kw))
    lda = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    lda.insert(loc=0, column="Doc_no", value=doc_no)
    return lda  

In [19]:
lda_pred_df = get_lda_keywords(docs)

In [20]:
pd.to_pickle(lda_pred_df, 'nlm_results/lda_pred_df.pkl')
lda_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,100,poor oxidation behavior major barrier increase...,"ti based alloy,use ti based,expected relation ..."
1,101,key problem inspector access data small inspec...,"smallest thickness measurement,sample smallest..."
2,102,situ oxidation experiment carried mm diameter ...,"xl feg esem,esem hot stage,examined leo vp,exp..."
3,103,study outline trial transient response analysi...,"transient response analysis,ass long term,year..."
4,104,result type oxidation test combined study tabl...,"weight change test,weighing room temperature,e..."
...,...,...,...
488,588,microct applied part form preliminary result d...,"work report similar,hip treatment sample,direc..."
489,589,aeroengine turbine disk consist paramagnetic m...,"aeroengine turbine disk,turbine disk consist,f..."
490,590,presented model developed tested c h layer min...,"way nuclear fusion,fusion device possibly,devi..."
491,591,power particle exhaust crucial viability futur...,"wall exhausted volumetrically,volumetrically l..."


### PositionRank Algorithm

In [21]:
def get_textrank_keywords(docs):
    results = []
    for row in docs:
        # load a spaCy model
        nlp = spacy.load("en_core_web_sm")

        # add PyTextRank to the spaCy pipeline
        nlp.add_pipe("positionrank") #positionrank
        doc = nlp(row)

        # examine the top-ranked phrases in the document
        limit = len(doc._.phrases)
        kw = []
        if(limit > 4):
            for phrase in doc._.phrases:
                if(len(phrase.text.split()) <= 4 and len(phrase.text.split()) > 1):
                    kw.append(phrase.text)
            kw = ",".join(kw)
            results.append(kw)
        else:
            for phrase in doc._.phrases:
                kw.append(phrase.text)
            kw = ",".join(kw)
            results.append(kw)
    pos_rank_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    pos_rank_df.insert(loc=0, column="Doc_no", value=doc_no)
    return pos_rank_df 

In [22]:
pos_rank_pred_df = get_textrank_keywords(docs)

In [23]:
pd.to_pickle(pos_rank_pred_df, 'nlm_results/pos_rank_pred_df.pkl')
pos_rank_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,100,poor oxidation behavior major barrier increase...,"based alloy,service temperature alloy,producti..."
1,101,key problem inspector access data small inspec...,"smallest thickness inspector,underlying thickn..."
2,102,situ oxidation experiment carried mm diameter ...,"situ oxidation experiment,esem hot stage,pa ex..."
3,103,study outline trial transient response analysi...,"interruption protective current structure,impr..."
4,104,result type oxidation test combined study tabl...,"h test test,time interval specimen,furnace tem..."
...,...,...,...
488,588,microct applied part form preliminary result d...,"sample average porosity study,similar porosity..."
489,589,aeroengine turbine disk consist paramagnetic m...,"non magnetic material,aeroengine turbine disk,..."
490,590,presented model developed tested c h layer min...,"volatile reactive gas e,presented model,consti..."
491,591,power particle exhaust crucial viability futur...,"conventional concept material plasma,neutron i..."
