In [1]:
# Importing essential modules for the algorithms
from rake_nltk import Rake
import yake
from keybert import KeyBERT
from sklearn.decomposition import LatentDirichletAllocation
import spacy
import pytextrank


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from numpy import array, log

# data processing
import pandas as pd 

### Importing the dataset
- The dataset is pre-processed from the "Pre-Processing-Hulth2003.ipynb" file

In [2]:
data = pd.read_pickle('Data/Processed_Hulth.pkl')
data.head()

Unnamed: 0,Doc_no,Title,Abstract,Keywords
0,6,SBC gets more serious on regulatory compliance,eye past future sbc communication week created...,"telecommunication,sbc communications,regulator..."
1,7,Anti-spam suit attempts to hold carriers accou...,lawsuit alleges sprint violated utah new anti ...,"electronic mail,legislation,telecommunication,..."
2,8,New investors get steal of a deal [Global Cros...,hutchison telecommunication singapore technolo...,"telecommunication,hutchison telecommunications..."
3,9,Achieving competitive capabilities in e-services,implication internet service operation strateg...,"corporate modelling,electronic commerce,intern..."
4,11,Does social capital determine innovation? To w...,paper deal question social capital determine i...,"corporate modelling,decision theory,manufactur..."


In [3]:
docs = data['Abstract'].tolist()

In [4]:
doc_no = data['Doc_no'].tolist()

### RAKE Algorithm
- The following code block uses the rake_nltk module to extract ketwords from the document

In [5]:
# This function takes in the text from the datafame and returns keywords
def get_rake_keywords(docs):
    r = Rake(min_length=2, max_length=3)
    sr_no = 1
    results = []
    for row in docs:
        #print(f'Processing : {sr_no}')
        r.extract_keywords_from_text(row)
        keywords = r.get_ranked_phrases()
        keywords = keywords[:7]
        keywords = ",".join(keywords)
        results.append(keywords)
        sr_no += 1
    rake_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    rake_df.insert(loc=0, column="Doc_no", value=doc_no)
    return rake_df

In [6]:
# Since Rake algorithm did not work well with pre-processed data, we applied the abstract from the 
rake_df = pd.read_pickle('Data/Hulth2003_data.pkl')
rake_docs = rake_df['Abstract'][:500].tolist()
rake_pred_df = get_rake_keywords(rake_docs)

In [7]:
pd.to_pickle(rake_pred_df, 'hulth_results/rake_pred_df.pkl')
rake_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,6,With one eye on the past and the other on its ...,"landing fcc approval,distance service througho..."
1,7,A lawsuit alleges that Sprint has violated Uta...,"telecom service providers,action could open,vi..."
2,8,Hutchison Telecommunications and Singapore Tec...,"lot less money,bankrupt carrier intact,origina..."
3,9,What implications does the Internet have for s...,"g ., bricks,different strategic imperatives,di..."
4,11,This paper deals with two questions: Does soci...,"reciprocal trust ).,empirical investigations r..."
...,...,...,...
495,946,We propose entanglement measures with asymptot...,"propose entanglement measures,entanglement mea..."
496,947,Characterizing entanglement in all but the sim...,"simple known closed,relevant experimental quan..."
497,948,We have calculated the concurrence of the pair...,"pairwise thermal entanglement,qubit number n,q..."
498,949,"In our letter (see ibid., vol. 296, p. 161 (20...","unknown qubit state,simple mathematics error,m..."


### YAKE Algorithm
- The following code block uses the yake module to get keywords from the document

In [8]:
# This function takes in the text from the datafame and returns keywords
def get_yake_keywords(docs):
    kw_extractor = yake.KeywordExtractor(n=2, top=7)
    results = []
    for row in docs:
        keywords = kw_extractor.extract_keywords(row)
        keys = []
        for val in keywords:
            keys.append(val[0])
        results.append(",".join(keys))
    yake_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    yake_df.insert(loc=0, column="Doc_no", value=doc_no)
    return yake_df

In [9]:
yake_pred_df = get_yake_keywords(docs)

In [10]:
pd.to_pickle(yake_pred_df, 'hulth_results/yake_pred_df.pkl')
yake_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,6,eye past future sbc communication week created...,"term goal,eye past,regulatory compliance,past ..."
1,7,lawsuit alleges sprint violated utah new anti ...,"lawsuit alleges,service provider,alleges sprin..."
2,8,hutchison telecommunication singapore technolo...,"hutchison telecommunication,doesnt clear,telec..."
3,9,implication internet service operation strateg...,"knowledge based,based competency,service compa..."
4,11,paper deal question social capital determine i...,"social capital,network asset,explanatory varia..."
...,...,...,...
495,946,propose entanglement measure asymptotic weak m...,"measure asymptotic,asymptotic weak,weak monoto..."
496,947,characterizing entanglement simplest case qubi...,"fully entangled,entangled fraction,action loca..."
497,948,calculated concurrence pairwise thermal entang...,"calculated concurrence,concurrence pairwise,pa..."
498,949,letter ibid vol p main question consider gener...,"ibid vol,state realize,mathematics error,gener..."


## TF-IDF 
- The following code block uses the TF-IDF method from the Scikit-Learn module

In [11]:
def get_tf_idf_keywords(docs):
    results = []
    for row in docs:
        vectorizer = CountVectorizer(ngram_range = (2,3))
        X1 = vectorizer.fit_transform([row]) 
        features = (vectorizer.get_feature_names())
        vectorizer = TfidfVectorizer(ngram_range = (2,3))
        X2 = vectorizer.fit_transform([row])
        scores = (X2.toarray())
        sums = X2.sum(axis = 0)
        data1 = []
        for col, term in enumerate(features):
            data1.append( (term, sums[0,col] ))
        ranking = pd.DataFrame(data1, columns = ['term','rank'])
        words = (ranking.sort_values('rank', ascending = False))
        ex_keywords = []
        for term in words['term'][:7]:
            ex_keywords.append(term)
        results.append(",".join(ex_keywords))
    tf_idf_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    tf_idf_df.insert(loc=0, column="Doc_no", value=doc_no)
    return tf_idf_df

In [12]:
tf_idf_pred_df = get_tf_idf_keywords(docs)

In [13]:
pd.to_pickle(tf_idf_pred_df, 'hulth_results/tf_idf_pred_df.pkl')
tf_idf_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,6,eye past future sbc communication week created...,"regulatory compliance,term goal,accomplish sho..."
1,7,lawsuit alleges sprint violated utah new anti ...,"act action,open door,violated utah,utah new an..."
2,8,hutchison telecommunication singapore technolo...,"bankrupt carrier,leaf bankrupt,telecommunicati..."
3,9,implication internet service operation strateg...,"knowledge based,based competency,knowledge bas..."
4,11,paper deal question social capital determine i...,"social capital,social capital innovation,capit..."
...,...,...,...
495,946,propose entanglement measure asymptotic weak m...,"asymptotic weak,entanglement measure,weak mono..."
496,947,characterizing entanglement simplest case qubi...,"entangled fraction,fully entangled fraction,fu..."
497,948,calculated concurrence pairwise thermal entang...,"aspect critical,great difference qubit,heisenb..."
498,949,letter ibid vol p main question consider gener...,"mathematics error,ibid vol,state realize,answe..."


### KeyBert Algorithm

In [14]:
def get_keybert_keywords(docs):
    model = KeyBERT('distilbert-base-nli-mean-tokens')
    results = []
    doc_no = 1
    for row in docs:
        keywords = model.extract_keywords(row, keyphrase_ngram_range=(2, 3), stop_words=None)
        temp_keys = []
        for val in keywords:
            temp_keys.append(val[0])
        results.append(",".join(temp_keys))
        doc_no += 1
    key_bert_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    key_bert_df.insert(loc=0, column="Doc_no", value=doc_no)
    return key_bert_df

In [15]:
key_bert_pred_df = get_keybert_keywords(docs)

In [16]:
pd.to_pickle(key_bert_pred_df, 'hulth_results/key_bert_pred_df.pkl')
key_bert_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,501,eye past future sbc communication week created...,"hope new regulatory,future sbc communication,l..."
1,501,lawsuit alleges sprint violated utah new anti ...,"sprint violated utah,violated utah new,lawsuit..."
2,501,hutchison telecommunication singapore technolo...,"deal leaf bankrupt,leaf bankrupt carrier,bankr..."
3,501,implication internet service operation strateg...,"strategy business performance,based economy re..."
4,501,paper deal question social capital determine i...,"social capital innovation,capital innovation d..."
...,...,...,...
495,501,propose entanglement measure asymptotic weak m...,"measure asymptotic weak,weak monotonicity norm..."
496,501,characterizing entanglement simplest case qubi...,"operation important quantum,important quantum ..."
497,501,calculated concurrence pairwise thermal entang...,"concurrence pairwise thermal,pairwise thermal ..."
498,501,letter ibid vol p main question consider gener...,"mathematics error normalization,mathematics er..."


### Latent Dirichlet Allocation

In [17]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return topic_dict

In [18]:
def get_lda_keywords(docs):
    results = []
    for row in docs:
        vectorizer = CountVectorizer(ngram_range=(3,3))
        tf = vectorizer.fit_transform([row])
        tf_feature_names = vectorizer.get_feature_names()
        number_of_topics = 1
        model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
        model.fit(tf)
        no_top_words = 7
        result_dict = display_topics(model, tf_feature_names, no_top_words)
        kw = result_dict['Topic 0 words']
        results.append(",".join(kw))
    lda = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    lda.insert(loc=0, column="Doc_no", value=doc_no)
    return lda  

In [19]:
lda_pred_df = get_lda_keywords(docs)

In [20]:
pd.to_pickle(lda_pred_df, 'hulth_results/lda_pred_df.pkl')
lda_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,6,eye past future sbc communication week created...,"week created unit,distance service region,help..."
1,7,lawsuit alleges sprint violated utah new anti ...,"violated utah new,utah new anti,telecom servic..."
2,8,hutchison telecommunication singapore technolo...,"telecommunication singapore technology,technol..."
3,9,implication internet service operation strateg...,"knowledge based competency,competitive capabil..."
4,11,paper deal question social capital determine i...,"social capital innovation,explanatory variable..."
...,...,...,...
495,946,propose entanglement measure asymptotic weak m...,"measure asymptotic weak,entanglement measure a..."
496,947,characterizing entanglement simplest case qubi...,"fully entangled fraction,action local unitary,..."
497,948,calculated concurrence pairwise thermal entang...,"xx chain great,thermal entanglement qubit,calc..."
498,949,letter ibid vol p main question consider gener...,"vol main question,vol author check,letter poin..."


### PositionRank Algorithm

In [21]:
def get_positionrank_keywords(docs):
    results = []
    for row in docs:
        # load a spaCy model
        nlp = spacy.load("en_core_web_sm")

        # add PyTextRank to the spaCy pipeline
        nlp.add_pipe("positionrank") #positionrank
        doc = nlp(row)

        # examine the top-ranked phrases in the document
        limit = len(doc._.phrases)
        kw = []
        if(limit > 4):
            for phrase in doc._.phrases:
                if(len(phrase.text.split()) <= 4 and len(phrase.text.split()) > 1):
                    kw.append(phrase.text)
            kw = ",".join(kw)
            results.append(kw)
        else:
            for phrase in doc._.phrases:
                kw.append(phrase.text)
            kw = ",".join(kw)
            results.append(kw)
    pos_rank_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    pos_rank_df.insert(loc=0, column="Doc_no", value=doc_no)
    return pos_rank_df 

In [22]:
pos_rank_pred_df = get_positionrank_keywords(docs)

In [23]:
pd.to_pickle(pos_rank_pred_df, 'hulth_results/pos_rank_pred_df.pkl')
pos_rank_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,6,eye past future sbc communication week created...,"new regulatory compliance unit,regulatory burd..."
1,7,lawsuit alleges sprint violated utah new anti ...,utah new anti spam act action open door new re...
2,8,hutchison telecommunication singapore technolo...,"global crossing lot money,hutchison telecommun..."
3,9,implication internet service operation strateg...,"investment intellectual capital,current litera..."
4,11,paper deal question social capital determine i...,"social capital,paper deal question,traditional..."
...,...,...,...
495,946,propose entanglement measure asymptotic weak m...,"asymptotic weak monotonicity,form entanglement..."
496,947,characterizing entanglement simplest case qubi...,"entanglement simplest case,readily extendable ..."
497,948,calculated concurrence pairwise thermal entang...,"odd qubit chain,critical temperature existence..."
498,949,letter ibid vol p main question consider gener...,"incorrect claim letter,simple mathematics erro..."
