In [1]:
# Importing essential modules for the algorithms
from rake_nltk import Rake
import yake
from keybert import KeyBERT
from sklearn.decomposition import LatentDirichletAllocation
import spacy
import pytextrank


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from numpy import array, log

# data processing
import pandas as pd 

### Importing the dataset
- The dataset is pre-processed from the "Pre-Processing-WWW.ipynb" file

In [2]:
data = pd.read_pickle('Data/Processed_WWW.pkl')
data.head()

Unnamed: 0,Doc_no,Abstract,Keywords
0,183,eigentrust algorithm reputation management p p...,"distributed eigenvector computation,peer-to-pe..."
1,10119,simulation verification automated composition ...,"automated reasoning,daml,distributed systems,o..."
2,11785,context content based trust policy semantic we...,"named graphs,semantic web,trust mechanisms,tru..."
3,12102,meteor web service annotation framework world ...,"ontology,semantic annotation of web services,s..."
4,13109,detecting web page structure adaptive viewing ...,"adaptive hypermedia,content adaptation,mobile ..."


In [3]:
docs = data['Abstract'].tolist()

In [4]:
doc_no = data['Doc_no'].tolist()

### RAKE Algorithm
- The following code block uses the rake_nltk module to extract ketwords from the document

In [5]:
# This function takes in the text from the datafame and returns keywords
def get_rake_keywords(docs):
    r = Rake(min_length=2, max_length=3)
    sr_no = 1
    results = []
    for row in docs:
        #print(f'Processing : {sr_no}')
        r.extract_keywords_from_text(row)
        keywords = r.get_ranked_phrases()
        keywords = keywords[:7]
        keywords = ",".join(keywords)
        results.append(keywords)
        sr_no += 1
    rake_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    rake_df.insert(loc=0, column="Doc_no", value=doc_no)
    return rake_df

In [6]:
# Since Rake algorithm did not work well with pre-processed data, we applied the abstract from the 
rake_df = pd.read_pickle('Data/WWW.pkl')
rake_docs = rake_df['Abstract'].tolist()
rake_pred_df = get_rake_keywords(rake_docs)

In [7]:
pd.to_pickle(rake_pred_df, 'www_results/rake_pred_df.pkl')
rake_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,183,The Eigentrust algorithm for reputation manage...,"recent experience shows,global trust values,al..."
1,10119,"Simulation , verification and automated compos...","order logical language,key application area,pr..."
2,11785,Using context - and content-based trust polici...,"specific data published,specific trust policie..."
3,12102,Meteor-s web service annotation framework The ...,"first critical step,broadly adopted technology..."
4,13109,Detecting web page structure for adaptive view...,"logically related units,experimental results s..."
...,...,...,...
495,14449751,A flexible generative model for preference agg...,"social choice face,preference aggregation prob..."
496,14453157,Evaluation with informational and navigational...,"single `` entry,search result diversification,..."
497,14453752,Template-based question answering over RDF dat...,"natural language question,based question answe..."
498,14454508,ZenCrowd : leveraging probabilistic reasoning ...,"make sensible decisions,dynamically generating..."


### YAKE Algorithm
- The following code block uses the yake module to get keywords from the document

In [8]:
# This function takes in the text from the datafame and returns keywords
def get_yake_keywords(docs):
    kw_extractor = yake.KeywordExtractor(n=2, top=7)
    results = []
    for row in docs:
        keywords = kw_extractor.extract_keywords(row)
        keys = []
        for val in keywords:
            keys.append(val[0])
        results.append(",".join(keys))
    yake_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    yake_df.insert(loc=0, column="Doc_no", value=doc_no)
    return yake_df

In [9]:
yake_pred_df = get_yake_keywords(docs)

In [10]:
pd.to_pickle(yake_pred_df, 'www_results/yake_pred_df.pkl')
yake_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,183,eigentrust algorithm reputation management p p...,"global trust,inauthentic file,file sharing,dec..."
1,10119,simulation verification automated composition ...,"web service,service web,simulation verificatio..."
2,11785,context content based trust policy semantic we...,"content based,context content,semantic web,bas..."
3,12102,meteor web service annotation framework world ...,"web service,semantic web,semantic annotation,s..."
4,13109,detecting web page structure adaptive viewing ...,"web page,mobile device,form factor,factor devi..."
...,...,...,...
495,14449751,flexible generative model preference aggregati...,"aggregation problem,existing method,preference..."
496,14453157,evaluation informational navigational intent g...,"intuitive metric,din ndcg,preference agreement..."
497,14453752,template based question answering rdf data inc...,"rdf data,question answering,answering rdf,incr..."
498,14454508,zencrowd leveraging probabilistic reasoning cr...,"improve quality,reasoning crowdsourcing,probab..."


## TF-IDF 
- The following code block uses the TF-IDF method from the Scikit-Learn module

In [11]:
def get_tf_idf_keywords(docs):
    results = []
    for row in docs:
        vectorizer = CountVectorizer(ngram_range = (2,3))
        X1 = vectorizer.fit_transform([row]) 
        features = (vectorizer.get_feature_names())
        vectorizer = TfidfVectorizer(ngram_range = (2,3))
        X2 = vectorizer.fit_transform([row])
        scores = (X2.toarray())
        sums = X2.sum(axis = 0)
        data1 = []
        for col, term in enumerate(features):
            data1.append( (term, sums[0,col] ))
        ranking = pd.DataFrame(data1, columns = ['term','rank'])
        words = (ranking.sort_values('rank', ascending = False))
        ex_keywords = []
        for term in words['term'][:7]:
            ex_keywords.append(term)
        results.append(",".join(ex_keywords))
    tf_idf_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    tf_idf_df.insert(loc=0, column="Doc_no", value=doc_no)
    return tf_idf_df

In [12]:
tf_idf_pred_df = get_tf_idf_keywords(docs)

In [13]:
pd.to_pickle(tf_idf_pred_df, 'www_results/tf_idf_pred_df.pkl')
tf_idf_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,183,eigentrust algorithm reputation management p p...,"global trust value,global trust,trust value,in..."
1,10119,simulation verification automated composition ...,"web service,web service web,simulation verific..."
2,11785,context content based trust policy semantic we...,"content based trust,based trust,semantic web,t..."
3,12102,meteor web service annotation framework world ...,"web service,semantic web,data semantic,meteor ..."
4,13109,detecting web page structure adaptive viewing ...,"web page,mobile device,form factor device,smal..."
...,...,...,...
495,14449751,flexible generative model preference aggregati...,"aggregation problem,preference aggregation,acc..."
496,14453157,evaluation informational navigational intent g...,"din ndcg,intuitive metric,intent recall,metric..."
497,14453752,template based question answering rdf data inc...,"question answering,rdf data,accessing data,que..."
498,14454508,zencrowd leveraging probabilistic reasoning cr...,"reasoning crowdsourcing technique,reasoning cr..."


### KeyBert Algorithm

In [14]:
def get_keybert_keywords(docs):
    model = KeyBERT('distilbert-base-nli-mean-tokens')
    results = []
    doc_no = 1
    for row in docs:
        keywords = model.extract_keywords(row, keyphrase_ngram_range=(2, 3), stop_words=None)
        temp_keys = []
        for val in keywords:
            temp_keys.append(val[0])
        results.append(",".join(temp_keys))
        doc_no += 1
    key_bert_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    key_bert_df.insert(loc=0, column="Doc_no", value=doc_no)
    return key_bert_df

In [15]:
key_bert_pred_df = get_keybert_keywords(docs)

In [16]:
pd.to_pickle(key_bert_pred_df, 'www_results/key_bert_pred_df.pkl')
key_bert_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,501,eigentrust algorithm reputation management p p...,"malicious peer isolates,file algorithm decreas..."
1,501,simulation verification automated composition ...,"evolution semantic web,semantic web proliferat..."
2,501,context content based trust policy semantic we...,"semantic web trust,policy semantic web,web tru..."
3,501,meteor web service annotation framework world ...,"better framework web,web create better,better ..."
4,501,detecting web page structure adaptive viewing ...,"page designed desktop,detecting web page,desig..."
...,...,...,...
495,501,flexible generative model preference aggregati...,"aggregation problem multiple,aggregation probl..."
496,501,evaluation informational navigational intent g...,"search result diversification,diversity metric..."
497,501,template based question answering rdf data inc...,"result expressive query,question triple matche..."
498,501,zencrowd leveraging probabilistic reasoning cr...,"probabilistic reasoning crowdsourcing,task onl..."


### Latent Dirichlet Allocation

In [17]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return topic_dict

In [18]:
def get_lda_keywords(docs):
    results = []
    for row in docs:
        vectorizer = CountVectorizer(ngram_range=(3,3))
        tf = vectorizer.fit_transform([row])
        tf_feature_names = vectorizer.get_feature_names()
        number_of_topics = 1
        model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
        model.fit(tf)
        no_top_words = 7
        result_dict = display_topics(model, tf_feature_names, no_top_words)
        kw = result_dict['Topic 0 words']
        results.append(",".join(kw))
    lda = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    lda.insert(loc=0, column="Doc_no", value=doc_no)
    return lda  

In [19]:
lda_pred_df = get_lda_keywords(docs)

In [20]:
pd.to_pickle(lda_pred_df, 'www_results/lda_pred_df.pkl')
lda_pred_df

Unnamed: 0,Doc_no,Abstract,Extracted_Keywords
0,183,eigentrust algorithm reputation management p p...,"global trust value,peer peer file,peer file sh..."
1,10119,simulation verification automated composition ...,"web service web,composition web service,implem..."
2,11785,context content based trust policy semantic we...,"content based trust,context content based,base..."
3,12102,meteor web service annotation framework world ...,"service annotation framework,web service annot..."
4,13109,detecting web page structure adaptive viewing ...,"device web page,small form factor,form factor ..."
...,...,...,...
495,14449751,flexible generative model preference aggregati...,"work formulate flexible,variety form make,gene..."
496,14453157,evaluation informational navigational intent g...,"metric intuitive metric,intuitive metric empha..."
497,14453752,template based question answering rdf data inc...,"way accessing data,identification predicate de..."
498,14454508,zencrowd leveraging probabilistic reasoning cr...,"reasoning crowdsourcing technique,probabilisti..."


### PositionRank Algorithm

In [21]:
def get_positionrank_keywords(docs):
    results = []
    for row in docs:
        # load a spaCy model
        nlp = spacy.load("en_core_web_sm")

        # add PyTextRank to the spaCy pipeline
        nlp.add_pipe("positionrank") #positionrank
        doc = nlp(row)

        # examine the top-ranked phrases in the document
        limit = len(doc._.phrases)
        kw = []
        if(limit > 4):
            for phrase in doc._.phrases:
                if(len(phrase.text.split()) <= 4 and len(phrase.text.split()) > 1):
                    kw.append(phrase.text)
            kw = ",".join(kw)
            results.append(kw)
        else:
            for phrase in doc._.phrases:
                kw.append(phrase.text)
            kw = ",".join(kw)
            results.append(kw)
    pos_rank_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    pos_rank_df.insert(loc=0, column="Doc_no", value=doc_no)
    return pos_rank_df 

In [None]:
pos_rank_pred_df = get_positionrank_keywords(docs)

In [None]:
pd.to_pickle(pos_rank_pred_df, 'www_results/pos_rank_pred_df.pkl')
pos_rank_pred_df