In [1]:
# Importing essential modules for the algorithms
from rake_nltk import Rake
import yake
from keybert import KeyBERT
from sklearn.decomposition import LatentDirichletAllocation
import spacy
import pytextrank


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from numpy import array, log

# data processing
import pandas as pd 

### Importing the dataset
- The dataset is pre-processed from the "Pre-Processing-SemEval2017.ipynb" file

In [2]:
data = pd.read_pickle('Data/Processed_SemEval.pkl')
data.head()

Unnamed: 0,Doc_no,Abstract,Keywords
0,100,poor oxidation behavior major barrier increase...,"alloys,alloys with substantially improved oxid..."
1,101,key problem inspector access data small inspec...,"block,build a model of the smallest thicknesse..."
2,102,situ oxidation experiment carried mm diameter ...,"3mm diameter discs,accelerating voltage of 30k..."
3,103,study outline trial transient response analysi...,"assessment of the corrosion condition,assess t..."
4,104,result type oxidation test combined study tabl...,"adjusted to accommodate buoyancy effects,alumi..."


In [3]:
docs = data['Abstract'].tolist()

In [4]:
doc_no = data['Doc_no'].tolist()

### RAKE Algorithm
- The following code block uses the rake_nltk module to extract ketwords from the document

In [5]:
# This function takes in the text from the datafame and returns keywords
def get_rake_keywords(docs):
    r = Rake(min_length=2, max_length=3)
    sr_no = 1
    results = []
    for row in docs:
        #print(f'Processing : {sr_no}')
        r.extract_keywords_from_text(row)
        keywords = r.get_ranked_phrases()
        keywords = keywords[:7]
        keywords = ",".join(keywords)
        results.append(keywords)
        sr_no += 1
    rake_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    rake_df.insert(loc=0, column="Doc_no", value=doc_no)
    return rake_df

In [6]:
# Since Rake algorithm did not work well with pre-processed data, we applied the abstract from the 
rake_df = pd.read_pickle('Data/SemEval_data.pkl')
rake_docs = rake_df['Abstract'].tolist()
rake_pred_df = get_rake_keywords(rake_docs)

ValueError: Length of values (493) does not match length of index (500)

In [None]:
pd.to_pickle(rake_pred_df, 'hulth_results/rake_pred_df.pkl')
rake_pred_df

### YAKE Algorithm
- The following code block uses the yake module to get keywords from the document

In [None]:
# This function takes in the text from the datafame and returns keywords
def get_yake_keywords(docs):
    kw_extractor = yake.KeywordExtractor(n=2, top=7)
    results = []
    for row in docs:
        keywords = kw_extractor.extract_keywords(row)
        keys = []
        for val in keywords:
            keys.append(val[0])
        results.append(",".join(keys))
    yake_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    yake_df.insert(loc=0, column="Doc_no", value=doc_no)
    return yake_df

In [None]:
yake_pred_df = get_yake_keywords(docs)

In [None]:
pd.to_pickle(yake_pred_df, 'hulth_results/yake_pred_df.pkl')
yake_pred_df

## TF-IDF 
- The following code block uses the TF-IDF method from the Scikit-Learn module

In [None]:
def get_tf_idf_keywords(docs):
    results = []
    for row in docs:
        vectorizer = CountVectorizer(ngram_range = (2,3))
        X1 = vectorizer.fit_transform([row]) 
        features = (vectorizer.get_feature_names())
        vectorizer = TfidfVectorizer(ngram_range = (2,3))
        X2 = vectorizer.fit_transform([row])
        scores = (X2.toarray())
        sums = X2.sum(axis = 0)
        data1 = []
        for col, term in enumerate(features):
            data1.append( (term, sums[0,col] ))
        ranking = pd.DataFrame(data1, columns = ['term','rank'])
        words = (ranking.sort_values('rank', ascending = False))
        ex_keywords = []
        for term in words['term'][:7]:
            ex_keywords.append(term)
        results.append(",".join(ex_keywords))
    tf_idf_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    tf_idf_df.insert(loc=0, column="Doc_no", value=doc_no)
    return tf_idf_df

In [None]:
tf_idf_pred_df = get_tf_idf_keywords(docs)

In [None]:
pd.to_pickle(tf_idf_pred_df, 'hulth_results/tf_idf_pred_df.pkl')
tf_idf_pred_df

### KeyBert Algorithm

In [None]:
def get_keybert_keywords(docs):
    model = KeyBERT('distilbert-base-nli-mean-tokens')
    results = []
    doc_no = 1
    for row in docs:
        keywords = model.extract_keywords(row, keyphrase_ngram_range=(2, 3), stop_words=None)
        temp_keys = []
        for val in keywords:
            temp_keys.append(val[0])
        results.append(",".join(temp_keys))
        doc_no += 1
    key_bert_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    key_bert_df.insert(loc=0, column="Doc_no", value=doc_no)
    return key_bert_df

In [None]:
key_bert_pred_df = get_keybert_keywords(docs)

In [None]:
pd.to_pickle(key_bert_pred_df, 'hulth_results/key_bert_pred_df.pkl')
key_bert_pred_df

### Latent Dirichlet Allocation

In [None]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return topic_dict

In [None]:
def get_lda_keywords(docs):
    results = []
    for row in docs:
        vectorizer = CountVectorizer(ngram_range=(3,3))
        tf = vectorizer.fit_transform([row])
        tf_feature_names = vectorizer.get_feature_names()
        number_of_topics = 1
        model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
        model.fit(tf)
        no_top_words = 7
        result_dict = display_topics(model, tf_feature_names, no_top_words)
        kw = result_dict['Topic 0 words']
        results.append(",".join(kw))
    lda = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    lda.insert(loc=0, column="Doc_no", value=doc_no)
    return lda  

In [None]:
lda_pred_df = get_lda_keywords(docs)

In [None]:
pd.to_pickle(lda_pred_df, 'hulth_results/lda_pred_df.pkl')
lda_pred_df

### PositionRank Algorithm

In [None]:
def get_textrank_keywords(docs):
    results = []
    for row in docs:
        # load a spaCy model
        nlp = spacy.load("en_core_web_sm")

        # add PyTextRank to the spaCy pipeline
        nlp.add_pipe("positionrank") #positionrank
        doc = nlp(row)

        # examine the top-ranked phrases in the document
        limit = len(doc._.phrases)
        kw = []
        if(limit > 4):
            for phrase in doc._.phrases:
                if(len(phrase.text.split()) <= 4 and len(phrase.text.split()) > 1):
                    kw.append(phrase.text)
            kw = ",".join(kw)
            results.append(kw)
        else:
            for phrase in doc._.phrases:
                kw.append(phrase.text)
            kw = ",".join(kw)
            results.append(kw)
    pos_rank_df = pd.DataFrame(zip(docs,results),columns=['Abstract','Extracted_Keywords'])
    pos_rank_df.insert(loc=0, column="Doc_no", value=doc_no)
    return pos_rank_df 

In [None]:
pos_rank_pred_df = get_textrank_keywords(docs)

In [None]:
pd.to_pickle(pos_rank_pred_df, 'hulth_results/pos_rank_pred_df.pkl')
pos_rank_pred_df