# Keyword_extraction from a document

In [5]:
# Run config file
%run config.ipynb
# Reading config file
import configparser
config = configparser.ConfigParser()
config.read("config.ini")

['config.ini']

## Rake keyword extraction

In [21]:
from rake_nltk import Rake

#text: The document(s) for which to extract keywords/keyphrases
#min_length: the minimun length of words present in the keyword extracted
#max_length: the maximum length of words present in the keyword extracted

def rake_extractor(text):
    """
    Uses Rake to extract the top k("number_of_keywords") keywords from a text
    Arguments: text (list)
    Returns: list of keywords (list)
    """
    rake_keywords = []
    for i in text:
        r = Rake(min_length=int(config['rake']['min_length']), max_length=int(config['rake']['max_length']))
        r.extract_keywords_from_text(i)
        rake_keywords.append((r.get_ranked_phrases_with_scores())[:int(config['rake']['number_of_keywords'])])
    return rake_keywords

## Yake keyword extraction

In [23]:
import yake
#language: Supports english language i.e en.
#max_ngram_size: This is the maximum lenght of keywords allowed.
#numOfKeywords: Number of keywords extarcted from a sentence/paragraph.

def yake_extractor(text):
    """
    Uses YAKE to extract the top k("number_of_keywords") keywords from a text
    Arguments: text (list)
    Returns: list of keywords (list)
    """
    yake_keywords = []
    for i in text:
        custom_kw_extractor = yake.KeywordExtractor(lan=config['yake']['language'], n=int(config['yake']['max_ngram_size']), top=int(config['yake']['number_of_keywords']))
        keywords = custom_kw_extractor.extract_keywords(i)
        yake_keywords.append(keywords)

    return yake_keywords

## keyBert keyword extraxtion

In [29]:
# Keybert arguments:
#     docs: The document(s) for which to extract keywords/keyphrases
#     candidates: Candidate keywords/keyphrases to use instead of extracting them from the document(s)
#     keyphrase_ngram_range: Length, in words, of the extracted keywords/keyphrases
#     stop_words: Stopwords to remove from the document
#     top_n: Return the top n keywords/keyphrases
#     min_df: Minimum document frequency of a word across all documents
#             if keywords for multiple documents need to be extracted
#     use_maxsum: Whether to use Max Sum Similarity for the selection
#                 of keywords/keyphrases
#     use_mmr: Whether to use Maximal Marginal Relevance (MMR) for the
#                 selection of keywords/keyphrases
#     diversity: The diversity of the results between 0 and 1 if use_mmr
#                 is set to True
#     nr_candidates: The number of candidates to consider if use_maxsum is
#                     set to True
#     vectorizer: Pass in your own CountVectorizer from scikit-learn
#     highlight: Whether to print the document and highlight
#                 its keywords/keyphrases. NOTE: This does not work if
#                 multiple documents are passed.
#     seed_keywords: Seed keywords that may guide the extraction of keywords by
#                     steering the similarities towards the seeded keywords
# Returns:
#     keywords: the top n keywords for a document with their respective distances
#                       to the input document

from keybert import KeyBERT

def keyBert_extractor(text):
    """
        Uses keyBert model to extract the top k("number_of_keywords") keywords from a text
        Arguments: text (list)
        Returns: list of keywords (list)
    """
    min = int(config['keyBert']['min_length'])
    max = int(config['keyBert']['max_length'])
    
    kw_model = KeyBERT()
    senetence_keywords = kw_model.extract_keywords(text,keyphrase_ngram_range =(min,max),top_n=int(config['keyBert']['number_of_keywords']))
    return senetence_keywords

## Loading data

In [25]:
#This is to read a csv file in order to give it as an input for extracring keywords.
import pandas as pd
import numpy as np

data = pd.read_csv(config['Data']['data'])
data = data["abstractiveBART summary"].str.lower()
data = data.to_list()
data

[' access to healthcare is a priority for sanofi . we are helping to improve access in various ways, from r&d to the fight against counterfeiting . we contribute to sustainable development goal 3: ensure healthy lives and promote well-being for all at all ages . we also operate a responsible pricing policy for our medicines to ensure that they are affordable for all (see section 4.2.4, product pricing), and we design and conduct initiatives to assist vulnerable populations .',
 ' novo nordisk is one of the largest pharmaceutical companies in the world . the company is based in malaysia, new zealand, nigeria, nigeria and the philippines . indonesia, indonesia, iran, iran and israel are among many of the countries controlled by the group .',
 ' the remuneration committee of the board of directors determined the . corporate executive committee members bonuses based on the performance 2019 against the . agreed objectives . the . remuneration committee uses complete discretion inthe weighti

## Keyword extraction using all three methods for each sentence

In [30]:
rake_key = rake_extractor(data)
yake_key = yake_extractor(data)
keyBert_key = keyBert_extractor(data)

71it [00:00, 605.89it/s]


In [31]:
print(rake_key)
print("\n")
print(yake_key)
print("\n")
print(keyBert_key)

[[(9.0, 'responsible pricing policy'), (9.0, 'product pricing ),'), (9.0, 'assist vulnerable populations'), (8.0, 'see section 4'), (8.0, 'ensure healthy lives')], [(9.0, 'largest pharmaceutical companies'), (4.0, 'novo nordisk'), (4.0, 'new zealand'), (4.0, 'countries controlled'), (4.0, 'among many')], [(9.0, 'total aggregate amount'), (9.0, 'daniel oday received'), (4.0, 'drmichael heuer'), (4.0, 'directors determined'), (4.0, 'brought forward')], [(7.888888888888889, 'johnson regulatory compliance'), (7.388888888888889, 'johnson innovation introduced'), (4.388888888888889, 'johnson innovation'), (4.0, 'risk management'), (4.0, 'relevant policies')], [(9.0, 'pharmaceutical manufacturing partnership'), (9.0, 'environmental protection agencys'), (8.666666666666666, 'improving energy intensity'), (8.666666666666666, 'energy star focus'), (4.666666666666666, 'energy efficiency')], [(9.0, 'un global compact'), (9.0, 'core business practices'), (8.5, 'party risk man'), (7.9, 'ingrain huma

In [32]:
#This creates a text file, which will contain the RAKE, YAKE and keyBert extarction for each sentence for comparision purpose.
#The output will be written into Comparision_of_3_keywordExtractions.txt for all the extarction methods.

f = open("Comparision_of_3_keywordExtractions.txt", "w")
for i in range(len(data)):
    doc = data[i]
    f.write("Document:\n"+doc+"\n\n")

    f.write("RAKE Keywords:\n\n")
    for j in range(len(rake_key[i])):
        key = rake_key[i][j]
        f.write(key[1]+"  "+str(round(key[0],2))+"\n")
        
    f.write("\nYAKE Keywords:\n\n")
    for k in range(len(yake_key[i])):
        key1 = yake_key[i][k]
        f.write(key1[0]+"  "+str(round(key1[1],2))+"\n")
        
    f.write("\nKEYBERT Keywords:\n\n")  
    for l in range(len(keyBert_key[i])):
        key2 = keyBert_key[i][l]
        f.write(key2[0]+"  "+str(round(key2[1],2))+"\n")
    f.write("\n___________________________________________________________________________________\n\n")
f.close()

In [33]:
#This part is to make a unique list of keywords for each sentence, and add the values respective to each sentences for all three
# extraction methods. If that keyword was not extracted in a particular method, we will simply assign 0.
#End result will be the four list, named as final_keyword, r, y and b for rake, yake and keyBert, which will contain the 
#list of unique keywords, respective rake, yake and keyBert values for that keyword extracted.

from numpy import nan

final_keyword = []
rake_score= []
yake_score = []
keyBert_score = []

count = 0
for i in range(len(data)):
    
    keyword = set()
    
    for j in rake_key[i]:
        keyword.add(j[1])    
    for k in yake_key[i]:
        keyword.add(k[0])  
    for l in keyBert_key[i]:
        keyword.add(l[0])
        
    keyword = list(keyword)
    
    final_keyword.extend(keyword)
    
    
    for key in keyword:
        
        count = count + 1
        
        for r in rake_key[i]:
            if key == r[1]:
                rake_score.append(round(r[0],3))
                break

        if count != len(rake_score):
            rake_score.append(nan)
            
        for y in yake_key[i]:
            if key == y[0]:
                yake_score.append(round(y[1],3))
                break

        if count != len(yake_score):
            yake_score.append(nan)      
            
        for k in keyBert_key[i]:
            if key == k[0]:
                keyBert_score.append(round(k[1],3))
                break

        if count != len(keyBert_score):
            keyBert_score.append(nan)  
        

In [36]:
# Creating a dataframe.
df = {
    "keywords":final_keyword,
    "rake extraction":rake_score,
    "yake extraction":yake_score,
    "keyBert extraction":keyBert_score
}

df = pd.DataFrame(df)
df

Unnamed: 0,keywords,rake extraction,yake extraction,keyBert extraction
0,responsible pricing policy,9.0,0.001,
1,"product pricing ),",9.0,,
2,sanofi helping improve,,,0.587
3,sanofi helping,,,0.562
4,assist vulnerable populations,9.0,0.001,
...,...,...,...,...
888,investigational molecule,4.0,,
889,priority review risdiplam,,,0.614
890,entire company,4.0,,
891,risdiplam sma,,,0.540


In [37]:
#Finally creating a dataFrame, inorder to create a xlsx sheet to do a better comparision.
df.to_excel("extracted_keywords.xlsx") 