**Keyword Extraction using KeyBert and Yake**

Reference: https://www.analyticsvidhya.com/blog/2022/03/keyword-extraction-methods-from-documents-in-nlp/

In [None]:
!pip3 install keybert
!pip3 install yake

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keybert
  Downloading keybert-0.7.0.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.3.8
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downlo

In [None]:
# connect your personal google drive to store dataset and trained model
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
import pandas as pd
pd.set_option("display.max_colwidth", -1)
from keybert import KeyBERT
import yake

In [None]:
# Define all functions

def get_source_from_excel(dataset):
  source=pd.read_excel(dataset, sheet_name='Sheet_name_1')
  return source

def keybert_extract(context):
  kw_BERT = KeyBERT()
  bert_keywords = kw_BERT.extract_keywords(context)
  bert_df = pd.DataFrame(bert_keywords, columns =['keyword', 'normalized_weight'])
  return bert_df

def yake_extract(context):
  kw_yake = yake.KeywordExtractor(n=2,top=5)
  yake_keywords = kw_yake.extract_keywords(context)
  yake_df = pd.DataFrame(yake_keywords, columns =['keyword', 'weight']).sort_values(by=['weight', 'keyword'], ascending=[True, False])
  yake_df['normalized_weight'] = 1 - ( yake_df['weight']  / sum(yake_df['weight']))
  yake_df.drop(['weight'], axis=1, inplace = True)
  return yake_df

def get_keywords_from_context(context):
  keywords_df = pd.concat([yake_extract(context), keybert_extract(context)], axis=0)    
  return keywords_df

def get_keywords_for_source(source):
  context_keywords = pd.DataFrame()
  for index, row in source.iterrows():
    context = row['content'] 
    keywords = get_keywords_from_context(context)
    c_keywords  = pd.DataFrame()
    c_keywords['context'] = [context]
    c_keywords = keywords.merge(c_keywords, how='cross')
    context_keywords = pd.concat([context_keywords , c_keywords], axis=0)
  return  context_keywords


In [None]:
#Usage
source = get_source_from_excel("/content/gdrive/My Drive/CS 677 Project/dataset/chemistryoflife_v3.xlsx")
context_keywords = get_keywords_for_source(source)
context_keywords.to_excel("/content/gdrive/My Drive/CS 677 Project/dataset/context_keywords.xlsx", sheet_name='Sheet_name_1')  