Gensim method is not working in this code keyword.py while doing keyword extraction from literature with docanalysis

In the colab notebook, the following code is used to extract keywords from the literatures. (rake and yake are working, while gensim is not )

!python /content/semanticClimate/keyword_extraction/code/keyword.py \
--html_path /content/remote_agri/eupmc_result.html \
--saving_path /content/ \
--method 'rake'



**keyword.py** 


```# -*- coding: utf-8 -*-
"""Untitled56.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1aCn-dAMP2zpfjFRA2b_Ts5G58rJVbWKr
"""

from bs4 import BeautifulSoup
from keybert import KeyBERT
from multi_rake import Rake
from summa import keywords
import yake
from IPython.display import HTML
import pandas as pd
import requests
import os
import argparse
import spacy
nlp = spacy.load("en_core_web_lg")


class keyword_extraction():
  def __init__(self,html_path, saving_path, method):
    self.html_path = html_path
    self.saving_path = saving_path
    self.method = method
    self.text = ''
    self.span_list = []

  def extract_span_list(self):
    with open(self.html_path, 'r') as f:
      html = f.read()
      soup = BeautifulSoup(html, features="html.parser")
      with open('/content/html_ex.html','w', encoding="utf-8")as file:
       file.write(soup.prettify())
      # kill all script and style elements
     
      soup_elem = soup.find_all("span")
      for span_elem in soup_elem:
        #print(span_elem)
        span_elem.extract() 
        span_text = span_elem.get_text().strip()
        lines = (line.strip() for line in span_text.splitlines())
        # break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  ") if len(phrase.strip())>9 )
        # drop blank lines
        #text_write = '\n'.join(chunk for chunk in chunks if chunk)
        span_text = ' '.join(chunk for chunk in chunks if chunk)
        if len(span_text)>9 and 'http' not in span_text and 'doi' not in span_text and 'Chapter' not in span_text:
          # print(span_text)
          # print('-'*50)
          self.span_list.append(span_text)
    return self.span_list      
  #def tf_idf(self):
  
  def clean(self,df):
      def tagger(x):
         return nlp(x)[0].pos_

      def lemma(x):
        #print(nlp(x)[0].lemma_)  
        return nlp(x)[0].lemma_ 
      
      df['POS']= df['keyword/phrase'].apply(lambda x: tagger(x))
      df['Lemma']= df['keyword/phrase'].apply(lambda x: lemma(x))
      df= df[df['keyword/phrase'] == df['Lemma'] ]
      df = df.drop_duplicates(subset=['score'], keep='last')
      df= df[df.POS.isin(['NOUN', 'PROPN', 'ADJ', 'ADV'])]
      df= df[~df['keyword/phrase'].apply(lambda x: lemma(x)).isin(['http','https', 'publication','Chapter'])]
      df = df.drop(columns = ['Lemma'], axis = 0)
      return df


  def extract_text_fom_html(self):

    with open(self.html_path, 'r', encoding="utf-8") as f:
      html = f.read()
      soup = BeautifulSoup(html, features="html.parser")
     
      for script in soup(["script", "style"]):
          script.extract()    # rip it out
      
      # get text
      text = soup.get_text()
      #print(text)
      # break into lines and remove leading and trailing space on each
      lines = (line.strip() for line in text.splitlines())
      # break multi-headlines into a line each
      chunks = (phrase.strip() for line in lines for phrase in line.split("      ") if len(phrase.strip())>9 )
      # drop blank lines
      #text_write = '\n'.join(chunk for chunk in chunks if chunk)
      text = '\n '.join(chunk for chunk in chunks if chunk)
      self.text = text
      #print(text)
      # TEXT_ = f'Chapter06_text.txt'
      # saving_path = '/content/'     
      with open('text.txt', 'w', encoding="utf-8") as file:
          file.write(text)
      return self.text
  def extract_keywords_rake(self):
    rake = Rake()
    self.extract_text_fom_html()
    keywords_Rake = rake.apply(self.text)
    df_Rake =pd.DataFrame(keywords_Rake)
    df_Rake.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_Rake = self.clean(df_Rake)
    df_Rake.to_csv(self.saving_path +'Rake_keywords.csv',index=None)

  def extract_keywords_gensim(self):
    self.extract_text_fom_html()
    keywords_gensim= keywords(self.text,words = 100,scores = True, pos_filter =('NN','ADJ'),lemmatize = False, deacc =False) # run over all parameters 
    df_gensim =pd.DataFrame(keywords_gensim)
    df_gensim.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_gensim = self.clean(df_gensim)
    df_gensim.to_csv(self.saving_path +'gensim_keywords.csv',index=None)  

  def extract_keywords_yake(self):
    self.extract_text_fom_html()
    kw_extractor = yake.KeywordExtractor(top=100, stopwords=None)
    keywords_yake = kw_extractor.extract_keywords(self.text)
    df_yake =pd.DataFrame(keywords_yake)
    df_yake.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_yake = self.clean(df_yake)
    df_yake.to_csv(self.saving_path +'yake_keywords.csv',index=None) 
    # for kw, v in keywords_yake:
    #   print("Keyphrase: ",kw, ": score", v)  

  def extract_keywords_textrank(self):
    self.extract_text_fom_html()
    keywords_textrank = keywords.keywords(self.text, scores=True)
    df_textrank = pd.DataFrame(keywords_textrank)
    df_textrank.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_textrank = self.clean(df_textrank)
    df_textrank.to_csv(self.saving_path +'textrank_keywords.csv',index=None)    

  def extract_keywords_keyBERT(self):
    kw_model = KeyBERT(model='all-mpnet-base-v2')
    keywords_keyBERT = kw_model.extract_keywords(self.text, 
                                     keyphrase_ngram_range=(1, 2), 
                                     stop_words='english', 
                                     highlight=True,
                                     top_n=10)  
    
  def main(self):
    if method == 'rake':
      self.extract_keywords_rake()
    elif method == 'yake':  
      self.extract_keywords_yake()
    elif method == 'gensim':  
      self.extract_keywords_gensim()
    elif method == 'textrank':  
      self.extract_keywords_textrank() 
    elif method == 'keyBERT':  
      self.extract_keywords_keyBERT() 


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--html_path',
                      required=True,
                      help='give the path where your html lives: /...')
    parser.add_argument('--saving_path',
                      required=True,
                      help='path of the folder where you want to save the files : /...'
                      )
    parser.add_argument('--method',
                      required=True,  choices=['rake','yake','gensim','keyBERT','textrank'],
                      help='which method you want to us to extact keywords /...')
    

    args = parser.parse_args()

    html_path = args.html_path #'/content/semanticClimate/ipcc/ar6/wg3/Chapter06/fulltext.flow.html'
    saving_path = args.saving_path  #'/content/'
    method = args.method
    
    keyword_extractions = keyword_extraction(html_path,saving_path,method)
    keyword_extractions.main()```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Gensim method is not working in this code keyword.py while doing keyword extraction from literature with docanalysis #40

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Gensim method is not working in this code keyword.py while doing keyword extraction from literature with docanalysis #40

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions