Skip to content

Gensim method is not working in this code keyword.py while doing keyword extraction from literature with docanalysis #40

@flower1430

Description

@flower1430

In the colab notebook, the following code is used to extract keywords from the literatures. (rake and yake are working, while gensim is not )

!python /content/semanticClimate/keyword_extraction/code/keyword.py
--html_path /content/remote_agri/eupmc_result.html
--saving_path /content/
--method 'rake'

keyword.py

"""Untitled56.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1aCn-dAMP2zpfjFRA2b_Ts5G58rJVbWKr
"""

from bs4 import BeautifulSoup
from keybert import KeyBERT
from multi_rake import Rake
from summa import keywords
import yake
from IPython.display import HTML
import pandas as pd
import requests
import os
import argparse
import spacy
nlp = spacy.load("en_core_web_lg")


class keyword_extraction():
  def __init__(self,html_path, saving_path, method):
    self.html_path = html_path
    self.saving_path = saving_path
    self.method = method
    self.text = ''
    self.span_list = []

  def extract_span_list(self):
    with open(self.html_path, 'r') as f:
      html = f.read()
      soup = BeautifulSoup(html, features="html.parser")
      with open('/content/html_ex.html','w', encoding="utf-8")as file:
       file.write(soup.prettify())
      # kill all script and style elements
     
      soup_elem = soup.find_all("span")
      for span_elem in soup_elem:
        #print(span_elem)
        span_elem.extract() 
        span_text = span_elem.get_text().strip()
        lines = (line.strip() for line in span_text.splitlines())
        # break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  ") if len(phrase.strip())>9 )
        # drop blank lines
        #text_write = '\n'.join(chunk for chunk in chunks if chunk)
        span_text = ' '.join(chunk for chunk in chunks if chunk)
        if len(span_text)>9 and 'http' not in span_text and 'doi' not in span_text and 'Chapter' not in span_text:
          # print(span_text)
          # print('-'*50)
          self.span_list.append(span_text)
    return self.span_list      
  #def tf_idf(self):
  
  def clean(self,df):
      def tagger(x):
         return nlp(x)[0].pos_

      def lemma(x):
        #print(nlp(x)[0].lemma_)  
        return nlp(x)[0].lemma_ 
      
      df['POS']= df['keyword/phrase'].apply(lambda x: tagger(x))
      df['Lemma']= df['keyword/phrase'].apply(lambda x: lemma(x))
      df= df[df['keyword/phrase'] == df['Lemma'] ]
      df = df.drop_duplicates(subset=['score'], keep='last')
      df= df[df.POS.isin(['NOUN', 'PROPN', 'ADJ', 'ADV'])]
      df= df[~df['keyword/phrase'].apply(lambda x: lemma(x)).isin(['http','https', 'publication','Chapter'])]
      df = df.drop(columns = ['Lemma'], axis = 0)
      return df


  def extract_text_fom_html(self):

    with open(self.html_path, 'r', encoding="utf-8") as f:
      html = f.read()
      soup = BeautifulSoup(html, features="html.parser")
     
      for script in soup(["script", "style"]):
          script.extract()    # rip it out
      
      # get text
      text = soup.get_text()
      #print(text)
      # break into lines and remove leading and trailing space on each
      lines = (line.strip() for line in text.splitlines())
      # break multi-headlines into a line each
      chunks = (phrase.strip() for line in lines for phrase in line.split("      ") if len(phrase.strip())>9 )
      # drop blank lines
      #text_write = '\n'.join(chunk for chunk in chunks if chunk)
      text = '\n '.join(chunk for chunk in chunks if chunk)
      self.text = text
      #print(text)
      # TEXT_ = f'Chapter06_text.txt'
      # saving_path = '/content/'     
      with open('text.txt', 'w', encoding="utf-8") as file:
          file.write(text)
      return self.text
  def extract_keywords_rake(self):
    rake = Rake()
    self.extract_text_fom_html()
    keywords_Rake = rake.apply(self.text)
    df_Rake =pd.DataFrame(keywords_Rake)
    df_Rake.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_Rake = self.clean(df_Rake)
    df_Rake.to_csv(self.saving_path +'Rake_keywords.csv',index=None)

  def extract_keywords_gensim(self):
    self.extract_text_fom_html()
    keywords_gensim= keywords(self.text,words = 100,scores = True, pos_filter =('NN','ADJ'),lemmatize = False, deacc =False) # run over all parameters 
    df_gensim =pd.DataFrame(keywords_gensim)
    df_gensim.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_gensim = self.clean(df_gensim)
    df_gensim.to_csv(self.saving_path +'gensim_keywords.csv',index=None)  

  def extract_keywords_yake(self):
    self.extract_text_fom_html()
    kw_extractor = yake.KeywordExtractor(top=100, stopwords=None)
    keywords_yake = kw_extractor.extract_keywords(self.text)
    df_yake =pd.DataFrame(keywords_yake)
    df_yake.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_yake = self.clean(df_yake)
    df_yake.to_csv(self.saving_path +'yake_keywords.csv',index=None) 
    # for kw, v in keywords_yake:
    #   print("Keyphrase: ",kw, ": score", v)  

  def extract_keywords_textrank(self):
    self.extract_text_fom_html()
    keywords_textrank = keywords.keywords(self.text, scores=True)
    df_textrank = pd.DataFrame(keywords_textrank)
    df_textrank.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_textrank = self.clean(df_textrank)
    df_textrank.to_csv(self.saving_path +'textrank_keywords.csv',index=None)    

  def extract_keywords_keyBERT(self):
    kw_model = KeyBERT(model='all-mpnet-base-v2')
    keywords_keyBERT = kw_model.extract_keywords(self.text, 
                                     keyphrase_ngram_range=(1, 2), 
                                     stop_words='english', 
                                     highlight=True,
                                     top_n=10)  
    
  def main(self):
    if method == 'rake':
      self.extract_keywords_rake()
    elif method == 'yake':  
      self.extract_keywords_yake()
    elif method == 'gensim':  
      self.extract_keywords_gensim()
    elif method == 'textrank':  
      self.extract_keywords_textrank() 
    elif method == 'keyBERT':  
      self.extract_keywords_keyBERT() 


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--html_path',
                      required=True,
                      help='give the path where your html lives: /...')
    parser.add_argument('--saving_path',
                      required=True,
                      help='path of the folder where you want to save the files : /...'
                      )
    parser.add_argument('--method',
                      required=True,  choices=['rake','yake','gensim','keyBERT','textrank'],
                      help='which method you want to us to extact keywords /...')
    

    args = parser.parse_args()

    html_path = args.html_path #'/content/semanticClimate/ipcc/ar6/wg3/Chapter06/fulltext.flow.html'
    saving_path = args.saving_path  #'/content/'
    method = args.method
    
    keyword_extractions = keyword_extraction(html_path,saving_path,method)
    keyword_extractions.main()```

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions