From 0605240643bed4a6d09a34772e9689534915563d Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Tue, 8 Feb 2022 14:17:00 +0100 Subject: [PATCH 1/3] chore: did some basic linting --- README.md | 14 +- __init__.py | 0 docanalysis/__init__.py | 2 +- docanalysis/docanalysis.py | 36 ++-- docanalysis/entity_extraction.py | 155 +++++++++------ docanalysis/extract_entities.py | 291 +++++++++++++++++----------- docanalysis/frequency_analysis.py | 16 +- demo.py => examples/demo.py | 0 pmr_demo.py => examples/pmr_demo.py | 0 setup.py | 9 +- 10 files changed, 315 insertions(+), 208 deletions(-) delete mode 100644 __init__.py rename demo.py => examples/demo.py (100%) rename pmr_demo.py => examples/pmr_demo.py (100%) diff --git a/README.md b/README.md index c1eddc5..c0c991f 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ Parameters: CORPUS_PATH: path to an existing corpus (CProject) labels_to_get: SpaCy recognizes Named-Entites and labels them. You can choose for lables you are interested by providing it as a list. For all available labels, check out the Tools Used section. ``` ## How to run? -We have created `demo.py` where you can run the package. +We have created `demo.py` in the `examples` folder where you can run the package. ``` import os @@ -74,12 +74,12 @@ with open('GPE.text', 'w') as f: f.write(str(list_with_gpe)) ``` To break this down, -|Variable snippet |What is it? | -|----------------------|----------------| -|`essential oil AND chemical composition` |Query to `pygetpapers` (EPMC default)| -|`100` |number of hits | -|stem_cell_research_300|Output directory| -|"ethics_dictionary", "ethics_key_phrases", "ethics_key_phrases.xml" |dictionary path | +| Variable snippet | What is it? | +| ------------------------------------------------------------------- | ------------------------------------- | +| `essential oil AND chemical composition` | Query to `pygetpapers` (EPMC default) | +| `100` | number of hits | +| stem_cell_research_300 | Output directory | +| "ethics_dictionary", "ethics_key_phrases", "ethics_key_phrases.xml" | dictionary path | ## What is a dictionary diff --git a/__init__.py b/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/docanalysis/__init__.py b/docanalysis/__init__.py index fc80254..2ae2839 100644 --- a/docanalysis/__init__.py +++ b/docanalysis/__init__.py @@ -1 +1 @@ -pass \ No newline at end of file +pass diff --git a/docanalysis/docanalysis.py b/docanalysis/docanalysis.py index b52d7ec..2caea84 100644 --- a/docanalysis/docanalysis.py +++ b/docanalysis/docanalysis.py @@ -1,19 +1,21 @@ -import os import logging +import os import sys -import configargparse -import coloredlogs +from functools import partialmethod from time import gmtime, strftime + +import coloredlogs +import configargparse from tqdm import tqdm -from functools import partialmethod + from docanalysis.entity_extraction import EntityExtraction -class Docanalysis: +class Docanalysis: def __init__(self): """This function makes all the constants""" self.entity_extraction = EntityExtraction() - self.version="0.0.3" + self.version = "0.0.3" def handle_logger_creation(self, args): """[summary] @@ -38,7 +40,7 @@ def handle_logger_creation(self, args): if args.logfile: self.handle_logfile(args, level) else: - coloredlogs.install(level=level, fmt='%(levelname)s: %(message)s') + coloredlogs.install(level=level, fmt="%(levelname)s: %(message)s") def handlecli(self): """Handles the command line interface using argparse""" @@ -85,7 +87,7 @@ def handlecli(self): parser.add_argument( "--entity_extraction", default=False, - nargs='+', + nargs="+", help="extracts specified entities chosen from a list of entities (CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART, GGP, SO, TAXON, CHEBI, GO, CL)", ) parser.add_argument( @@ -121,7 +123,6 @@ def handlecli(self): help="[All] save log to specified file in output directory as well as printing to terminal", ) - if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit() @@ -130,10 +131,19 @@ def handlecli(self): if vars(args)[arg] == "False": vars(args)[arg] = False self.handle_logger_creation(args) - self.entity_extraction.extract_entities_from_papers(args.project_name,args.dictionary,query=args.query,hits=args.hits, - make_project=args.run_pygetpapers, install_ami=False, removefalse=True, create_csv=True, - csv_name=args.output, labels_to_get=args.entity_extraction,make_ami_dict=args.make_ami_dict) - + self.entity_extraction.extract_entities_from_papers( + args.project_name, + args.dictionary, + query=args.query, + hits=args.hits, + make_project=args.run_pygetpapers, + install_ami=False, + removefalse=True, + create_csv=True, + csv_name=args.output, + labels_to_get=args.entity_extraction, + make_ami_dict=args.make_ami_dict, + ) def main(): diff --git a/docanalysis/entity_extraction.py b/docanalysis/entity_extraction.py index b4cce42..88659d6 100644 --- a/docanalysis/entity_extraction.py +++ b/docanalysis/entity_extraction.py @@ -1,19 +1,21 @@ -import os import logging +import os +import xml.etree.ElementTree as ET from glob import glob -import spacy + import pandas as pd +import spacy from bs4 import BeautifulSoup -from tqdm import tqdm -import xml.etree.ElementTree as ET from nltk import tokenize +from tqdm import tqdm try: - nlp = spacy.load('en_core_web_sm') + nlp = spacy.load("en_core_web_sm") except OSError: from spacy.cli import download - download('en_core_web_sm') - nlp = spacy.load('en_core_web_sm') + + download("en_core_web_sm") + nlp = spacy.load("en_core_web_sm") class EntityExtraction: @@ -23,15 +25,28 @@ def __init__(self): self.labels_to_get = [] logging.basicConfig(level=logging.INFO) - def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None, hits=30, - make_project=False, install_ami=False, removefalse=True, create_csv=True, - csv_name='entities.csv', labels_to_get=['GPE', 'ORG'],make_ami_dict=False): + def extract_entities_from_papers( + self, + corpus_path, + terms_xml_path, + query=None, + hits=30, + make_project=False, + install_ami=False, + removefalse=True, + create_csv=True, + csv_name="entities.csv", + labels_to_get=["GPE", "ORG"], + make_ami_dict=False, + ): self.labels_to_get = labels_to_get if make_project: if not query: - logging.warning('Please provide query as parameter') + logging.warning("Please provide query as parameter") return - logging.info(f"making project/searching {query} for {hits} hits into {corpus_path}") + logging.info( + f"making project/searching {query} for {hits} hits into {corpus_path}" + ) self.create_project_files(query, hits, corpus_path) if install_ami: logging.info(f"installing ami3 (check whether this is a good idea)") @@ -45,15 +60,19 @@ def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None, if terms_xml_path: terms = self.get_terms_from_ami_xml(terms_xml_path) self.add_if_file_contains_terms( - terms=terms, dict_with_parsed_xml=dict_with_parsed_xml) + terms=terms, dict_with_parsed_xml=dict_with_parsed_xml + ) if removefalse: self.remove_statements_not_having_xmldict_terms_or_entities( - dict_with_parsed_xml=dict_with_parsed_xml) + dict_with_parsed_xml=dict_with_parsed_xml + ) if create_csv: self.convert_dict_to_csv( - path=os.path.join(corpus_path, csv_name), dict_with_parsed_xml=dict_with_parsed_xml) + path=os.path.join(corpus_path, csv_name), + dict_with_parsed_xml=dict_with_parsed_xml, + ) if make_ami_dict: - self.handle_ami_dict_creation(dict_with_parsed_xml,make_ami_dict) + self.handle_ami_dict_creation(dict_with_parsed_xml, make_ami_dict) return dict_with_parsed_xml def create_project_files(self, QUERY, HITS, OUTPUT): @@ -68,8 +87,9 @@ def install_ami(self): def make_dict_with_parsed_xml(self, output): dict_with_parsed_xml = {} - all_paragraphs = glob(os.path.join( - output, '*', 'sections', '**', '[1_9]_p.xml'), recursive=True) + all_paragraphs = glob( + os.path.join(output, "*", "sections", "**", "[1_9]_p.xml"), recursive=True + ) counter = 1 logging.info(f"starting tokenization on {len(all_paragraphs)} paragraphs") for section_path in tqdm(all_paragraphs): @@ -90,11 +110,10 @@ def read_text_from_path(self, paragraph_path): tree = ET.parse(paragraph_path) root = tree.getroot() try: - xmlstr = ET.tostring(root, encoding='utf8', method='xml') - soup = BeautifulSoup(xmlstr, features='lxml') + xmlstr = ET.tostring(root, encoding="utf8", method="xml") + soup = BeautifulSoup(xmlstr, features="lxml") text = soup.get_text(separator="") - paragraph_text = text.replace( - '\n', '') + paragraph_text = text.replace("\n", "") except: paragraph_text = "empty" return paragraph_text @@ -102,31 +121,36 @@ def read_text_from_path(self, paragraph_path): def add_parsed_sections_to_dict(self, dict_with_parsed_xml): for paragraph in dict_with_parsed_xml: - doc = nlp(dict_with_parsed_xml[paragraph]['sentence']) + doc = nlp(dict_with_parsed_xml[paragraph]["sentence"]) entities, labels, position_end, position_start = self.make_required_lists() for ent in doc.ents: self.add_parsed_entities_to_lists( - entities, labels, position_end, position_start, ent) - self.add_lists_to_dict(dict_with_parsed_xml[paragraph], entities, labels, position_end, - position_start) + entities, labels, position_end, position_start, ent + ) + self.add_lists_to_dict( + dict_with_parsed_xml[paragraph], + entities, + labels, + position_end, + position_start, + ) def add_if_file_contains_terms(self, terms, dict_with_parsed_xml): for statement in dict_with_parsed_xml: dict_for_sentence = dict_with_parsed_xml[statement] - dict_for_sentence['has_terms'] = [] + dict_for_sentence["has_terms"] = [] for term in terms: - if term.lower().strip() in dict_for_sentence['sentence'].lower(): - dict_for_sentence['has_terms'].append(term) - dict_for_sentence['weight'] = len( - dict_for_sentence['has_terms']) + if term.lower().strip() in dict_for_sentence["sentence"].lower(): + dict_for_sentence["has_terms"].append(term) + dict_for_sentence["weight"] = len(dict_for_sentence["has_terms"]) def get_terms_from_ami_xml(self, xml_path): tree = ET.parse(xml_path) root = tree.getroot() terms = [] - for para in root.iter('entry'): + for para in root.iter("entry"): terms.append(para.attrib["term"]) return terms @@ -138,14 +162,18 @@ def make_required_lists(self): position_end = [] return entities, labels, position_end, position_start - def add_lists_to_dict(self, dict_for_sentence, entities, labels, position_end, position_start): + def add_lists_to_dict( + self, dict_for_sentence, entities, labels, position_end, position_start + ): - dict_for_sentence['entities'] = entities - dict_for_sentence['labels'] = labels - dict_for_sentence['position_start'] = position_start - dict_for_sentence['position_end'] = position_end + dict_for_sentence["entities"] = entities + dict_for_sentence["labels"] = labels + dict_for_sentence["position_start"] = position_start + dict_for_sentence["position_end"] = position_end - def add_parsed_entities_to_lists(self, entities, labels, position_end, position_start, ent=None): + def add_parsed_entities_to_lists( + self, entities, labels, position_end, position_start, ent=None + ): if ent.label_ in self.labels_to_get: entities.append(ent) labels.append(ent.label_) @@ -158,20 +186,23 @@ def convert_dict_to_csv(self, path, dict_with_parsed_xml): df = df.T for col in df: try: - df[col] = df[col].astype(str).str.replace( - "[", "").str.replace("]", "") - df[col] = df[col].astype(str).str.replace( - "'", "").str.replace("'", "") + df[col] = df[col].astype(str).str.replace("[", "").str.replace("]", "") + df[col] = df[col].astype(str).str.replace("'", "").str.replace("'", "") except: pass - df.to_csv(path, encoding='utf-8', line_terminator='\r\n') + df.to_csv(path, encoding="utf-8", line_terminator="\r\n") logging.info(f"wrote output to {path}") - def remove_statements_not_having_xmldict_terms_or_entities(self, dict_with_parsed_xml): + def remove_statements_not_having_xmldict_terms_or_entities( + self, dict_with_parsed_xml + ): statement_to_pop = [] for statement in dict_with_parsed_xml: sentect_dict = dict_with_parsed_xml[statement] - if len(sentect_dict['has_terms']) == 0 or len(sentect_dict['entities']) == 0: + if ( + len(sentect_dict["has_terms"]) == 0 + or len(sentect_dict["entities"]) == 0 + ): statement_to_pop.append(statement) for term in statement_to_pop: @@ -182,31 +213,31 @@ def extract_particular_fields(dict_with_parsed_xml, field): field_list = [] for sentence in dict_with_parsed_xml: sentect_dict = dict_with_parsed_xml[sentence] - for entity, label in zip(sentect_dict['entities'], sentect_dict['labels']): + for entity, label in zip(sentect_dict["entities"], sentect_dict["labels"]): if label == field: if entity not in field_list: field_list.append(entity) return field_list - def make_ami_dict_from_list(self,list_of_terms,title): - xml_string=f''' + def make_ami_dict_from_list(self, list_of_terms, title): + xml_string = f""" - ''' + """ for term in list_of_terms: - xml_string+=f''' + xml_string += f""" - ''' - xml_string+="" + """ + xml_string += "" return xml_string - - def write_string_to_file(self,string_to_put,title): - with open(f'{title}.xml',mode='w') as f: + + def write_string_to_file(self, string_to_put, title): + with open(f"{title}.xml", mode="w") as f: f.write(string_to_put) - - def handle_ami_dict_creation(self,result_dictionary,title): - list_of_entities=[] + + def handle_ami_dict_creation(self, result_dictionary, title): + list_of_entities = [] for entry in result_dictionary: - if 'entities' in entry: - list_of_entities+=entry['entities'] - xml_dict = self.make_ami_dict_from_list(list_of_entities,title) - self.write_string_to_file(xml_dict,f'{title}.xml') + if "entities" in entry: + list_of_entities += entry["entities"] + xml_dict = self.make_ami_dict_from_list(list_of_entities, title) + self.write_string_to_file(xml_dict, f"{title}.xml") diff --git a/docanalysis/extract_entities.py b/docanalysis/extract_entities.py index 97ba3c5..38d0726 100644 --- a/docanalysis/extract_entities.py +++ b/docanalysis/extract_entities.py @@ -1,25 +1,28 @@ -from fileinput import filename +import json +import logging import os +import re +import subprocess import sys -import logging +import xml.etree.ElementTree as ET +from fileinput import filename from glob import glob -import spacy + import pandas as pd -from bs4 import BeautifulSoup -from tqdm import tqdm -import xml.etree.ElementTree as ET -from nltk import tokenize -import subprocess import scispacy -import json -import re +import spacy import yake +from bs4 import BeautifulSoup +from nltk import tokenize +from tqdm import tqdm + try: - nlp = spacy.load('en_core_web_sm') + nlp = spacy.load("en_core_web_sm") except OSError: from spacy.cli import download - download('en_core_web_sm') - nlp = spacy.load('en_core_web_sm') + + download("en_core_web_sm") + nlp = spacy.load("en_core_web_sm") class DocAnalysis: @@ -29,9 +32,19 @@ def __init__(self): self.labels_to_get = [] logging.basicConfig(level=logging.INFO) - def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None, hits=30, - make_project=False, install_ami=False, removefalse=True, create_csv=True, - csv_name='entities.csv', labels_to_get=['GPE', 'ORG']): + def extract_entities_from_papers( + self, + corpus_path, + terms_xml_path, + query=None, + hits=30, + make_project=False, + install_ami=False, + removefalse=True, + create_csv=True, + csv_name="entities.csv", + labels_to_get=["GPE", "ORG"], + ): """[summary] :param query: [description] @@ -58,9 +71,11 @@ def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None, self.labels_to_get = labels_to_get if make_project: if not query: - logging.warning('Please provide query as parameter') + logging.warning("Please provide query as parameter") return - logging.info(f"making project/searching {query} for {hits} hits into {corpus_path}") + logging.info( + f"making project/searching {query} for {hits} hits into {corpus_path}" + ) self.create_project_files(query, hits, corpus_path) if install_ami: logging.info(f"installing ami3 (check whether this is a good idea)") @@ -80,14 +95,18 @@ def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None, terms = self.get_terms_from_ami_xml(terms_xml_path) # moved from (1) self.add_if_file_contains_terms( - terms=terms, dict_with_parsed_xml=dict_with_parsed_xml) + terms=terms, dict_with_parsed_xml=dict_with_parsed_xml + ) if removefalse: self.remove_statements_not_having_xmldict_terms_or_entities( - dict_with_parsed_xml=dict_with_parsed_xml) + dict_with_parsed_xml=dict_with_parsed_xml + ) if create_csv: self.convert_dict_to_csv( - path=os.path.join(corpus_path, csv_name), dict_with_parsed_xml=dict_with_parsed_xml) + path=os.path.join(corpus_path, csv_name), + dict_with_parsed_xml=dict_with_parsed_xml, + ) return dict_with_parsed_xml def create_project_files(self, QUERY, HITS, OUTPUT): @@ -102,8 +121,9 @@ def install_ami(self): def make_dict_with_parsed_xml(self, output): dict_with_parsed_xml = {} - all_paragraphs = glob(os.path.join( - output, '*', 'sections', '**', '[1_9]_p.xml'), recursive=True) + all_paragraphs = glob( + os.path.join(output, "*", "sections", "**", "[1_9]_p.xml"), recursive=True + ) counter = 1 logging.info(f"starting tokenization on {len(all_paragraphs)} paragraphs") for section_path in tqdm(all_paragraphs): @@ -124,11 +144,10 @@ def read_text_from_path(self, paragraph_path): tree = ET.parse(paragraph_path) root = tree.getroot() try: - xmlstr = ET.tostring(root, encoding='utf8', method='xml') - soup = BeautifulSoup(xmlstr, features='lxml') + xmlstr = ET.tostring(root, encoding="utf8", method="xml") + soup = BeautifulSoup(xmlstr, features="lxml") text = soup.get_text(separator="") - paragraph_text = text.replace( - '\n', '') + paragraph_text = text.replace("\n", "") except: paragraph_text = "empty" return paragraph_text @@ -136,31 +155,36 @@ def read_text_from_path(self, paragraph_path): def add_parsed_sections_to_dict(self, dict_with_parsed_xml): for paragraph in dict_with_parsed_xml: - doc = nlp(dict_with_parsed_xml[paragraph]['sentence']) + doc = nlp(dict_with_parsed_xml[paragraph]["sentence"]) entities, labels, position_end, position_start = self.make_required_lists() for ent in doc.ents: self.add_parsed_entities_to_lists( - entities, labels, position_end, position_start, ent) - self.add_lists_to_dict(dict_with_parsed_xml[paragraph], entities, labels, position_end, - position_start) + entities, labels, position_end, position_start, ent + ) + self.add_lists_to_dict( + dict_with_parsed_xml[paragraph], + entities, + labels, + position_end, + position_start, + ) def add_if_file_contains_terms(self, terms, dict_with_parsed_xml): for statement in dict_with_parsed_xml: dict_for_sentence = dict_with_parsed_xml[statement] - dict_for_sentence['has_terms'] = [] + dict_for_sentence["has_terms"] = [] for term in terms: - if term.lower().strip() in dict_for_sentence['sentence'].lower(): - dict_for_sentence['has_terms'].append(term) - dict_for_sentence['weight'] = len( - dict_for_sentence['has_terms']) + if term.lower().strip() in dict_for_sentence["sentence"].lower(): + dict_for_sentence["has_terms"].append(term) + dict_for_sentence["weight"] = len(dict_for_sentence["has_terms"]) def get_terms_from_ami_xml(self, xml_path): tree = ET.parse(xml_path) root = tree.getroot() terms = [] - for para in root.iter('entry'): + for para in root.iter("entry"): terms.append(para.attrib["term"]) return terms @@ -172,14 +196,18 @@ def make_required_lists(self): position_end = [] return entities, labels, position_end, position_start - def add_lists_to_dict(self, dict_for_sentence, entities, labels, position_end, position_start): + def add_lists_to_dict( + self, dict_for_sentence, entities, labels, position_end, position_start + ): - dict_for_sentence['entities'] = entities - dict_for_sentence['labels'] = labels - dict_for_sentence['position_start'] = position_start - dict_for_sentence['position_end'] = position_end + dict_for_sentence["entities"] = entities + dict_for_sentence["labels"] = labels + dict_for_sentence["position_start"] = position_start + dict_for_sentence["position_end"] = position_end - def add_parsed_entities_to_lists(self, entities, labels, position_end, position_start, ent=None): + def add_parsed_entities_to_lists( + self, entities, labels, position_end, position_start, ent=None + ): if ent.label_ in self.labels_to_get: entities.append(ent) labels.append(ent.label_) @@ -192,20 +220,23 @@ def convert_dict_to_csv(self, path, dict_with_parsed_xml): df = df.T for col in df: try: - df[col] = df[col].astype(str).str.replace( - "[", "").str.replace("]", "") - df[col] = df[col].astype(str).str.replace( - "'", "").str.replace("'", "") + df[col] = df[col].astype(str).str.replace("[", "").str.replace("]", "") + df[col] = df[col].astype(str).str.replace("'", "").str.replace("'", "") except: pass - df.to_csv(path, encoding='utf-8', line_terminator='\r\n') + df.to_csv(path, encoding="utf-8", line_terminator="\r\n") logging.info(f"wrote output to {path}") - def remove_statements_not_having_xmldict_terms_or_entities(self, dict_with_parsed_xml): + def remove_statements_not_having_xmldict_terms_or_entities( + self, dict_with_parsed_xml + ): statement_to_pop = [] for statement in dict_with_parsed_xml: sentect_dict = dict_with_parsed_xml[statement] - if len(sentect_dict['has_terms']) == 0 or len(sentect_dict['entities']) == 0: + if ( + len(sentect_dict["has_terms"]) == 0 + or len(sentect_dict["entities"]) == 0 + ): statement_to_pop.append(statement) for term in statement_to_pop: @@ -225,38 +256,44 @@ def extract_particular_fields(dict_with_parsed_xml, field): field_list = [] for sentence in dict_with_parsed_xml: sentect_dict = dict_with_parsed_xml[sentence] - for entity, label in zip(sentect_dict['entities'], sentect_dict['labels']): + for entity, label in zip(sentect_dict["entities"], sentect_dict["labels"]): if label == field: if entity not in field_list: field_list.append(entity) return field_list - def make_ami_dict_from_list(self,list_of_terms,title): - xml_string=f''' + def make_ami_dict_from_list(self, list_of_terms, title): + xml_string = f""" - ''' + """ for term in list_of_terms: - xml_string+=f''' + xml_string += f""" - ''' - xml_string+="" + """ + xml_string += "" return xml_string - - def write_string_to_file(self,string_to_put,title): - with open(f'{title}.xml',mode='w') as f: + + def write_string_to_file(self, string_to_put, title): + with open(f"{title}.xml", mode="w") as f: f.write(string_to_put) -# -------this section comes from metadata_analysis.py + +# -------this section comes from metadata_analysis.py # (https://github.com/petermr/crops/blob/main/metadata_analysis/metadata_analysis.py) metadata_dictionary = {} + def get_metadata_json(output_directory): WORKING_DIRECTORY = os.getcwd() - glob_results = glob.glob(os.path.join(WORKING_DIRECTORY, - output_directory, "*", 'eupmc_result.json')) + glob_results = glob.glob( + os.path.join(WORKING_DIRECTORY, output_directory, "*", "eupmc_result.json") + ) metadata_dictionary["metadata_json"] = glob_results - logging.info(f'metadata found for {len(metadata_dictionary["metadata_json"])} papers') + logging.info( + f'metadata found for {len(metadata_dictionary["metadata_json"])} papers' + ) + def get_PMCIDS(metadata_dictionary=metadata_dictionary): # gets PMCDIDs from metadata_JSON of individual papers. @@ -264,35 +301,46 @@ def get_PMCIDS(metadata_dictionary=metadata_dictionary): metadata_dictionary["PMCIDS"] = [] for metadata in metadata_dictionary["metadata_json"]: - with open(metadata, encoding='utf-8') as f: + with open(metadata, encoding="utf-8") as f: metadata_in_json = json.load(f) try: - metadata_dictionary["PMCIDS"].append( - metadata_in_json["full"]["pmcid"]) + metadata_dictionary["PMCIDS"].append(metadata_in_json["full"]["pmcid"]) except KeyError: - metadata_dictionary["PMCIDS"].append('NaN') - logging.info('getting PMCIDs') + metadata_dictionary["PMCIDS"].append("NaN") + logging.info("getting PMCIDs") + def parse_xml(output_directory, section, metadata_dictionary=metadata_dictionary): - # gets the text from XML. Clubs all the paragraphs in the section into one. + # gets the text from XML. Clubs all the paragraphs in the section into one. metadata_dictionary[f"{section}"] = [] for pmc in metadata_dictionary["PMCIDS"]: paragraphs = [] - section_glob = glob.glob(os.path.join(os.getcwd(), output_directory, - pmc, 'sections', '**', f'*{section}*', '**', '*.xml'), - recursive=True) + section_glob = glob.glob( + os.path.join( + os.getcwd(), + output_directory, + pmc, + "sections", + "**", + f"*{section}*", + "**", + "*.xml", + ), + recursive=True, + ) for result in section_glob: tree = ET.parse(result) root = tree.getroot() - xmlstr = ET.tostring(root, encoding='utf-8', method='xml') - soup = BeautifulSoup(xmlstr, features='lxml') + xmlstr = ET.tostring(root, encoding="utf-8", method="xml") + soup = BeautifulSoup(xmlstr, features="lxml") text = soup.get_text(separator="") - text = text.replace('\n', '') + text = text.replace("\n", "") paragraphs.append(text) - concated_paragraph = ' '.join(paragraphs) + concated_paragraph = " ".join(paragraphs) metadata_dictionary[f"{section}"].append(concated_paragraph) logging.info(f"parsing {section} section") + def get_abstract(metadata_dictionary=metadata_dictionary): # gets abstracts from the metadata json. # We might want to get the abstract from the fulltext, @@ -300,14 +348,14 @@ def get_abstract(metadata_dictionary=metadata_dictionary): TAG_RE = re.compile(r"<[^>]+>") metadata_dictionary["abstract"] = [] for metadata in metadata_dictionary["metadata_json"]: - with open(metadata, encoding='utf-8') as f: + with open(metadata, encoding="utf-8") as f: metadata_in_json = json.load(f) try: raw_abstract = metadata_in_json["full"]["abstractText"] - abstract = TAG_RE.sub(' ', raw_abstract) + abstract = TAG_RE.sub(" ", raw_abstract) metadata_dictionary["abstract"].append(abstract) except KeyError: - metadata_dictionary["abstract"].append('NaN') + metadata_dictionary["abstract"].append("NaN") logging.info("getting the abstracts") @@ -317,32 +365,36 @@ def get_keywords(metadata_dictionary=metadata_dictionary): # since the format of the metadata JSON has changed from time to time. metadata_dictionary["keywords"] = [] for metadata in metadata_dictionary["metadata_json"]: - with open(metadata, encoding='utf-8') as f: + with open(metadata, encoding="utf-8") as f: metadata_in_json = json.load(f) try: metadata_dictionary["keywords"].append( - metadata_in_json["full"]["keywordList"]["keyword"]) + metadata_in_json["full"]["keywordList"]["keyword"] + ) except KeyError: metadata_dictionary["keywords"].append([]) logging.info("getting the keywords from metadata") def key_phrase_extraction(section, metadata_dictionary=metadata_dictionary): - # extracts keyphrases from the blob of texts of section specified for each paper using YAKE + # extracts keyphrases from the blob of texts of section specified for each paper using YAKE metadata_dictionary["yake_keywords"] = [] for text in metadata_dictionary[f"{section}"]: custom_kw_extractor = yake.KeywordExtractor( - lan='en', n=2, top=10, features=None) + lan="en", n=2, top=10, features=None + ) keywords = custom_kw_extractor.extract_keywords(text) keywords_list = [] for kw in keywords: keywords_list.append(kw[0]) metadata_dictionary["yake_keywords"].append(keywords_list) - logging.info(f'extracted key phrases from {section}') + logging.info(f"extracted key phrases from {section}") -def get_organism(section,label_interested= 'TAXON', metadata_dictionary=metadata_dictionary): - #nlp = spacy.load("en_ner_bionlp13cg_md") +def get_organism( + section, label_interested="TAXON", metadata_dictionary=metadata_dictionary +): + # nlp = spacy.load("en_ner_bionlp13cg_md") nlp = spacy.load("en_core_sci_sm") metadata_dictionary["entities"] = [] for sci_text in metadata_dictionary[f"{section}"]: @@ -352,23 +404,28 @@ def get_organism(section,label_interested= 'TAXON', metadata_dictionary=metadata if ent.label_ == label_interested: entity.append(ent.text) metadata_dictionary["entities"].append(entity) - logging.info(F"NER using SciSpacy - looking for {label_interested}") + logging.info(f"NER using SciSpacy - looking for {label_interested}") -def convert_to_csv(path='keywords_results_yake_organism_pmcid_tps_cam_ter_c.csv', metadata_dictionary=metadata_dictionary): - # method borrowed from original docanalysis +def convert_to_csv( + path="keywords_results_yake_organism_pmcid_tps_cam_ter_c.csv", + metadata_dictionary=metadata_dictionary, +): + # method borrowed from original docanalysis df = pd.DataFrame(metadata_dictionary) - df.to_csv(path, encoding='utf-8', line_terminator='\r\n') - logging.info(f'writing the keywords to {path}') + df.to_csv(path, encoding="utf-8", line_terminator="\r\n") + logging.info(f"writing the keywords to {path}") -def convert_to_json(path='ethics_statement_2000.json', metadata_dictionary = metadata_dictionary): +def convert_to_json( + path="ethics_statement_2000.json", metadata_dictionary=metadata_dictionary +): # converts the python dictionary containing output into a JSON file json_file = json.dumps(metadata_dictionary) - f = open(path,"w", encoding='ascii') + f = open(path, "w", encoding="ascii") f.write(json_file) f.close() - logging.info(f'writing the dictionary to {path}') + logging.info(f"writing the dictionary to {path}") def look_for_a_word(section, search_for="TPS", metadata_dictionary=metadata_dictionary): @@ -377,12 +434,16 @@ def look_for_a_word(section, search_for="TPS", metadata_dictionary=metadata_dict metadata_dictionary[f"{search_for}_match"] = [] for text in metadata_dictionary[f"{section}"]: words = text.split(" ") - match_list = ([s for s in words if f"{search_for}" in s]) - metadata_dictionary[f"{search_for}_match"] .append(match_list) + match_list = [s for s in words if f"{search_for}" in s] + metadata_dictionary[f"{search_for}_match"].append(match_list) logging.info(f"looking for {search_for} in {section}") -def look_for_next_word(section, search_for=["number:", "no.", "No.", "number" ], metadata_dictionary=metadata_dictionary): +def look_for_next_word( + section, + search_for=["number:", "no.", "No.", "number"], + metadata_dictionary=metadata_dictionary, +): # chops the paragraph corresponding to a section into list of words # gets the word next to the matched string. metadata_dictionary[f"{search_for}_match"] = [] @@ -390,7 +451,9 @@ def look_for_next_word(section, search_for=["number:", "no.", "No.", "number" ], words = text.split(" ") words = iter(words) try: - match_list = ([next(words) for s in words if any(xs in s for xs in search_for)]) + match_list = [ + next(words) for s in words if any(xs in s for xs in search_for) + ] metadata_dictionary[f"{search_for}_match"].append(match_list) except StopIteration: metadata_dictionary[f"{search_for}_match"].append([]) @@ -398,7 +461,9 @@ def look_for_next_word(section, search_for=["number:", "no.", "No.", "number" ], logging.info(f"looking for {search_for} in {section}") -def add_if_file_contains_terms(section, metadata_dictionary=metadata_dictionary, terms=['iNaturalist']): +def add_if_file_contains_terms( + section, metadata_dictionary=metadata_dictionary, terms=["iNaturalist"] +): # method borrowed from original docanalysis metadata_dictionary["terms"] = [] for term in terms: @@ -406,30 +471,30 @@ def add_if_file_contains_terms(section, metadata_dictionary=metadata_dictionary, if term.lower() in text.lower(): metadata_dictionary["terms"].append(term) else: - metadata_dictionary["terms"].append('NaN') - logging.info(f'looking for term matches in {section}') + metadata_dictionary["terms"].append("NaN") + logging.info(f"looking for term matches in {section}") # calling all the functions -CPROJECT = os.path.join(os.path.expanduser('~'), 'ethics_statement_2000_generic') -SECTION= 'ethic' -#querying_pygetpapers_sectioning("inaturalist",'500',CPROJECT) +CPROJECT = os.path.join(os.path.expanduser("~"), "ethics_statement_2000_generic") +SECTION = "ethic" +# querying_pygetpapers_sectioning("inaturalist",'500',CPROJECT) get_metadata_json(CPROJECT) get_PMCIDS() parse_xml(CPROJECT, SECTION) get_abstract() get_keywords() key_phrase_extraction(SECTION) -#get_organism(SECTION) +# get_organism(SECTION) look_for_next_word(SECTION) -#look_for_next_word(SECTION, search_for="C.") -#look_for_next_word(SECTION, search_for='Citrus') +# look_for_next_word(SECTION, search_for="C.") +# look_for_next_word(SECTION, search_for='Citrus') add_if_file_contains_terms(SECTION) -convert_to_csv(f'ethics_{SECTION}2000.csv') +convert_to_csv(f"ethics_{SECTION}2000.csv") convert_to_json() -# -------end of code section from metadata_analysis.py +# -------end of code section from metadata_analysis.py -#TODO intergrate metadata_analyis.py to original docanalysis; -#TODO decide on functions we need from metadata_analysis.py -#TODO write methods to create ami-dictionaries from extracted entites and keywords +# TODO intergrate metadata_analyis.py to original docanalysis; +# TODO decide on functions we need from metadata_analysis.py +# TODO write methods to create ami-dictionaries from extracted entites and keywords diff --git a/docanalysis/frequency_analysis.py b/docanalysis/frequency_analysis.py index d899a9e..55e7811 100644 --- a/docanalysis/frequency_analysis.py +++ b/docanalysis/frequency_analysis.py @@ -1,23 +1,24 @@ - -import xml.etree.ElementTree as ET import os +import xml.etree.ElementTree as ET from collections import Counter + def get_terms_from_ami_xml(xml_path): tree = ET.parse(xml_path) root = tree.getroot() terms = [] - for para in root.iter('entry'): + for para in root.iter("entry"): terms.append(para.attrib["term"]) return terms + def frequency_counter(terms): frequency = {} # iterating over the list for item in terms: - # checking the element in dictionary + # checking the element in dictionary if item in frequency: # incrementing the counr frequency[item] += 1 @@ -29,9 +30,12 @@ def frequency_counter(terms): print(Counter(frequency).most_common()) -xml_path = os.path.join(os.getcwd(), 'ami_dict.xml') +xml_path = os.path.join(os.getcwd(), "ami_dict.xml") + + def main(): terms = get_terms_from_ami_xml(xml_path) frequency_counter(terms) -main() \ No newline at end of file + +main() diff --git a/demo.py b/examples/demo.py similarity index 100% rename from demo.py rename to examples/demo.py diff --git a/pmr_demo.py b/examples/pmr_demo.py similarity index 100% rename from pmr_demo.py rename to examples/pmr_demo.py diff --git a/setup.py b/setup.py index 9ffe7ac..bb5d719 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,9 @@ -#!/usr/bin/env python # -*- coding: utf-8 -*- - try: from setuptools import setup except ImportError: from distutils.core import setup -import configparser -import os + with open('README.md') as readme_file: readme = readme_file.read() @@ -22,7 +19,7 @@ author_email='ayush@science.org.in', url='https://github.com/petermr/docanalysis', packages=[ - 'pygetpapers', + 'docanalysis', ], package_dir={'docanalysis': 'docanalysis'}, @@ -30,7 +27,7 @@ install_requires=requirements, license='Apache License', zip_safe=False, - keywords='research automation', + keywords=['research automation'], classifiers=[ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', From 38dd3d65ca6d1e5a6d2ee6d3b53e1e00e40d961b Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Tue, 8 Feb 2022 14:17:15 +0100 Subject: [PATCH 2/3] chore: also format setup.py --- setup.py | 54 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/setup.py b/setup.py index bb5d719..1e550b2 100644 --- a/setup.py +++ b/setup.py @@ -5,40 +5,46 @@ from distutils.core import setup -with open('README.md') as readme_file: +with open("README.md") as readme_file: readme = readme_file.read() -requirements = ['pygetpapers', 'pandas', 'spacy', 'numpy', - 'matplotlib', 'tqdm', 'beautifulsoup4','nltk'] +requirements = [ + "pygetpapers", + "pandas", + "spacy", + "numpy", + "matplotlib", + "tqdm", + "beautifulsoup4", + "nltk", +] setup( - name='docanalysis', + name="docanalysis", version="0.0.3", - description='extract structured information from ethics paragraphs', + description="extract structured information from ethics paragraphs", long_description=readme, - author='Ayush Garg, Shweata N. Hegde', - author_email='ayush@science.org.in', - url='https://github.com/petermr/docanalysis', + author="Ayush Garg, Shweata N. Hegde", + author_email="ayush@science.org.in", + url="https://github.com/petermr/docanalysis", packages=[ - 'docanalysis', + "docanalysis", ], - package_dir={'docanalysis': - 'docanalysis'}, + package_dir={"docanalysis": "docanalysis"}, include_package_data=True, install_requires=requirements, - license='Apache License', + license="Apache License", zip_safe=False, - keywords=['research automation'], + keywords=["research automation"], classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: Apache Software License', - 'Natural Language :: English', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Natural Language :: English", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", ], ) From 02af78e8d49ba19036162abfe93bf2b042cc4949 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Tue, 8 Feb 2022 14:21:53 +0100 Subject: [PATCH 3/3] chore: remove unused imports, add dev dependencies --- docanalysis/extract_entities.py | 3 --- setup.py | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/docanalysis/extract_entities.py b/docanalysis/extract_entities.py index 38d0726..29ffa8e 100644 --- a/docanalysis/extract_entities.py +++ b/docanalysis/extract_entities.py @@ -2,14 +2,11 @@ import logging import os import re -import subprocess -import sys import xml.etree.ElementTree as ET from fileinput import filename from glob import glob import pandas as pd -import scispacy import spacy import yake from bs4 import BeautifulSoup diff --git a/setup.py b/setup.py index 1e550b2..4ca57c9 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ package_dir={"docanalysis": "docanalysis"}, include_package_data=True, install_requires=requirements, + extras={"dev": ["pytest", "pytest-cov"]}, license="Apache License", zip_safe=False, keywords=["research automation"],