From 0605240643bed4a6d09a34772e9689534915563d Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Tue, 8 Feb 2022 14:17:00 +0100
Subject: [PATCH 1/3] chore: did some basic linting

---
 README.md                           |  14 +-
 __init__.py                         |   0
 docanalysis/__init__.py             |   2 +-
 docanalysis/docanalysis.py          |  36 ++--
 docanalysis/entity_extraction.py    | 155 +++++++++------
 docanalysis/extract_entities.py     | 291 +++++++++++++++++-----------
 docanalysis/frequency_analysis.py   |  16 +-
 demo.py => examples/demo.py         |   0
 pmr_demo.py => examples/pmr_demo.py |   0
 setup.py                            |   9 +-
 10 files changed, 315 insertions(+), 208 deletions(-)
 delete mode 100644 __init__.py
 rename demo.py => examples/demo.py (100%)
 rename pmr_demo.py => examples/pmr_demo.py (100%)

diff --git a/README.md b/README.md
index c1eddc5..c0c991f 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ Parameters: CORPUS_PATH: path to an existing corpus (CProject)
             labels_to_get: SpaCy recognizes Named-Entites and labels them. You can choose for lables you are interested by providing it as a list. For all available labels, check out the Tools Used section. 
 ```
 ## How to run?
-We have created `demo.py` where you can run the package. 
+We have created `demo.py`  in the `examples` folder where you can run the package. 
 
 ```
 import os
@@ -74,12 +74,12 @@ with open('GPE.text', 'w') as f:
     f.write(str(list_with_gpe))
 ```
 To break this down, 
-|Variable snippet      |What is it?     |
-|----------------------|----------------|
-|`essential oil AND chemical composition` |Query to `pygetpapers` (EPMC default)|
-|`100`                 |number of hits  |
-|stem_cell_research_300|Output directory|
-|"ethics_dictionary", "ethics_key_phrases", "ethics_key_phrases.xml"     |dictionary path |
+| Variable snippet                                                    | What is it?                           |
+| ------------------------------------------------------------------- | ------------------------------------- |
+| `essential oil AND chemical composition`                            | Query to `pygetpapers` (EPMC default) |
+| `100`                                                               | number of hits                        |
+| stem_cell_research_300                                              | Output directory                      |
+| "ethics_dictionary", "ethics_key_phrases", "ethics_key_phrases.xml" | dictionary path                       |
 
 ## What is a dictionary
 
diff --git a/__init__.py b/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/docanalysis/__init__.py b/docanalysis/__init__.py
index fc80254..2ae2839 100644
--- a/docanalysis/__init__.py
+++ b/docanalysis/__init__.py
@@ -1 +1 @@
-pass
\ No newline at end of file
+pass
diff --git a/docanalysis/docanalysis.py b/docanalysis/docanalysis.py
index b52d7ec..2caea84 100644
--- a/docanalysis/docanalysis.py
+++ b/docanalysis/docanalysis.py
@@ -1,19 +1,21 @@
-import os
 import logging
+import os
 import sys
-import configargparse
-import coloredlogs
+from functools import partialmethod
 from time import gmtime, strftime
+
+import coloredlogs
+import configargparse
 from tqdm import tqdm
-from functools import partialmethod
+
 from docanalysis.entity_extraction import EntityExtraction
 
-class Docanalysis:
 
+class Docanalysis:
     def __init__(self):
         """This function makes all the constants"""
         self.entity_extraction = EntityExtraction()
-        self.version="0.0.3"
+        self.version = "0.0.3"
 
     def handle_logger_creation(self, args):
         """[summary]
@@ -38,7 +40,7 @@ def handle_logger_creation(self, args):
         if args.logfile:
             self.handle_logfile(args, level)
         else:
-            coloredlogs.install(level=level, fmt='%(levelname)s: %(message)s')
+            coloredlogs.install(level=level, fmt="%(levelname)s: %(message)s")
 
     def handlecli(self):
         """Handles the command line interface using argparse"""
@@ -85,7 +87,7 @@ def handlecli(self):
         parser.add_argument(
             "--entity_extraction",
             default=False,
-            nargs='+',
+            nargs="+",
             help="extracts specified entities chosen from a list of entities (CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART, GGP, SO, TAXON, CHEBI, GO, CL)",
         )
         parser.add_argument(
@@ -121,7 +123,6 @@ def handlecli(self):
             help="[All] save log to specified file in output directory as well as printing to terminal",
         )
 
-
         if len(sys.argv) == 1:
             parser.print_help(sys.stderr)
             sys.exit()
@@ -130,10 +131,19 @@ def handlecli(self):
             if vars(args)[arg] == "False":
                 vars(args)[arg] = False
         self.handle_logger_creation(args)
-        self.entity_extraction.extract_entities_from_papers(args.project_name,args.dictionary,query=args.query,hits=args.hits,
-                                     make_project=args.run_pygetpapers, install_ami=False, removefalse=True, create_csv=True,
-                                     csv_name=args.output, labels_to_get=args.entity_extraction,make_ami_dict=args.make_ami_dict)
-
+        self.entity_extraction.extract_entities_from_papers(
+            args.project_name,
+            args.dictionary,
+            query=args.query,
+            hits=args.hits,
+            make_project=args.run_pygetpapers,
+            install_ami=False,
+            removefalse=True,
+            create_csv=True,
+            csv_name=args.output,
+            labels_to_get=args.entity_extraction,
+            make_ami_dict=args.make_ami_dict,
+        )
 
 
 def main():
diff --git a/docanalysis/entity_extraction.py b/docanalysis/entity_extraction.py
index b4cce42..88659d6 100644
--- a/docanalysis/entity_extraction.py
+++ b/docanalysis/entity_extraction.py
@@ -1,19 +1,21 @@
-import os
 import logging
+import os
+import xml.etree.ElementTree as ET
 from glob import glob
-import spacy
+
 import pandas as pd
+import spacy
 from bs4 import BeautifulSoup
-from tqdm import tqdm
-import xml.etree.ElementTree as ET
 from nltk import tokenize
+from tqdm import tqdm
 
 try:
-    nlp = spacy.load('en_core_web_sm')
+    nlp = spacy.load("en_core_web_sm")
 except OSError:
     from spacy.cli import download
-    download('en_core_web_sm')
-    nlp = spacy.load('en_core_web_sm')
+
+    download("en_core_web_sm")
+    nlp = spacy.load("en_core_web_sm")
 
 
 class EntityExtraction:
@@ -23,15 +25,28 @@ def __init__(self):
         self.labels_to_get = []
         logging.basicConfig(level=logging.INFO)
 
-    def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None, hits=30,
-                                     make_project=False, install_ami=False, removefalse=True, create_csv=True,
-                                     csv_name='entities.csv', labels_to_get=['GPE', 'ORG'],make_ami_dict=False):
+    def extract_entities_from_papers(
+        self,
+        corpus_path,
+        terms_xml_path,
+        query=None,
+        hits=30,
+        make_project=False,
+        install_ami=False,
+        removefalse=True,
+        create_csv=True,
+        csv_name="entities.csv",
+        labels_to_get=["GPE", "ORG"],
+        make_ami_dict=False,
+    ):
         self.labels_to_get = labels_to_get
         if make_project:
             if not query:
-                logging.warning('Please provide query as parameter')
+                logging.warning("Please provide query as parameter")
                 return
-            logging.info(f"making project/searching {query} for {hits} hits into {corpus_path}")
+            logging.info(
+                f"making project/searching {query} for {hits} hits into {corpus_path}"
+            )
             self.create_project_files(query, hits, corpus_path)
         if install_ami:
             logging.info(f"installing ami3 (check whether this is a good idea)")
@@ -45,15 +60,19 @@ def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None,
         if terms_xml_path:
             terms = self.get_terms_from_ami_xml(terms_xml_path)
             self.add_if_file_contains_terms(
-                terms=terms, dict_with_parsed_xml=dict_with_parsed_xml)
+                terms=terms, dict_with_parsed_xml=dict_with_parsed_xml
+            )
             if removefalse:
                 self.remove_statements_not_having_xmldict_terms_or_entities(
-                    dict_with_parsed_xml=dict_with_parsed_xml)
+                    dict_with_parsed_xml=dict_with_parsed_xml
+                )
         if create_csv:
             self.convert_dict_to_csv(
-                path=os.path.join(corpus_path, csv_name), dict_with_parsed_xml=dict_with_parsed_xml)
+                path=os.path.join(corpus_path, csv_name),
+                dict_with_parsed_xml=dict_with_parsed_xml,
+            )
         if make_ami_dict:
-            self.handle_ami_dict_creation(dict_with_parsed_xml,make_ami_dict)
+            self.handle_ami_dict_creation(dict_with_parsed_xml, make_ami_dict)
         return dict_with_parsed_xml
 
     def create_project_files(self, QUERY, HITS, OUTPUT):
@@ -68,8 +87,9 @@ def install_ami(self):
     def make_dict_with_parsed_xml(self, output):
 
         dict_with_parsed_xml = {}
-        all_paragraphs = glob(os.path.join(
-            output, '*', 'sections', '**', '[1_9]_p.xml'), recursive=True)
+        all_paragraphs = glob(
+            os.path.join(output, "*", "sections", "**", "[1_9]_p.xml"), recursive=True
+        )
         counter = 1
         logging.info(f"starting  tokenization on {len(all_paragraphs)} paragraphs")
         for section_path in tqdm(all_paragraphs):
@@ -90,11 +110,10 @@ def read_text_from_path(self, paragraph_path):
         tree = ET.parse(paragraph_path)
         root = tree.getroot()
         try:
-            xmlstr = ET.tostring(root, encoding='utf8', method='xml')
-            soup = BeautifulSoup(xmlstr, features='lxml')
+            xmlstr = ET.tostring(root, encoding="utf8", method="xml")
+            soup = BeautifulSoup(xmlstr, features="lxml")
             text = soup.get_text(separator="")
-            paragraph_text = text.replace(
-                '\n', '')
+            paragraph_text = text.replace("\n", "")
         except:
             paragraph_text = "empty"
         return paragraph_text
@@ -102,31 +121,36 @@ def read_text_from_path(self, paragraph_path):
     def add_parsed_sections_to_dict(self, dict_with_parsed_xml):
 
         for paragraph in dict_with_parsed_xml:
-            doc = nlp(dict_with_parsed_xml[paragraph]['sentence'])
+            doc = nlp(dict_with_parsed_xml[paragraph]["sentence"])
             entities, labels, position_end, position_start = self.make_required_lists()
             for ent in doc.ents:
                 self.add_parsed_entities_to_lists(
-                    entities, labels, position_end, position_start, ent)
-            self.add_lists_to_dict(dict_with_parsed_xml[paragraph], entities, labels, position_end,
-                                   position_start)
+                    entities, labels, position_end, position_start, ent
+                )
+            self.add_lists_to_dict(
+                dict_with_parsed_xml[paragraph],
+                entities,
+                labels,
+                position_end,
+                position_start,
+            )
 
     def add_if_file_contains_terms(self, terms, dict_with_parsed_xml):
 
         for statement in dict_with_parsed_xml:
             dict_for_sentence = dict_with_parsed_xml[statement]
-            dict_for_sentence['has_terms'] = []
+            dict_for_sentence["has_terms"] = []
             for term in terms:
-                if term.lower().strip() in dict_for_sentence['sentence'].lower():
-                    dict_for_sentence['has_terms'].append(term)
-            dict_for_sentence['weight'] = len(
-                dict_for_sentence['has_terms'])
+                if term.lower().strip() in dict_for_sentence["sentence"].lower():
+                    dict_for_sentence["has_terms"].append(term)
+            dict_for_sentence["weight"] = len(dict_for_sentence["has_terms"])
 
     def get_terms_from_ami_xml(self, xml_path):
 
         tree = ET.parse(xml_path)
         root = tree.getroot()
         terms = []
-        for para in root.iter('entry'):
+        for para in root.iter("entry"):
             terms.append(para.attrib["term"])
         return terms
 
@@ -138,14 +162,18 @@ def make_required_lists(self):
         position_end = []
         return entities, labels, position_end, position_start
 
-    def add_lists_to_dict(self, dict_for_sentence, entities, labels, position_end, position_start):
+    def add_lists_to_dict(
+        self, dict_for_sentence, entities, labels, position_end, position_start
+    ):
 
-        dict_for_sentence['entities'] = entities
-        dict_for_sentence['labels'] = labels
-        dict_for_sentence['position_start'] = position_start
-        dict_for_sentence['position_end'] = position_end
+        dict_for_sentence["entities"] = entities
+        dict_for_sentence["labels"] = labels
+        dict_for_sentence["position_start"] = position_start
+        dict_for_sentence["position_end"] = position_end
 
-    def add_parsed_entities_to_lists(self, entities, labels, position_end, position_start, ent=None):
+    def add_parsed_entities_to_lists(
+        self, entities, labels, position_end, position_start, ent=None
+    ):
         if ent.label_ in self.labels_to_get:
             entities.append(ent)
             labels.append(ent.label_)
@@ -158,20 +186,23 @@ def convert_dict_to_csv(self, path, dict_with_parsed_xml):
         df = df.T
         for col in df:
             try:
-                df[col] = df[col].astype(str).str.replace(
-                    "[", "").str.replace("]", "")
-                df[col] = df[col].astype(str).str.replace(
-                    "'", "").str.replace("'", "")
+                df[col] = df[col].astype(str).str.replace("[", "").str.replace("]", "")
+                df[col] = df[col].astype(str).str.replace("'", "").str.replace("'", "")
             except:
                 pass
-        df.to_csv(path, encoding='utf-8', line_terminator='\r\n')
+        df.to_csv(path, encoding="utf-8", line_terminator="\r\n")
         logging.info(f"wrote output to {path}")
 
-    def remove_statements_not_having_xmldict_terms_or_entities(self, dict_with_parsed_xml):
+    def remove_statements_not_having_xmldict_terms_or_entities(
+        self, dict_with_parsed_xml
+    ):
         statement_to_pop = []
         for statement in dict_with_parsed_xml:
             sentect_dict = dict_with_parsed_xml[statement]
-            if len(sentect_dict['has_terms']) == 0 or len(sentect_dict['entities']) == 0:
+            if (
+                len(sentect_dict["has_terms"]) == 0
+                or len(sentect_dict["entities"]) == 0
+            ):
                 statement_to_pop.append(statement)
 
         for term in statement_to_pop:
@@ -182,31 +213,31 @@ def extract_particular_fields(dict_with_parsed_xml, field):
         field_list = []
         for sentence in dict_with_parsed_xml:
             sentect_dict = dict_with_parsed_xml[sentence]
-            for entity, label in zip(sentect_dict['entities'], sentect_dict['labels']):
+            for entity, label in zip(sentect_dict["entities"], sentect_dict["labels"]):
                 if label == field:
                     if entity not in field_list:
                         field_list.append(entity)
         return field_list
 
-    def make_ami_dict_from_list(self,list_of_terms,title):
-        xml_string=f'''<?xml version="1.0" encoding="UTF-8"?>
+    def make_ami_dict_from_list(self, list_of_terms, title):
+        xml_string = f"""<?xml version="1.0" encoding="UTF-8"?>
                             <dictionary title="{title}">
-                    '''
+                    """
         for term in list_of_terms:
-            xml_string+=f'''
+            xml_string += f"""
                         <entry term="{term}"/>
-            '''
-        xml_string+="</dictionary>"
+            """
+        xml_string += "</dictionary>"
         return xml_string
-    
-    def write_string_to_file(self,string_to_put,title):
-        with open(f'{title}.xml',mode='w') as f:
+
+    def write_string_to_file(self, string_to_put, title):
+        with open(f"{title}.xml", mode="w") as f:
             f.write(string_to_put)
-    
-    def handle_ami_dict_creation(self,result_dictionary,title):
-        list_of_entities=[]
+
+    def handle_ami_dict_creation(self, result_dictionary, title):
+        list_of_entities = []
         for entry in result_dictionary:
-            if 'entities' in entry:
-                list_of_entities+=entry['entities']
-        xml_dict = self.make_ami_dict_from_list(list_of_entities,title)
-        self.write_string_to_file(xml_dict,f'{title}.xml')
+            if "entities" in entry:
+                list_of_entities += entry["entities"]
+        xml_dict = self.make_ami_dict_from_list(list_of_entities, title)
+        self.write_string_to_file(xml_dict, f"{title}.xml")
diff --git a/docanalysis/extract_entities.py b/docanalysis/extract_entities.py
index 97ba3c5..38d0726 100644
--- a/docanalysis/extract_entities.py
+++ b/docanalysis/extract_entities.py
@@ -1,25 +1,28 @@
-from fileinput import filename
+import json
+import logging
 import os
+import re
+import subprocess
 import sys
-import logging
+import xml.etree.ElementTree as ET
+from fileinput import filename
 from glob import glob
-import spacy
+
 import pandas as pd
-from bs4 import BeautifulSoup
-from tqdm import tqdm
-import xml.etree.ElementTree as ET
-from nltk import tokenize
-import subprocess
 import scispacy
-import json
-import re
+import spacy
 import yake
+from bs4 import BeautifulSoup
+from nltk import tokenize
+from tqdm import tqdm
+
 try:
-    nlp = spacy.load('en_core_web_sm')
+    nlp = spacy.load("en_core_web_sm")
 except OSError:
     from spacy.cli import download
-    download('en_core_web_sm')
-    nlp = spacy.load('en_core_web_sm')
+
+    download("en_core_web_sm")
+    nlp = spacy.load("en_core_web_sm")
 
 
 class DocAnalysis:
@@ -29,9 +32,19 @@ def __init__(self):
         self.labels_to_get = []
         logging.basicConfig(level=logging.INFO)
 
-    def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None, hits=30,
-                                     make_project=False, install_ami=False, removefalse=True, create_csv=True,
-                                     csv_name='entities.csv', labels_to_get=['GPE', 'ORG']):
+    def extract_entities_from_papers(
+        self,
+        corpus_path,
+        terms_xml_path,
+        query=None,
+        hits=30,
+        make_project=False,
+        install_ami=False,
+        removefalse=True,
+        create_csv=True,
+        csv_name="entities.csv",
+        labels_to_get=["GPE", "ORG"],
+    ):
         """[summary]
 
         :param query: [description]
@@ -58,9 +71,11 @@ def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None,
         self.labels_to_get = labels_to_get
         if make_project:
             if not query:
-                logging.warning('Please provide query as parameter')
+                logging.warning("Please provide query as parameter")
                 return
-            logging.info(f"making project/searching {query} for {hits} hits into {corpus_path}")
+            logging.info(
+                f"making project/searching {query} for {hits} hits into {corpus_path}"
+            )
             self.create_project_files(query, hits, corpus_path)
         if install_ami:
             logging.info(f"installing ami3 (check whether this is a good idea)")
@@ -80,14 +95,18 @@ def extract_entities_from_papers(self, corpus_path, terms_xml_path, query=None,
 
         terms = self.get_terms_from_ami_xml(terms_xml_path)  # moved from (1)
         self.add_if_file_contains_terms(
-            terms=terms, dict_with_parsed_xml=dict_with_parsed_xml)
+            terms=terms, dict_with_parsed_xml=dict_with_parsed_xml
+        )
 
         if removefalse:
             self.remove_statements_not_having_xmldict_terms_or_entities(
-                dict_with_parsed_xml=dict_with_parsed_xml)
+                dict_with_parsed_xml=dict_with_parsed_xml
+            )
         if create_csv:
             self.convert_dict_to_csv(
-                path=os.path.join(corpus_path, csv_name), dict_with_parsed_xml=dict_with_parsed_xml)
+                path=os.path.join(corpus_path, csv_name),
+                dict_with_parsed_xml=dict_with_parsed_xml,
+            )
         return dict_with_parsed_xml
 
     def create_project_files(self, QUERY, HITS, OUTPUT):
@@ -102,8 +121,9 @@ def install_ami(self):
     def make_dict_with_parsed_xml(self, output):
 
         dict_with_parsed_xml = {}
-        all_paragraphs = glob(os.path.join(
-            output, '*', 'sections', '**', '[1_9]_p.xml'), recursive=True)
+        all_paragraphs = glob(
+            os.path.join(output, "*", "sections", "**", "[1_9]_p.xml"), recursive=True
+        )
         counter = 1
         logging.info(f"starting  tokenization on {len(all_paragraphs)} paragraphs")
         for section_path in tqdm(all_paragraphs):
@@ -124,11 +144,10 @@ def read_text_from_path(self, paragraph_path):
         tree = ET.parse(paragraph_path)
         root = tree.getroot()
         try:
-            xmlstr = ET.tostring(root, encoding='utf8', method='xml')
-            soup = BeautifulSoup(xmlstr, features='lxml')
+            xmlstr = ET.tostring(root, encoding="utf8", method="xml")
+            soup = BeautifulSoup(xmlstr, features="lxml")
             text = soup.get_text(separator="")
-            paragraph_text = text.replace(
-                '\n', '')
+            paragraph_text = text.replace("\n", "")
         except:
             paragraph_text = "empty"
         return paragraph_text
@@ -136,31 +155,36 @@ def read_text_from_path(self, paragraph_path):
     def add_parsed_sections_to_dict(self, dict_with_parsed_xml):
 
         for paragraph in dict_with_parsed_xml:
-            doc = nlp(dict_with_parsed_xml[paragraph]['sentence'])
+            doc = nlp(dict_with_parsed_xml[paragraph]["sentence"])
             entities, labels, position_end, position_start = self.make_required_lists()
             for ent in doc.ents:
                 self.add_parsed_entities_to_lists(
-                    entities, labels, position_end, position_start, ent)
-            self.add_lists_to_dict(dict_with_parsed_xml[paragraph], entities, labels, position_end,
-                                   position_start)
+                    entities, labels, position_end, position_start, ent
+                )
+            self.add_lists_to_dict(
+                dict_with_parsed_xml[paragraph],
+                entities,
+                labels,
+                position_end,
+                position_start,
+            )
 
     def add_if_file_contains_terms(self, terms, dict_with_parsed_xml):
 
         for statement in dict_with_parsed_xml:
             dict_for_sentence = dict_with_parsed_xml[statement]
-            dict_for_sentence['has_terms'] = []
+            dict_for_sentence["has_terms"] = []
             for term in terms:
-                if term.lower().strip() in dict_for_sentence['sentence'].lower():
-                    dict_for_sentence['has_terms'].append(term)
-            dict_for_sentence['weight'] = len(
-                dict_for_sentence['has_terms'])
+                if term.lower().strip() in dict_for_sentence["sentence"].lower():
+                    dict_for_sentence["has_terms"].append(term)
+            dict_for_sentence["weight"] = len(dict_for_sentence["has_terms"])
 
     def get_terms_from_ami_xml(self, xml_path):
 
         tree = ET.parse(xml_path)
         root = tree.getroot()
         terms = []
-        for para in root.iter('entry'):
+        for para in root.iter("entry"):
             terms.append(para.attrib["term"])
         return terms
 
@@ -172,14 +196,18 @@ def make_required_lists(self):
         position_end = []
         return entities, labels, position_end, position_start
 
-    def add_lists_to_dict(self, dict_for_sentence, entities, labels, position_end, position_start):
+    def add_lists_to_dict(
+        self, dict_for_sentence, entities, labels, position_end, position_start
+    ):
 
-        dict_for_sentence['entities'] = entities
-        dict_for_sentence['labels'] = labels
-        dict_for_sentence['position_start'] = position_start
-        dict_for_sentence['position_end'] = position_end
+        dict_for_sentence["entities"] = entities
+        dict_for_sentence["labels"] = labels
+        dict_for_sentence["position_start"] = position_start
+        dict_for_sentence["position_end"] = position_end
 
-    def add_parsed_entities_to_lists(self, entities, labels, position_end, position_start, ent=None):
+    def add_parsed_entities_to_lists(
+        self, entities, labels, position_end, position_start, ent=None
+    ):
         if ent.label_ in self.labels_to_get:
             entities.append(ent)
             labels.append(ent.label_)
@@ -192,20 +220,23 @@ def convert_dict_to_csv(self, path, dict_with_parsed_xml):
         df = df.T
         for col in df:
             try:
-                df[col] = df[col].astype(str).str.replace(
-                    "[", "").str.replace("]", "")
-                df[col] = df[col].astype(str).str.replace(
-                    "'", "").str.replace("'", "")
+                df[col] = df[col].astype(str).str.replace("[", "").str.replace("]", "")
+                df[col] = df[col].astype(str).str.replace("'", "").str.replace("'", "")
             except:
                 pass
-        df.to_csv(path, encoding='utf-8', line_terminator='\r\n')
+        df.to_csv(path, encoding="utf-8", line_terminator="\r\n")
         logging.info(f"wrote output to {path}")
 
-    def remove_statements_not_having_xmldict_terms_or_entities(self, dict_with_parsed_xml):
+    def remove_statements_not_having_xmldict_terms_or_entities(
+        self, dict_with_parsed_xml
+    ):
         statement_to_pop = []
         for statement in dict_with_parsed_xml:
             sentect_dict = dict_with_parsed_xml[statement]
-            if len(sentect_dict['has_terms']) == 0 or len(sentect_dict['entities']) == 0:
+            if (
+                len(sentect_dict["has_terms"]) == 0
+                or len(sentect_dict["entities"]) == 0
+            ):
                 statement_to_pop.append(statement)
 
         for term in statement_to_pop:
@@ -225,38 +256,44 @@ def extract_particular_fields(dict_with_parsed_xml, field):
         field_list = []
         for sentence in dict_with_parsed_xml:
             sentect_dict = dict_with_parsed_xml[sentence]
-            for entity, label in zip(sentect_dict['entities'], sentect_dict['labels']):
+            for entity, label in zip(sentect_dict["entities"], sentect_dict["labels"]):
                 if label == field:
                     if entity not in field_list:
                         field_list.append(entity)
         return field_list
 
-    def make_ami_dict_from_list(self,list_of_terms,title):
-        xml_string=f'''<?xml version="1.0" encoding="UTF-8"?>
+    def make_ami_dict_from_list(self, list_of_terms, title):
+        xml_string = f"""<?xml version="1.0" encoding="UTF-8"?>
                             <dictionary title="{title}">
-                    '''
+                    """
         for term in list_of_terms:
-            xml_string+=f'''
+            xml_string += f"""
                         <entry term="{term}"/>
-            '''
-        xml_string+="</dictionary>"
+            """
+        xml_string += "</dictionary>"
         return xml_string
-    
-    def write_string_to_file(self,string_to_put,title):
-        with open(f'{title}.xml',mode='w') as f:
+
+    def write_string_to_file(self, string_to_put, title):
+        with open(f"{title}.xml", mode="w") as f:
             f.write(string_to_put)
 
-# -------this section comes from metadata_analysis.py 
+
+# -------this section comes from metadata_analysis.py
 # (https://github.com/petermr/crops/blob/main/metadata_analysis/metadata_analysis.py)
 
 metadata_dictionary = {}
 
+
 def get_metadata_json(output_directory):
     WORKING_DIRECTORY = os.getcwd()
-    glob_results = glob.glob(os.path.join(WORKING_DIRECTORY,
-                                          output_directory, "*", 'eupmc_result.json'))
+    glob_results = glob.glob(
+        os.path.join(WORKING_DIRECTORY, output_directory, "*", "eupmc_result.json")
+    )
     metadata_dictionary["metadata_json"] = glob_results
-    logging.info(f'metadata found for {len(metadata_dictionary["metadata_json"])} papers')
+    logging.info(
+        f'metadata found for {len(metadata_dictionary["metadata_json"])} papers'
+    )
+
 
 def get_PMCIDS(metadata_dictionary=metadata_dictionary):
     # gets PMCDIDs from metadata_JSON of individual papers.
@@ -264,35 +301,46 @@ def get_PMCIDS(metadata_dictionary=metadata_dictionary):
 
     metadata_dictionary["PMCIDS"] = []
     for metadata in metadata_dictionary["metadata_json"]:
-        with open(metadata, encoding='utf-8') as f:
+        with open(metadata, encoding="utf-8") as f:
             metadata_in_json = json.load(f)
             try:
-                metadata_dictionary["PMCIDS"].append(
-                    metadata_in_json["full"]["pmcid"])
+                metadata_dictionary["PMCIDS"].append(metadata_in_json["full"]["pmcid"])
             except KeyError:
-                metadata_dictionary["PMCIDS"].append('NaN')
-    logging.info('getting PMCIDs')
+                metadata_dictionary["PMCIDS"].append("NaN")
+    logging.info("getting PMCIDs")
+
 
 def parse_xml(output_directory, section, metadata_dictionary=metadata_dictionary):
-    # gets the text from XML. Clubs all the paragraphs in the section into one. 
+    # gets the text from XML. Clubs all the paragraphs in the section into one.
     metadata_dictionary[f"{section}"] = []
     for pmc in metadata_dictionary["PMCIDS"]:
         paragraphs = []
-        section_glob = glob.glob(os.path.join(os.getcwd(), output_directory,
-                                           pmc, 'sections', '**', f'*{section}*', '**', '*.xml'),
-                              recursive=True)
+        section_glob = glob.glob(
+            os.path.join(
+                os.getcwd(),
+                output_directory,
+                pmc,
+                "sections",
+                "**",
+                f"*{section}*",
+                "**",
+                "*.xml",
+            ),
+            recursive=True,
+        )
         for result in section_glob:
             tree = ET.parse(result)
             root = tree.getroot()
-            xmlstr = ET.tostring(root, encoding='utf-8', method='xml')
-            soup = BeautifulSoup(xmlstr, features='lxml')
+            xmlstr = ET.tostring(root, encoding="utf-8", method="xml")
+            soup = BeautifulSoup(xmlstr, features="lxml")
             text = soup.get_text(separator="")
-            text = text.replace('\n', '')
+            text = text.replace("\n", "")
             paragraphs.append(text)
-        concated_paragraph = ' '.join(paragraphs)
+        concated_paragraph = " ".join(paragraphs)
         metadata_dictionary[f"{section}"].append(concated_paragraph)
     logging.info(f"parsing {section} section")
 
+
 def get_abstract(metadata_dictionary=metadata_dictionary):
     # gets abstracts from the metadata json.
     # We might want to get the abstract from the fulltext,
@@ -300,14 +348,14 @@ def get_abstract(metadata_dictionary=metadata_dictionary):
     TAG_RE = re.compile(r"<[^>]+>")
     metadata_dictionary["abstract"] = []
     for metadata in metadata_dictionary["metadata_json"]:
-        with open(metadata, encoding='utf-8') as f:
+        with open(metadata, encoding="utf-8") as f:
             metadata_in_json = json.load(f)
             try:
                 raw_abstract = metadata_in_json["full"]["abstractText"]
-                abstract = TAG_RE.sub(' ', raw_abstract)
+                abstract = TAG_RE.sub(" ", raw_abstract)
                 metadata_dictionary["abstract"].append(abstract)
             except KeyError:
-                metadata_dictionary["abstract"].append('NaN')
+                metadata_dictionary["abstract"].append("NaN")
     logging.info("getting the abstracts")
 
 
@@ -317,32 +365,36 @@ def get_keywords(metadata_dictionary=metadata_dictionary):
     # since the format of the metadata JSON has changed from time to time.
     metadata_dictionary["keywords"] = []
     for metadata in metadata_dictionary["metadata_json"]:
-        with open(metadata, encoding='utf-8') as f:
+        with open(metadata, encoding="utf-8") as f:
             metadata_in_json = json.load(f)
             try:
                 metadata_dictionary["keywords"].append(
-                    metadata_in_json["full"]["keywordList"]["keyword"])
+                    metadata_in_json["full"]["keywordList"]["keyword"]
+                )
             except KeyError:
                 metadata_dictionary["keywords"].append([])
     logging.info("getting the keywords from metadata")
 
 
 def key_phrase_extraction(section, metadata_dictionary=metadata_dictionary):
-    # extracts keyphrases from the blob of texts of section specified for each paper using YAKE 
+    # extracts keyphrases from the blob of texts of section specified for each paper using YAKE
     metadata_dictionary["yake_keywords"] = []
     for text in metadata_dictionary[f"{section}"]:
         custom_kw_extractor = yake.KeywordExtractor(
-            lan='en', n=2, top=10, features=None)
+            lan="en", n=2, top=10, features=None
+        )
         keywords = custom_kw_extractor.extract_keywords(text)
         keywords_list = []
         for kw in keywords:
             keywords_list.append(kw[0])
         metadata_dictionary["yake_keywords"].append(keywords_list)
-    logging.info(f'extracted key phrases from {section}')
+    logging.info(f"extracted key phrases from {section}")
 
 
-def get_organism(section,label_interested= 'TAXON', metadata_dictionary=metadata_dictionary):
-    #nlp = spacy.load("en_ner_bionlp13cg_md")
+def get_organism(
+    section, label_interested="TAXON", metadata_dictionary=metadata_dictionary
+):
+    # nlp = spacy.load("en_ner_bionlp13cg_md")
     nlp = spacy.load("en_core_sci_sm")
     metadata_dictionary["entities"] = []
     for sci_text in metadata_dictionary[f"{section}"]:
@@ -352,23 +404,28 @@ def get_organism(section,label_interested= 'TAXON', metadata_dictionary=metadata
             if ent.label_ == label_interested:
                 entity.append(ent.text)
         metadata_dictionary["entities"].append(entity)
-    logging.info(F"NER using SciSpacy - looking for {label_interested}")
+    logging.info(f"NER using SciSpacy - looking for {label_interested}")
 
 
-def convert_to_csv(path='keywords_results_yake_organism_pmcid_tps_cam_ter_c.csv', metadata_dictionary=metadata_dictionary):
-     # method borrowed from original docanalysis
+def convert_to_csv(
+    path="keywords_results_yake_organism_pmcid_tps_cam_ter_c.csv",
+    metadata_dictionary=metadata_dictionary,
+):
+    # method borrowed from original docanalysis
     df = pd.DataFrame(metadata_dictionary)
-    df.to_csv(path, encoding='utf-8', line_terminator='\r\n')
-    logging.info(f'writing the keywords to {path}')
+    df.to_csv(path, encoding="utf-8", line_terminator="\r\n")
+    logging.info(f"writing the keywords to {path}")
 
 
-def convert_to_json(path='ethics_statement_2000.json', metadata_dictionary = metadata_dictionary):
+def convert_to_json(
+    path="ethics_statement_2000.json", metadata_dictionary=metadata_dictionary
+):
     # converts the python dictionary containing output into a JSON file
     json_file = json.dumps(metadata_dictionary)
-    f = open(path,"w", encoding='ascii')
+    f = open(path, "w", encoding="ascii")
     f.write(json_file)
     f.close()
-    logging.info(f'writing the dictionary to {path}')
+    logging.info(f"writing the dictionary to {path}")
 
 
 def look_for_a_word(section, search_for="TPS", metadata_dictionary=metadata_dictionary):
@@ -377,12 +434,16 @@ def look_for_a_word(section, search_for="TPS", metadata_dictionary=metadata_dict
     metadata_dictionary[f"{search_for}_match"] = []
     for text in metadata_dictionary[f"{section}"]:
         words = text.split(" ")
-        match_list = ([s for s in words if f"{search_for}" in s])
-        metadata_dictionary[f"{search_for}_match"] .append(match_list)
+        match_list = [s for s in words if f"{search_for}" in s]
+        metadata_dictionary[f"{search_for}_match"].append(match_list)
     logging.info(f"looking for {search_for} in {section}")
 
 
-def look_for_next_word(section, search_for=["number:", "no.", "No.", "number" ], metadata_dictionary=metadata_dictionary):
+def look_for_next_word(
+    section,
+    search_for=["number:", "no.", "No.", "number"],
+    metadata_dictionary=metadata_dictionary,
+):
     # chops the paragraph corresponding to a section into list of words
     # gets the word next to the matched string.
     metadata_dictionary[f"{search_for}_match"] = []
@@ -390,7 +451,9 @@ def look_for_next_word(section, search_for=["number:", "no.", "No.", "number" ],
         words = text.split(" ")
         words = iter(words)
         try:
-            match_list = ([next(words) for s in words if any(xs in s for xs in search_for)])
+            match_list = [
+                next(words) for s in words if any(xs in s for xs in search_for)
+            ]
             metadata_dictionary[f"{search_for}_match"].append(match_list)
         except StopIteration:
             metadata_dictionary[f"{search_for}_match"].append([])
@@ -398,7 +461,9 @@ def look_for_next_word(section, search_for=["number:", "no.", "No.", "number" ],
     logging.info(f"looking for {search_for} in {section}")
 
 
-def add_if_file_contains_terms(section, metadata_dictionary=metadata_dictionary, terms=['iNaturalist']):
+def add_if_file_contains_terms(
+    section, metadata_dictionary=metadata_dictionary, terms=["iNaturalist"]
+):
     # method borrowed from original docanalysis
     metadata_dictionary["terms"] = []
     for term in terms:
@@ -406,30 +471,30 @@ def add_if_file_contains_terms(section, metadata_dictionary=metadata_dictionary,
             if term.lower() in text.lower():
                 metadata_dictionary["terms"].append(term)
             else:
-                metadata_dictionary["terms"].append('NaN')
-    logging.info(f'looking for term matches in {section}')
+                metadata_dictionary["terms"].append("NaN")
+    logging.info(f"looking for term matches in {section}")
 
 
 # calling all the functions
-CPROJECT = os.path.join(os.path.expanduser('~'), 'ethics_statement_2000_generic')
-SECTION= 'ethic'
-#querying_pygetpapers_sectioning("inaturalist",'500',CPROJECT)
+CPROJECT = os.path.join(os.path.expanduser("~"), "ethics_statement_2000_generic")
+SECTION = "ethic"
+# querying_pygetpapers_sectioning("inaturalist",'500',CPROJECT)
 get_metadata_json(CPROJECT)
 get_PMCIDS()
 parse_xml(CPROJECT, SECTION)
 get_abstract()
 get_keywords()
 key_phrase_extraction(SECTION)
-#get_organism(SECTION)
+# get_organism(SECTION)
 look_for_next_word(SECTION)
-#look_for_next_word(SECTION, search_for="C.")
-#look_for_next_word(SECTION, search_for='Citrus')
+# look_for_next_word(SECTION, search_for="C.")
+# look_for_next_word(SECTION, search_for='Citrus')
 add_if_file_contains_terms(SECTION)
-convert_to_csv(f'ethics_{SECTION}2000.csv')
+convert_to_csv(f"ethics_{SECTION}2000.csv")
 convert_to_json()
 
-# -------end of code section from metadata_analysis.py 
+# -------end of code section from metadata_analysis.py
 
-#TODO intergrate metadata_analyis.py to original docanalysis; 
-#TODO decide on functions we need from metadata_analysis.py 
-#TODO write methods to create ami-dictionaries from extracted entites and keywords
+# TODO intergrate metadata_analyis.py to original docanalysis;
+# TODO decide on functions we need from metadata_analysis.py
+# TODO write methods to create ami-dictionaries from extracted entites and keywords
diff --git a/docanalysis/frequency_analysis.py b/docanalysis/frequency_analysis.py
index d899a9e..55e7811 100644
--- a/docanalysis/frequency_analysis.py
+++ b/docanalysis/frequency_analysis.py
@@ -1,23 +1,24 @@
-
-import xml.etree.ElementTree as ET
 import os
+import xml.etree.ElementTree as ET
 from collections import Counter
 
+
 def get_terms_from_ami_xml(xml_path):
 
     tree = ET.parse(xml_path)
     root = tree.getroot()
     terms = []
-    for para in root.iter('entry'):
+    for para in root.iter("entry"):
         terms.append(para.attrib["term"])
     return terms
 
+
 def frequency_counter(terms):
     frequency = {}
 
     # iterating over the list
     for item in terms:
-    # checking the element in dictionary
+        # checking the element in dictionary
         if item in frequency:
             # incrementing the counr
             frequency[item] += 1
@@ -29,9 +30,12 @@ def frequency_counter(terms):
     print(Counter(frequency).most_common())
 
 
-xml_path = os.path.join(os.getcwd(), 'ami_dict.xml')
+xml_path = os.path.join(os.getcwd(), "ami_dict.xml")
+
+
 def main():
     terms = get_terms_from_ami_xml(xml_path)
     frequency_counter(terms)
 
-main()
\ No newline at end of file
+
+main()
diff --git a/demo.py b/examples/demo.py
similarity index 100%
rename from demo.py
rename to examples/demo.py
diff --git a/pmr_demo.py b/examples/pmr_demo.py
similarity index 100%
rename from pmr_demo.py
rename to examples/pmr_demo.py
diff --git a/setup.py b/setup.py
index 9ffe7ac..bb5d719 100644
--- a/setup.py
+++ b/setup.py
@@ -1,12 +1,9 @@
-#!/usr/bin/env python
 # -*- coding: utf-8 -*-
-
 try:
     from setuptools import setup
 except ImportError:
     from distutils.core import setup
-import configparser
-import os
+
 
 with open('README.md') as readme_file:
     readme = readme_file.read()
@@ -22,7 +19,7 @@
     author_email='ayush@science.org.in',
     url='https://github.com/petermr/docanalysis',
     packages=[
-        'pygetpapers',
+        'docanalysis',
     ],
     package_dir={'docanalysis':
                  'docanalysis'},
@@ -30,7 +27,7 @@
     install_requires=requirements,
     license='Apache License',
     zip_safe=False,
-    keywords='research automation',
+    keywords=['research automation'],
     classifiers=[
         'Development Status :: 4 - Beta',
         'Intended Audience :: Developers',

From 38dd3d65ca6d1e5a6d2ee6d3b53e1e00e40d961b Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Tue, 8 Feb 2022 14:17:15 +0100
Subject: [PATCH 2/3] chore: also format setup.py

---
 setup.py | 54 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/setup.py b/setup.py
index bb5d719..1e550b2 100644
--- a/setup.py
+++ b/setup.py
@@ -5,40 +5,46 @@
     from distutils.core import setup
 
 
-with open('README.md') as readme_file:
+with open("README.md") as readme_file:
     readme = readme_file.read()
-requirements = ['pygetpapers', 'pandas', 'spacy', 'numpy',
-                'matplotlib', 'tqdm', 'beautifulsoup4','nltk']
+requirements = [
+    "pygetpapers",
+    "pandas",
+    "spacy",
+    "numpy",
+    "matplotlib",
+    "tqdm",
+    "beautifulsoup4",
+    "nltk",
+]
 
 setup(
-    name='docanalysis',
+    name="docanalysis",
     version="0.0.3",
-    description='extract structured information from ethics paragraphs',
+    description="extract structured information from ethics paragraphs",
     long_description=readme,
-    author='Ayush Garg, Shweata N. Hegde',
-    author_email='ayush@science.org.in',
-    url='https://github.com/petermr/docanalysis',
+    author="Ayush Garg, Shweata N. Hegde",
+    author_email="ayush@science.org.in",
+    url="https://github.com/petermr/docanalysis",
     packages=[
-        'docanalysis',
+        "docanalysis",
     ],
-    package_dir={'docanalysis':
-                 'docanalysis'},
+    package_dir={"docanalysis": "docanalysis"},
     include_package_data=True,
     install_requires=requirements,
-    license='Apache License',
+    license="Apache License",
     zip_safe=False,
-    keywords=['research automation'],
+    keywords=["research automation"],
     classifiers=[
-        'Development Status :: 4 - Beta',
-        'Intended Audience :: Developers',
-        'License :: OSI Approved :: Apache Software License',
-        'Natural Language :: English',
-        'Programming Language :: Python :: 3.4',
-        'Programming Language :: Python :: 3.5',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
-        'Programming Language :: Python :: 3.9',
-
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: Apache Software License",
+        "Natural Language :: English",
+        "Programming Language :: Python :: 3.4",
+        "Programming Language :: Python :: 3.5",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
     ],
 )

From 02af78e8d49ba19036162abfe93bf2b042cc4949 Mon Sep 17 00:00:00 2001
From: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
Date: Tue, 8 Feb 2022 14:21:53 +0100
Subject: [PATCH 3/3] chore: remove unused imports, add dev dependencies

---
 docanalysis/extract_entities.py | 3 ---
 setup.py                        | 1 +
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/docanalysis/extract_entities.py b/docanalysis/extract_entities.py
index 38d0726..29ffa8e 100644
--- a/docanalysis/extract_entities.py
+++ b/docanalysis/extract_entities.py
@@ -2,14 +2,11 @@
 import logging
 import os
 import re
-import subprocess
-import sys
 import xml.etree.ElementTree as ET
 from fileinput import filename
 from glob import glob
 
 import pandas as pd
-import scispacy
 import spacy
 import yake
 from bs4 import BeautifulSoup
diff --git a/setup.py b/setup.py
index 1e550b2..4ca57c9 100644
--- a/setup.py
+++ b/setup.py
@@ -32,6 +32,7 @@
     package_dir={"docanalysis": "docanalysis"},
     include_package_data=True,
     install_requires=requirements,
+    extras={"dev": ["pytest", "pytest-cov"]},
     license="Apache License",
     zip_safe=False,
     keywords=["research automation"],