fix linting and formatting

microsoft · Jul 16, 2020 · 48f537a · 48f537a
1 parent 4e5da37
commit 48f537a
Show file tree

Hide file tree

Showing 25 changed files with 462 additions and 380 deletions.
diff --git a/docs/src/api/app.py b/docs/src/api/app.py
@@ -3,15 +3,11 @@
 
 import os
 
-from dotenv import load_dotenv, find_dotenv
-from fastapi import FastAPI, Body
-from starlette.middleware.cors import CORSMiddleware
-from starlette.responses import RedirectResponse
 import spacy
-import uvicorn
-
-from models import LinkingRequest, LinkingResponse, LinkingRecord
-
+from dotenv import find_dotenv, load_dotenv
+from fastapi import Body, FastAPI
+from models import LinkingRecord, LinkingRequest, LinkingResponse
+from starlette.responses import RedirectResponse
 
 load_dotenv(find_dotenv())
 prefix = os.getenv("CLUSTER_ROUTE_PREFIX")
@@ -27,7 +23,7 @@
     openapi_prefix=prefix,
 )
 
-example_request = list(srsly.read_json('./example_request.json'))
+example_request = list(srsly.read_json("./example_request.json"))
 
 nlp = spacy.load("../tutorial/models/ann_linker")
 
@@ -44,18 +40,13 @@ async def link(body: LinkingRequest = Body(..., example=example_request)):
     res = LinkingResponse(documents=[])
     for doc in body.documents:
         spacy_doc = nlp.make_doc(doc.context)
-        spans = [
-            spacy_doc.char_span(s.start, s.end, label=s.label)
-            for s in doc.spans
-        ]
+        spans = [spacy_doc.char_span(s.start, s.end, label=s.label) for s in doc.spans]
         spacy_doc.ents = [s for s in spans if s]
-        spacy_doc = nlp.get_pipe('ann_linker')(spacy_doc)
+        spacy_doc = nlp.get_pipe("ann_linker")(spacy_doc)
 
         for i, ent in enumerate(spacy_doc.ents):
             doc.spans[i].id = ent.kb_id_
 
-        res.documents.append(
-            LinkingRecord(spans=doc.spans, context=doc.context)
-        )
-
+        res.documents.append(LinkingRecord(spans=doc.spans, context=doc.context))
+
     return res
diff --git a/docs/src/api/models.py b/docs/src/api/models.py
@@ -1,4 +1,5 @@
 from typing import List
+
 from pydantic import BaseModel
 
 

diff --git a/docs/src/local_ann_linker.py b/docs/src/local_ann_linker.py
@@ -1,5 +1,4 @@
 import spacy
-from spacy.tokens import Span
 
 if __name__ == "__main__":
 
@@ -9,12 +8,13 @@
     nlp = spacy.load(model_dir)
 
     # The NER component of the en_core_web_md model doesn't actually
-    # recognize the aliases as entities so we'll add a 
+    # recognize the aliases as entities so we'll add a
     # spaCy EntityRuler component for now to extract them.
-    ruler = nlp.create_pipe('entity_ruler')
+    ruler = nlp.create_pipe("entity_ruler")
     patterns = [
         {"label": "SKILL", "pattern": alias}
-        for alias in nlp.get_pipe('ann_linker').kb.get_alias_strings() + ['machine learn']
+        for alias in nlp.get_pipe("ann_linker").kb.get_alias_strings()
+        + ["machine learn"]
     ]
     ruler.add_patterns(patterns)
     nlp.add_pipe(ruler, before="ann_linker")

diff --git a/docs/src/remote_ann_linker.py b/docs/src/remote_ann_linker.py
@@ -2,22 +2,22 @@
 
 if __name__ == "__main__":
     nlp = spacy.blank("en")
-    aliases = ['machine learning', 'ML', 'NLP', 'researched']
-    ruler = nlp.create_pipe('entity_ruler', {"overwrite_ents": True})
+    aliases = ["machine learning", "ML", "NLP", "researched"]
+    ruler = nlp.create_pipe("entity_ruler", {"overwrite_ents": True})
     patterns = [{"label": "SKILL", "pattern": alias} for alias in aliases]
     ruler.add_patterns(patterns)
 
-    remote_ann_linker = nlp.create_pipe('remote_ann_linker', {
-        'base_url': "http://localhost:8080/link"
-    })
+    remote_ann_linker = nlp.create_pipe(
+        "remote_ann_linker", {"base_url": "http://localhost:8080/link"}
+    )
     nlp.add_pipe(remote_ann_linker)
 
     doc = nlp("NLP is a highly researched area of machine learning")
     print([(e.text, e.label_, e.kb_id_) for e in doc.ents])
-    
+
     # Outputs:
     # [('NLP', 'SKILL', 'a3'), ('Machine learning', 'SKILL', 'a1')]
     #
     # In our entities.jsonl file
     # a3 => Natural Language Processing
-    # a1 => Machine learning
+    # a1 => Machine learning
diff --git a/pyproject.toml b/pyproject.toml
@@ -50,6 +50,7 @@ api = [
     "python-dotenv"
 ]
 test = [
+    "autoflake",
     "click-completion",
     "pytest >=4.4.0",
     "pytest-cov",

diff --git a/scripts/format.sh b/scripts/format.sh
@@ -1,6 +1,6 @@
 #!/bin/sh -e
 set -x
 
-autoflake --remove-all-unused-imports --recursive --remove-unused-variables --in-place docs/src/ typer tests --exclude=__init__.py
-black typer tests docs/src
-isort --multi-line=3 --trailing-comma --force-grid-wrap=0 --combine-as --line-width 88 --recursive --thirdparty typer --apply typer tests docs/src
+autoflake --remove-all-unused-imports --recursive --remove-unused-variables --in-place docs/src/ spacy_ann tests --exclude=__init__.py
+black spacy_ann tests docs/src
+isort --multi-line=3 --trailing-comma --force-grid-wrap=0 --combine-as --line-width 88 --recursive --thirdparty spacy_ann --apply spacy_ann tests docs/src
diff --git a/spacy_ann/__init__.py b/spacy_ann/__init__.py
@@ -3,15 +3,15 @@
 """spaCy ANN Linker, a pipeline component for generating spaCy KnowledgeBase Alias Candidates for Entity Linking."""
 
 
-__version__ = '0.2.0'
+__version__ = "0.2.0"
 
 from .ann_linker import AnnLinker
 from .remote_ann_linker import RemoteAnnLinker
 
 # TODO: Uncomment (and probably fix a bit) once this PR is merged upstream
-# https://github.com/explosion/spaCy/pull/4988 to enable kb registry with 
+# https://github.com/explosion/spaCy/pull/4988 to enable kb registry with
 # customizable `get_candidates` function
-# 
+#
 # from spacy.kb import KnowledgeBase
 # from spacy.tokens import Span
 # from spacy.util import registry

diff --git a/spacy_ann/ann_kb.py b/spacy_ann/ann_kb.py
@@ -1,35 +1,31 @@
-from typing import List, Dict, Set, Tuple
-import json
-from collections import defaultdict
 from pathlib import Path
+from timeit import default_timer as timer
+from typing import List, Set, Tuple
 
 import joblib
 import nmslib
-from nmslib.dist import FloatIndex
 import numpy as np
-from preshed.maps import PreshMap
 import scipy
+import srsly
+from nmslib.dist import FloatIndex
 from sklearn.feature_extraction.text import TfidfVectorizer
-import spacy
-from spacy.kb import Candidate, KnowledgeBase
-from spacy.tokens import Doc, Span
-from spacy.util import ensure_path, to_disk, from_disk
+from spacy.kb import KnowledgeBase
+from spacy.util import ensure_path, from_disk, to_disk
 from spacy.vocab import Vocab
-import srsly
-from timeit import default_timer as timer
-from wasabi import Printer
 from spacy_ann.types import AliasCandidate
+from wasabi import Printer
 
 
 class AnnKnowledgeBase(KnowledgeBase):
-    def __init__(self, 
+    def __init__(
+        self,
         vocab: Vocab,
         entity_vector_length: int = 64,
         k: int = 1,
         m_parameter: int = 100,
         ef_search: int = 200,
         ef_construction: int = 2000,
-        n_threads: int = 60
+        n_threads: int = 60,
     ):
         """Initialize a CandidateGenerator
         
@@ -56,7 +52,7 @@ def _initialize(
         short_aliases: Set[str],
         ann_index: FloatIndex,
         vectorizer: TfidfVectorizer,
-        alias_tfidfs: scipy.sparse.csr_matrix
+        alias_tfidfs: scipy.sparse.csr_matrix,
     ):
         """Used in `fit` and `from_disk` to initialize the CandidateGenerator with computed
         # TF-IDF Vectorizer and ANN Index
@@ -97,7 +93,7 @@ def fit_index(self, verbose: bool = True):
         # resulting vectors using float16, meaning they take up half the memory on disk. Unfortunately
         # we can't use the float16 format to actually run the vectorizer, because of this bug in sparse
         # matrix representations in scipy: https://github.com/scipy/scipy/issues/7408
-        
+
         msg.text(f"Fitting tfidf vectorizer on {len(kb_aliases)} aliases")
         tfidf_vectorizer = TfidfVectorizer(
             analyzer="char_wb", ngram_range=(3, 3), min_df=2, dtype=np.float32
@@ -109,7 +105,9 @@ def fit_index(self, verbose: bool = True):
         msg.text(f"Fitting and saving vectorizer took {round(total_time)} seconds")
 
         msg.text(f"Finding empty (all zeros) tfidf vectors")
-        empty_tfidfs_boolean_flags = np.array(alias_tfidfs.sum(axis=1) != 0).reshape(-1,)
+        empty_tfidfs_boolean_flags = np.array(alias_tfidfs.sum(axis=1) != 0).reshape(
+            -1,
+        )
         number_of_non_empty_tfidfs = sum(
             empty_tfidfs_boolean_flags == False
         )  # pylint: disable=singleton-comparison
@@ -119,14 +117,18 @@ def fit_index(self, verbose: bool = True):
             f"Deleting {number_of_non_empty_tfidfs}/{total_number_of_tfidfs} aliases because their tfidf is empty"
         )
         # remove empty tfidf vectors, otherwise nmslib will crash
-        aliases = [alias for alias, flag in zip(kb_aliases, empty_tfidfs_boolean_flags) if flag]
+        aliases = [
+            alias for alias, flag in zip(kb_aliases, empty_tfidfs_boolean_flags) if flag
+        ]
         alias_tfidfs = alias_tfidfs[empty_tfidfs_boolean_flags]
         assert len(aliases) == np.size(alias_tfidfs, 0)
 
         msg.text(f"Fitting ann index on {len(aliases)} aliases")
         start_time = timer()
         ann_index = nmslib.init(
-            method="hnsw", space="cosinesimil_sparse", data_type=nmslib.DataType.SPARSE_VECTOR
+            method="hnsw",
+            space="cosinesimil_sparse",
+            data_type=nmslib.DataType.SPARSE_VECTOR,
         )
         ann_index.addDataPointBatch(alias_tfidfs)
         ann_index.createIndex(index_params, print_progress=verbose)
@@ -136,7 +138,9 @@ def fit_index(self, verbose: bool = True):
         total_time = end_time - start_time
         msg.text(f"Fitting ann index took {round(total_time)} seconds")
 
-        self._initialize(aliases, short_aliases, ann_index, tfidf_vectorizer, alias_tfidfs)
+        self._initialize(
+            aliases, short_aliases, ann_index, tfidf_vectorizer, alias_tfidfs
+        )
         return self
 
     def _nmslib_knn_with_zero_vectors(
@@ -154,7 +158,7 @@ def _nmslib_knn_with_zero_vectors(
         k (int): k neighbors to consider
         
         RETURNS (Tuple[np.ndarray, np.ndarray]): Tuple of [neighbors, distances]
-        """                
+        """
 
         empty_vectors_boolean_flags = np.array(vectors.sum(axis=1) != 0).reshape(-1,)
         empty_vectors_count = vectors.shape[0] - sum(empty_vectors_boolean_flags)
@@ -172,7 +176,9 @@ def _nmslib_knn_with_zero_vectors(
         # call `knnQueryBatch` to get neighbors
         original_neighbours = self.ann_index.knnQueryBatch(vectors, k=k)
 
-        neighbors, distances = zip(*[(x[0].tolist(), x[1].tolist()) for x in original_neighbours])
+        neighbors, distances = zip(
+            *[(x[0].tolist(), x[1].tolist()) for x in original_neighbours]
+        )
         neighbors = list(neighbors)
         distances = list(distances)
 
@@ -205,9 +211,11 @@ def get_alias_candidates(self, mention_texts: List[str]):
         # `ann_index.knnQueryBatch` crashes if one of the vectors is all zeros.
         # `nmslib_knn_with_zero_vectors` is a wrapper around `ann_index.knnQueryBatch`
         # that addresses this issue.
-        batch_neighbors, batch_distances = self._nmslib_knn_with_zero_vectors(tfidfs, self.k)
+        batch_neighbors, batch_distances = self._nmslib_knn_with_zero_vectors(
+            tfidfs, self.k
+        )
         end_time = timer()
-        total_time = end_time - start_time
+        end_time - start_time
 
         batch_candidates = []
         for mention, neighbors, distances in zip(
@@ -225,7 +233,9 @@ def get_alias_candidates(self, mention_texts: List[str]):
             for neighbor_index, distance in zip(neighbors, distances):
                 alias = self.aliases[neighbor_index]
                 similarity = 1.0 - distance
-                alias_candidates.append(AliasCandidate(alias=alias, similarity=similarity))
+                alias_candidates.append(
+                    AliasCandidate(alias=alias, similarity=similarity)
+                )
 
             batch_candidates.append(alias_candidates)
 
@@ -258,14 +268,18 @@ def dump(self, path: Path):
             "m_parameter": self.m_parameter,
             "ef_search": self.ef_search,
             "ef_construction": self.ef_construction,
-            "n_threads": self.n_threads
+            "n_threads": self.n_threads,
         }
         serializers = {
             "cg_cfg": lambda p: srsly.write_json(p, cfg),
             "aliases": lambda p: srsly.write_json(p.with_suffix(".json"), self.aliases),
-            "short_aliases": lambda p: srsly.write_json(p.with_suffix(".json"), self.short_aliases),
+            "short_aliases": lambda p: srsly.write_json(
+                p.with_suffix(".json"), self.short_aliases
+            ),
             "ann_index": lambda p: self.ann_index.saveIndex(str(p.with_suffix(".bin"))),
-            "tfidf_vectorizer": lambda p: joblib.dump(self.vectorizer, p.with_suffix(".joblib")),
+            "tfidf_vectorizer": lambda p: joblib.dump(
+                self.vectorizer, p.with_suffix(".joblib")
+            ),
             "tfidf_vectors_sparse": lambda p: scipy.sparse.save_npz(
                 p.with_suffix(".npz"), self.alias_tfidfs.astype(np.float16)
             ),
@@ -275,7 +289,7 @@ def dump(self, path: Path):
 
     def load_bulk(self, path: Path):
         path = ensure_path(path)
-        
+
         super().load_bulk(path)
 
         aliases_path = path / "aliases.json"
@@ -299,13 +313,17 @@ def load_bulk(self, path: Path):
         tfidf_vectorizer = joblib.load(tfidf_vectorizer_path)
         alias_tfidfs = scipy.sparse.load_npz(tfidf_vectors_path).astype(np.float32)
         ann_index = nmslib.init(
-            method="hnsw", space="cosinesimil_sparse", data_type=nmslib.DataType.SPARSE_VECTOR
+            method="hnsw",
+            space="cosinesimil_sparse",
+            data_type=nmslib.DataType.SPARSE_VECTOR,
         )
         ann_index.addDataPointBatch(alias_tfidfs)
         ann_index.loadIndex(str(ann_index_path))
         query_time_params = {"efSearch": self.ef_search}
         ann_index.setQueryTimeParams(query_time_params)
 
-        self._initialize(aliases, short_aliases, ann_index, tfidf_vectorizer, alias_tfidfs)
+        self._initialize(
+            aliases, short_aliases, ann_index, tfidf_vectorizer, alias_tfidfs
+        )
 
-        return self
+        return self