Skip to content

Commit

Permalink
fix linting and formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
Kabir Khan committed Jul 16, 2020
1 parent 4e5da37 commit 48f537a
Show file tree
Hide file tree
Showing 25 changed files with 462 additions and 380 deletions.
27 changes: 9 additions & 18 deletions docs/src/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,11 @@

import os

from dotenv import load_dotenv, find_dotenv
from fastapi import FastAPI, Body
from starlette.middleware.cors import CORSMiddleware
from starlette.responses import RedirectResponse
import spacy
import uvicorn

from models import LinkingRequest, LinkingResponse, LinkingRecord

from dotenv import find_dotenv, load_dotenv
from fastapi import Body, FastAPI
from models import LinkingRecord, LinkingRequest, LinkingResponse
from starlette.responses import RedirectResponse

load_dotenv(find_dotenv())
prefix = os.getenv("CLUSTER_ROUTE_PREFIX")
Expand All @@ -27,7 +23,7 @@
openapi_prefix=prefix,
)

example_request = list(srsly.read_json('./example_request.json'))
example_request = list(srsly.read_json("./example_request.json"))

nlp = spacy.load("../tutorial/models/ann_linker")

Expand All @@ -44,18 +40,13 @@ async def link(body: LinkingRequest = Body(..., example=example_request)):
res = LinkingResponse(documents=[])
for doc in body.documents:
spacy_doc = nlp.make_doc(doc.context)
spans = [
spacy_doc.char_span(s.start, s.end, label=s.label)
for s in doc.spans
]
spans = [spacy_doc.char_span(s.start, s.end, label=s.label) for s in doc.spans]
spacy_doc.ents = [s for s in spans if s]
spacy_doc = nlp.get_pipe('ann_linker')(spacy_doc)
spacy_doc = nlp.get_pipe("ann_linker")(spacy_doc)

for i, ent in enumerate(spacy_doc.ents):
doc.spans[i].id = ent.kb_id_

res.documents.append(
LinkingRecord(spans=doc.spans, context=doc.context)
)

res.documents.append(LinkingRecord(spans=doc.spans, context=doc.context))

return res
1 change: 1 addition & 0 deletions docs/src/api/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List

from pydantic import BaseModel


Expand Down
8 changes: 4 additions & 4 deletions docs/src/local_ann_linker.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import spacy
from spacy.tokens import Span

if __name__ == "__main__":

Expand All @@ -9,12 +8,13 @@
nlp = spacy.load(model_dir)

# The NER component of the en_core_web_md model doesn't actually
# recognize the aliases as entities so we'll add a
# recognize the aliases as entities so we'll add a
# spaCy EntityRuler component for now to extract them.
ruler = nlp.create_pipe('entity_ruler')
ruler = nlp.create_pipe("entity_ruler")
patterns = [
{"label": "SKILL", "pattern": alias}
for alias in nlp.get_pipe('ann_linker').kb.get_alias_strings() + ['machine learn']
for alias in nlp.get_pipe("ann_linker").kb.get_alias_strings()
+ ["machine learn"]
]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler, before="ann_linker")
Expand Down
14 changes: 7 additions & 7 deletions docs/src/remote_ann_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,22 @@

if __name__ == "__main__":
nlp = spacy.blank("en")
aliases = ['machine learning', 'ML', 'NLP', 'researched']
ruler = nlp.create_pipe('entity_ruler', {"overwrite_ents": True})
aliases = ["machine learning", "ML", "NLP", "researched"]
ruler = nlp.create_pipe("entity_ruler", {"overwrite_ents": True})
patterns = [{"label": "SKILL", "pattern": alias} for alias in aliases]
ruler.add_patterns(patterns)

remote_ann_linker = nlp.create_pipe('remote_ann_linker', {
'base_url': "http://localhost:8080/link"
})
remote_ann_linker = nlp.create_pipe(
"remote_ann_linker", {"base_url": "http://localhost:8080/link"}
)
nlp.add_pipe(remote_ann_linker)

doc = nlp("NLP is a highly researched area of machine learning")
print([(e.text, e.label_, e.kb_id_) for e in doc.ents])

# Outputs:
# [('NLP', 'SKILL', 'a3'), ('Machine learning', 'SKILL', 'a1')]
#
# In our entities.jsonl file
# a3 => Natural Language Processing
# a1 => Machine learning
# a1 => Machine learning
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ api = [
"python-dotenv"
]
test = [
"autoflake",
"click-completion",
"pytest >=4.4.0",
"pytest-cov",
Expand Down
6 changes: 3 additions & 3 deletions scripts/format.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/sh -e
set -x

autoflake --remove-all-unused-imports --recursive --remove-unused-variables --in-place docs/src/ typer tests --exclude=__init__.py
black typer tests docs/src
isort --multi-line=3 --trailing-comma --force-grid-wrap=0 --combine-as --line-width 88 --recursive --thirdparty typer --apply typer tests docs/src
autoflake --remove-all-unused-imports --recursive --remove-unused-variables --in-place docs/src/ spacy_ann tests --exclude=__init__.py
black spacy_ann tests docs/src
isort --multi-line=3 --trailing-comma --force-grid-wrap=0 --combine-as --line-width 88 --recursive --thirdparty spacy_ann --apply spacy_ann tests docs/src
6 changes: 3 additions & 3 deletions spacy_ann/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
"""spaCy ANN Linker, a pipeline component for generating spaCy KnowledgeBase Alias Candidates for Entity Linking."""


__version__ = '0.2.0'
__version__ = "0.2.0"

from .ann_linker import AnnLinker
from .remote_ann_linker import RemoteAnnLinker

# TODO: Uncomment (and probably fix a bit) once this PR is merged upstream
# https://github.com/explosion/spaCy/pull/4988 to enable kb registry with
# https://github.com/explosion/spaCy/pull/4988 to enable kb registry with
# customizable `get_candidates` function
#
#
# from spacy.kb import KnowledgeBase
# from spacy.tokens import Span
# from spacy.util import registry
Expand Down
82 changes: 50 additions & 32 deletions spacy_ann/ann_kb.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,31 @@
from typing import List, Dict, Set, Tuple
import json
from collections import defaultdict
from pathlib import Path
from timeit import default_timer as timer
from typing import List, Set, Tuple

import joblib
import nmslib
from nmslib.dist import FloatIndex
import numpy as np
from preshed.maps import PreshMap
import scipy
import srsly
from nmslib.dist import FloatIndex
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from spacy.kb import Candidate, KnowledgeBase
from spacy.tokens import Doc, Span
from spacy.util import ensure_path, to_disk, from_disk
from spacy.kb import KnowledgeBase
from spacy.util import ensure_path, from_disk, to_disk
from spacy.vocab import Vocab
import srsly
from timeit import default_timer as timer
from wasabi import Printer
from spacy_ann.types import AliasCandidate
from wasabi import Printer


class AnnKnowledgeBase(KnowledgeBase):
def __init__(self,
def __init__(
self,
vocab: Vocab,
entity_vector_length: int = 64,
k: int = 1,
m_parameter: int = 100,
ef_search: int = 200,
ef_construction: int = 2000,
n_threads: int = 60
n_threads: int = 60,
):
"""Initialize a CandidateGenerator
Expand All @@ -56,7 +52,7 @@ def _initialize(
short_aliases: Set[str],
ann_index: FloatIndex,
vectorizer: TfidfVectorizer,
alias_tfidfs: scipy.sparse.csr_matrix
alias_tfidfs: scipy.sparse.csr_matrix,
):
"""Used in `fit` and `from_disk` to initialize the CandidateGenerator with computed
# TF-IDF Vectorizer and ANN Index
Expand Down Expand Up @@ -97,7 +93,7 @@ def fit_index(self, verbose: bool = True):
# resulting vectors using float16, meaning they take up half the memory on disk. Unfortunately
# we can't use the float16 format to actually run the vectorizer, because of this bug in sparse
# matrix representations in scipy: https://github.com/scipy/scipy/issues/7408

msg.text(f"Fitting tfidf vectorizer on {len(kb_aliases)} aliases")
tfidf_vectorizer = TfidfVectorizer(
analyzer="char_wb", ngram_range=(3, 3), min_df=2, dtype=np.float32
Expand All @@ -109,7 +105,9 @@ def fit_index(self, verbose: bool = True):
msg.text(f"Fitting and saving vectorizer took {round(total_time)} seconds")

msg.text(f"Finding empty (all zeros) tfidf vectors")
empty_tfidfs_boolean_flags = np.array(alias_tfidfs.sum(axis=1) != 0).reshape(-1,)
empty_tfidfs_boolean_flags = np.array(alias_tfidfs.sum(axis=1) != 0).reshape(
-1,
)
number_of_non_empty_tfidfs = sum(
empty_tfidfs_boolean_flags == False
) # pylint: disable=singleton-comparison
Expand All @@ -119,14 +117,18 @@ def fit_index(self, verbose: bool = True):
f"Deleting {number_of_non_empty_tfidfs}/{total_number_of_tfidfs} aliases because their tfidf is empty"
)
# remove empty tfidf vectors, otherwise nmslib will crash
aliases = [alias for alias, flag in zip(kb_aliases, empty_tfidfs_boolean_flags) if flag]
aliases = [
alias for alias, flag in zip(kb_aliases, empty_tfidfs_boolean_flags) if flag
]
alias_tfidfs = alias_tfidfs[empty_tfidfs_boolean_flags]
assert len(aliases) == np.size(alias_tfidfs, 0)

msg.text(f"Fitting ann index on {len(aliases)} aliases")
start_time = timer()
ann_index = nmslib.init(
method="hnsw", space="cosinesimil_sparse", data_type=nmslib.DataType.SPARSE_VECTOR
method="hnsw",
space="cosinesimil_sparse",
data_type=nmslib.DataType.SPARSE_VECTOR,
)
ann_index.addDataPointBatch(alias_tfidfs)
ann_index.createIndex(index_params, print_progress=verbose)
Expand All @@ -136,7 +138,9 @@ def fit_index(self, verbose: bool = True):
total_time = end_time - start_time
msg.text(f"Fitting ann index took {round(total_time)} seconds")

self._initialize(aliases, short_aliases, ann_index, tfidf_vectorizer, alias_tfidfs)
self._initialize(
aliases, short_aliases, ann_index, tfidf_vectorizer, alias_tfidfs
)
return self

def _nmslib_knn_with_zero_vectors(
Expand All @@ -154,7 +158,7 @@ def _nmslib_knn_with_zero_vectors(
k (int): k neighbors to consider
RETURNS (Tuple[np.ndarray, np.ndarray]): Tuple of [neighbors, distances]
"""
"""

empty_vectors_boolean_flags = np.array(vectors.sum(axis=1) != 0).reshape(-1,)
empty_vectors_count = vectors.shape[0] - sum(empty_vectors_boolean_flags)
Expand All @@ -172,7 +176,9 @@ def _nmslib_knn_with_zero_vectors(
# call `knnQueryBatch` to get neighbors
original_neighbours = self.ann_index.knnQueryBatch(vectors, k=k)

neighbors, distances = zip(*[(x[0].tolist(), x[1].tolist()) for x in original_neighbours])
neighbors, distances = zip(
*[(x[0].tolist(), x[1].tolist()) for x in original_neighbours]
)
neighbors = list(neighbors)
distances = list(distances)

Expand Down Expand Up @@ -205,9 +211,11 @@ def get_alias_candidates(self, mention_texts: List[str]):
# `ann_index.knnQueryBatch` crashes if one of the vectors is all zeros.
# `nmslib_knn_with_zero_vectors` is a wrapper around `ann_index.knnQueryBatch`
# that addresses this issue.
batch_neighbors, batch_distances = self._nmslib_knn_with_zero_vectors(tfidfs, self.k)
batch_neighbors, batch_distances = self._nmslib_knn_with_zero_vectors(
tfidfs, self.k
)
end_time = timer()
total_time = end_time - start_time
end_time - start_time

batch_candidates = []
for mention, neighbors, distances in zip(
Expand All @@ -225,7 +233,9 @@ def get_alias_candidates(self, mention_texts: List[str]):
for neighbor_index, distance in zip(neighbors, distances):
alias = self.aliases[neighbor_index]
similarity = 1.0 - distance
alias_candidates.append(AliasCandidate(alias=alias, similarity=similarity))
alias_candidates.append(
AliasCandidate(alias=alias, similarity=similarity)
)

batch_candidates.append(alias_candidates)

Expand Down Expand Up @@ -258,14 +268,18 @@ def dump(self, path: Path):
"m_parameter": self.m_parameter,
"ef_search": self.ef_search,
"ef_construction": self.ef_construction,
"n_threads": self.n_threads
"n_threads": self.n_threads,
}
serializers = {
"cg_cfg": lambda p: srsly.write_json(p, cfg),
"aliases": lambda p: srsly.write_json(p.with_suffix(".json"), self.aliases),
"short_aliases": lambda p: srsly.write_json(p.with_suffix(".json"), self.short_aliases),
"short_aliases": lambda p: srsly.write_json(
p.with_suffix(".json"), self.short_aliases
),
"ann_index": lambda p: self.ann_index.saveIndex(str(p.with_suffix(".bin"))),
"tfidf_vectorizer": lambda p: joblib.dump(self.vectorizer, p.with_suffix(".joblib")),
"tfidf_vectorizer": lambda p: joblib.dump(
self.vectorizer, p.with_suffix(".joblib")
),
"tfidf_vectors_sparse": lambda p: scipy.sparse.save_npz(
p.with_suffix(".npz"), self.alias_tfidfs.astype(np.float16)
),
Expand All @@ -275,7 +289,7 @@ def dump(self, path: Path):

def load_bulk(self, path: Path):
path = ensure_path(path)

super().load_bulk(path)

aliases_path = path / "aliases.json"
Expand All @@ -299,13 +313,17 @@ def load_bulk(self, path: Path):
tfidf_vectorizer = joblib.load(tfidf_vectorizer_path)
alias_tfidfs = scipy.sparse.load_npz(tfidf_vectors_path).astype(np.float32)
ann_index = nmslib.init(
method="hnsw", space="cosinesimil_sparse", data_type=nmslib.DataType.SPARSE_VECTOR
method="hnsw",
space="cosinesimil_sparse",
data_type=nmslib.DataType.SPARSE_VECTOR,
)
ann_index.addDataPointBatch(alias_tfidfs)
ann_index.loadIndex(str(ann_index_path))
query_time_params = {"efSearch": self.ef_search}
ann_index.setQueryTimeParams(query_time_params)

self._initialize(aliases, short_aliases, ann_index, tfidf_vectorizer, alias_tfidfs)
self._initialize(
aliases, short_aliases, ann_index, tfidf_vectorizer, alias_tfidfs
)

return self
return self

0 comments on commit 48f537a

Please sign in to comment.