/
base.py
103 lines (69 loc) · 2.32 KB
/
base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from abc import ABC, abstractmethod
from typing import Iterable
import nltk
from spacy.tokens import Doc
from oldp.apps.nlp.language_models import GermanSpacyModel, EnglishSpacyModel, SpacyModel
class DocBase(ABC):
text = None
@abstractmethod
def tokens(self) -> [str]:
pass
@abstractmethod
def lemmas(self) -> [str]:
pass
@abstractmethod
def ents(self, entity_type: str) -> [str]:
pass
class ArrayDoc(DocBase):
def __init__(self, text: str, tokens: [str]):
self.text = text
self._tokens = tokens
def tokens(self) -> Iterable[str]:
return self._tokens
def lemmas(self) -> Iterable[str]:
raise NotImplementedError
def ents(self, entity_type: str) -> Iterable[str]:
raise NotImplementedError
class SpacyDoc(DocBase):
def __init__(self, text: str, doc: Doc, model: SpacyModel):
self.text = text
self.doc = doc
self.model = model
def tokens(self) -> [str]:
return (t.text for t in self.doc)
def lemmas(self) -> [str]:
return (t.lemma_ for t in self.doc)
def ents(self, entity_type: str):
for ent in self.doc.ents:
if self.spacy_entity_name(entity_type) == ent.label_:
yield (ent.text, ent.start_char, ent.end_char)
def spacy_entity_name(self, entity_type):
return self.model.entity_name(entity_type)
class NLPBase(ABC):
def __init__(self, lang='de'):
self.lang = lang
@abstractmethod
def process(self, text: str) -> DocBase:
pass
class SpacyNLP(NLPBase):
nlp = None
model = None
def __init__(self, lang='de'):
super().__init__(lang=lang)
if lang == 'de':
self.model = GermanSpacyModel()
elif lang == 'en':
self.model = EnglishSpacyModel()
else:
raise ValueError('Unsupported language {}'.format(lang))
self.nlp = self.model.load()
def process(self, text: str) -> DocBase:
doc = self.nlp(text)
return SpacyDoc(text, doc, self.model)
class NltkNLP(NLPBase):
def __init__(self, lang='en'):
super().__init__(lang=lang)
# TODO load model for given language
def process(self, text: str) -> DocBase:
tokens = nltk.word_tokenize(text)
return ArrayDoc(text, tokens)