Skip to content

Commit

Permalink
#347 related update. Wiped predefined vectorizers
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Jun 28, 2022
1 parent 368eba5 commit 5e0ad81
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 27 deletions.
13 changes: 13 additions & 0 deletions arekit/contrib/networks/core/input/term_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
class TermTypes(object):
""" Types of input terms that may occur within the
input sequence of the neural network moodels.
"""

WORD = "word"
ENTITY = "entity"
FRAME = "frame"
TOKEN = "token"

@staticmethod
def iter_types():
return [TermTypes.WORD, TermTypes.ENTITY, TermTypes.FRAME, TermTypes.TOKEN]
21 changes: 8 additions & 13 deletions arekit/contrib/networks/core/input/terms_mapping.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
from arekit.common.data.input.terms_mapper import OpinionContainingTextTermsMapper
from arekit.common.entities.base import Entity
from arekit.common.frames.text_variant import TextFrameVariant
from arekit.contrib.networks.core.input.term_types import TermTypes


class StringWithEmbeddingNetworkTermMapping(OpinionContainingTextTermsMapper):
""" For every element returns: (word, embedded vector)
"""

WORD = "word"
ENTITY = "entity"
FRAME = "frame"
TOKEN = "token"

def __init__(self, string_entities_formatter, vectorizers):
"""
string_emb_entity_formatter:
Expand All @@ -20,31 +16,30 @@ def __init__(self, string_entities_formatter, vectorizers):
dict
"""
assert(isinstance(vectorizers, dict))
assert(self.WORD in vectorizers)
assert(self.ENTITY in vectorizers)
assert(self.FRAME in vectorizers)
assert(self.TOKEN in vectorizers)

for term_type in TermTypes.iter_types():
assert(term_type in vectorizers)

super(StringWithEmbeddingNetworkTermMapping, self).__init__(
entity_formatter=string_entities_formatter)

self.__vectorizers = vectorizers

def map_word(self, w_ind, word):
value, vector = self.__vectorizers[self.WORD].create_term_embedding(term=word)
value, vector = self.__vectorizers[TermTypes.WORD].create_term_embedding(term=word)
return value, vector

def map_text_frame_variant(self, fv_ind, text_frame_variant):
assert(isinstance(text_frame_variant, TextFrameVariant))
value, vector = self.__vectorizers[self.FRAME].create_term_embedding(
value, vector = self.__vectorizers[TermTypes.FRAME].create_term_embedding(
term=text_frame_variant.Variant.get_value())
return value, vector

def map_token(self, t_ind, token):
""" It assumes to be composed for all the supported types.
"""
value = token.get_token_value()
vector = self.__vectorizers[self.TOKEN].create_term_embedding(term=t_ind)
vector = self.__vectorizers[TermTypes.TOKEN].create_term_embedding(term=t_ind)
return value, vector

def map_entity(self, e_ind, entity):
Expand All @@ -56,6 +51,6 @@ def map_entity(self, e_ind, entity):
entity=entity)

# Vector extraction
emb_word, vector = self.__vectorizers[self.ENTITY].create_term_embedding(term=str_formatted_entity)
emb_word, vector = self.__vectorizers[TermTypes.ENTITY].create_term_embedding(term=str_formatted_entity)

return emb_word, vector
28 changes: 14 additions & 14 deletions arekit/contrib/networks/handlers/serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,12 @@
from arekit.contrib.networks.embedding import Embedding
from arekit.contrib.utils.model_io.tf_networks import DefaultNetworkIOUtils
from arekit.contrib.utils.serializer import InputDataSerializationHelper
from arekit.contrib.utils.vectorizers.bpe import BPEVectorizer
from arekit.contrib.utils.vectorizers.random_norm import RandomNormalVectorizer


class NetworksInputSerializerExperimentIteration(ExperimentIterationHandler):

def __init__(self, data_type_pipelines, save_labels_func, exp_ctx, exp_io, doc_ops, balance):
def __init__(self, data_type_pipelines, vectorizers,
save_labels_func, exp_ctx, exp_io, doc_ops, balance):
""" This hanlder allows to perform a data preparation for neural network models.
considering a list of the whole data_types with the related pipelines,
Expand All @@ -31,6 +30,14 @@ def __init__(self, data_type_pipelines, save_labels_func, exp_ctx, exp_io, doc_o
balance: bool
declares whethere there is a need to balance Train samples
vectorizers: dict in which for every type there is an assigned Vectorizer
vectorization of term types.
{
TermType.Word: Vectorizer,
TermType.Entity: Vectorizer,
...
}
save_labels_func: function
data_type -> bool
Expand All @@ -47,15 +54,17 @@ def __init__(self, data_type_pipelines, save_labels_func, exp_ctx, exp_io, doc_o
assert(isinstance(exp_ctx, NetworkSerializationContext))
assert(isinstance(exp_io, DefaultNetworkIOUtils))
assert(isinstance(doc_ops, DocumentOperations))
assert(isinstance(vectorizers, dict))
assert(isinstance(balance, bool))
super(NetworksInputSerializerExperimentIteration, self).__init__()

self.__data_type_pipelines = data_type_pipelines
self.__exp_ctx = exp_ctx
self.__exp_io = exp_io
self.__doc_ops = doc_ops
self.__balance = balance
self.__save_labels_func = save_labels_func
self.__vectorizers = vectorizers
self.__balance = balance

# region protected methods

Expand Down Expand Up @@ -87,17 +96,8 @@ def on_iteration(self, iter_index):

term_embedding_pairs = collections.OrderedDict()

bpe_vectorizer = BPEVectorizer(embedding=self.__exp_ctx.WordEmbedding, max_part_size=3)
norm_vectorizer = RandomNormalVectorizer(vector_size=self.__exp_ctx.WordEmbedding.VectorSize,
token_offset=12345)

text_terms_mapper = StringWithEmbeddingNetworkTermMapping(
vectorizers={
StringWithEmbeddingNetworkTermMapping.WORD: bpe_vectorizer,
StringWithEmbeddingNetworkTermMapping.ENTITY: bpe_vectorizer,
StringWithEmbeddingNetworkTermMapping.FRAME: bpe_vectorizer,
StringWithEmbeddingNetworkTermMapping.TOKEN: norm_vectorizer
},
vectorizers=self.__vectorizers,
string_entities_formatter=self.__exp_ctx.StringEntityFormatter)

text_provider = NetworkSingleTextProvider(
Expand Down

0 comments on commit 5e0ad81

Please sign in to comment.