Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TF2 porting: category feature #667

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
fc4cabe
feat: start of category feature
jimthompson5802 Mar 28, 2020
0e3013c
feat: add parameter for initializing decoder
jimthompson5802 Mar 28, 2020
bde494e
feat: add loss function and loss metric
jimthompson5802 Mar 29, 2020
21db8e3
feat: partial embedder implementation
jimthompson5802 Mar 29, 2020
9153e8e
feat: complete dense embedding implementation
jimthompson5802 Mar 29, 2020
d88a1b8
refactor: comment out obsolete code. To be deleted
jimthompson5802 Mar 29, 2020
cd78001
feat: add probabilties and predictions to predictions() method
jimthompson5802 Mar 29, 2020
7d414c8
Merge remote-tracking branch 'upstream/tf2_porting' into tf2_categori…
jimthompson5802 Mar 29, 2020
e1626f7
feat: add accuracy metric
jimthompson5802 Mar 29, 2020
553da68
refactor: code clean-up
jimthompson5802 Mar 29, 2020
66464c5
refactor: code clean-up
jimthompson5802 Mar 30, 2020
1c2f270
refactor: change default validation metric
jimthompson5802 Mar 30, 2020
7312589
refactor: add todo
jimthompson5802 Mar 30, 2020
8f581d5
refactor: softmax loss parameter changes
jimthompson5802 Mar 31, 2020
3c70152
Merge remote-tracking branch 'upstream/tf2_porting' into tf2_categori…
jimthompson5802 Apr 1, 2020
4ca282e
feat: add partial implementation of categorical sparse encoder
jimthompson5802 Apr 1, 2020
5fc8bf8
feat: initial setup for sampled softmax cross entropy loss
jimthompson5802 Apr 1, 2020
270a04b
feat: continued sampled softmax cross entropy loss
jimthompson5802 Apr 2, 2020
b7cb2bb
feat: test for handling decoder_last_hidden
jimthompson5802 Apr 3, 2020
0ec7651
feat: initial working sampled softmax cross entropy loss function
jimthompson5802 Apr 3, 2020
cbd6955
feat: category feature working softmax ce loss and sampled softmax ce…
jimthompson5802 Apr 3, 2020
2e467ef
refactor: rename custom loss wrapper class
jimthompson5802 Apr 4, 2020
6d8bb23
refactor: add custom wrapper class for MSE metric to support revised …
jimthompson5802 Apr 4, 2020
4661cdf
feat: add new constants
jimthompson5802 Apr 4, 2020
626e512
feat: retrofit MAE to support new prediction format
jimthompson5802 Apr 4, 2020
0044692
feat: add TYPE constant
jimthompson5802 Apr 4, 2020
0ffe3c5
feat: adapt to new predictions format
jimthompson5802 Apr 4, 2020
c5cb9dc
feat: add test for feature encoder, decoder and loss function
jimthompson5802 Apr 5, 2020
73ef884
fix: change format of <feature_name>_probabilities.csv from tensors t…
jimthompson5802 Apr 5, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
99 changes: 83 additions & 16 deletions ludwig/features/category_feature.py
Expand Up @@ -27,6 +27,10 @@
from ludwig.features.base_feature import OutputFeature
from ludwig.models.modules.embedding_modules import Embed
from ludwig.models.modules.initializer_modules import get_initializer
from ludwig.models.modules.category_decoders import Regressor
from ludwig.models.modules.category_encoders import CategoricalEmbedEncoder
from ludwig.models.modules.category_encoders import CategoricalSparseEncoder
from ludwig.models.modules.category_encoders import CategoricalPassthroughEncoder
from ludwig.models.modules.loss_modules import mean_confidence_penalty
from ludwig.models.modules.loss_modules import sampled_softmax_cross_entropy
from ludwig.models.modules.loss_modules import weighted_softmax_cross_entropy
Expand Down Expand Up @@ -98,8 +102,9 @@ def add_feature_data(


class CategoryInputFeature(CategoryBaseFeature, InputFeature):
def __init__(self, feature):
super().__init__(feature)
def __init__(self, feature, encoder_obj=None):
CategoryBaseFeature.__init__(self, feature)
InputFeature.__init__(self)

self.vocab = []

Expand All @@ -112,20 +117,43 @@ def __init__(self, feature):
self.initializer = None
self.regularize = True

_ = self.overwrite_defaults(feature)

self.embed = Embed(
vocab=self.vocab,
embedding_size=self.embedding_size,
representation=self.representation,
embeddings_trainable=self.embeddings_trainable,
pretrained_embeddings=self.pretrained_embeddings,
embeddings_on_cpu=self.embeddings_on_cpu,
dropout=self.dropout,
initializer=self.initializer,
regularize=self.regularize

# _ = self.overwrite_defaults(feature)
#
# self.embed = Embed(
# vocab=self.vocab,
# embedding_size=self.embedding_size,
# representation=self.representation,
# embeddings_trainable=self.embeddings_trainable,
# pretrained_embeddings=self.pretrained_embeddings,
# embeddings_on_cpu=self.embeddings_on_cpu,
# dropout=self.dropout,
# initializer=self.initializer,
# regularize=self.regularize
# )

self.encoder = self.representation
jimthompson5802 marked this conversation as resolved.
Show resolved Hide resolved
encoder_parameters = self.overwrite_defaults(feature)

if encoder_obj:
self.encoder_obj = encoder_obj
else:
self.encoder_obj = self.initialize_encoder(encoder_parameters)

def call(self, inputs, training=None, mask=None):
assert isinstance(inputs, tf.Tensor)
assert inputs.dtype == tf.int8 or inputs.dtype == tf.int16 or \
inputs.dtype == tf.int32 or inputs.dtype == tf.float64
jimthompson5802 marked this conversation as resolved.
Show resolved Hide resolved
assert len(inputs.shape) == 1

inputs_exp = inputs[:, tf.newaxis]
inputs_encoded = self.encoder_obj(
inputs_exp, training=training, mask=mask
)

return inputs_encoded


@staticmethod
def update_model_definition_with_metadata(
input_feature,
Expand Down Expand Up @@ -175,18 +203,49 @@ def build_input(
def populate_defaults(input_feature):
set_default_value(input_feature, TIED, None)

encoder_registry = {
'dense': CategoricalEmbedEncoder,
'sparse': CategoricalSparseEncoder,
'passthrough': CategoricalPassthroughEncoder,
'null': CategoricalPassthroughEncoder,
'none': CategoricalPassthroughEncoder,
'None': CategoricalPassthroughEncoder,
None: CategoricalPassthroughEncoder
}


class CategoryOutputFeature(CategoryBaseFeature, OutputFeature):
def __init__(self, feature):
super().__init__(feature)
CategoryBaseFeature.__init__(self, feature)
OutputFeature.__init__(self, feature)

self.loss = {'type': SOFTMAX_CROSS_ENTROPY}
self.num_classes = 0
self.top_k = 3
self.initializer = None
self.regularize = True

_ = self.overwrite_defaults(feature)
self.decoder = 'regressor'
jimthompson5802 marked this conversation as resolved.
Show resolved Hide resolved
decoder_parameters = self.overwrite_defaults(feature)

self.decoder_obj = self.initialize_decoder(decoder_parameters)

self._setup_loss()
self._setup_metrics()


def logits(
self,
inputs, # hidden
):
return self.decoder_obj(inputs)


def _setup_loss(self):
pass

def _setup_metrics(self):
pass

default_validation_metric = ACCURACY

Expand Down Expand Up @@ -714,3 +773,11 @@ def populate_defaults(output_feature):
'reduce_dependencies': SUM
}
)

decoder_registry = {
'regressor': Regressor,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

classifier

'null': Regressor,
'none': Regressor,
'None': Regressor,
None: Regressor
}
33 changes: 33 additions & 0 deletions ludwig/models/modules/category_decoders.py
@@ -0,0 +1,33 @@
#! /usr/bin/env python
# coding=utf-8
# Copyright (c) 2019 Uber Technologies, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import logging

import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Layer

logger = logging.getLogger(__name__)


class Regressor(Layer):
jimthompson5802 marked this conversation as resolved.
Show resolved Hide resolved

def __init__(self, **kwargs):
super().__init__()
self.dense = Dense(4) # todo add initialization etc. Hard coded for debugging

def call(self, inputs, **kwargs):
return tf.squeeze(self.dense(inputs))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is squeeze needed?

72 changes: 72 additions & 0 deletions ludwig/models/modules/category_encoders.py
@@ -0,0 +1,72 @@
#! /usr/bin/env python
# coding=utf-8
# Copyright (c) 2019 Uber Technologies, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import logging

import tensorflow as tf
from tensorflow.keras.layers import Layer

logger = logging.getLogger(__name__)


class CategoricalPassthroughEncoder(Layer):

def __init__(
self,
**kwargs
):
super(CategoricalPassthroughEncoder, self).__init__()

def call(self, inputs, training=None, mask=None):
"""
:param inputs: The inputs fed into the encoder.
Shape: [batch x 1], type tf.int32
"""
return inputs


class CategoricalEmbedEncoder(Layer):

def __init__(
self,
**kwargs
):
super(CategoricalEmbedEncoder, self).__init__()

def call(self, inputs, training=None, mask=None):
"""
:param inputs: The inputs fed into the encoder.
Shape: [batch x 1], type tf.int32
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should probably work for any integer precision


:param return: embeddings of shape [batch x embed size], type tf.float32
"""
return tf.cast(inputs, dtype=tf.float32)

class CategoricalSparseEncoder(Layer):

def __init__(
self,
**kwargs
):
super(CategoricalSparseEncoder, self).__init__()

def call(self, inputs, training=None, mask=None):
"""
:param inputs: The inputs fed into the encoder.
Shape: [batch x 1], type tf.int32
:param return: one-hot encoding, shape [batch x number classes], type tf.int32
"""
return inputs