Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TF2 porting: category feature #667

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
fc4cabe
feat: start of category feature
jimthompson5802 Mar 28, 2020
0e3013c
feat: add parameter for initializing decoder
jimthompson5802 Mar 28, 2020
bde494e
feat: add loss function and loss metric
jimthompson5802 Mar 29, 2020
21db8e3
feat: partial embedder implementation
jimthompson5802 Mar 29, 2020
9153e8e
feat: complete dense embedding implementation
jimthompson5802 Mar 29, 2020
d88a1b8
refactor: comment out obsolete code. To be deleted
jimthompson5802 Mar 29, 2020
cd78001
feat: add probabilties and predictions to predictions() method
jimthompson5802 Mar 29, 2020
7d414c8
Merge remote-tracking branch 'upstream/tf2_porting' into tf2_categori…
jimthompson5802 Mar 29, 2020
e1626f7
feat: add accuracy metric
jimthompson5802 Mar 29, 2020
553da68
refactor: code clean-up
jimthompson5802 Mar 29, 2020
66464c5
refactor: code clean-up
jimthompson5802 Mar 30, 2020
1c2f270
refactor: change default validation metric
jimthompson5802 Mar 30, 2020
7312589
refactor: add todo
jimthompson5802 Mar 30, 2020
8f581d5
refactor: softmax loss parameter changes
jimthompson5802 Mar 31, 2020
3c70152
Merge remote-tracking branch 'upstream/tf2_porting' into tf2_categori…
jimthompson5802 Apr 1, 2020
4ca282e
feat: add partial implementation of categorical sparse encoder
jimthompson5802 Apr 1, 2020
5fc8bf8
feat: initial setup for sampled softmax cross entropy loss
jimthompson5802 Apr 1, 2020
270a04b
feat: continued sampled softmax cross entropy loss
jimthompson5802 Apr 2, 2020
b7cb2bb
feat: test for handling decoder_last_hidden
jimthompson5802 Apr 3, 2020
0ec7651
feat: initial working sampled softmax cross entropy loss function
jimthompson5802 Apr 3, 2020
cbd6955
feat: category feature working softmax ce loss and sampled softmax ce…
jimthompson5802 Apr 3, 2020
2e467ef
refactor: rename custom loss wrapper class
jimthompson5802 Apr 4, 2020
6d8bb23
refactor: add custom wrapper class for MSE metric to support revised …
jimthompson5802 Apr 4, 2020
4661cdf
feat: add new constants
jimthompson5802 Apr 4, 2020
626e512
feat: retrofit MAE to support new prediction format
jimthompson5802 Apr 4, 2020
0044692
feat: add TYPE constant
jimthompson5802 Apr 4, 2020
0ffe3c5
feat: adapt to new predictions format
jimthompson5802 Apr 4, 2020
c5cb9dc
feat: add test for feature encoder, decoder and loss function
jimthompson5802 Apr 5, 2020
73ef884
fix: change format of <feature_name>_probabilities.csv from tensors t…
jimthompson5802 Apr 5, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions ludwig/constants.py
Expand Up @@ -72,8 +72,12 @@

METRIC = 'metric'
PREDICTION = 'prediction'
LOGITS = 'logits'
FINAL_HIDDEN = 'final_hidden'

SUM = 'sum'
APPEND = 'append'
SEQ_SUM = 'seq_sum'
AVG_EXP = 'avg_exp'

TYPE = 'type'
10 changes: 5 additions & 5 deletions ludwig/data/dataset_synthesyzer.py
Expand Up @@ -25,7 +25,7 @@
import numpy as np
import yaml

from ludwig.constants import VECTOR
from ludwig.constants import VECTOR, TYPE
from ludwig.utils.data_utils import save_csv
from ludwig.utils.h3_util import components_to_h3
from ludwig.utils.misc import get_from_registry
Expand Down Expand Up @@ -96,7 +96,7 @@ def build_feature_parameters(features):
feature_parameters = {}
for feature in features:
fearure_builder_function = get_from_registry(
feature['type'],
feature[TYPE],
parameters_builders_registry
)

Expand Down Expand Up @@ -136,12 +136,12 @@ def generate_datapoint(features):
datapoint = []
for feature in features:
if ('cycle' in feature and feature['cycle'] is True and
feature['type'] in cyclers_registry):
cycler_function = cyclers_registry[feature['type']]
feature[TYPE] in cyclers_registry):
cycler_function = cyclers_registry[feature[TYPE]]
feature_value = cycler_function(feature)
else:
generator_function = get_from_registry(
feature['type'],
feature[TYPE],
generators_registry
)
feature_value = generator_function(feature)
Expand Down
5 changes: 3 additions & 2 deletions ludwig/data/postprocessing.py
Expand Up @@ -16,6 +16,7 @@
# ==============================================================================
import pandas as pd

from ludwig.constants import TYPE
from ludwig.features.feature_registries import output_type_registry
from ludwig.features.feature_utils import SEQUENCE_TYPES
from ludwig.utils.misc import get_from_registry
Expand All @@ -29,7 +30,7 @@ def postprocess_results(
skip_save_unprocessed_output=False,
):
feature = get_from_registry(
output_feature['type'], output_type_registry
output_feature[TYPE], output_type_registry
)
return feature.postprocess_results(
output_feature,
Expand Down Expand Up @@ -76,7 +77,7 @@ def postprocess_df(
data_for_df = {}
for output_feature in output_features:
output_feature_name = output_feature['name']
output_feature_type = output_feature['type']
output_feature_type = output_feature[TYPE]
output_feature_dict = postprocessed_output[output_feature_name]
for key_val in output_feature_dict.items():
output_subgroup_name, output_type_value = key_val
Expand Down
26 changes: 13 additions & 13 deletions ludwig/data/preprocessing.py
Expand Up @@ -113,17 +113,17 @@ def build_metadata(dataset_df, features, global_preprocessing_parameters):
train_set_metadata = {}
for feature in features:
get_feature_meta = get_from_registry(
feature['type'],
feature[TYPE],
base_type_registry
).get_feature_meta
if 'preprocessing' in feature:
preprocessing_parameters = merge_dict(
global_preprocessing_parameters[feature['type']],
global_preprocessing_parameters[feature[TYPE]],
feature['preprocessing']
)
else:
preprocessing_parameters = global_preprocessing_parameters[
feature['type']
feature[TYPE]
]
train_set_metadata[feature['name']] = get_feature_meta(
dataset_df[feature['name']].astype(str),
Expand All @@ -141,17 +141,17 @@ def build_data(
data_dict = {}
for feature in features:
add_feature_data = get_from_registry(
feature['type'],
feature[TYPE],
base_type_registry
).add_feature_data
if 'preprocessing' in feature:
preprocessing_parameters = merge_dict(
global_preprocessing_parameters[feature['type']],
global_preprocessing_parameters[feature[TYPE]],
feature['preprocessing']
)
else:
preprocessing_parameters = global_preprocessing_parameters[
feature['type']
feature[TYPE]
]
handle_missing_values(
dataset_df,
Expand Down Expand Up @@ -185,7 +185,7 @@ def handle_missing_values(dataset_df, feature, preprocessing_parameters):
dataset_df[feature['name']].value_counts().index[0],
)
elif missing_value_strategy == FILL_WITH_MEAN:
if feature['type'] != NUMERICAL:
if feature[TYPE] != NUMERICAL:
raise ValueError(
'Filling missing values with mean is supported '
'only for numerical types',
Expand Down Expand Up @@ -245,15 +245,15 @@ def load_data(
hdf5_data = h5py.File(hdf5_file_path, 'r')
dataset = {}
for input_feature in input_features:
if input_feature['type'] == TEXT:
if input_feature[TYPE] == TEXT:
text_data_field = text_feature_data_field(input_feature)
dataset[text_data_field] = hdf5_data[text_data_field][()]
else:
dataset[input_feature['name']] = hdf5_data[
input_feature['name']
][()]
for output_feature in output_features:
if output_feature['type'] == TEXT:
if output_feature[TYPE] == TEXT:
dataset[text_feature_data_field(output_feature)] = hdf5_data[
text_feature_data_field(output_feature)
][()]
Expand Down Expand Up @@ -856,7 +856,7 @@ def preprocess_for_prediction(

def replace_text_feature_level(features, datasets):
for feature in features:
if feature['type'] == TEXT:
if feature[TYPE] == TEXT:
for dataset in datasets:
if dataset is not None:
dataset[feature['name']] = dataset[
Expand Down Expand Up @@ -891,15 +891,15 @@ def get_preprocessing_params(model_definition):
for feature in features:
if 'preprocessing' in feature:
local_preprocessing_parameters = merge_dict(
global_preprocessing_parameters[feature['type']],
global_preprocessing_parameters[feature[TYPE]],
feature['preprocessing']
)
else:
local_preprocessing_parameters = global_preprocessing_parameters[
feature['type']
feature[TYPE]
]
merged_preprocessing_params.append(
(feature['name'], feature['type'], local_preprocessing_parameters)
(feature['name'], feature[TYPE], local_preprocessing_parameters)
)

return merged_preprocessing_params
Expand Down
8 changes: 4 additions & 4 deletions ludwig/features/audio_feature.py
Expand Up @@ -87,7 +87,7 @@ def get_feature_meta(column, preprocessing_parameters):

@staticmethod
def _get_feature_dim(audio_feature_dict, sampling_rate_in_hz):
feature_type = audio_feature_dict['type']
feature_type = audio_feature_dict[TYPE]

if feature_type == 'raw':
feature_dim = 1
Expand Down Expand Up @@ -127,7 +127,7 @@ def _read_audio_and_transform_to_feature(filepath, audio_feature_dict,
)
sys.exit(-1)

feature_type = audio_feature_dict['type']
feature_type = audio_feature_dict[TYPE]
audio, sampling_rate_in_hz = soundfile.read(filepath)
AudioBaseFeature._update(audio_stats, audio, sampling_rate_in_hz)

Expand Down Expand Up @@ -236,7 +236,7 @@ def add_feature_data(
raise ValueError(
'audio_feature dictionary has to be present in preprocessing '
'for audio.')
if not 'type' in preprocessing_parameters['audio_feature']:
if not TYPE in preprocessing_parameters['audio_feature']:
raise ValueError(
'type has to be present in audio_feature dictionary '
'for audio.')
Expand Down Expand Up @@ -315,7 +315,7 @@ def _get_max_length_feature(
sampling_rate_in_hz,
audio_length_limit_in_s
):
feature_type = audio_feature_dict['type']
feature_type = audio_feature_dict[TYPE]
audio_length_limit_in_samp = (
audio_length_limit_in_s * sampling_rate_in_hz
)
Expand Down
2 changes: 1 addition & 1 deletion ludwig/features/base_feature.py
Expand Up @@ -141,7 +141,7 @@ def eval_loss(self, targets, predictions):
def update_metrics(self, targets, predictions):
for metric, metric_fn in self.metric_functions.items():
if metric == LOSS:
metric_fn.update_state(targets, predictions['logits'])
metric_fn.update_state(targets, predictions)
else:
metric_fn.update_state(targets, predictions['predictions'])

Expand Down
10 changes: 5 additions & 5 deletions ludwig/features/binary_feature.py
Expand Up @@ -158,7 +158,7 @@ def predictions(
self,
inputs # hidden
):
logits = inputs
logits = inputs[LOGITS]

probabilities = tf.nn.sigmoid(
logits,
Expand All @@ -173,9 +173,9 @@ def predictions(
)

return {
'probabilities': probabilities,
'predictions': predictions,
'logits': inputs
PROBABILITIES: probabilities,
PREDICTIONS: predictions,
LOGITS: logits
}

def _setup_loss(self):
Expand All @@ -198,7 +198,7 @@ def _setup_metrics(self):
# def update_metrics(self, targets, predictions):
# for metric, metric_fn in self.metric_functions.items():
# if metric == LOSS:
# metric_fn.update_state(targets, predictions['logits'])
# metric_fn.update_state(targets, predictions[LOGITS])
# else:
# metric_fn.update_state(targets, predictions['predictions'])

Expand Down