Skip to content

Commit

Permalink
fix duplicate categorical features added from image exif metadata in …
Browse files Browse the repository at this point in the history
…RAI Vision Dashboard (#2483)
  • Loading branch information
imatiach-msft committed Jan 5, 2024
1 parent 2556253 commit 464c9f0
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ def extract_features(image_dataset: pd.DataFrame,
results = []
dropped_features = feature_metadata.dropped_features \
if feature_metadata else None
if feature_metadata and feature_metadata.categorical_features is None:
if not feature_metadata:
feature_metadata = FeatureMetadata()
if feature_metadata.categorical_features is None:
feature_metadata.categorical_features = []
exif_feature_names = get_all_exif_feature_names(image_dataset)
feature_names = [MEAN_PIXEL_VALUE] + exif_feature_names
Expand Down Expand Up @@ -73,40 +75,8 @@ def extract_features(image_dataset: pd.DataFrame,
row_feature_values = [mean_pixel_value] + \
[None] * len(exif_feature_names)

# append all exif features
if isinstance(image, str):
image_pointer_path = get_image_pointer_from_path(image)
with Image.open(image_pointer_path) as im:
exifdata = im.getexif()
for tag_id in exifdata:
# get the tag name, instead of human unreadable tag id
tag = TAGS.get(tag_id, tag_id)
data = exifdata.get(tag_id)
# decode bytes
if isinstance(data, bytes):
data = data.decode()
if len(data) > MAX_CUSTOM_LEN:
data = data[:MAX_CUSTOM_LEN] + '...'
if isinstance(data, str):
if not feature_metadata:
feature_metadata = FeatureMetadata()
feature_metadata.categorical_features = []
if tag in feature_names:
feature_metadata.categorical_features.append(
str(tag))
tag_index = feature_names.index(tag)
row_feature_values[tag_index] = data
else:
# in theory this should now never happen with
# latest code, but adding this check for safety
if tag not in blacklisted_tags:
blacklisted_tags.add(tag)
warnings.warn(
f'Exif tag {tag} could not be found '
'in the feature names. Ignoring tag '
'from extracted metadata.')
elif isinstance(data, int) or isinstance(data, float):
row_feature_values[feature_names.index(tag)] = data
append_exif_features(image, row_feature_values, feature_names,
blacklisted_tags, feature_metadata)

# append all features other than target column and label
for j in range(start_meta_index, image_dataset.shape[1]):
Expand All @@ -115,3 +85,37 @@ def extract_features(image_dataset: pd.DataFrame,
row_feature_values.append(image_dataset.iloc[i, j])
results.append(row_feature_values)
return results, feature_names


def append_exif_features(image, row_feature_values, feature_names,
blacklisted_tags, feature_metadata):
if isinstance(image, str):
image_pointer_path = get_image_pointer_from_path(image)
with Image.open(image_pointer_path) as im:
exifdata = im.getexif()
for tag_id in exifdata:
# get the tag name, instead of human unreadable tag id
tag = str(TAGS.get(tag_id, tag_id))
data = exifdata.get(tag_id)
# decode bytes
if isinstance(data, bytes):
data = data.decode()
if len(data) > MAX_CUSTOM_LEN:
data = data[:MAX_CUSTOM_LEN] + '...'
if isinstance(data, str):
if tag in feature_names:
if tag not in feature_metadata.categorical_features:
feature_metadata.categorical_features.append(tag)
tag_index = feature_names.index(tag)
row_feature_values[tag_index] = data
else:
# in theory this should now never happen with
# latest code, but adding this check for safety
if tag not in blacklisted_tags:
blacklisted_tags.add(tag)
warnings.warn(
f'Exif tag {tag} could not be found '
'in the feature names. Ignoring tag '
'from extracted metadata.')
elif isinstance(data, int) or isinstance(data, float):
row_feature_values[feature_names.index(tag)] = data
27 changes: 20 additions & 7 deletions responsibleai_vision/tests/test_feature_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
load_fridge_object_detection_dataset,
load_imagenet_dataset)

from responsibleai.feature_metadata import FeatureMetadata
from responsibleai_vision.common.constants import (ExtractedFeatures,
ImageColumns, ImageModes)
from responsibleai_vision.utils.feature_extractors import extract_features
Expand All @@ -18,17 +19,23 @@


def validate_extracted_features(extracted_features, feature_names,
expected_feature_names, data):
expected_feature_names, data,
feature_metadata=None):
assert len(extracted_features) == len(data)
assert feature_names[0] == expected_feature_names[0]
for i in range(1, len(feature_names)):
assert feature_names[i] in expected_feature_names
assert len(feature_names) == len(expected_feature_names)
assert len(extracted_features[0]) == len(feature_names)
if feature_metadata is not None:
assert len(feature_metadata.categorical_features) <= len(feature_names)
for categorical_feature in feature_metadata.categorical_features:
assert categorical_feature in feature_names


def extract_dataset_features(data):
return extract_features(data, ImageColumns.LABEL, ImageModes.RGB, None)
def extract_dataset_features(data, feature_metadata=None):
return extract_features(data, ImageColumns.LABEL, ImageModes.RGB,
feature_metadata=feature_metadata)


class TestFeatureExtractors(object):
Expand All @@ -55,15 +62,21 @@ def test_extract_features_imagenet_metadata(self):

def test_extract_features_flowers_metadata(self):
data = load_flowers_dataset(upscale=False)
extracted_features, feature_names = extract_dataset_features(data)
feature_metadata = FeatureMetadata()
extracted_features, feature_names = extract_dataset_features(
data, feature_metadata=feature_metadata)
expected_feature_names = [MEAN_PIXEL_VALUE]
validate_extracted_features(extracted_features, feature_names,
expected_feature_names, data)
expected_feature_names, data,
feature_metadata)

def test_extract_features_mixed_exif_XPComment_metadata(self):
data = load_fridge_dataset(add_extra_mixed_metadata=True)
extracted_features, feature_names = extract_dataset_features(data)
feature_metadata = FeatureMetadata()
extracted_features, feature_names = extract_dataset_features(
data, feature_metadata=feature_metadata)
expected_feature_names = [MEAN_PIXEL_VALUE, 'XPComment']
expected_feature_names += FRIDGE_METADATA_FEATURES
validate_extracted_features(extracted_features, feature_names,
expected_feature_names, data)
expected_feature_names, data,
feature_metadata)

0 comments on commit 464c9f0

Please sign in to comment.