# Example Data Visualization
Given a metadata store database, query it and visualize raw input data in tf record format, via tensorflow data visualization.

In [35]:
import glob
import os

import tensorflow as tf
import tensorflow_data_validation as tfdv

from tfx.orchestration import metadata
from tfx.types import standard_artifacts

In [7]:
metadata_path = os.path.join(os.getcwd(), 'metadata.db')
metadata_connection_config = metadata.sqlite_metadata_connection_config(metadata_path)

In [13]:
with metadata.Metadata(metadata_connection_config) as store:
    example_statistics_artifacts = store.get_artifacts_by_type(standard_artifacts.ExampleStatistics.TYPE_NAME)
    schema_artifacts = store.get_artifacts_by_type(standard_artifacts.Schema.TYPE_NAME)

In [10]:
example_statistics_artifacts

[id: 3
 type_id: 8
 uri: "/Users/zarkopafilis/Desktop/youtube-tfx-pipelines/pipeline_out/chicago_taxi_beam/StatisticsGen/statistics/3"
 properties {
   key: "split_names"
   value {
     string_value: "[\"train\", \"eval\"]"
   }
 }
 custom_properties {
   key: "name"
   value {
     string_value: "statistics"
   }
 }
 custom_properties {
   key: "pipeline_name"
   value {
     string_value: "chicago_taxi_beam"
   }
 }
 custom_properties {
   key: "producer_component"
   value {
     string_value: "StatisticsGen"
   }
 }
 custom_properties {
   key: "state"
   value {
     string_value: "published"
   }
 }
 create_time_since_epoch: 1598017054020
 last_update_time_since_epoch: 1598017073751]

In [24]:
statistics_train_uri = os.path.join(example_statistics_artifacts[-1].uri, 'train', 'stats_tfrecord')
statistics_eval_uri = os.path.join(example_statistics_artifacts[-1].uri, 'eval', 'stats_tfrecord')

statistics_train_uri

'/Users/zarkopafilis/Desktop/youtube-tfx-pipelines/pipeline_out/chicago_taxi_beam/StatisticsGen/statistics/3/train/stats_tfrecord'

In [15]:
schema_uri = os.path.join(schema_artifacts[0].uri, 'schema.pbtxt')
schema_uri

'/Users/zarkopafilis/Desktop/youtube-tfx-pipelines/pipeline_out/chicago_taxi_beam/SchemaGen/schema/4/schema.pbtxt'

In [59]:
schema = tfdv.load_schema_text(schema_uri)
schema

feature {
  name: "payment_type"
  value_count {
    min: 1
    max: 1
  }
  type: BYTES
  domain: "payment_type"
  presence {
    min_fraction: 1.0
    min_count: 1
  }
}
feature {
  name: "company"
  value_count {
    min: 1
    max: 1
  }
  type: BYTES
  domain: "company"
  presence {
    min_count: 1
  }
}
feature {
  name: "dropoff_census_tract"
  value_count {
    min: 1
    max: 1
  }
  type: INT
  presence {
    min_count: 1
  }
}
feature {
  name: "dropoff_community_area"
  value_count {
    min: 1
    max: 1
  }
  type: INT
  presence {
    min_count: 1
  }
}
feature {
  name: "dropoff_latitude"
  value_count {
    min: 1
    max: 1
  }
  type: FLOAT
  presence {
    min_count: 1
  }
}
feature {
  name: "dropoff_longitude"
  value_count {
    min: 1
    max: 1
  }
  type: FLOAT
  presence {
    min_count: 1
  }
}
feature {
  name: "fare"
  value_count {
    min: 1
    max: 1
  }
  type: FLOAT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
}
feature {
  name: "pickup_

In [60]:
stats_train = tfdv.load_statistics(statistics_train_uri)
stats_eval = tfdv.load_statistics(statistics_eval_uri)

In [62]:
tfdv.visualize_statistics(stats_train, stats_eval, lhs_name='train_stats', rhs_name='eval_stats')