# Imports

In [None]:
%pip install tensorflow-data-validation
%pip install -q tensorflow_data_validation[visualization]
%pip install tfx

In [101]:
import sys
import os
import numpy as np
import pandas as pd
import pickle
import tempfile
import tensorflow_data_validation as tfdv
np.set_printoptions(threshold=sys.maxsize)
print('TFDV version: {}'.format(tfdv.version.__version__))

TFDV version: 1.9.0


# Data Analysis

#### Load and display data

In [None]:
DATA = './data'
TRAIN_DATA = os.path.join(DATA, 'train.csv')
TEST_DATA = os.path.join(DATA, 'test.csv')
OUTPUT = './output'

In [100]:
train_df = pd.read_csv(TRAIN_DATA, sep=";")    
test_df = pd.read_csv(TEST_DATA, sep=";")
display(train_df)
display(test_df)

Unnamed: 0,ID,TIMESTAMP,WEBSITE,GDS,DEPARTURE,ARRIVAL,ADULTS,CHILDREN,INFANTS,TRAIN,HAUL_TYPE,DISTANCE,DEVICE,TRIP_TYPE,PRODUCT,SMS,EXTRA_BAGGAGE,NO_GDS
0,0,01/July,EDES,1,22/July,25/July,1,0,0,False,DOMESTIC,628844,TABLET,ROUND_TRIP,TRIP,True,False,0
1,1,01/July,EDIT,0,29/July,29/July,1,0,0,False,CONTINENTAL,128143,SMARTPHONE,ONE_WAY,TRIP,False,False,1
2,2,01/July,OPUK,2,29/July,19/August,1,0,0,False,CONTINENTAL,173035,TABLET,ROUND_TRIP,TRIP,True,False,0
3,3,01/July,OPIT,0,24/July,04/August,1,0,0,False,DOMESTIC,652702,SMARTPHONE,MULTI_DESTINATION,TRIP,False,False,2
4,4,01/July,EDES,0,11/August,11/August,1,0,0,False,CONTINENTAL,171785,COMPUTER,ONE_WAY,TRIP,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49995,02/July,EDUK,1,02/July,02/July,2,0,0,False,CONTINENTAL,103513,COMPUTER,ONE_WAY,TRIP,True,True,0
49996,49996,02/July,EDPT,1,11/August,19/August,2,1,0,False,CONTINENTAL,11522,SMARTPHONE,ROUND_TRIP,TRIP,True,False,0
49997,49997,02/July,GOFR,1,09/September,23/September,1,0,0,False,INTERCONTINENTAL,223654,SMARTPHONE,ROUND_TRIP,TRIP,False,False,0
49998,49998,02/July,EDPT,2,05/July,21/July,1,0,0,False,CONTINENTAL,131248,COMPUTER,ROUND_TRIP,TRIP,False,True,0


Unnamed: 0,ID,TIMESTAMP,WEBSITE,GDS,DEPARTURE,ARRIVAL,ADULTS,CHILDREN,INFANTS,TRAIN,HAUL_TYPE,DISTANCE,DEVICE,TRIP_TYPE,PRODUCT,SMS,NO_GDS
0,0,03/July,EDES,1,22/July,22/July,1,0,0,False,DOMESTIC,342595,COMPUTER,ONE_WAY,TRIP,True,0
1,1,03/July,GOFR,1,05/July,22/August,1,0,0,False,INTERCONTINENTAL,320692,SMARTPHONE,ROUND_TRIP,TRIP,False,0
2,2,03/July,OPGB,1,22/July,12/August,1,0,0,False,INTERCONTINENTAL,660522,COMPUTER,ROUND_TRIP,TRIP,False,0
3,3,03/July,EDUK,0,06/July,08/July,1,0,0,False,CONTINENTAL,130229,SMARTPHONE,ROUND_TRIP,TRIP,False,2
4,4,03/July,OPDE,1,10/September,10/September,1,0,0,False,INTERCONTINENTAL,677078,COMPUTER,ONE_WAY,TRIP,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29995,04/July,EDES,0,17/July,17/July,1,0,0,False,CONTINENTAL,14198,COMPUTER,ONE_WAY,TRIP,True,1
29996,29996,04/July,OPFR,1,06/August,06/August,3,1,0,False,DOMESTIC,192141,COMPUTER,ONE_WAY,TRIP,True,0
29997,29997,04/July,EDGB,0,09/July,09/July,1,1,0,False,CONTINENTAL,80468,SMARTPHONE,ONE_WAY,TRIP,False,1
29998,29998,04/July,GOFR,2,24/July,04/August,1,0,0,False,INTERCONTINENTAL,734119,COMPUTER,MULTI_DESTINATION,TRIP,True,1


In [None]:

CATEGORICAL_FEATURE_KEYS = [
    'WEBSITE',
    'HAUL_TYPE',
    'DEVICE',
    'TRIP_TYPE',
    'PRODUCT',
    'DEPARTURE',
    'ARRIVAL',
]

NUMERIC_FEATURE_KEYS = [
    'TIMESTAMP',
    'GDS',    
    'CHILDREN',
    'ADULTS',
    'INFANTS',
    'DISTANCE',
    'SMS',
    'NO_GDS',
]

ORDERED_CSV_COLUMNS = [
    'ID',
    'TIMESTAMP',
    'WEBSITE',
    'GDS',
    'DEPARTURE',
    'ARRIVAL',
    'ADULTS',
    'CHILDREN',
    'INFANTS',
    'TRAIN',
    'HAUL_TYPE',
    'DISTANCE',
    'DEVICE',
    'TRIP_TYPE',
    'PRODUCT',
    'SMS',
    'NO_GDS'
]

LABEL_KEY = 'EXTRA_BAGGAGE'

#### Generate Statistics

In [None]:
%%capture
import tensorflow_data_validation as tfdv
print('TFDV version: {}'.format(tfdv.version.__version__))
train_stats = tfdv.generate_statistics_from_dataframe(train_df)
test_stats = tfdv.generate_statistics_from_dataframe(test_df)

In [None]:
tfdv.visualize_statistics(train_stats)
tfdv.visualize_statistics(lhs_statistics=train_stats,
                         rhs_statistics=test_stats)

#### Infer schema and detect anomalies

In [None]:
schema = tfdv.infer_schema(train_stats)
tfdv.display_schema(schema)

In [None]:
from tensorflow_metadata.proto.v0 import schema_pb2

# Create schema environments and remove the label from the testing environment so it is not detected as an anomaly in the test set
schema.default_environment.append('TRAINING')
schema.default_environment.append('TESTING')

tfdv.get_feature(schema, 'EXTRA_BAGGAGE').not_in_environment.append('TESTING')

# Generate new statistics based on schema
stats_options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)
stats_options.label_feature = 'EXTRA_BAGGAGE'
train_stats = tfdv.generate_statistics_from_dataframe(
    train_df,
    stats_options=stats_options,
)

# Check for anomalies in the test statistics
anomalies = tfdv.validate_statistics(test_stats, schema, environment='TESTING')
tfdv.display_anomalies(anomalies)

#options = tfdv.StatsOptions(schema=schema)
#anomalous_example_stats = tfdv.validate_examples_in_csv(data_location=TRAIN_DATA, stats_options=options)



#### Checking data skew and drift

In [None]:
tfdv.get_feature(schema, 'WEBSITE').skew_comparator.infinity_norm.threshold = 0.01
skew_anomalies = tfdv.validate_statistics(statistics=train_stats, schema=schema, serving_statistics=test_stats)
tfdv.display_anomalies(skew_anomalies)

#### Generate statistics on data slices

In [None]:
from tensorflow_data_validation.utils import slicing_util
slice_fn =  slicing_util.get_feature_value_slicer(features={'DEVICE': 'COMPUTER'})
stats_options = tfdv.StatsOptions(slice_functions=[slice_fn])

train_stats = tfdv.generate_statistics_from_dataframe(
    train_df,
    stats_options=stats_options,
)

tfdv.visualize_statistics(train_stats)


# Data preprocessing

In [None]:
%pip install -U tensorflow-transform
%pip install pyarrow

In [None]:
import pkg_resources
import importlib
importlib.reload(pkg_resources)

In [None]:
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils
from tfx_bsl.public import tfxio

In [None]:
# Names of temp files
TRANSFORMED_TRAIN_DATA_FILEBASE = 'train_transformed'
TRANSFORMED_TEST_DATA_FILEBASE = 'test_transformed'
EXPORTED_MODEL_DIR = 'exported_model_dir'

In [None]:
DROP_COLS = ['ID', 'TIMESTAMP', 'TRAIN']
train_df.drop(DROP_COLS,  axis='columns', inplace=True)
test_df.drop(DROP_COLS,  axis='columns', inplace=True)
display(train_df)

In [None]:
NUM_OOV_BUCKETS = 1
def preprocessing_fn(inputs):
  outputs = inputs.copy()
  # Scale numeric columns to have range [0, 1].
  for key in NUMERIC_FEATURE_KEYS:
    outputs[key] = tft.scale_to_0_1(inputs[key])

  # For all categorical columns except the label column, we generate a
  # vocabulary but do not modify the feature.  This vocabulary is instead
  # used in the trainer, by means of a feature column, to convert the feature
  # from a string to an integer id.
  for key in CATEGORICAL_FEATURE_KEYS:
    outputs[key] = tft.compute_and_apply_vocabulary(
        tft.strings.strip(inputs[key]),
        num_oov_buckets=NUM_OOV_BUCKETS,
        vocab_filename=key)
  
  return outputs 
   

In [None]:
def transform_data(train_data_file, test_data_file, working_dir):
  """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
  """

  # The "with" block will create a pipeline, and run that pipeline at the exit
  # of the block.
  with tft.beam.Pipeline() as pipeline:
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
      # Create a TFXIO to read the census data with the schema. To do this we
      # need to list all columns in order since the schema doesn't specify the
      # order of columns in the csv.
      # We first read CSV files and use BeamRecordCsvTFXIO whose .BeamSource()
      # accepts a PCollection[bytes] because we need to patch the records first
      # (see "FixCommasTrainData" below). Otherwise, tfxio.CsvTFXIO can be used
      # to both read the CSV files and parse them to TFT inputs:
      # csv_tfxio = tfxio.CsvTFXIO(...)
      # raw_data = (pipeline | 'ToRecordBatches' >> csv_tfxio.BeamSource())
      train_csv_tfxio = tfxio.CsvTFXIO(
          file_pattern=TRAIN_DATA,
          telemetry_descriptors=[],
          column_names=ORDERED_CSV_COLUMNS,
          schema=schema)

      # Read in raw data and convert using CSV TFXIO.
      raw_data = (
          pipeline |
          'ReadTrainCsv' >> train_csv_tfxio.BeamSource())

      # Combine data and schema into a dataset tuple.  Note that we already used
      # the schema to read the CSV data, but we also need it to interpret
      # raw_data.
      cfg = train_csv_tfxio.TensorAdapterConfig()
      raw_dataset = (raw_data, cfg)

      # The TFXIO output format is chosen for improved performance.
      transformed_dataset, transform_fn = (
          raw_dataset | tft_beam.AnalyzeAndTransformDataset(
              preprocessing_fn, output_record_batches=True))

      # Transformed metadata is not necessary for encoding.
      transformed_data, _ = transformed_dataset

      # Extract transformed RecordBatches, encode and write them to the given
      # directory.
      # TODO(b/223384488): Switch to `RecordBatchToExamplesEncoder`.
      _ = (
          transformed_data
          | 'EncodeTrainData' >>
          tft.beam.FlatMapTuple(lambda batch, _: RecordBatchToExamples(batch))
          | 'WriteTrainData' >> tft.beam.io.WriteToTFRecord(
              os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE)))

      # Now apply transform function to test data.  In this case we remove the
      # trailing period at the end of each line, and also ignore the header line
      # that is present in the test data file.
      test_csv_tfxio = tfxio.CsvTFXIO(
          file_pattern=test_data_file,
          skip_header_lines=1,
          telemetry_descriptors=[],
          column_names=ORDERED_CSV_COLUMNS,
          schema=schema)
      raw_test_data = (
          pipeline
          | 'ReadTestCsv' >> test_csv_tfxio.BeamSource())

      raw_test_dataset = (raw_test_data, test_csv_tfxio.TensorAdapterConfig())

      # The TFXIO output format is chosen for improved performance.
      transformed_test_dataset = (
          (raw_test_dataset, transform_fn)
          | tft_beam.TransformDataset(output_record_batches=True))

      # Transformed metadata is not necessary for encoding.
      transformed_test_data, _ = transformed_test_dataset

      # Extract transformed RecordBatches, encode and write them to the given
      # directory.
      _ = (
          transformed_test_data
          | 'EncodeTestData' >>
          tft.beam.FlatMapTuple(lambda batch, _: RecordBatchToExamples(batch))
          | 'WriteTestData' >> tft.beam.io.WriteToTFRecord(
              os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

      # Will write a SavedModel and metadata to working_dir, which can then
      # be read by the tft.TFTransformOutput class.
      _ = (
          transform_fn
          | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))

# Neural network training