# Data Ingestion

## Setup

### Install the packages

In [1]:
!pip install -U tensorflow==2.4.1 tfx==0.27.0 pandas==1.2.3

Requirement already up-to-date: tensorflow==2.4.1 in /usr/local/lib/python3.7/dist-packages (2.4.1)
Requirement already up-to-date: tfx==0.27.0 in /usr/local/lib/python3.7/dist-packages (0.27.0)
Requirement already up-to-date: pandas==1.2.3 in /usr/local/lib/python3.7/dist-packages (1.2.3)


### Import the packages

In [2]:
import logging
import os
import sys

import pandas as pd
import tensorflow as tf
from tfx.components import CsvExampleGen
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.proto import example_gen_pb2

## The dataset

### Download the dataset

In [3]:
!mkdir data

In [4]:
import logging
import pandas as pd

# Initial dataset source
DATASET_URL = "http://bit.ly/building-ml-pipelines-dataset"

# Initial local dataset location
LOCAL_FILE_NAME = "data/consumer_complaints_with_narrative.csv"


def download_dataset(url=DATASET_URL):
    """download_dataset downloads the remote dataset to a local path

    Keyword Arguments:
        url {string} --
            complete url path to the csv data source (default: {DATASET_URL})
        local_path {string} --
            initial local file location (default: {LOCAL_FILE_NAME})
    Returns:
        None
    """
    df = pd.read_csv(url, index_col=0)
    df.to_csv(LOCAL_FILE_NAME)
    logging.info("Download completed.")


download_dataset()

### Load the dataset

In [5]:
df = pd.read_csv(LOCAL_FILE_NAME)
df.head()

Unnamed: 0,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company,state,zip_code,company_response,timely_response,consumer_disputed
0,Debt collection,I do not know,Disclosure verification of debt,Right to dispute notice not received,I was denied employment because of a judgment ...,Encore Capital Group,NY,113XX,Closed with explanation,Yes,0
1,Credit reporting,,Improper use of my credit report,Report improperly shared by CRC,I have a credit card through XXXX XXXX and XXX...,Experian,IL,606XX,Closed with non-monetary relief,Yes,0
2,Debt collection,I do not know,Cont'd attempts collect debt not owed,Debt is not mine,Almost daily phone calls from Stellar Recovery...,Stellar Recovery Inc.,MI,480XX,Closed with explanation,Yes,1
3,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,I submitted my monthly mortgage payment to Pri...,Primary Residential Mortgage,CT,066XX,Closed with monetary relief,Yes,0
4,Student loan,Non-federal student loan,Dealing with my lender or servicer,Received bad information about my loan,I contacted America Education Services in XX/X...,AES/PHEAA,FL,321XX,Closed with explanation,Yes,1


## Experiments

### First ingestion

In [6]:
context = InteractiveContext(pipeline_root='tfx')



In [7]:
data_dir = 'data'
example_gen = CsvExampleGen(input_base=data_dir)
context.run(example_gen)





0,1
.execution_id,1
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } CsvExampleGen at 0x7f2c9a122e10.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c9a122fd0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/1) at 0x7f2d105df350.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0.exec_properties['input_base']data['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1616487535,sum_checksum:1616487535"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c9a122fd0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/1) at 0x7f2d105df350.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c9a122fd0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/1) at 0x7f2d105df350.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_base']data['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1616487535,sum_checksum:1616487535"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c9a122fd0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/1) at 0x7f2d105df350.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/1) at 0x7f2d105df350.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/1) at 0x7f2d105df350.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,tfx/CsvExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_base'],data
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['custom_config'],
['range_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1616487535,sum_checksum:1616487535"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c9a122fd0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/1) at 0x7f2d105df350.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/1) at 0x7f2d105df350.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/1) at 0x7f2d105df350.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,tfx/CsvExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval""]"
.version,0


### Split the dataset into subsets

In [8]:
output = example_gen_pb2.Output(
    split_config=example_gen_pb2.SplitConfig(splits=[
        example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=6),
        example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=2),
        example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=2)
    ]))

example_gen = CsvExampleGen(input_base=data_dir, output_config=output)

context.run(example_gen)

0,1
.execution_id,2
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } CsvExampleGen at 0x7f2c9848d5d0.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c9848ded0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/2) at 0x7f2c9a197dd0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/2.span0.split_names[""train"", ""eval"", ""test""].version0.exec_properties['input_base']data['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 6,  ""name"": ""train""  },  {  ""hash_buckets"": 2,  ""name"": ""eval""  },  {  ""hash_buckets"": 2,  ""name"": ""test""  }  ]  } }['output_data_format']6['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1616487535,sum_checksum:1616487535"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c9848ded0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/2) at 0x7f2c9a197dd0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/2.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c9848ded0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/2) at 0x7f2c9a197dd0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/2.span0.split_names[""train"", ""eval"", ""test""].version0"
.exec_properties,"['input_base']data['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 6,  ""name"": ""train""  },  {  ""hash_buckets"": 2,  ""name"": ""eval""  },  {  ""hash_buckets"": 2,  ""name"": ""test""  }  ]  } }['output_data_format']6['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1616487535,sum_checksum:1616487535"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c9848ded0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/2) at 0x7f2c9a197dd0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/2.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/2) at 0x7f2c9a197dd0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/2.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/2) at 0x7f2c9a197dd0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/2.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,tfx/CsvExampleGen/examples/2
.span,0
.split_names,"[""train"", ""eval"", ""test""]"
.version,0

0,1
['input_base'],data
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 6,  ""name"": ""train""  },  {  ""hash_buckets"": 2,  ""name"": ""eval""  },  {  ""hash_buckets"": 2,  ""name"": ""test""  }  ]  } }"
['output_data_format'],6
['custom_config'],
['range_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:78956236,xor_checksum:1616487535,sum_checksum:1616487535"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c9848ded0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/2) at 0x7f2c9a197dd0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/2.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/2) at 0x7f2c9a197dd0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/2.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/2) at 0x7f2c9a197dd0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/2.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,tfx/CsvExampleGen/examples/2
.span,0
.split_names,"[""train"", ""eval"", ""test""]"
.version,0


### Preserve existing splits

In [9]:
!mkdir -p data/train
!mkdir -p data/eval
!mkdir -p data/test

In [10]:
df.iloc[:20000].to_csv('data/train/20k-consumer-complaints-training.csv', index=False)
df.iloc[20000:24000].to_csv('data/eval/4k-consumer-complaints-eval.csv', index=False)
df.iloc[24000:26000].to_csv('data/test/2k-consumer-complaints-test.csv', index=False)

In [11]:
input = example_gen_pb2.Input(splits=[
    example_gen_pb2.Input.Split(name='train', pattern='train/*'),
    example_gen_pb2.Input.Split(name='eval', pattern='eval/*'),
    example_gen_pb2.Input.Split(name='test', pattern='test/*')
])

example_gen = CsvExampleGen(input_base=data_dir, input_config=input)
context.run(example_gen)

0,1
.execution_id,3
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } CsvExampleGen at 0x7f2c98960e90.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c98960850.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/3) at 0x7f2c97e64590.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/3.span0.split_names[""train"", ""eval"", ""test""].version0.exec_properties['input_base']data['input_config']{  ""splits"": [  {  ""name"": ""train"",  ""pattern"": ""train/*""  },  {  ""name"": ""eval"",  ""pattern"": ""eval/*""  },  {  ""name"": ""test"",  ""pattern"": ""test/*""  }  ] }['output_config']{}['output_data_format']6['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:train,num_files:1,total_bytes:23608185,xor_checksum:1616487600,sum_checksum:1616487600 split:eval,num_files:1,total_bytes:4759360,xor_checksum:1616487601,sum_checksum:1616487601 split:test,num_files:1,total_bytes:2428892,xor_checksum:1616487601,sum_checksum:1616487601"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c98960850.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/3) at 0x7f2c97e64590.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/3.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c98960850.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/3) at 0x7f2c97e64590.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/3.span0.split_names[""train"", ""eval"", ""test""].version0"
.exec_properties,"['input_base']data['input_config']{  ""splits"": [  {  ""name"": ""train"",  ""pattern"": ""train/*""  },  {  ""name"": ""eval"",  ""pattern"": ""eval/*""  },  {  ""name"": ""test"",  ""pattern"": ""test/*""  }  ] }['output_config']{}['output_data_format']6['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:train,num_files:1,total_bytes:23608185,xor_checksum:1616487600,sum_checksum:1616487600 split:eval,num_files:1,total_bytes:4759360,xor_checksum:1616487601,sum_checksum:1616487601 split:test,num_files:1,total_bytes:2428892,xor_checksum:1616487601,sum_checksum:1616487601"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c98960850.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/3) at 0x7f2c97e64590.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/3.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/3) at 0x7f2c97e64590.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/3.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/3) at 0x7f2c97e64590.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/3.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,tfx/CsvExampleGen/examples/3
.span,0
.split_names,"[""train"", ""eval"", ""test""]"
.version,0

0,1
['input_base'],data
['input_config'],"{  ""splits"": [  {  ""name"": ""train"",  ""pattern"": ""train/*""  },  {  ""name"": ""eval"",  ""pattern"": ""eval/*""  },  {  ""name"": ""test"",  ""pattern"": ""test/*""  }  ] }"
['output_config'],{}
['output_data_format'],6
['custom_config'],
['range_config'],
['span'],0
['version'],
['input_fingerprint'],"split:train,num_files:1,total_bytes:23608185,xor_checksum:1616487600,sum_checksum:1616487600 split:eval,num_files:1,total_bytes:4759360,xor_checksum:1616487601,sum_checksum:1616487601 split:test,num_files:1,total_bytes:2428892,xor_checksum:1616487601,sum_checksum:1616487601"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c98960850.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/3) at 0x7f2c97e64590.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/3.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/3) at 0x7f2c97e64590.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/3.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/3) at 0x7f2c97e64590.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/3.span0.split_names[""train"", ""eval"", ""test""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,tfx/CsvExampleGen/examples/3
.span,0
.split_names,"[""train"", ""eval"", ""test""]"
.version,0


### Spanning Datasets

In [12]:
!mkdir -p data/export-0
!mkdir -p data/export-1
!mkdir -p data/export-2

In [13]:
df.iloc[:20000].to_csv('data/export-0/20k-consumer-complaints.csv', index=False)
df.iloc[24000].to_csv('data/export-1/24k-consumer-complaints.csv', index=False)
df.iloc[26000].to_csv('data/export-2/26k-consumer-complaints.csv', index=False)

In [14]:
input = example_gen_pb2.Input(splits=[
    example_gen_pb2.Input.Split(pattern='export-{SPAN}/*')
])

example_gen = CsvExampleGen(input_base=data_dir, input_config=input)
context.run(example_gen)

0,1
.execution_id,4
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } CsvExampleGen at 0x7f2c98960bd0.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c98960b10.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/4) at 0x7f2c97e67ed0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0.exec_properties['input_base']data['input_config']{  ""splits"": [  {  ""pattern"": ""export-2/*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['custom_config']None['range_config']None['span']2['version']None['input_fingerprint']split:,num_files:1,total_bytes:412,xor_checksum:1616487615,sum_checksum:1616487615"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c98960b10.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/4) at 0x7f2c97e67ed0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c98960b10.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/4) at 0x7f2c97e67ed0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_base']data['input_config']{  ""splits"": [  {  ""pattern"": ""export-2/*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['custom_config']None['range_config']None['span']2['version']None['input_fingerprint']split:,num_files:1,total_bytes:412,xor_checksum:1616487615,sum_checksum:1616487615"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c98960b10.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/4) at 0x7f2c97e67ed0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/4) at 0x7f2c97e67ed0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/4) at 0x7f2c97e67ed0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,tfx/CsvExampleGen/examples/4
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_base'],data
['input_config'],"{  ""splits"": [  {  ""pattern"": ""export-2/*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['custom_config'],
['range_config'],
['span'],2
['version'],
['input_fingerprint'],"split:,num_files:1,total_bytes:412,xor_checksum:1616487615,sum_checksum:1616487615"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f2c98960b10.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/4) at 0x7f2c97e67ed0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/4) at 0x7f2c97e67ed0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: tfx/CsvExampleGen/examples/4) at 0x7f2c97e67ed0.type<class 'tfx.types.standard_artifacts.Examples'>.uritfx/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,tfx/CsvExampleGen/examples/4
.span,0
.split_names,"[""train"", ""eval""]"
.version,0
