## Prepare Data Pipeline by using tf.data 

The tf.data.Dataset API supports writing descriptive and efficient input pipelines. 

Dataset usage follows a common pattern:
  * Create a source dataset from your input data.
  * Apply dataset transformations to preprocess the data.
  * Iterate over the dataset and process the elements.

Iteration happens in a streaming fashion, so the full dataset does not need to fit into memory.

<b>Pandas Category Type</b> 
One of the main use cases for categorical data types is more efficient memory usage.

In [1]:
import pathlib 

data_dir = pathlib.Path("../datasets/big_ds/img-001/")

In [2]:
import pandas as pd 
data = pd.read_csv("../datasets/attribute_set/list_attr_img.txt", delim_whitespace=True, names= ['paths'] + list(range(1000)))

In [3]:
data.head() 

Unnamed: 0,paths,0,1,2,3,4,5,6,7,8,...,990,991,992,993,994,995,996,997,998,999
0,img/Sheer_Pleated-Front_Blouse/img_00000001.jpg,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,img/Sheer_Pleated-Front_Blouse/img_00000002.jpg,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,img/Sheer_Pleated-Front_Blouse/img_00000003.jpg,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,img/Sheer_Pleated-Front_Blouse/img_00000004.jpg,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,img/Sheer_Pleated-Front_Blouse/img_00000005.jpg,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [4]:
cols = pd.read_fwf("../datasets/attribute_set/list_attr_cloth.txt", delim_whitespace=True) 
cols.head() 

Unnamed: 0,attribute_name attribute_type
0,a-line 3
1,abstract 1
2,abstract chevron 1
3,abstract chevron print 1
4,abstract diamond 1


In [5]:
rows = [] 
for i, row in cols.iterrows():
    atts = str(row[0]).split()  
    ttype = atts[-1] 
    attr = " ".join(atts[:-1]) 
    rows.append([attr, ttype]) 
    
rows[:5], len(rows)

([['a-line', '3'],
  ['abstract', '1'],
  ['abstract chevron', '1'],
  ['abstract chevron print', '1'],
  ['abstract diamond', '1']],
 1000)

In [6]:
attributes = pd.DataFrame(
    rows, 
    columns=str(cols.columns.to_numpy()[0]).split('  ')
)

attributes

Unnamed: 0,attribute_name,attribute_type
0,a-line,3
1,abstract,1
2,abstract chevron,1
3,abstract chevron print,1
4,abstract diamond,1
...,...,...
995,zip-pocket,4
996,zip-up,4
997,zipped,4
998,zipper,4


In [None]:
attributes.to_csv("../datasets/attribute_set/custom_attr_definitions.csv")

In [7]:
data.columns = ["paths"] + list(attributes.attribute_name.values)
data.head() 

Unnamed: 0,paths,a-line,abstract,abstract chevron,abstract chevron print,abstract diamond,abstract floral,abstract floral print,abstract geo,abstract geo print,...,zeppelin,zig,zigzag,zip,zip-front,zip-pocket,zip-up,zipped,zipper,zippered
0,img/Sheer_Pleated-Front_Blouse/img_00000001.jpg,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,img/Sheer_Pleated-Front_Blouse/img_00000002.jpg,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,img/Sheer_Pleated-Front_Blouse/img_00000003.jpg,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,img/Sheer_Pleated-Front_Blouse/img_00000004.jpg,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,img/Sheer_Pleated-Front_Blouse/img_00000005.jpg,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [13]:
variables = set()
for col in data.columns[1:]: 
    [variables.add(item) for item in data[col].unique()]
    
variables

AttributeError: 'DataFrame' object has no attribute 'unique'

In [14]:
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289222 entries, 0 to 289221
Columns: 1001 entries, paths to zippered
dtypes: int64(1000), object(1)
memory usage: 2.2+ GB


In [22]:
for col in range(1, 1001): 
    data.iloc[:, col] = data.iloc[:, col].astype('category').cat.codes.values
    if col % 100 == 0: 
        print(col)

100
200
300
400
500
600
700
800
900
1000


In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289222 entries, 0 to 289221
Columns: 1001 entries, paths to zippered
dtypes: int8(1000), object(1)
memory usage: 278.0+ MB


In [25]:
data.head() 

Unnamed: 0,paths,a-line,abstract,abstract chevron,abstract chevron print,abstract diamond,abstract floral,abstract floral print,abstract geo,abstract geo print,...,zeppelin,zig,zigzag,zip,zip-front,zip-pocket,zip-up,zipped,zipper,zippered
0,img/Sheer_Pleated-Front_Blouse/img_00000001.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,img/Sheer_Pleated-Front_Blouse/img_00000002.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,img/Sheer_Pleated-Front_Blouse/img_00000003.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,img/Sheer_Pleated-Front_Blouse/img_00000004.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,img/Sheer_Pleated-Front_Blouse/img_00000005.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Save the dataset with helpers 

In [24]:
def to_csv(dataframe: pd.DataFrame, path:str): 
    """
        It save the dataframe into the folder which path points. 
        Also, it will add the dtypes as a record as a last row. 
    """ 
    dataframe.loc[-1] = dataframe.dtypes 
    dataframe.index = dataframe.index + 1 
    dataframe.sort_index(inplace=True) 
    dataframe.to_csv(path, index=False) 

def read_csv_with_dtypes(path:str):
    """
        CSV file has to have its dtypes at the last row. 
    """
    dtypes = pd.read_csv(path, nrows=1).iloc[0].to_dict() 
    return pd.read_csv(path, dtype=dtypes, skiprows=[1])  
    

In [26]:
to_csv(data, "../datasets/attribute_set/custom_attr.csv")