# Preprocessing with tensorflow pipelines

In [28]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('data/autos.csv', encoding='cp1252', index_col=None)

In [30]:
train, test = train_test_split(data, test_size=0.7, random_state=666)
# fill NAs with "NA"
train = train.fillna('NA')

In [101]:
# Normalize numerical features
numerical_features = ['yearOfRegistration', 'powerPS', 'kilometer']

numerical_features_normalization = pd.DataFrame({
    'feature': numerical_features,
    'mean': [np.mean(train[x]) for x in numerical_features],
    'std': [np.std(train[x]) for x in numerical_features]
})

numerical_features_normalization

Unnamed: 0,feature,mean,std
0,yearOfRegistration,2004.536785,93.669427
1,powerPS,114.732231,175.738757
2,kilometer,125575.956863,40153.150424


In [5]:
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)

In [91]:
def set_numerical_feature(name):
    
    numerical_feature = tf.feature_column.numeric_column(name)
    
    return numerical_feature
    
    
    
def set_one_hot_feature(name, data):
    one_hot_feature = tf.feature_column.categorical_column_with_vocabulary_list(name, data[name].unique().tolist())
    one_hot_feature = tf.feature_column.indicator_column(one_hot_feature)
    
    return one_hot_feature

def set_embedding_feature(name, data, dims):
    embedding_feature = tf.feature_column.categorical_column_with_vocabulary_list(name, data[name].unique().tolist())
    embedding_feature = tf.feature_column.embedding_column(embedding_feature, dimension=dims)
    
    return embedding_feature

In [92]:
def feature_columns(data, dataset):
    
    feature_columns = []

    # numeric cols
    for header in ['yearOfRegistration', 'powerPS', 'kilometer']:
        feature_columns.append(set_numerical_feature(header))

    feature_columns.append(set_one_hot_feature('abtest', data))
    
    feature_columns.append(set_embedding_feature('vehicleType', data, 4))
    
    feature_columns.append(set_one_hot_feature('gearbox', data))
    
    feature_columns.append(set_embedding_feature('model', data, 8))
    
    feature_columns.append(set_one_hot_feature('fuelType', data))
    
    feature_columns.append(set_embedding_feature('brand', data, 6))
    
    feature_columns.append(set_one_hot_feature('notRepairedDamage', data))
    
    feature_columns.append(set_embedding_feature('postalCode', data, 10))
    

    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    
    return(feature_layer)

In [93]:
f_layer = feature_columns(train, train_ds)

In [58]:
batch_size = 150

train_ds = df_to_dataset(train, batch_size=batch_size)

In [64]:
train.yearOfRegistration.unique()

array([2009, 2005, 2010, 2011, 2004, 1999, 2007, 2001, 1996, 2013, 1998,
       2006, 1995, 1990, 1988, 1994, 1997, 2000, 2003, 1992, 2018, 2016,
       2008, 2017, 2012, 2002, 1991, 1980, 2015, 1933, 1977, 2014, 1974,
       1993, 1989, 1979, 1973, 1966, 1986, 1985, 1970, 1963, 1987, 1975,
       1959, 1951, 1976, 1969, 1983, 1000, 1978, 1982, 1001, 1972, 1984,
       1964, 8000, 1981, 1910, 6000, 1971, 1967, 1956, 1200, 1965, 1960,
       4800, 1968, 4000, 1957, 1955, 1937, 1931, 2222, 1961, 1962, 9999,
       1949, 2019, 1915, 1958, 1950, 1253, 1953, 1946, 1952, 1954, 1941,
       1936, 1935, 3000, 5000, 1800, 9000, 1939, 1111, 1948, 9450, 1938,
       1234, 2290, 7500, 1925, 3700, 1930, 5911, 1934, 1943, 2200, 1929,
       1600, 3200, 8500, 1940, 1919], dtype=int64)