# Create model arquitecture

In [1]:
from tensorflow.keras import layers
from tensorflow import keras, dtypes
from tensorflow.data import Dataset
from tensorflow import feature_column
from preprocess_tf import preprocessing_fn

In [2]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, target_name, shuffle=True, batch_size=100):
    dataframe = dataframe.copy()
    targets = dataframe.pop(target_name)
    ds = Dataset.from_tensor_slices((dict(dataframe), targets))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
        ds = ds.batch(batch_size)
    return ds

In [3]:
def set_numerical_feature(name):
    
    numerical_feature = feature_column.numeric_column(name, dtype=dtypes.float64)
    
    return numerical_feature
    
def set_one_hot_feature(name, data):
    one_hot_feature = feature_column.categorical_column_with_vocabulary_list(name, data[name].unique().tolist())
    one_hot_feature = feature_column.indicator_column(one_hot_feature)
    
    return one_hot_feature

def set_embedding_feature(name, data, dims):
    embedding_feature = feature_column.categorical_column_with_vocabulary_list(name, data[name].unique().tolist())
    embedding_feature = feature_column.embedding_column(embedding_feature, dimension=dims)
    
    return embedding_feature

In [4]:
def feature_columns(data, dataset):
    
    feature_columns = []

    # numeric cols
    for header in ['yearOfRegistration', 'powerPS', 'kilometer']:
        feature_columns.append(set_numerical_feature(header))

    feature_columns.append(set_one_hot_feature('abtest', data))
    
    feature_columns.append(set_embedding_feature('vehicleType', data, 4))
    
    feature_columns.append(set_one_hot_feature('gearbox', data))
    
    feature_columns.append(set_embedding_feature('model', data, 8))
    
    feature_columns.append(set_one_hot_feature('fuelType', data))
    
    feature_columns.append(set_embedding_feature('brand', data, 6))
    
    feature_columns.append(set_one_hot_feature('notRepairedDamage', data))
    
    feature_columns.append(set_embedding_feature('postalCode', data, 10))
    

    feature_layer = layers.DenseFeatures(feature_columns)
    
    return(feature_layer)

In [54]:
train = preprocessing_fn(path='data/train.csv', na_encoding='data/gearbox_powerps_na.csv', norm_params='data/numerical_features_normalization.csv')
val = preprocessing_fn(path='data/val.csv', na_encoding='data/gearbox_powerps_na.csv', norm_params='data/numerical_features_normalization.csv')

In [8]:
train.dtypes

dateCrawled             object
name                    object
seller                  object
offerType               object
price                    int64
abtest                  object
vehicleType             object
yearOfRegistration     float32
gearbox                 object
powerPS                float32
model                   object
kilometer              float32
monthOfRegistration      int64
fuelType                object
brand                   object
notRepairedDamage       object
dateCreated             object
nrOfPictures             int64
postalCode               int64
lastSeen                object
dtype: object

In [55]:
train_ds = df_to_dataset(dataframe=train, target_name='price', shuffle=True, batch_size=1000)
val_ds = df_to_dataset(dataframe=val, target_name='price', shuffle=True, batch_size=1000)

In [56]:
feature_layer = feature_columns(train, train_ds)

In [None]:

# x = layers.Dense(100, activation='relu')(feature_layer)

# x = layers.Dense(100, activation='relu')(x)

# output = layers.Dense(1, activation='relu')(x)

# model = keras.Model(inputs=feature_layer, outputs=output, name="my_model")

In [73]:
dp_rate = 0.5
lr = 0.001
hu = 150

model = keras.Sequential([
    feature_layer,
    layers.BatchNormalization(),
    layers.Dense(60, activation='relu'),
    #layers.Dropout(rate=dp_rate),
    layers.BatchNormalization(),
    layers.Dense(120, activation='relu'),
    #layers.Dropout(rate=dp_rate),
    layers.BatchNormalization(),
    layers.Dense(120, activation='relu'),
    #layers.Dropout(rate=dp_rate),
    layers.BatchNormalization(),
    layers.Dense(240, activation='relu'),
    #layers.Dropout(rate=dp_rate),
    layers.BatchNormalization(),
    layers.Dense(240, activation='relu'),
    #layers.Dropout(rate=dp_rate),
    layers.BatchNormalization(),
    layers.Dense(120, activation='relu'),
    #layers.Dropout(rate=dp_rate),
    layers.BatchNormalization(),
    layers.Dense(120, activation='relu'),
    #layers.Dropout(rate=dp_rate),
    layers.BatchNormalization(),
    layers.Dense(120, activation='relu'),
    #layers.Dropout(rate=dp_rate),
    layers.Dense(1, activation='relu')
])

# Compile Keras model
model.compile(
    loss='mean_absolute_error',
    optimizer=keras.optimizers.Adam(learning_rate=lr))

model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x27c19a106a0>

In [48]:
train.price.describe()

count    2.057070e+05
mean     6.887453e+03
std      1.074045e+05
min      0.000000e+00
25%      1.150000e+03
50%      2.950000e+03
75%      7.200000e+03
max      1.400050e+07
Name: price, dtype: float64

In [49]:
val.price.describe()

count    5.201400e+04
mean     1.255307e+04
std      7.173585e+05
min      0.000000e+00
25%      1.150000e+03
50%      2.950000e+03
75%      7.250000e+03
max      1.000000e+08
Name: price, dtype: float64