# Create model arquitecture

In [1]:
from tensorflow.keras import layers
from tensorflow import keras, dtypes
from tensorflow.data import Dataset
from tensorflow import feature_column
import tensorflow as tf

from preprocess_tf import preprocessing_fn

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

import random
import os

%load_ext autoreload
%autoreload 2


In [2]:
# Seed value
# Apparently you may use different seed values at each stage
seed_value= 0

# 1. Set `PYTHONHASHSEED` environment variable at a fixed value

os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set `python` built-in pseudo-random generator at a fixed value

random.seed(seed_value)

# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)


tf.keras.backend.set_floatx('float32')

In [3]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, target_name, shuffle=True, batch_size=100):
    dataframe = dataframe.copy()
    targets = dataframe.pop(target_name)
    ds = Dataset.from_tensor_slices((dict(dataframe), targets))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [4]:
def r2_score(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred)**2)
    ss_tot = np.sum((y_true - np.mean(y_true))**2)
    r2 = 1 - ss_res/ss_tot
    return r2

In [5]:
def set_numerical_feature(name):
    
    numerical_feature = feature_column.numeric_column(name, dtype=dtypes.float32)
    
    return numerical_feature
    
def set_one_hot_feature(name, data):
    one_hot_feature = feature_column.categorical_column_with_vocabulary_list(name, data[name].unique().tolist())
    one_hot_feature = feature_column.indicator_column(one_hot_feature)
    
    return one_hot_feature

def set_embedding_feature(name, data, dims):
    embedding_feature = feature_column.categorical_column_with_vocabulary_list(name, data[name].unique().tolist())
    embedding_feature = feature_column.embedding_column(embedding_feature, dimension=dims)
    
    return embedding_feature

In [6]:
def feature_columns(data, dataset):
    
    feature_columns = []

    # numeric cols
    for header in ['yearOfRegistration', 'powerPS', 'kilometer']:
        feature_columns.append(set_numerical_feature(header))
        
    feature_columns.append(set_one_hot_feature('abtest', data))
    
    feature_columns.append(set_one_hot_feature('vehicleType', data))
    
    feature_columns.append(set_one_hot_feature('gearbox', data))
    
    feature_columns.append(set_one_hot_feature('fuelType', data))
    
    feature_columns.append(set_one_hot_feature('brand', data))

    feature_columns.append(set_one_hot_feature('cluster_model', data))
    
    feature_columns.append(set_one_hot_feature('notRepairedDamage', data))
    
    
    feature_layer = layers.DenseFeatures(feature_columns)
    
    return(feature_layer)

In [7]:
train = preprocessing_fn(
    path='data/train.csv',
    norm_params='data/numerical_features_normalization.csv',
    model_clusters='data/model_clusters.csv'
)
val = preprocessing_fn(
    path='data/val.csv',
    norm_params='data/numerical_features_normalization.csv',
    model_clusters='data/model_clusters.csv',
)
test = preprocessing_fn(
    path='data/test.csv',
    norm_params='data/numerical_features_normalization.csv',
    model_clusters='data/model_clusters.csv'
)

train = train.dropna()
val = val.dropna()
test = test.dropna()

print(train.shape)
print(val.shape)
print(test.shape)

(88266, 11)
(22093, 11)
(47219, 11)


In [8]:
train.dtypes

price                   int64
abtest                 object
vehicleType            object
yearOfRegistration    float32
gearbox                object
powerPS               float32
kilometer             float32
fuelType               object
brand                  object
notRepairedDamage      object
cluster_model           int32
dtype: object

In [9]:
train_ds = df_to_dataset(dataframe=train, target_name='price', shuffle=True, batch_size=512)
val_ds = df_to_dataset(dataframe=val, target_name='price', shuffle=True, batch_size=512)
test_ds = df_to_dataset(dataframe=test, target_name='price', shuffle=True, batch_size=512)

In [9]:
feature_layer = feature_columns(train, train_ds)

In [11]:
dp_rate = 0.1
lr = 0.001
hu = 60
momentum = 0.9

model = keras.Sequential([
    feature_layer,
    layers.Dense(hu),
    layers.Activation('relu'),
    layers.Dropout(rate=dp_rate),
    layers.Dense(hu),
    layers.Activation('relu'),
    layers.Dropout(rate=dp_rate),
    layers.Dense(hu),
    layers.Activation('relu'),
    layers.Dropout(rate=dp_rate),
    layers.Dense(hu),
    layers.Activation('relu'),
    layers.Dropout(rate=dp_rate),
    layers.Dense(hu),
    layers.Activation('relu'),
    layers.Dropout(rate=dp_rate),
    layers.Dense(hu),
    layers.Activation('relu'),
    layers.Dense(1, activation='relu')
])

# Compile Keras model
model.compile(
    loss='mean_absolute_error',
    metrics=['mean_squared_error'],
    optimizer=keras.optimizers.Adam(learning_rate=lr))

# fit model
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=100,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)]
)

# get results
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch

plt.plot(hist.epoch, hist.loss)
plt.plot(hist.epoch, hist.val_loss)
plt.show()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100


AttributeError: 'DataFrame' object has no attribute 'mean_absolute_error'

In [12]:
hist.tail(10)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch
27,1019.054932,3669375.5,970.935486,3556235.75,27
28,1024.600098,3745526.25,988.630676,3594246.5,28
29,1015.863159,3700812.0,948.659241,3383191.5,29
30,1017.293701,3668143.5,989.330566,3569449.0,30
31,1016.506042,3802411.5,961.512512,3421793.25,31
32,1013.033081,3696856.0,964.881348,3471097.25,32
33,1009.373413,3628546.75,982.363037,3504029.0,33
34,1010.750061,3632364.25,947.265564,3366392.25,34
35,1010.270508,3624976.25,947.038086,3320341.0,35
36,1007.091858,3681255.25,990.807251,3582824.0,36


In [13]:
tf.keras.models.save_model(model, "cars_model")

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: cars_model\assets


In [6]:
model = tf.keras.models.load_model("cars_model")


In [19]:
a = df_to_dataset(dataframe=val, target_name='price', shuffle=False, batch_size=1000)

preds = model.predict(a)

In [24]:
val

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,kilometer,fuelType,brand,notRepairedDamage,cluster_model,price_pred,abs_error,error
0,8900,control,coupe,-0.840987,manuell,0.478913,-0.074132,benzin,volkswagen,nein,2,8994.614258,94.614258,94.614258
1,10000,test,suv,-1.132601,automatik,0.332571,-0.730286,diesel,volkswagen,nein,3,16805.447266,6805.447266,6805.447266
2,15888,test,coupe,-1.132601,automatik,0.449644,-2.305057,benzin,peugeot,nein,8,13141.212891,2746.787109,-2746.787109
3,4400,control,limousine,-0.403566,manuell,0.332571,0.582023,diesel,volkswagen,nein,8,5267.276367,867.276367,867.276367
4,6700,test,bus,-0.403566,manuell,0.076473,0.582023,diesel,volkswagen,nein,5,4840.722168,1859.277832,-1859.277832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23250,4650,control,kleinwagen,-0.840987,manuell,-0.143040,-0.074132,diesel,opel,nein,6,4597.784668,52.215332,-52.215332
23251,5550,control,_NA,-2.007443,manuell,0.113058,0.582023,diesel,volkswagen,nein,2,5832.893066,282.893066,282.893066
23252,5390,control,bus,-0.403566,manuell,0.266717,-3.223674,diesel,volkswagen,nein,2,9429.758789,4039.758789,4039.758789
23253,2000,test,bus,0.762890,manuell,-0.033283,0.582023,diesel,ford,nein,1,1239.589600,760.410400,-760.410400


In [21]:
# Validation results
print('VALIDATION RESULTS\n')
val_pred = model.predict(df_to_dataset(dataframe=val, target_name='price', shuffle=False, batch_size=1000)).flatten()

val['price_pred'] = val_pred
val['abs_error'] = abs(val.price_pred - val.price)
val['error'] = val.price_pred - val.price

# super baseline model
print('baseline abs error: ' + str(np.mean(np.abs(val.price - train.price.mean()))))

# my model
print('model abs error: ' + str(np.mean(np.abs(val.price - val_pred))))

# r squared
print('model r2: ' + str(r2_score(val.price, val.price_pred)))

VALIDATION RESULTS

baseline abs error: 3515.769520968634
model abs error: 990.8072053169171
model r2: 0.8660594642701392


In [22]:
# Test results
print('TEST RESULTS\n')
test_pred = model.predict(df_to_dataset(dataframe=test, target_name='price', shuffle=False, batch_size=1000)).flatten()

test['price_pred'] = test_pred
test['abs_error'] = abs(test.price_pred - test.price)
test['error'] = test.price_pred - test.price

worst_predictions = test.copy().loc[test.abs_error >= test.abs_error.quantile(0.9)]

# super baseline model
print('baseline abs error: ' + str(np.mean(np.abs(test.price - train.price.mean()))))

# my model
print('model abs error: ' + str(np.mean(np.abs(test.price - test_pred))))

# r squared
print('model r2: ' + str(r2_score(test.price, test.price_pred)))

TEST RESULTS

baseline abs error: 3519.359949713082
model abs error: 985.9640685379809
model r2: 0.867237633519853


In [None]:
plt.scatter(test.price, test_pred, alpha=0.2)
plt.xlim([0, test.price.max()])
plt.ylim([0, test.price.max()])
plt.xlabel('True')
plt.ylabel('Pred')
plt.show()

In [None]:
plt.scatter(val.price, val_pred, alpha=0.2)
plt.xlim([0, val.price.max()])
plt.ylim([0, val.price.max()])
plt.xlabel('True')
plt.ylabel('Pred')
plt.show()

In [None]:
plt.hist(val.error, bins=40)
plt.show()

In [None]:
plt.hist(worst_predictions.price, bins=40, alpha=0.5, density=True, label='worst')
plt.hist(val.price, bins=40, alpha=0.5, density=True, label='total')
plt.legend()
plt.show()

In [None]:
plt.scatter(val.price, val.abs_error, alpha=0.2)
plt.show()

In [None]:
plt.figure(figsize=(20,5))
plt.hist(val.price_pred, bins=80, alpha=0.5, density=True)
plt.hist(val.price, bins=80, alpha=0.5, density=True)
plt.show()

In [None]:
test.price.max()

In [None]:
plt.figure(figsize=(20,5))
plt.hist(test.price_pred, bins=80, alpha=0.5, density=True)
plt.hist(test.price, bins=80, alpha=0.5, density=True)
plt.show()