### Tuning in order to find best loss function for autos dataset

Functions to be used in the tests: mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, mean_squared_logarithmic_error e squared_hinge 

**Result:** The best loss function is mean_squared_error

In [1]:
import pandas as pd
import tensorflow as tf
import sklearn

In [2]:
pd.__version__, tf.__version__, sklearn.__version__

('2.2.2', '2.10.0', '1.5.1')

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV
from tensorflow.keras import backend as k
from tensorflow.keras.losses import KLDivergence

In [4]:
dataset = pd.read_csv('../data/autos/autos.csv', encoding='ISO-8859-1')

In [5]:
# Working on the data 

# droping some columns 
dataset = dataset.drop('dateCrawled', axis=1) # axis=1 -> column
dataset = dataset.drop('dateCreated', axis=1)
dataset = dataset.drop('nrOfPictures', axis=1)
dataset = dataset.drop('postalCode', axis=1)
dataset = dataset.drop('lastSeen', axis=1)

# it returns how many times words will appear
dataset['name'].value_counts()

name
Ford_Fiesta                                              657
BMW_318i                                                 627
Opel_Corsa                                               622
Volkswagen_Golf_1.4                                      603
BMW_316i                                                 523
                                                        ... 
Audi_A4_Avant_Klima_Gruene_Plakette_TÜV_&AU_NEU_XENON      1
Renault_clio_in_gold_450VB_!!                              1
Fiat_Doblo_1.6_Multijet                                    1
Renault_Laguna_1                                           1
BMW_M135i_vollausgestattet_NP_52.720____Euro               1
Name: count, Length: 233531, dtype: int64

In [6]:
'''
    Since the data in that column is highly variable, we can drop the column, 
    considering that we have another column (brand) with less variation
'''
dataset = dataset.drop('name', axis=1)

In [7]:
dataset['seller'].value_counts()

seller
privat        371525
gewerblich         3
Name: count, dtype: int64

In [8]:
#the data is highly imbalanced
dataset = dataset.drop('seller', axis=1)

In [9]:
dataset['offerType'].value_counts()

offerType
Angebot    371516
Gesuch         12
Name: count, dtype: int64

In [10]:
# the data is highly imbalanced
dataset = dataset.drop('offerType', axis=1)

In [11]:
dataset.loc[dataset['price'] <=  10]

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
7,0,test,limousine,1980,manuell,50,andere,40000,7,benzin,volkswagen,nein
40,0,test,,1990,,0,corsa,150000,1,benzin,opel,
60,1,control,suv,1994,manuell,286,,150000,11,,sonstige_autos,
91,1,control,limousine,1995,manuell,113,e_klasse,150000,4,diesel,mercedes_benz,nein
115,0,test,,2017,manuell,0,golf,5000,12,benzin,volkswagen,
...,...,...,...,...,...,...,...,...,...,...,...,...
371356,0,control,,2000,manuell,65,corsa,150000,0,,opel,ja
371392,0,test,kleinwagen,2002,manuell,60,fiesta,150000,3,benzin,ford,
371402,0,control,kleinwagen,1999,manuell,53,swift,150000,3,benzin,suzuki,
371431,0,control,kleinwagen,1999,manuell,37,arosa,150000,7,benzin,seat,ja


In [12]:
dataset['price'].mean()

17295.14186548524

In [13]:
# recreating dataset in which the registers have prices greater than 10
dataset = dataset[dataset['price'] > 10]
dataset.shape

(359410, 12)

In [14]:
dataset.loc[dataset['price'] > 350000]

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
1846,579000,control,coupe,1980,manuell,277,andere,20000,12,benzin,bmw,nein
10649,420000,control,coupe,2004,manuell,483,911,50000,4,benzin,porsche,nein
14663,11111111,control,coupe,2003,manuell,64,polo,150000,2,benzin,volkswagen,
16889,1000000,control,kombi,1998,,0,mondeo,150000,0,benzin,ford,ja
20143,1250000,test,coupe,2016,manuell,500,911,5000,3,benzin,porsche,nein
...,...,...,...,...,...,...,...,...,...,...,...,...
364171,3890000,test,coupe,2006,,799,,5000,7,,sonstige_autos,nein
365461,599000,control,coupe,1980,manuell,377,andere,5000,3,benzin,bmw,nein
366653,99999999,control,cabrio,1996,manuell,192,3er,150000,0,,bmw,
366861,3895000,test,coupe,2006,,799,,5000,4,benzin,sonstige_autos,nein


In [15]:
dataset = dataset[dataset['price'] < 350000]
dataset.shape

(359291, 12)

In [16]:
# Missing data handling
dataset.loc[pd.isnull(dataset['vehicleType'])]

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,
16,300,test,,2016,,60,polo,150000,0,benzin,volkswagen,
22,2900,test,,2018,manuell,90,meriva,150000,5,benzin,opel,nein
26,5555,control,,2017,manuell,125,c4,125000,4,,citroen,nein
31,899,control,,2016,manuell,60,clio,150000,6,benzin,renault,
...,...,...,...,...,...,...,...,...,...,...,...,...
371495,180,control,,1995,,0,,125000,3,benzin,opel,
371504,2600,control,,2005,automatik,0,c_klasse,150000,9,,mercedes_benz,
371509,1900,test,,2000,manuell,110,,150000,7,,volkswagen,nein
371519,5250,control,,2016,automatik,150,159,150000,12,,alfa_romeo,nein


In [17]:
# Missing values handle. Numbers: mean. Categorical attributes: mode

dataset['vehicleType'].value_counts()
dataset['vehicleType'].mode()


0    limousine
Name: vehicleType, dtype: object

In [18]:
dataset.loc[pd.isnull(dataset['gearbox'])]

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
15,450,test,kleinwagen,1910,,0,ka,5000,0,benzin,ford,
16,300,test,,2016,,60,polo,150000,0,benzin,volkswagen,
32,245,test,limousine,1994,,0,golf,150000,2,benzin,volkswagen,nein
37,1500,test,,2016,,0,kangoo,150000,1,diesel,renault,nein
70,1200,test,coupe,2001,,0,astra,150000,0,,opel,
...,...,...,...,...,...,...,...,...,...,...,...,...
371443,3300,control,kombi,2006,,0,touran,150000,7,diesel,volkswagen,
371460,3500,control,,1995,,0,polo,150000,0,,volkswagen,
371486,350,control,kleinwagen,1996,,65,punto,150000,0,,fiat,
371495,180,control,,1995,,0,,125000,3,benzin,opel,


In [19]:
dataset['gearbox'].value_counts()

gearbox
manuell      266547
automatik     75508
Name: count, dtype: int64

In [20]:
dataset['gearbox'].mode()

0    manuell
Name: gearbox, dtype: object

In [21]:
dataset.loc[pd.isnull(dataset['model'])]

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
1,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja
83,350,control,kleinwagen,1997,manuell,54,,150000,3,,fiat,ja
139,1450,control,limousine,1992,manuell,136,,150000,0,,audi,nein
156,6799,control,kleinwagen,2009,,60,,20000,5,benzin,volkswagen,nein
165,500,control,kleinwagen,1999,manuell,0,,150000,0,benzin,renault,nein
...,...,...,...,...,...,...,...,...,...,...,...,...
371399,560,control,kleinwagen,2001,automatik,170,,90000,0,benzin,fiat,ja
371476,9400,control,kombi,2007,manuell,200,,150000,4,diesel,sonstige_autos,ja
371495,180,control,,1995,,0,,125000,3,benzin,opel,
371509,1900,test,,2000,manuell,110,,150000,7,,volkswagen,nein


In [22]:
dataset['model'].value_counts()

model
golf               28989
andere             25560
3er                19905
polo               12604
corsa              12149
                   ...  
serie_2                8
rangerover             6
serie_3                3
serie_1                1
discovery_sport        1
Name: count, Length: 251, dtype: int64

In [23]:
dataset['model'].mode()

0    golf
Name: model, dtype: object

In [24]:
dataset.loc[pd.isnull(dataset['fuelType'])]

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
9,999,test,kleinwagen,1998,manuell,101,golf,150000,0,,volkswagen,
13,2500,control,kombi,2004,manuell,131,passat,150000,2,,volkswagen,nein
26,5555,control,,2017,manuell,125,c4,125000,4,,citroen,nein
36,1600,control,andere,1991,manuell,75,kadett,70000,0,,opel,
41,7500,control,limousine,2002,automatik,306,e_klasse,150000,4,,mercedes_benz,
...,...,...,...,...,...,...,...,...,...,...,...,...
371496,3850,test,cabrio,2006,manuell,108,2_reihe,125000,2,,peugeot,nein
371504,2600,control,,2005,automatik,0,c_klasse,150000,9,,mercedes_benz,
371509,1900,test,,2000,manuell,110,,150000,7,,volkswagen,nein
371519,5250,control,,2016,automatik,150,159,150000,12,,alfa_romeo,nein


In [25]:
dataset['fuelType'].value_counts()

fuelType
benzin     217582
diesel     106002
lpg          5222
cng           557
hybrid        271
andere        165
elektro       101
Name: count, dtype: int64

In [26]:
dataset.loc[pd.isnull(dataset['notRepairedDamage'])]

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,
2,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,
8,14500,control,bus,2014,manuell,125,c_max,30000,8,benzin,ford,
9,999,test,kleinwagen,1998,manuell,101,golf,150000,0,,volkswagen,
12,999,control,kombi,1995,manuell,115,passat,150000,11,benzin,volkswagen,
...,...,...,...,...,...,...,...,...,...,...,...,...
371507,5999,test,kombi,2005,manuell,140,a4,150000,4,diesel,audi,
371514,999,control,cabrio,2000,manuell,95,megane,150000,4,benzin,renault,
371515,1690,test,kombi,2004,manuell,55,fabia,150000,4,benzin,skoda,
371523,2200,test,,2005,,0,,20000,1,,sonstige_autos,


In [27]:
dataset['notRepairedDamage'].value_counts()

notRepairedDamage
nein    259301
ja       34004
Name: count, dtype: int64

In [28]:
# correcting data

values = {
    'vehicleType': 'limousine',
    'gearbox':'manuell',
    'model':'golf',
    'fuelType':'benzin',
    'notRepairedDamage': 'nein',
}

values

{'vehicleType': 'limousine',
 'gearbox': 'manuell',
 'model': 'golf',
 'fuelType': 'benzin',
 'notRepairedDamage': 'nein'}

In [29]:
dataset = dataset.fillna(value=values)

In [30]:
dataset.isnull().sum()

price                  0
abtest                 0
vehicleType            0
yearOfRegistration     0
gearbox                0
powerPS                0
model                  0
kilometer              0
monthOfRegistration    0
fuelType               0
brand                  0
notRepairedDamage      0
dtype: int64

In [31]:
dataset.columns

Index(['price', 'abtest', 'vehicleType', 'yearOfRegistration', 'gearbox',
       'powerPS', 'model', 'kilometer', 'monthOfRegistration', 'fuelType',
       'brand', 'notRepairedDamage'],
      dtype='object')

In [32]:
X = dataset.iloc[:, 1:12].values # numpy format 

X.shape, type(X)

((359291, 11), numpy.ndarray)

In [33]:
X

array([['test', 'limousine', 1993, ..., 'benzin', 'volkswagen', 'nein'],
       ['test', 'coupe', 2011, ..., 'diesel', 'audi', 'ja'],
       ['test', 'suv', 2004, ..., 'diesel', 'jeep', 'nein'],
       ...,
       ['test', 'bus', 1996, ..., 'diesel', 'volkswagen', 'nein'],
       ['test', 'kombi', 2002, ..., 'diesel', 'volkswagen', 'nein'],
       ['control', 'limousine', 2013, ..., 'benzin', 'bmw', 'nein']],
      dtype=object)

In [34]:
y = dataset.iloc[:, 0].values

y

array([  480, 18300,  9800, ...,  9200,  3400, 28990], dtype=int64)

In [35]:
# Preprocessing

onehotencoder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [0, 1, 3, 5, 8, 9, 10])], remainder='passthrough')

In [36]:
X = onehotencoder.fit_transform(X)


In [37]:
dataset.shape, X.shape

((359291, 12), (359291, 316))

In [38]:
# initial definition of neuron units

# (input + output)/2

(316 + 1)/2

158.5

In [47]:
# building the neural network
def create_net (loss):
    k.clear_session()
    regressor = Sequential([
        tf.keras.layers.InputLayer(input_shape=(316, )),
        tf.keras.layers.Dense(units=158, activation='relu'),
        tf.keras.layers.Dense(units=158, activation='relu'),
        tf.keras.layers.Dense(units=1, activation='linear')
    ])
    regressor.compile(optimizer='adam', loss=loss, metrics=['mean_absolute_error'])
    return regressor

neural_network = KerasRegressor(model=create_net)

In [48]:
params = {
    'batch_size': [20],
    'epochs': [25],
    'model__loss':[
        'mean_squared_error', 
        'mean_absolute_error', 
        'mean_absolute_percentage_error', 
        'mean_squared_logarithmic_error', 
        'squared_hinge'
        ]
}

In [49]:
grid_search = GridSearchCV(estimator=neural_network, param_grid=params, cv=3)

In [50]:
grid_search = grid_search.fit(X,y)

Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25




Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [51]:
grid_search.best_params_

{'batch_size': 20, 'epochs': 25, 'model__loss': 'mean_squared_error'}