## Single Output Regression - Autos Dataset

Dataset: https://www.kaggle.com/datasets/vfsousas/autos

In [1]:
import pandas as pd
import tensorflow as tf
import sklearn
import scikeras

In [2]:
pd.__version__, tf.__version__, sklearn.__version__, scikeras.__version__

('2.2.2', '2.17.0', '1.5.1', '0.13.0')

In [3]:
import time
from scikeras.wrappers import KerasRegressor
from tensorflow.keras import backend as k
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import metrics

In [4]:
# to monitor how much time training is gonna take 
beginning  = time.time()
beginning

1725751584.1105912

In [5]:
data = pd.read_csv('../data/autos/autos.csv', encoding='ISO-8859-1')

In [6]:
data = data.drop('dateCrawled', axis=1)
data = data.drop('dateCreated', axis=1)
data = data.drop('nrOfPictures', axis=1)
data = data.drop('postalCode', axis=1)
data = data.drop('lastSeen', axis=1)
data = data.drop('name', axis=1)
data = data.drop('seller', axis=1)
data = data.drop('offerType', axis=1)

In [7]:
data = data[data.price > 10]
data = data.loc[data.price < 350000]

In [8]:
values = {'vehicleType': 'limousine',
           'gearbox': 'manuell',
           'model': 'golf',
           'fuelType': 'benzin',
           'notRepairedDamage': 'nein'}
data = data.fillna(value=values)

In [9]:
# X: predictor variables
X = data.iloc[:, 1:12].values
y = data.iloc[:, 0].values

In [10]:
X, y

(array([['test', 'limousine', 1993, ..., 'benzin', 'volkswagen', 'nein'],
        ['test', 'coupe', 2011, ..., 'diesel', 'audi', 'ja'],
        ['test', 'suv', 2004, ..., 'diesel', 'jeep', 'nein'],
        ...,
        ['test', 'bus', 1996, ..., 'diesel', 'volkswagen', 'nein'],
        ['test', 'kombi', 2002, ..., 'diesel', 'volkswagen', 'nein'],
        ['control', 'limousine', 2013, ..., 'benzin', 'bmw', 'nein']],
       dtype=object),
 array([  480, 18300,  9800, ...,  9200,  3400, 28990], dtype=int64))

In [11]:
# preprocessing 
onehotencoder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [0, 1, 3, 5, 8, 9, 10])], remainder='passthrough')
X = onehotencoder.fit_transform(X).toarray()
X.shape

(359291, 316)

In [12]:
def create_net():
    k.clear_session()
    regressor = Sequential([
        tf.keras.layers.InputLayer(shape=(316,)), 
        tf.keras.layers.Dense(units=158, activation='relu'),
        tf.keras.layers.Dense(units=158, activation='relu'),
        tf.keras.layers.Dense(units=1, activation='linear'),
    ])
    regressor.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
    return regressor

In [13]:
regressor = KerasRegressor(model = create_net, epochs=100, batch_size=300)

In [14]:
# the cv standard value is 10 but if you don't want it to take too much time to train then change it to 5

results = cross_val_score(estimator=regressor, X=X, y=y, cv=10, scoring='neg_mean_absolute_error')


Epoch 1/100
[1m1078/1078[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 4252.4507 - mean_absolute_error: 4252.4507
Epoch 2/100
[1m1078/1078[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 3441.0093 - mean_absolute_error: 3441.0093
Epoch 3/100
[1m1078/1078[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 3154.0522 - mean_absolute_error: 3154.0522
Epoch 4/100
[1m1078/1078[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 2967.4231 - mean_absolute_error: 2967.4231
Epoch 5/100
[1m1078/1078[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 2832.7673 - mean_absolute_error: 2832.7673
Epoch 6/100
[1m1078/1078[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 2797.9976 - mean_absolute_error: 2797.9976
Epoch 7/100
[1m1078/1078[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 2742.6665 - mean_absolute_error: 2742.6665
Epoch 8/100
[1m107

In [15]:
end = time.time()

In [16]:
# value in hour
(end - beginning)/60/60

0.6288488935761981

In [17]:
abs(results)

array([2211.40251582, 2234.24774379, 2308.25940783, 2228.08597161,
       2210.57956837, 2304.7180538 , 2375.96356162, 2199.24266188,
       2148.29467708, 2153.50511786])

In [18]:
abs(results.mean()), abs(results.std())

(2237.4299279656875, 68.32174804004968)