In [1]:
import os
import json
import numpy as np
import pandas as pd
from torch import load
from torch import FloatTensor
from machine_learning.DNN.model import DNN
from machine_learning.DNN.data_pipepline import CarData
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_dir = './data'

# data params
label_col = 'price'
norm_feats = ['year', 'odometer', 'cylinders', 'lat', 'long']
label_norm = 'standard'

# training params
batch_size = 32
val_size = 0.1
test_size = 0.1

# data files
train_features_file = os.path.join(data_dir, 'cleaned_data_train.parquet')
train_label_file = os.path.join(data_dir, 'cleaned_price_train.parquet')
val_features_file = os.path.join(data_dir, 'cleaned_data_val.parquet')
val_label_file = os.path.join(data_dir, 'cleaned_price_val.parquet')
test_features_file = os.path.join(data_dir, 'cleaned_data_test.parquet')
test_label_file = os.path.join(data_dir, 'cleaned_price_test.parquet')

data = CarData()
data.load_from_files(train_features_file, train_label_file, val_features_file, 
                     val_label_file,  test_features_file, test_label_file,
                     norm_feats, label_norm='standard')

# train_data = DataLoader(data.train_data, batch_size=batch_size, shuffle=True)
val_data = DataLoader(data.val_data, batch_size=batch_size, shuffle=True)
test_data = DataLoader(data.test_data, batch_size=batch_size, shuffle=True)

In [3]:
# model params
input_dim = len(data.val_data.dataset[0][0])
hidden_dims = [512, 256, 256, 128]
output_dim = 1

# training params
learning_rate = 0.001
n_epochs = 100
l2_reg = 1e-5

model_save_dir = f"./machine_learning/DNN/models/arc_{'_'.join([str(i) for i in hidden_dims])}_lr_{learning_rate}_l2_{l2_reg}"

In [4]:
model = DNN(hidden_dims=hidden_dims, input_dim=input_dim, output_dim=output_dim)
model.load_state_dict(load(os.path.join(model_save_dir, 'model_parameters')))

<All keys matched successfully>

In [5]:
# Meta data
dataset = data.val_data.dataset
feature_means = data.feature_means
feature_stds = data.feature_stds
label_mean = data.label_mean
label_std = data.label_std

In [8]:
input_json = {'year': 2012,
            'manufacturer': 'toyota',
            'condition': None,
            'cylinders': 6.0,
            'fuel': 'gas',
            'odometer': 74000,
            'title_status': 'clean',
            'transmission': 'manual',
            'drive': 'fwd',
            'size': 'compact',
            'type': 'sedan',
            'paint_color': 'grey',
            'lat': 40.73,
            'long': -73.93}

with open('./data/metadata.json', 'r') as f:
    data_filters = json.load(f)
    
with open('./data/column_names.json', 'r') as f:
    column_names = json.load(f)

input_X = {}
for i, col in enumerate(column_names['features']):
    if column_names['features'][col] == 'numeric':
        input_X.update({col:(input_json[col] - feature_means[i]) / feature_stds[i]})
    else:
        categorical_column = '_'.join(col.split('_')[:-1])
        categorical_value = col.split('_')[-1]
        if input_json[categorical_column] == categorical_value:
            input_X.update({col:1.0})
        else:
            input_X.update({col:0.0})

model.eval()
y_pred = model(FloatTensor(np.array(list(input_X.values())))).detach().numpy()[0]
            
print('Predicted price:', ((y_pred * label_std) + label_mean) * 0.94, 'euro')


Predicted price: 10208.61207640134 euro


In [11]:
numeric_columns = data.numeric_features
onehot_columns = data.onehot_features
categorical_cols = data.categorical_features

sample_idx = np.random.randint(0, len(dataset))
X = dataset[sample_idx][0]
y_true = dataset[sample_idx][1]

# Predict sample label
model.eval()
y_pred = model(FloatTensor(X))

# Reconstruct prediction
y_pred = (y_pred.detach().numpy() * label_std) + label_mean

# Reconstruct numeric data
numeric_vals = (X[:5] * feature_stds) + feature_means

# Reconstruct categorical data

categorical_vals = list(np.array(onehot_columns)[np.where(X[5:] == 1)[0]])
categorical_vals = {'_'.join(col.split('_')[:-1]):col.split('_')[-1] for col in categorical_vals}

# Reconstruct label
y_true = (y_true * label_std) + label_mean

# Put all data in dictionary
sample_data = {}
for col, val in zip(numeric_columns, numeric_vals):
    sample_data.update({col:val})
for col in categorical_cols:
    if col in categorical_vals:
        sample_data.update({col:categorical_vals[col]})
    else:
        sample_data.update({col:None})
sample_data.update({'predicted price': int(y_pred[0]), 'true price': int(y_true[0])})

# Show sample results
print(pd.Series(sample_data).to_string())

year                  2019.0
odometer              6200.0
cylinders                6.0
lat                38.255234
long              -77.497557
size                    None
manufacturer            ford
transmission       automatic
paint_color              red
drive                    fwd
condition               None
fuel                     gas
title_status           clean
type                    None
predicted price        21925
true price             18450
