### Import libraries

In [1]:
import sys
import os
import gc
import warnings
from functools import partial
warnings.filterwarnings("ignore")

In [2]:
home = os.path.expanduser("~")
sys.path.append(f"{home}/Documents/projects/CarPriceRegression/Machine_Learning/scripts/")

In [3]:
from DataSetUp import *
from NNCarPrice import *
%load_ext autoreload
%autoreload 2

In [4]:
NNCarPrice.set_gpu_limit(6)

### Import data for regular NN without categorical embedding and split into train,dev, and test

In [5]:
data = pd.read_csv(f"{home}/Documents/projects/CarPriceRegression/Processed_Data/car_onehot_data.csv")
features,y = data.drop("price",axis=1), data.price

In [6]:
data_setup = DataSetUp(features,y)

In [7]:
X_train,X_dev,X_test,y_train,y_dev,y_test = data_setup.data_split(2020,0.1,dev_set=True,dev_seed=1988,dev_size=0.11)

### Garbage collect

In [8]:
gc.collect()
%reset -f out

Flushing output cache (0 entries)


### Set up base parameter for a base regression model

In [9]:
BATCH_SIZE = 256 
input_size = data.drop("price",axis=1).shape[1]
MAX_EPOCH = 500
lr = 1e-3
l2 = 0 
SIZES = [512,512]
METRICS = ["MAPE","MAE"]

### Set up a base regression model

In [10]:
base_model = NNCarPrice.make_model(SIZES,input_size,METRICS,l2,lr)

In [11]:
base_model.summary()

Model: "NN_regressor"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               112640    
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 513       
Total params: 375,809
Trainable params: 375,809
Non-trainable params: 0
_________________________________________________________________


### Define early stop

In [12]:
EARLYSTOP = tfk.callbacks.EarlyStopping(monitor="val_loss",mode="min",
                                        patience=10,verbose=1,restore_best_weights=True)

### Set up a base regression model

In [13]:
CarPrice_NN_base = NNCarPrice(base_model,BATCH_SIZE,MAX_EPOCH,[EARLYSTOP])

### Make tensorflow dataset

* notice: this dataset has pretty similar scale so no reason to normalize to help with gradient descent

In [14]:
train_dataset = data_setup.make_tensor_dataset(X_train,y_train,BATCH_SIZE)
dev_dataset = data_setup.make_tensor_dataset(X_dev,y_dev,BATCH_SIZE)

In [16]:
CarPrice_NN_base.regression_metrics(X_train.values,y_train,"train",True,train_dataset,dev_dataset,V=0)

Restoring model weights from the end of the best epoch.
Epoch 00237: early stopping


Unnamed: 0,r2_score,rmse,price_diff_abs_max
train,0.945751,2293.66622,971.644504


In [None]:
CarPrice_NN_base.regression_metrics(X_test.values,y_test,"test",retrain=False)

In [None]:
base_model_trained = CarPrice_NN_base._trained_model

In [None]:
NNCarPrice.save_model(base_model_trained,"base_nn_regression_mdl.h5")

In [None]:
base_model_trained = NNCarPrice.load_model("base_nn_regression_mdl.h5")

In [None]:
CarPrice_NN_base.reset_trained_model(base_model_trained)

### Check price vs predicted value

In [None]:
CarPrice_NN_base.plot_pred_price(features,y,retrain=False)

In [None]:
price_table = CarPrice_NN_base.price_diff(features,y)

In [None]:
price_table.head(25)

In [None]:
price_table.tail(25)

In [None]:
feature_table = CarPrice_NN_base.linear_feature_importance(features,plot=True)

### Pretty decent results and the error appear to be normal distribution

## Try a bigger model 

In [None]:
bigger_model = NNCarPrice.make_model([2056,2014,512,512],input_size,METRICS,1e-4,lr)

In [None]:
bigger_model.summary()

In [None]:
CarPrice_NN_bigger = NNCarPrice(bigger_model,BATCH_SIZE,MAX_EPOCH,[EARLYSTOP])

In [None]:
CarPrice_NN_bigger.regression_metrics(X_train.values,y_train,"train",True,train_dataset,dev_dataset,V=0)

In [None]:
CarPrice_NN_bigger.regression_metrics(X_test.values,y_test,"test",False)

In [None]:
bigger_model_tuned = CarPrice_NN_bigger._trained_model

In [None]:
NNCarPrice.save_model(bigger_model_tuned,"bigger_model_nn.h5")

In [None]:
bigger_model_tuned = NNCarPrice.load_model("bigger_model_nn.h5")

In [None]:
CarPrice_NN_bigger.plot_pred_price(features,y,retrain=False)

### Pretty decent result and not over training the train data either

In [None]:
bigger_model_2 = NNCarPrice.make_model([2056,2014,512,512],input_size,METRICS,1e-4,5e-5)

In [None]:
bigger_model_2.summary()

In [None]:
CarPrice_NN_bigger_2 = NNCarPrice(bigger_model_2,BATCH_SIZE,MAX_EPOCH,[EARLYSTOP])

In [None]:
CarPrice_NN_bigger_2.regression_metrics(X_train.values,y_train,"train",True,train_dataset,dev_dataset,0)

In [None]:
CarPrice_NN_bigger_2.regression_metrics(X_test.values,y_test,"test",False)

In [None]:
bigger_model_2 = CarPrice_NN_bigger_2._trained_model

In [None]:
NNCarPrice.save_model(bigger_model_2,"bigger_model_2.h5")

In [None]:
bigger_model_2 = NNCarPrice.load_model("bigger_model_2.h5")

In [None]:
CarPrice_NN_bigger_2.reset_trained_model(bigger_model_2)

In [None]:
CarPrice_NN_bigger_2.plot_pred_price(features,y,False)

### Since we have limited data, having bigger NN didn't seem to help with the improvement

------

### Looking at special cases of why the model has hard time predicting certain car price

In [None]:
price_table_final = CarPrice_NN_bigger_2.price_diff(features,y)

In [None]:
price_table_final.head(20)

### Most of these models are older. The top cases have a weird combination of low car mileage but older year model 

In [None]:
feature_table = CarPrice_NN_bigger_2.linear_feature_importance(features,plot=True)

### Let's tune learning rate

In [None]:
params = np.power(0.1,np.arange(0.5,7))

In [None]:
partial_model = partial(NNCarPrice.make_model,[2056,2014,512,512],input_size,METRICS,1e-4)

In [None]:
CarPrice_NN_bigger_2.param_search(params,partial_model,train_dataset,dev_dataset,1)

### appears learning rate between 10-2 and 10-3 would work great

In [None]:
params_2 = [0.05,0.025,0.01,0.005,0.0025,0.001,0.0005]

In [None]:
CarPrice_NN_bigger_2.param_search(params_2,partial_model,train_dataset,dev_dataset,1)

### It appears lr of 0.001 works the best which we already use

### The NN models have mostly car model type as top and bottom features


____