In [1]:
# import necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
# create datasets, evaluation_results_json, models_joblib and training_evaluation_py folders if they don't exist already
if not os.path.exists('datasets'):
    os.makedirs('datasets')
if not os.path.exists('evaluation_results_json'):
    os.makedirs('evaluation_results_json')
if not os.path.exists('models_joblib'):
    os.makedirs('models_joblib')
if not os.path.exists('training_evaluation_py'):
    os.makedirs('training_evaluation_py')

In [3]:
# get the data path of used_cars.csv file present in the train folder 
input_data_path = '../UsedCarsPricePrediction/datasets/used_cars.csv'
target = 'price'
numeric_features = ['Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats']
catergorical_features = ['Segment']
# train_features_output_path for used_cars.csv from local machine
train_features_output_path = os.path.join("../UsedCarsPricePrediction/datasets", "train_features.csv")
# train_labels_output_path for used_cars.csv from local machine
train_labels_output_path = os.path.join("../UsedCarsPricePrediction/datasets", "train_labels.csv")

# validation set for used_cars.csv from local machine
val_features_output_path = os.path.join("../UsedCarsPricePrediction/datasets", "val_features.csv")
val_labels_output_path = os.path.join("../UsedCarsPricePrediction/datasets", "val_labels.csv")

# test_features_output_path for used_cars.csv from local machine
test_features_output_path = os.path.join("../UsedCarsPricePrediction/datasets", "test_features.csv")
# test_labels_output_path for used_cars.csv from local machine
test_labels_output_path = os.path.join("../UsedCarsPricePrediction/datasets", "test_labels.csv")


In [4]:
# from main.py in the current directory import the function preprocess 
from main import preprocess

# call the function preprocess and pass the input_data_path, target, numeric_features, catergorical_features, train_features_output_path, train_labels_output_path, val_features_output_path, val_labels_output_path, test_features_output_path, test_labels_output_path
preprocess(input_data_path, target, numeric_features, catergorical_features, train_features_output_path, train_labels_output_path, val_features_output_path, val_labels_output_path, test_features_output_path, test_labels_output_path)

In [5]:
# training_data_directory for used_cars.csv from local machine
training_data_directory = "../UsedCarsPricePrediction/datasets"
# validation_data_directory for used_cars.csv from local machine
validation_data_directory = "../UsedCarsPricePrediction/datasets"
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/datasets"

train_features_data = os.path.join(training_data_directory, "train_features.csv") # this
train_labels_data = os.path.join(training_data_directory, "train_labels.csv")

val_features_data = os.path.join(validation_data_directory, "val_features.csv")
val_labels_data = os.path.join(validation_data_directory, "val_labels.csv")

test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

In [6]:
# model_output_directory for used_cars.csv from local machine for linear regression
model_output_directory = os.path.join("../UsedCarsPricePrediction/models_joblib", "model_lr.joblib")

# from main.py in the current directory import the function training_lr
from main import training_lr

# call the function training_lr and pass the train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory
training_lr(train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory)

RMSE: 19.3793577173483;
Training Accuracy: 0.7648688620886591
Validation Accuracy: 0.7636768057374047
Saving model to ../UsedCarsPricePrediction/models_joblib\model_lr.joblib


In [7]:
# model_path for used_cars.csv from local machine for linear regression
model_path = f"../UsedCarsPricePrediction/models_joblib/model_lr.joblib"
print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/datasets"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation_results_json", "evaluation_lr.json")

# from main.py in the current directory import the function evaluation_lr
from main import evaluation_lr

# call the function evaluation_lr and pass the model_path, test_features_data, test_labels_data, evaluation_output_path
evaluation_lr(model_path, test_features_data, test_labels_data, evaluation_output_path)

Loading test input data
RMSE: 11.82246417300771;
R2: 0.8312221874592235;


In [8]:
# model_output_directory for used_cars.csv from local machine for random forest regressor 
model_output_directory = os.path.join("../UsedCarsPricePrediction/models_joblib", "model_rfr.joblib")

# from main.py in the current directory import the function training_rfr
from main import training_rfr

# call the function training_rfr and pass the train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory
training_rfr(train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory)

RMSE: 22.13644716861574;
Training Accuracy: 0.9828503759331686
Validation Accuracy: 0.6916503725956862
Saving model to ../UsedCarsPricePrediction/models_joblib\model_rfr.joblib


In [9]:
# model_path for used_cars.csv from local machine for random forest regressor
model_path = f"../UsedCarsPricePrediction/models_joblib/model_rfr.joblib"
print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/datasets"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation_results_json", "evaluation_rfr.json")

# from main.py in the current directory import the function evaluation_rfr
from main import evaluation_rfr

# call the function evaluation_rfr and pass the model_path, test_features_data, test_labels_data, evaluation_output_path
evaluation_rfr(model_path, test_features_data, test_labels_data, evaluation_output_path)

Loading test input data
RMSE: 7.397773727131262;
R2: 0.9339152231492407;


In [10]:
# model_output_directory for used_cars.csv from local machine for model_abr
model_output_directory = os.path.join("../UsedCarsPricePrediction/models_joblib", "model_abr.joblib")

# from main.py in the current directory import the function training_abr
from main import training_abr

# call the function training_abr and pass the train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory
training_abr(train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory)

RMSE: 22.535073087722964;
Training Accuracy: 0.9543936924491306
Validation Accuracy: 0.6804450631181782
Saving model to ../UsedCarsPricePrediction/models_joblib\model_abr.joblib


In [11]:
# model_path for used_cars.csv from local machine for model_abr 
model_path = f"../UsedCarsPricePrediction/models_joblib/model_abr.joblib"
print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/datasets"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation_results_json", "evaluation_abr.json")

# from main.py in the current directory import the function evaluation_abr
from main import evaluation_abr

# call the function evaluation_abr and pass the model_path, test_features_data, test_labels_data, evaluation_output_path
evaluation_abr(model_path, test_features_data, test_labels_data, evaluation_output_path)

Loading test input data
RMSE: 6.995229383919679;
R2: 0.940911459747701;


In [12]:
# model_output_directory for used_cars.csv from local machine for model_br
model_output_directory = os.path.join("../UsedCarsPricePrediction/models_joblib", "model_br.joblib")

# from main.py in the current directory import the function training_br
from main import training_br

# call the function training_br and pass the train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory
training_br(train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory)

RMSE: 22.325611764919042;
Training Accuracy: 0.9830871633658333
Validation Accuracy: 0.6863579195038911
Saving model to ../UsedCarsPricePrediction/models_joblib\model_br.joblib


In [13]:
# model_path for used_cars.csv from local machine for model_br
model_path = f"../UsedCarsPricePrediction/models_joblib/model_br.joblib"
print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/datasets"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation_results_json", "evaluation_br.json")

# from main.py in the current directory import the function evaluation_br
from main import evaluation_br

# call the function evaluation_br and pass the model_path, test_features_data, test_labels_data, evaluation_output_path
evaluation_br(model_path, test_features_data, test_labels_data, evaluation_output_path)

Loading test input data
RMSE: 7.404164510716247;
R2: 0.9338009952895394;


In [14]:
# model_output_directory for used_cars.csv from local machine for model_dtr
model_output_directory = os.path.join("../UsedCarsPricePrediction/models_joblib", "model_dtr.joblib")

# from main.py in the current directory import the function training_dtr
from main import training_dtr

# call the function training_dtr and pass the train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory
training_dtr(train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory)

RMSE: 22.568621936080856;
Training Accuracy: 1.0
Validation Accuracy: 0.6794928869045613
Saving model to ../UsedCarsPricePrediction/models_joblib\model_dtr.joblib


In [15]:
# model_path for used_cars.csv from local machine for model_dtr
model_path = f"../UsedCarsPricePrediction/models_joblib/model_dtr.joblib"
print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/datasets"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation_results_json", "evaluation_dtr.json")

# from main.py in the current directory import the function evaluation_dtr
from main import evaluation_dtr

# call the function evaluation_dtr and pass the model_path, test_features_data, test_labels_data, evaluation_output_path
evaluation_dtr(model_path, test_features_data, test_labels_data, evaluation_output_path)

Loading test input data
RMSE: 8.613734962256501;
R2: 0.9104052936975868;


In [16]:
# model_output_directory for used_cars.csv from local machine for model_knn
model_output_directory = os.path.join("../UsedCarsPricePrediction/models_joblib", "model_knn.joblib")

# from main.py in the current directory import the function training_knn
from main import training_knn

# call the function training_knn and pass the train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory
training_knn(train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory)

RMSE: 24.10221509075981;
Training Accuracy: 0.8259962155986728
Validation Accuracy: 0.6344544467595274
Saving model to ../UsedCarsPricePrediction/models_joblib\model_knn.joblib


In [17]:
# model_path for used_cars.csv from local machine for model_knn
model_path = f"../UsedCarsPricePrediction/models_joblib/model_knn.joblib"
print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/datasets"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation_results_json", "evaluation_knn.json")

# from main.py in the current directory import the function evaluation_knn
from main import evaluation_knn

# call the function evaluation_knn and pass the model_path, test_features_data, test_labels_data, evaluation_output_path
evaluation_knn(model_path, test_features_data, test_labels_data, evaluation_output_path)

Loading test input data
RMSE: 10.353159522580537;
R2: 0.870566959842025;


In [18]:
# model_output_directory for used_cars.csv from local machine for model_lr_reg_lasso
model_output_directory = os.path.join("../UsedCarsPricePrediction/models_joblib", "model_lr_reg_lasso.joblib")

# from main.py in the current directory import the function training_lr_reg_lasso
from main import training_lr_reg_lasso

# call the function training_lr_reg_lasso and pass the train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory
training_lr_reg_lasso(train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory)

RMSE: 19.965552413964623;
Training Accuracy: 0.7577516310631337
Validation Accuracy: 0.7491637777684657
Saving model to ../UsedCarsPricePrediction/models_joblib\model_lr_reg_lasso.joblib


In [19]:
# model_path for used_cars.csv from local machine for model_lr_reg_lasso
model_path = f"../UsedCarsPricePrediction/models_joblib/model_lr_reg_lasso.joblib"
print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/datasets"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation_results_json", "evaluation_lr_reg_lasso.json")

# from main.py in the current directory import the function evaluation_lr_reg_lasso
from main import evaluation_lr_reg_lasso

# call the function evaluation_lr_reg_lasso and pass the model_path, test_features_data, test_labels_data, evaluation_output_path
evaluation_lr_reg_lasso(model_path, test_features_data, test_labels_data, evaluation_output_path)

Loading test input data
RMSE: 11.88477035597751;
R2: 0.8294385302936675;


In [20]:
# model_output_directory for used_cars.csv from local machine for model_lr_reg
model_output_directory = os.path.join("../UsedCarsPricePrediction/models_joblib", "model_lr_reg_ridge.joblib")

# from main.py in the current directory import the function training_lr_reg
from main import training_lr_reg_ridge

# call the function training_lr_reg and pass the train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory
training_lr_reg_ridge(train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory)

RMSE: 19.380907317179734;
Training Accuracy: 0.764793226938298
Validation Accuracy: 0.7636390107775805
Saving model to ../UsedCarsPricePrediction/models_joblib\model_lr_reg_ridge.joblib


In [21]:
# model_path for used_cars.csv from local machine for model_lr_reg
model_path = f"../UsedCarsPricePrediction/models_joblib/model_lr_reg_ridge.joblib"
print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/datasets"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation_results_json", "evaluation_lr_reg_ridge.json")

# from main.py in the current directory import the function evaluation_lr_reg
from main import evaluation_lr_reg_ridge

# call the function evaluation_lr_reg and pass the model_path, test_features_data, test_labels_data, evaluation_output_path
evaluation_lr_reg_ridge(model_path, test_features_data, test_labels_data, evaluation_output_path)

Loading test input data
RMSE: 11.803644404211763;
R2: 0.8317591028087684;


In [22]:
# model_output_directory for used_cars.csv from local machine for model_lr
model_output_directory = os.path.join("../UsedCarsPricePrediction/models_joblib", "model_lr.joblib")

# from main.py in the current directory import the function training_lr
from main import training_lr

# call the function training_lr and pass the train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory
training_lr(train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory)

RMSE: 19.3793577173483;
Training Accuracy: 0.7648688620886591
Validation Accuracy: 0.7636768057374047
Saving model to ../UsedCarsPricePrediction/models_joblib\model_lr.joblib


In [23]:
# model_path for used_cars.csv from local machine for model_lr
model_path = f"../UsedCarsPricePrediction/models_joblib/model_lr.joblib"
print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/datasets"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation_results_json", "evaluation_lr.json")

# from main.py in the current directory import the function evaluation_lr
from main import evaluation_lr

# call the function evaluation_lr and pass the model_path, test_features_data, test_labels_data, evaluation_output_path
evaluation_lr(model_path, test_features_data, test_labels_data, evaluation_output_path)

Loading test input data
RMSE: 11.82246417300771;
R2: 0.8312221874592235;


In [24]:
# model_output_directory for used_cars.csv from local machine for model_rfr
model_output_directory = os.path.join("../UsedCarsPricePrediction/models_joblib", "model_rfr.joblib")

# from main.py in the current directory import the function training_rfr
from main import training_rfr

# call the function training_rfr and pass the train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory
training_rfr(train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory)

RMSE: 22.13644716861574;
Training Accuracy: 0.9828503759331686
Validation Accuracy: 0.6916503725956862
Saving model to ../UsedCarsPricePrediction/models_joblib\model_rfr.joblib


In [25]:
# model_path for used_cars.csv from local machine for model_rfr
model_path = f"../UsedCarsPricePrediction/models_joblib/model_rfr.joblib"
print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/datasets"
test_features_data = os.path.join(test_data_directory,"test_features.csv")
test_labels_data = os.path.join(test_data_directory,"test_labels.csv")
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation_results_json", "evaluation_rfr.json")

# from main.py in the current directory import the function evaluation_rfr
from main import evaluation_rfr

# call the function evaluation_rfr and pass the model_path, test_features_data, test_labels_data, evaluation_output_path
evaluation_rfr(model_path, test_features_data, test_labels_data, evaluation_output_path)

Loading test input data
RMSE: 7.397773727131262;
R2: 0.9339152231492407;


In [26]:
# model_output_directory for used_cars.csv from local machine for model_svr
model_output_directory = os.path.join("../UsedCarsPricePrediction/models_joblib", "model_svr.joblib")

# from main.py in the current directory import the function training_svr
from main import training_svr

# call the function training_svr and pass the train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory
training_svr(train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory)

RMSE: 30.072622511341212;
Training Accuracy: 0.8333191237936458
Validation Accuracy: 0.4309240983329129
Saving model to ../UsedCarsPricePrediction/models_joblib\model_svr.joblib


In [27]:
# model_path for used_cars.csv from local machine for model_svr
model_path = f"../UsedCarsPricePrediction/models_joblib/model_svr.joblib"
print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/datasets"
test_features_data =os.path.join(test_data_directory,"test_features.csv")
test_labels_data = os.path.join(test_data_directory,"test_labels.csv")
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation_results_json", "evaluation_svr.json")

# from main.py in the current directory import the function evaluation_svr
from main import evaluation_svr

# call the function evaluation_svr and pass the model_path, test_features_data, test_labels_data, evaluation_output_path
evaluation_svr(model_path, test_features_data, test_labels_data, evaluation_output_path)

Loading test input data
RMSE: 12.686073013767935;
R2: 0.8056637787334887;


In [28]:
# model_output_directory for used_cars.csv from local machine for model_xgbr
model_output_directory = os.path.join("../UsedCarsPricePrediction/models_joblib", "model_xgbr.joblib")

# from main.py in the current directory import the function training_xgbr
from main import training_xgbr

# call the function training_xgbr and pass the train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory
training_xgbr(train_features_data, train_labels_data, val_features_data, val_labels_data, model_output_directory)

RMSE: 23.07856287869712;
Training Accuracy: 0.9999999902077813
Validation Accuracy: 0.6648454555772869
Saving model to ../UsedCarsPricePrediction/models_joblib\model_xgbr.joblib


In [29]:
# model_path for used_cars.csv from local machine for model_xgbr
model_path = f"../UsedCarsPricePrediction/models_joblib/model_xgbr.joblib"
print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/datasets"
test_features_data =os.path.join(test_data_directory,"test_features.csv")
test_labels_data = os.path.join(test_data_directory,"test_labels.csv")
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation_results_json", "evaluation_xgbr.json")

# from main.py in the current directory import the function evaluation_xgbr
from main import evaluation_xgbr

# call the function evaluation_xgbr and pass the model_path, test_features_data, test_labels_data, evaluation_output_path
evaluation_xgbr(model_path, test_features_data, test_labels_data, evaluation_output_path)

Loading test input data
RMSE: 8.575832266542381;
R2: 0.9111920394058313;
