In [1]:
# import necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
# create train_src_dir and test_src_dir directories to store train and test data respectively 
train_src_dir = '../UsedCarsPricePrediction/train'
test_src_dir = '../UsedCarsPricePrediction/test'
# create train and test directories if they don't exist already 
if not os.path.exists(train_src_dir):
    os.mkdir(train_src_dir)
    os.mkdir(test_src_dir)

In [3]:
%%writefile {train_src_dir}/pre_process.py
import os
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer

from sklearn.model_selection import train_test_split


# input_data_path for used_cars.csv from local machine 
input_data_path = '../UsedCarsPricePrediction/train/used_cars.csv'
used_cars = pd.read_csv(input_data_path)

target = 'price'
numeric_features = ['Kilometers_Driven', 'Mileage', 'Engine','Power','Seats']
categorical_features = ['Segment']

# X for used_cars.csv from local machine 
X = used_cars.drop(columns=[target])
y = used_cars[target]

# split the data into train and test sets
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y,
                                                test_size=0.2,
                                                random_state=42)

# split the train data into train and validation sets
Xtrain, Xval, ytrain, yval = train_test_split(Xtrain, ytrain,
                                              test_size=0.2,
                                              random_state=42)

# create a preprocessor object to preprocess the data
preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown='ignore', 
                   sparse=False), categorical_features)
)

transformed_Xtrain = preprocessor.fit_transform(Xtrain)
transformed_Xval = preprocessor.transform(Xval)
transformed_Xtest = preprocessor.transform(Xtest)

# train_features_output_path for used_cars.csv from local machine
train_features_output_path = os.path.join("../UsedCarsPricePrediction/train", "train_features.csv")
# train_labels_output_path for used_cars.csv from local machine
train_labels_output_path = os.path.join("../UsedCarsPricePrediction/train", "train_labels.csv")

# validation set for used_cars.csv from local machine
val_features_output_path = os.path.join("../UsedCarsPricePrediction/train", "val_features.csv")
val_labels_output_path = os.path.join("../UsedCarsPricePrediction/train", "val_labels.csv")

# test_features_output_path for used_cars.csv from local machine
test_features_output_path = os.path.join("../UsedCarsPricePrediction/test", "test_features.csv")
# test_labels_output_path for used_cars.csv from local machine
test_labels_output_path = os.path.join("../UsedCarsPricePrediction/test", "test_labels.csv")

# save the validation set to csv file 
pd.DataFrame(transformed_Xval).to_csv(val_features_output_path, 
                                       header=False, index=False)
pd.DataFrame(transformed_Xtrain).to_csv(train_features_output_path, 
                                        header=False, index=False)
pd.DataFrame(transformed_Xtest).to_csv(test_features_output_path, 
                                       header=False, index=False)

ytrain.to_csv(train_labels_output_path, header=False, index=False)
yval.to_csv(val_labels_output_path, header=False, index=False)
ytest.to_csv(test_labels_output_path, header=False, index=False)

Overwriting ../UsedCarsPricePrediction/train/pre_process.py


In [4]:
# run the pre_process.py file to preprocess the data 
!python {train_src_dir}/pre_process.py

In [5]:
%%writefile {train_src_dir}/model_lr.py
import os
import joblib

import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


# training_data_directory for used_cars.csv from local machine
training_data_directory = "../UsedCarsPricePrediction/train"
# validation_data_directory for used_cars.csv from local machine
validation_data_directory = "../UsedCarsPricePrediction/train"
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"

train_features_data = os.path.join(training_data_directory, "train_features.csv") # this
train_labels_data = os.path.join(training_data_directory, "train_labels.csv")

val_features_data = os.path.join(validation_data_directory, "val_features.csv")
val_labels_data = os.path.join(validation_data_directory, "val_labels.csv")

test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_train = pd.read_csv(train_features_data, header=None)
y_train = pd.read_csv(train_labels_data, header=None)

model_lr = LinearRegression()

model_lr.fit(X_train, y_train)

# X_test = pd.read_csv(test_features_data, header=None)
X_val = pd.read_csv(val_features_data, header=None)
y_val = pd.read_csv(val_labels_data, header=None)

y_pred_val = model_lr.predict(X_val)

# print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"RMSE: {mean_squared_error(y_val, y_pred_val, squared=False)};")
# print accuracy score on the training and validation sets
print(f"Training Accuracy: {model_lr.score(X_train, y_train)}")
print(f"Validation Accuracy: {model_lr.score(X_val, y_val)}")

# model_output_directory for used_cars.csv from local machine
model_output_directory = os.path.join("../UsedCarsPricePrediction/model", "model_lr.joblib")

print(f"Saving model to {model_output_directory}")
joblib.dump(model_lr, model_output_directory)

Overwriting ../UsedCarsPricePrediction/train/model_lr.py


In [6]:
# run the model_lr.py file to train the model
!python {train_src_dir}/model_lr.py

RMSE: 19.3793577173483;
Training Accuracy: 0.7648688620886591
Validation Accuracy: 0.7636768057374047
Saving model to ../UsedCarsPricePrediction/model\model_lr.joblib


In [7]:
%%writefile {train_src_dir}/evaluation_lr.py
import os
import json
import joblib

import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score

# model_path for used_cars.csv from local machine
model_path = f"../UsedCarsPricePrediction/model/model_lr.joblib"
model = joblib.load(model_path)

print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_test = pd.read_csv(test_features_data, header=None)
y_test = pd.read_csv(test_labels_data, header=None)

y_pred = model.predict(X_test)

print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"R2: {r2_score(y_test, y_pred)};")

report_dict = {
        "regression_metrics": {
                "mse": {
                        "value": mean_squared_error(y_test, y_pred)
                },
                "rmse": {
                        "value": mean_squared_error(y_test, y_pred, squared=False)
                },
                "r2": {
                        "value": r2_score(y_test, y_pred)
                }
        }
}

# evaluation_output_path for used_cars.csv from local machine
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation", "evaluation_lr.json")

with open(evaluation_output_path, "w") as f:
      f.write(json.dumps(report_dict))

Overwriting ../UsedCarsPricePrediction/train/evaluation_lr.py


In [8]:
# run the evaluation_lr.py file to evaluate the model
!python {train_src_dir}/evaluation_lr.py

Loading test input data
RMSE: 11.82246417300771;
R2: 0.8312221874592235;


In [11]:
%%writefile {train_src_dir}/model_dtr.py
import os
import joblib

import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error 


# training_data_directory for used_cars.csv from local machine
training_data_directory = "../UsedCarsPricePrediction/train"
# validation_data_directory for used_cars.csv from local machine
validation_data_directory = "../UsedCarsPricePrediction/train"
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"

train_features_data = os.path.join(training_data_directory, "train_features.csv") # this 
train_labels_data = os.path.join(training_data_directory, "train_labels.csv")

val_features_data = os.path.join(validation_data_directory, "val_features.csv")
val_labels_data = os.path.join(validation_data_directory, "val_labels.csv")

test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_train = pd.read_csv(train_features_data, header=None)
y_train = pd.read_csv(train_labels_data, header=None)

model_dt = DecisionTreeRegressor()

model_dt.fit(X_train, y_train)

# X_test = pd.read_csv(test_features_data, header=None)
X_val = pd.read_csv(val_features_data, header=None)
y_val = pd.read_csv(val_labels_data, header=None)

y_pred_val = model_dt.predict(X_val)

# print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"RMSE: {mean_squared_error(y_val, y_pred_val, squared=False)};")
# print accuracy score on the training and validation sets 
print(f"Training Accuracy: {model_dt.score(X_train, y_train)}")
print(f"Validation Accuracy: {model_dt.score(X_val, y_val)}")

# model_output_directory for used_cars.csv from local machine
model_output_directory = os.path.join("../UsedCarsPricePrediction/model", "model_dtr.joblib")

print(f"Saving model to {model_output_directory}")
joblib.dump(model_dt, model_output_directory)

Overwriting ../UsedCarsPricePrediction/train/model_dtr.py


In [12]:
# run the model_dtr.py file to train the model
!python {train_src_dir}/model_dtr.py

RMSE: 22.525434859842328;
Training Accuracy: 1.0
Validation Accuracy: 0.6807183513072823
Saving model to ../UsedCarsPricePrediction/model\model_dtr.joblib


In [15]:
%%writefile {train_src_dir}/evaluation_dtr.py
import os
import json
import joblib
import tarfile

import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score

# model_path for used_cars.csv from local machine
model_path = f"../UsedCarsPricePrediction/model/model_dtr.joblib"
model = joblib.load(model_path)

print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_test = pd.read_csv(test_features_data, header=None)
y_test = pd.read_csv(test_labels_data, header=None)

y_pred = model.predict(X_test)

print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"R2: {r2_score(y_test, y_pred)};")

report_dict = {
        "regression_metrics": {
                "mse": {
                        "value": mean_squared_error(y_test, y_pred)
                },
                "rmse": {
                        "value": mean_squared_error(y_test, y_pred, squared=False)
                },
                "r2": {
                        "value": r2_score(y_test, y_pred)
                }
        }
}

# evaluation_output_path for used_cars.csv from local machine
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation", "evaluation_dtr.json")

with open(evaluation_output_path, "w") as f:
      f.write(json.dumps(report_dict))

Overwriting ../UsedCarsPricePrediction/train/evaluation_dtr.py


In [16]:
# run the evaluation script 
!python {train_src_dir}/evaluation_dtr.py

Loading test input data
RMSE: 8.741994516413287;
R2: 0.9077172771444003;


In [17]:
%%writefile {train_src_dir}/model_lr_reg.py
import os
import joblib

import pandas as pd

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error


# training_data_directory for used_cars.csv from local machine
training_data_directory = "../UsedCarsPricePrediction/train"
# validation_data_directory for used_cars.csv from local machine
validation_data_directory = "../UsedCarsPricePrediction/train"
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"

train_features_data = os.path.join(training_data_directory, "train_features.csv") # this
train_labels_data = os.path.join(training_data_directory, "train_labels.csv")

val_features_data = os.path.join(validation_data_directory, "val_features.csv")
val_labels_data = os.path.join(validation_data_directory, "val_labels.csv")

test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_train = pd.read_csv(train_features_data, header=None)
y_train = pd.read_csv(train_labels_data, header=None)

model_lr_reg = Ridge(alpha=0.5)

model_lr_reg.fit(X_train, y_train)

# X_test = pd.read_csv(test_features_data, header=None)
X_val = pd.read_csv(val_features_data, header=None)
y_val = pd.read_csv(val_labels_data, header=None)

y_pred_val = model_lr_reg.predict(X_val)

# print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"RMSE: {mean_squared_error(y_val, y_pred_val, squared=False)};")
# print accuracy score on the training and validation sets
print(f"Training Accuracy: {model_lr_reg.score(X_train, y_train)}")
print(f"Validation Accuracy: {model_lr_reg.score(X_val, y_val)}")

# model_output_directory for used_cars.csv from local machine
model_output_directory = os.path.join("../UsedCarsPricePrediction/model", "model_lr_reg.joblib")

print(f"Saving model to {model_output_directory}")
joblib.dump(model_lr_reg, model_output_directory)

Overwriting ../UsedCarsPricePrediction/train/model_lr_reg.py


In [18]:
# run the model_lr_reg.py file to train the model
!python {train_src_dir}/model_lr_reg.py

RMSE: 19.380907317179734;
Training Accuracy: 0.764793226938298
Validation Accuracy: 0.7636390107775805
Saving model to ../UsedCarsPricePrediction/model\model_lr_reg.joblib


In [19]:
%%writefile {train_src_dir}/evaluation_lr_reg.py
import os
import json
import joblib

import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score

# model_path for used_cars.csv from local machine
model_path = f"../UsedCarsPricePrediction/model/model_lr_reg.joblib"
model = joblib.load(model_path)

print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_test = pd.read_csv(test_features_data, header=None)
y_test = pd.read_csv(test_labels_data, header=None)

y_pred = model.predict(X_test)

print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"R2: {r2_score(y_test, y_pred)};")
report_dict = {
        "regression_metrics": {
                "mse": {
                        "value": mean_squared_error(y_test, y_pred)
                },
                "rmse": {
                        "value": mean_squared_error(y_test, y_pred, squared=False)
                },
                "r2": {
                        "value": r2_score(y_test, y_pred)
                }
        }
}

# evaluation_output_path for used_cars.csv from local machine
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation", "evaluation_lr_reg.json")

with open(evaluation_output_path, "w") as f:
      f.write(json.dumps(report_dict))

Writing ../UsedCarsPricePrediction/train/evaluation_lr_reg.py


In [20]:
# run the evaluation_lr_reg.py file to evaluate the model
!python {train_src_dir}/evaluation_lr_reg.py

Loading test input data
RMSE: 11.803644404211763;
R2: 0.8317591028087684;


In [21]:
%%writefile {train_src_dir}/model_lr_reg_lasso.py
import os
import joblib

import pandas as pd

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error


# training_data_directory for used_cars.csv from local machine
training_data_directory = "../UsedCarsPricePrediction/train"
# validation_data_directory for used_cars.csv from local machine
validation_data_directory = "../UsedCarsPricePrediction/train"
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"

train_features_data = os.path.join(training_data_directory, "train_features.csv") # this
train_labels_data = os.path.join(training_data_directory, "train_labels.csv")

val_features_data = os.path.join(validation_data_directory, "val_features.csv")
val_labels_data = os.path.join(validation_data_directory, "val_labels.csv")

test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_train = pd.read_csv(train_features_data, header=None)
y_train = pd.read_csv(train_labels_data, header=None)

model_lr_reg_lasso = Lasso(alpha=0.5)

model_lr_reg_lasso.fit(X_train, y_train)

# X_test = pd.read_csv(test_features_data, header=None)
X_val = pd.read_csv(val_features_data, header=None)
y_val = pd.read_csv(val_labels_data, header=None)

y_pred_val = model_lr_reg_lasso.predict(X_val)

# print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"RMSE: {mean_squared_error(y_val, y_pred_val, squared=False)};")
# print accuracy score on the training and validation sets
print(f"Training Accuracy: {model_lr_reg_lasso.score(X_train, y_train)}")
print(f"Validation Accuracy: {model_lr_reg_lasso.score(X_val, y_val)}")

# model_output_directory for used_cars.csv from local machine
model_output_directory = os.path.join("../UsedCarsPricePrediction/model", "model_lr_reg_lasso.joblib")

print(f"Saving model to {model_output_directory}")
joblib.dump(model_lr_reg_lasso, model_output_directory)

Overwriting ../UsedCarsPricePrediction/train/model_lr_reg_lasso.py


In [22]:
# run the model_lr_reg_lasso.py file to train the model
!python {train_src_dir}/model_lr_reg_lasso.py

RMSE: 19.965552413964623;
Training Accuracy: 0.7577516310631337
Validation Accuracy: 0.7491637777684657
Saving model to ../UsedCarsPricePrediction/model\model_lr_reg_lasso.joblib


In [23]:
%%writefile {train_src_dir}/evaluation_lr_reg_lasso.py
import os
import json
import joblib

import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score

# model_path for used_cars.csv from local machine
model_path = f"../UsedCarsPricePrediction/model/model_lr_reg_lasso.joblib"
model = joblib.load(model_path)

print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_test = pd.read_csv(test_features_data, header=None)
y_test = pd.read_csv(test_labels_data, header=None)

y_pred = model.predict(X_test)

print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"R2: {r2_score(y_test, y_pred)};")

report_dict = {
        "regression_metrics": {
                "mse": {
                        "value": mean_squared_error(y_test, y_pred)
                },
                "rmse": {
                        "value": mean_squared_error(y_test, y_pred, squared=False)
                },
                "r2": {
                        "value": r2_score(y_test, y_pred)
                }
        }
}

# evaluation_output_path for used_cars.csv from local machine
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation", "evaluation_lr_reg_lasso.json")

with open(evaluation_output_path, "w") as f:
      f.write(json.dumps(report_dict))

Writing ../UsedCarsPricePrediction/train/evaluation_lr_reg_lasso.py


In [24]:
# run the evaluation_lr_reg_lasso.py file to evaluate the model
!python {train_src_dir}/evaluation_lr_reg_lasso.py

Loading test input data
RMSE: 11.88477035597751;
R2: 0.8294385302936675;


In [25]:
%%writefile {train_src_dir}/model_knn.py
import os
import joblib

import pandas as pd

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error


# training_data_directory for used_cars.csv from local machine
training_data_directory = "../UsedCarsPricePrediction/train"
# validation_data_directory for used_cars.csv from local machine
validation_data_directory = "../UsedCarsPricePrediction/train"
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"

train_features_data = os.path.join(training_data_directory, "train_features.csv") # this
train_labels_data = os.path.join(training_data_directory, "train_labels.csv")

val_features_data = os.path.join(validation_data_directory, "val_features.csv")
val_labels_data = os.path.join(validation_data_directory, "val_labels.csv")

test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_train = pd.read_csv(train_features_data, header=None)
y_train = pd.read_csv(train_labels_data, header=None)

model_knn = KNeighborsRegressor(n_neighbors=5)

model_knn.fit(X_train, y_train)

# X_test = pd.read_csv(test_features_data, header=None)
X_val = pd.read_csv(val_features_data, header=None)
y_val = pd.read_csv(val_labels_data, header=None)

y_pred_val = model_knn.predict(X_val)

# print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"RMSE: {mean_squared_error(y_val, y_pred_val, squared=False)};")
# print accuracy score on the training and validation sets
print(f"Training Accuracy: {model_knn.score(X_train, y_train)}")
print(f"Validation Accuracy: {model_knn.score(X_val, y_val)}")

# model_output_directory for used_cars.csv from local machine
model_output_directory = os.path.join("../UsedCarsPricePrediction/model", "model_knn.joblib")

print(f"Saving model to {model_output_directory}")
joblib.dump(model_knn, model_output_directory)

Overwriting ../UsedCarsPricePrediction/train/model_knn.py


In [26]:
# run the model_knn.py file to train the model
!python {train_src_dir}/model_knn.py

RMSE: 24.10221509075981;
Training Accuracy: 0.8259962155986728
Validation Accuracy: 0.6344544467595274
Saving model to ../UsedCarsPricePrediction/model\model_knn.joblib


In [27]:
%%writefile {train_src_dir}/evaluation_knn.py
import os
import json
import joblib

import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score

# model_path for used_cars.csv from local machine
model_path = f"../UsedCarsPricePrediction/model/model_knn.joblib"
model = joblib.load(model_path)

print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_test = pd.read_csv(test_features_data, header=None)
y_test = pd.read_csv(test_labels_data, header=None)

y_pred = model.predict(X_test)

print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"R2: {r2_score(y_test, y_pred)};")

report_dict = {
        "regression_metrics": {
                "mse": {
                        "value": mean_squared_error(y_test, y_pred)
                },
                "rmse": {
                        "value": mean_squared_error(y_test, y_pred, squared=False)
                },
                "r2": {
                        "value": r2_score(y_test, y_pred)
                }
        }
}

# evaluation_output_path for used_cars.csv from local machine
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation", "evaluation_knn.json")

with open(evaluation_output_path, "w") as f:
      f.write(json.dumps(report_dict))

Writing ../UsedCarsPricePrediction/train/evaluation_knn.py


In [28]:
# run the evaluation_knn.py file to evaluate the model
!python {train_src_dir}/evaluation_knn.py

Loading test input data
RMSE: 10.353159522580537;
R2: 0.870566959842025;


In [29]:
%%writefile {train_src_dir}/model_rfr.py
import os
import joblib

import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


# training_data_directory for used_cars.csv from local machine
training_data_directory = "../UsedCarsPricePrediction/train"
# validation_data_directory for used_cars.csv from local machine
validation_data_directory = "../UsedCarsPricePrediction/train"
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"

train_features_data = os.path.join(training_data_directory, "train_features.csv") # this
train_labels_data = os.path.join(training_data_directory, "train_labels.csv")

val_features_data = os.path.join(validation_data_directory, "val_features.csv")
val_labels_data = os.path.join(validation_data_directory, "val_labels.csv")

test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_train = pd.read_csv(train_features_data, header=None)
y_train = pd.read_csv(train_labels_data, header=None)

model_rfr = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

model_rfr.fit(X_train, y_train)

# X_test = pd.read_csv(test_features_data, header=None)
X_val = pd.read_csv(val_features_data, header=None)
y_val = pd.read_csv(val_labels_data, header=None)

y_pred_val = model_rfr.predict(X_val)

# print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"RMSE: {mean_squared_error(y_val, y_pred_val, squared=False)};")
# print accuracy score on the training and validation sets
print(f"Training Accuracy: {model_rfr.score(X_train, y_train)}")
print(f"Validation Accuracy: {model_rfr.score(X_val, y_val)}")

# model_output_directory for used_cars.csv from local machine
model_output_directory = os.path.join("../UsedCarsPricePrediction/model", "model_rfr.joblib")

print(f"Saving model to {model_output_directory}")
joblib.dump(model_rfr, model_output_directory)

Overwriting ../UsedCarsPricePrediction/train/model_rfr.py


In [30]:
# run the model_rfr.py file to train the model
!python {train_src_dir}/model_rfr.py

RMSE: 22.13644716861574;
Training Accuracy: 0.9828503759331686
Validation Accuracy: 0.6916503725956862
Saving model to ../UsedCarsPricePrediction/model\model_rfr.joblib


  model_rfr.fit(X_train, y_train)


In [31]:
%%writefile {train_src_dir}/evaluation_rfr.py
import os
import json
import joblib

import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score

# model_path for used_cars.csv from local machine
model_path = f"../UsedCarsPricePrediction/model/model_rfr.joblib"
model = joblib.load(model_path)

print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_test = pd.read_csv(test_features_data, header=None)
y_test = pd.read_csv(test_labels_data, header=None)

y_pred = model.predict(X_test)

print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"R2: {r2_score(y_test, y_pred)};")

report_dict = {
        "regression_metrics": {
                "mse": {
                        "value": mean_squared_error(y_test, y_pred)
                },
                "rmse": {
                        "value": mean_squared_error(y_test, y_pred, squared=False)
                },
                "r2": {
                        "value": r2_score(y_test, y_pred)
                }
        }
}

# evaluation_output_path for used_cars.csv from local machine
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation", "evaluation_rfr.json")

with open(evaluation_output_path, "w") as f:
      f.write(json.dumps(report_dict))

Writing ../UsedCarsPricePrediction/train/evaluation_rfr.py


In [32]:
# run the evaluation_rfr.py file to evaluate the model
!python {train_src_dir}/evaluation_rfr.py

Loading test input data
RMSE: 7.397773727131262;
R2: 0.9339152231492407;


In [33]:
%%writefile {train_src_dir}/model_br.py
import os
import joblib

import pandas as pd

from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error


# training_data_directory for used_cars.csv from local machine
training_data_directory = "../UsedCarsPricePrediction/train"
# validation_data_directory for used_cars.csv from local machine
validation_data_directory = "../UsedCarsPricePrediction/train"
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"

train_features_data = os.path.join(training_data_directory, "train_features.csv") # this
train_labels_data = os.path.join(training_data_directory, "train_labels.csv")

val_features_data = os.path.join(validation_data_directory, "val_features.csv")
val_labels_data = os.path.join(validation_data_directory, "val_labels.csv")

test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_train = pd.read_csv(train_features_data, header=None)
y_train = pd.read_csv(train_labels_data, header=None)

model_br = BaggingRegressor(n_estimators=100, random_state=42)

model_br.fit(X_train, y_train)

# X_test = pd.read_csv(test_features_data, header=None)
X_val = pd.read_csv(val_features_data, header=None)
y_val = pd.read_csv(val_labels_data, header=None)

y_pred_val = model_br.predict(X_val)

# print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"RMSE: {mean_squared_error(y_val, y_pred_val, squared=False)};")
# print accuracy score on the training and validation sets
print(f"Training Accuracy: {model_br.score(X_train, y_train)}")
print(f"Validation Accuracy: {model_br.score(X_val, y_val)}")

# model_output_directory for used_cars.csv from local machine
model_output_directory = os.path.join("../UsedCarsPricePrediction/model", "model_br.joblib")

print(f"Saving model to {model_output_directory}")
joblib.dump(model_br, model_output_directory)

Overwriting ../UsedCarsPricePrediction/train/model_br.py


In [34]:
# run the model_br.py file to train the model
!python {train_src_dir}/model_br.py

RMSE: 22.325611764919042;
Training Accuracy: 0.9830871633658333
Validation Accuracy: 0.6863579195038911
Saving model to ../UsedCarsPricePrediction/model\model_br.joblib


  return column_or_1d(y, warn=True)


In [35]:
%%writefile {train_src_dir}/evaluation_br.py
import os
import json
import joblib

import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score

# model_path for used_cars.csv from local machine
model_path = f"../UsedCarsPricePrediction/model/model_br.joblib"
model = joblib.load(model_path)

print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_test = pd.read_csv(test_features_data, header=None)
y_test = pd.read_csv(test_labels_data, header=None)

y_pred = model.predict(X_test)

print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"R2: {r2_score(y_test, y_pred)};")

report_dict = {
        "regression_metrics": {
                "mse": {
                        "value": mean_squared_error(y_test, y_pred)
                },
                "rmse": {
                        "value": mean_squared_error(y_test, y_pred, squared=False)
                },
                "r2": {
                        "value": r2_score(y_test, y_pred)
                }
        }
}

# evaluation_output_path for used_cars.csv from local machine
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation", "evaluation_br.json")

with open(evaluation_output_path, "w") as f:
      f.write(json.dumps(report_dict))

Writing ../UsedCarsPricePrediction/train/evaluation_br.py


In [36]:
# run the evaluation_br.py file to evaluate the model
!python {train_src_dir}/evaluation_br.py

Loading test input data
RMSE: 7.404164510716247;
R2: 0.9338009952895394;


In [37]:
%%writefile {train_src_dir}/model_abr.py
import os
import joblib

import pandas as pd

from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error


# training_data_directory for used_cars.csv from local machine
training_data_directory = "../UsedCarsPricePrediction/train"
# validation_data_directory for used_cars.csv from local machine
validation_data_directory = "../UsedCarsPricePrediction/train"

train_features_data = os.path.join(training_data_directory, "train_features.csv") # this
train_labels_data = os.path.join(training_data_directory, "train_labels.csv")

val_features_data = os.path.join(validation_data_directory, "val_features.csv")
val_labels_data = os.path.join(validation_data_directory, "val_labels.csv")


X_train = pd.read_csv(train_features_data, header=None)
y_train = pd.read_csv(train_labels_data, header=None)

model_abr = AdaBoostRegressor(n_estimators=100, random_state=42)

model_abr.fit(X_train, y_train)

X_val = pd.read_csv(val_features_data, header=None)
y_val = pd.read_csv(val_labels_data, header=None)

y_pred_val = model_abr.predict(X_val)

print(f"RMSE: {mean_squared_error(y_val, y_pred_val, squared=False)};")
# print accuracy score on the training and validation sets
print(f"Training Accuracy: {model_abr.score(X_train, y_train)}")
print(f"Validation Accuracy: {model_abr.score(X_val, y_val)}")

# model_output_directory for used_cars.csv from local machine
model_output_directory = os.path.join("../UsedCarsPricePrediction/model", "model_abr.joblib")

print(f"Saving model to {model_output_directory}")
joblib.dump(model_abr, model_output_directory)

Overwriting ../UsedCarsPricePrediction/train/model_abr.py


In [38]:
# run the model_abr.py file to train the model
!python {train_src_dir}/model_abr.py

RMSE: 22.535073087722964;
Training Accuracy: 0.9543936924491306
Validation Accuracy: 0.6804450631181782
Saving model to ../UsedCarsPricePrediction/model\model_abr.joblib


  y = column_or_1d(y, warn=True)


In [39]:
%%writefile {train_src_dir}/evaluation_abr.py
import os
import json
import joblib

import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score

# model_path for used_cars.csv from local machine
model_path = f"../UsedCarsPricePrediction/model/model_abr.joblib"
model = joblib.load(model_path)

print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")


X_test = pd.read_csv(test_features_data, header=None)
y_test = pd.read_csv(test_labels_data, header=None)

y_pred = model.predict(X_test)

print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"R2: {r2_score(y_test, y_pred)};")

report_dict = {
        "regression_metrics": {
                "mse": {
                        "value": mean_squared_error(y_test, y_pred)
                },
                "rmse": {
                        "value": mean_squared_error(y_test, y_pred, squared=False)
                },
                "r2": {
                        "value": r2_score(y_test, y_pred)
                }
        }
}

# evaluation_output_path for used_cars.csv from local machine
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation", "evaluation_abr.json")

with open(evaluation_output_path, "w") as f:
      f.write(json.dumps(report_dict))

Writing ../UsedCarsPricePrediction/train/evaluation_abr.py


In [40]:
# run the evaluation_abr.py file to evaluate the model
!python {train_src_dir}/evaluation_abr.py

Loading test input data
RMSE: 6.995229383919679;
R2: 0.940911459747701;


In [41]:
%%writefile {train_src_dir}/model_xgbr.py
import os
import joblib

import pandas as pd

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error


# training_data_directory for used_cars.csv from local machine
training_data_directory = "../UsedCarsPricePrediction/train"
# validation_data_directory for used_cars.csv from local machine
validation_data_directory = "../UsedCarsPricePrediction/train"

train_features_data = os.path.join(training_data_directory, "train_features.csv") # this
train_labels_data = os.path.join(training_data_directory, "train_labels.csv")

val_features_data = os.path.join(validation_data_directory, "val_features.csv")
val_labels_data = os.path.join(validation_data_directory, "val_labels.csv")


X_train = pd.read_csv(train_features_data, header=None)
y_train = pd.read_csv(train_labels_data, header=None)

model_xgbr = XGBRegressor(n_estimators=100, random_state=42)

model_xgbr.fit(X_train, y_train)

X_val = pd.read_csv(val_features_data, header=None)
y_val = pd.read_csv(val_labels_data, header=None)

y_pred_val = model_xgbr.predict(X_val)

print(f"RMSE: {mean_squared_error(y_val, y_pred_val, squared=False)};")
# print accuracy score on the training and validation sets
print(f"Training Accuracy: {model_xgbr.score(X_train, y_train)}")
print(f"Validation Accuracy: {model_xgbr.score(X_val, y_val)}")

# model_output_directory for used_cars.csv from local machine
model_output_directory = os.path.join("../UsedCarsPricePrediction/model", "model_xgbr.joblib")

print(f"Saving model to {model_output_directory}")
joblib.dump(model_xgbr, model_output_directory)

Overwriting ../UsedCarsPricePrediction/train/model_xgbr.py


In [43]:
# run the model_xgbr.py file to train the model
!python {train_src_dir}/model_xgbr.py

RMSE: 23.07856287869712;
Training Accuracy: 0.9999999902077813
Validation Accuracy: 0.6648454555772869
Saving model to ../UsedCarsPricePrediction/model\model_xgbr.joblib


In [44]:
%%writefile {train_src_dir}/evaluation_xgbr.py
import os
import json
import joblib

import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score

# model_path for used_cars.csv from local machine
model_path = f"../UsedCarsPricePrediction/model/model_xgbr.joblib"
model = joblib.load(model_path)

print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")


X_test = pd.read_csv(test_features_data, header=None)
y_test = pd.read_csv(test_labels_data, header=None)

y_pred = model.predict(X_test)

print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"R2: {r2_score(y_test, y_pred)};")

report_dict = {
        "regression_metrics": {
                "mse": {
                        "value": mean_squared_error(y_test, y_pred)
                },
                "rmse": {
                        "value": mean_squared_error(y_test, y_pred, squared=False)
                },
                "r2": {
                        "value": r2_score(y_test, y_pred)
                }
        }
}

# evaluation_output_path for used_cars.csv from local machine
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation", "evaluation_xgbr.json")

with open(evaluation_output_path, "w") as f:
      f.write(json.dumps(report_dict))

Writing ../UsedCarsPricePrediction/train/evaluation_xgbr.py


In [45]:
# run the evaluation_xgbr.py file to evaluate the model
!python {train_src_dir}/evaluation_xgbr.py

Loading test input data
RMSE: 8.575832266542381;
R2: 0.9111920394058313;
