In [1]:
# import necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
import pickle
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
# create train_src_dir and test_src_dir directories to store train and test data respectively 
train_src_dir = '../UsedCarsPricePrediction/train'
test_src_dir = '../UsedCarsPricePrediction/test'
# create train and test directories if they don't exist already 
if not os.path.exists(train_src_dir):
    os.mkdir(train_src_dir)
    os.mkdir(test_src_dir)

In [3]:
%%writefile {train_src_dir}/pre_process1.py
import os
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer

from sklearn.model_selection import train_test_split


# input_data_path for used_cars.csv from local machine 
input_data_path = '../UsedCarsPricePrediction/train/used_cars.csv'
used_cars = pd.read_csv(input_data_path)

target = 'price'
numeric_features = ['Kilometers_Driven', 'Mileage', 'Engine','Power','Seats']
categorical_features = ['Segment']

# X for used_cars.csv from local machine 
X = used_cars.drop(columns=[target])
y = used_cars[target]

# split the data into train and test sets
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y,
                                                test_size=0.2,
                                                random_state=42)

# split the train data into train and validation sets
Xtrain, Xval, ytrain, yval = train_test_split(Xtrain, ytrain,
                                              test_size=0.2,
                                              random_state=42)

# create a preprocessor object to preprocess the data
preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown='ignore', 
                   sparse=False), categorical_features)
)

transformed_Xtrain = preprocessor.fit_transform(Xtrain)
transformed_Xval = preprocessor.transform(Xval)
transformed_Xtest = preprocessor.transform(Xtest)

# train_features_output_path for used_cars.csv from local machine
train_features_output_path = os.path.join("../UsedCarsPricePrediction/train", "train_features.csv")
# train_labels_output_path for used_cars.csv from local machine
train_labels_output_path = os.path.join("../UsedCarsPricePrediction/train", "train_labels.csv")

# validation set for used_cars.csv from local machine
val_features_output_path = os.path.join("../UsedCarsPricePrediction/train", "val_features.csv")
val_labels_output_path = os.path.join("../UsedCarsPricePrediction/train", "val_labels.csv")

# test_features_output_path for used_cars.csv from local machine
test_features_output_path = os.path.join("../UsedCarsPricePrediction/test", "test_features.csv")
# test_labels_output_path for used_cars.csv from local machine
test_labels_output_path = os.path.join("../UsedCarsPricePrediction/test", "test_labels.csv")

# save the validation set to csv file 
pd.DataFrame(transformed_Xval).to_csv(val_features_output_path, 
                                       header=False, index=False)
pd.DataFrame(transformed_Xtrain).to_csv(train_features_output_path, 
                                        header=False, index=False)
pd.DataFrame(transformed_Xtest).to_csv(test_features_output_path, 
                                       header=False, index=False)

ytrain.to_csv(train_labels_output_path, header=False, index=False)
yval.to_csv(val_labels_output_path, header=False, index=False)
ytest.to_csv(test_labels_output_path, header=False, index=False)

Overwriting ../UsedCarsPricePrediction/train/pre_process1.py


In [6]:
%%writefile {train_src_dir}/model_dtr.py
import os
import joblib

import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error 


# training_data_directory for used_cars.csv from local machine
training_data_directory = "../UsedCarsPricePrediction/train"
# validation_data_directory for used_cars.csv from local machine
validation_data_directory = "../UsedCarsPricePrediction/train"
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"

train_features_data = os.path.join(training_data_directory, "train_features.csv") # this 
train_labels_data = os.path.join(training_data_directory, "train_labels.csv")

val_features_data = os.path.join(validation_data_directory, "val_features.csv")
val_labels_data = os.path.join(validation_data_directory, "val_labels.csv")

test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_train = pd.read_csv(train_features_data, header=None)
y_train = pd.read_csv(train_labels_data, header=None)

model_dt = DecisionTreeRegressor()

model_dt.fit(X_train, y_train)

# X_test = pd.read_csv(test_features_data, header=None)
X_val = pd.read_csv(val_features_data, header=None)
y_val = pd.read_csv(val_labels_data, header=None)

y_pred_val = model_dt.predict(X_val)

# print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"RMSE: {mean_squared_error(y_val, y_pred_val, squared=False)};")
# print accuracy score on the training and validation sets 
print(f"Training Accuracy: {model_dt.score(X_train, y_train)}")
print(f"Validation Accuracy: {model_dt.score(X_val, y_val)}")

# model_output_directory for used_cars.csv from local machine
model_output_directory = os.path.join("../UsedCarsPricePrediction/model", "model.joblib")

print(f"Saving model to {model_output_directory}")
joblib.dump(model_dt, model_output_directory)

RMSE: 22.568580326833807;
Training Accuracy: 1.0
Validation Accuracy: 0.6794940687265967
Saving model to ../UsedCarsPricePrediction/model\model.joblib


['../UsedCarsPricePrediction/model\\model.joblib']

In [7]:
# %%writefile {train_src_dir}/evaluation_dtr.py
import os
import json
import joblib
import tarfile

import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score

# model_path for used_cars.csv from local machine
model_path = f"../UsedCarsPricePrediction/model/model.joblib"
model = joblib.load(model_path)

print("Loading test input data")
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"
test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_test = pd.read_csv(test_features_data, header=None)
y_test = pd.read_csv(test_labels_data, header=None)

y_pred = model.predict(X_test)

print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")

report_dict = {
        "regression_metrics": {
                "mse": {
                        "value": mean_squared_error(y_test, y_pred)
                },
                "rmse": {
                        "value": mean_squared_error(y_test, y_pred, squared=False)
                },
                "r2": {
                        "value": r2_score(y_test, y_pred)
                }
        }
}

# evaluation_output_path for used_cars.csv from local machine
evaluation_output_path = os.path.join("../UsedCarsPricePrediction/evaluation", "evaluation.json")

with open(evaluation_output_path, "w") as f:
      f.write(json.dumps(report_dict))

Loading test input data
RMSE: 8.688475017228281;


In [None]:
%%writefile {train_src_dir}/model_lr.py
import os
import joblib

import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


# training_data_directory for used_cars.csv from local machine
training_data_directory = "../UsedCarsPricePrediction/train"
# validation_data_directory for used_cars.csv from local machine
validation_data_directory = "../UsedCarsPricePrediction/train"
# test_data_directory for used_cars.csv from local machine
test_data_directory = "../UsedCarsPricePrediction/test"

train_features_data = os.path.join(training_data_directory, "train_features.csv") # this
train_labels_data = os.path.join(training_data_directory, "train_labels.csv")

val_features_data = os.path.join(validation_data_directory, "val_features.csv")
val_labels_data = os.path.join(validation_data_directory, "val_labels.csv")

test_features_data = os.path.join(test_data_directory, "test_features.csv")
test_labels_data = os.path.join(test_data_directory, "test_labels.csv")

X_train = pd.read_csv(train_features_data, header=None)
y_train = pd.read_csv(train_labels_data, header=None)

model_lr = LinearRegression()

model_lr.fit(X_train, y_train)

# X_test = pd.read_csv(test_features_data, header=None)
X_val = pd.read_csv(val_features_data, header=None)
y_val = pd.read_csv(val_labels_data, header=None)

y_pred_val = model_lr.predict(X_val)

# print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)};")
print(f"RMSE: {mean_squared_error(y_val, y_pred_val, squared=False)};")
# print accuracy score on the training and validation sets
print(f"Training Accuracy: {model_lr.score(X_train, y_train)}")
print(f"Validation Accuracy: {model_lr.score(X_val, y_val)}")

# model_output_directory for used_cars.csv from local machine
model_output_directory = os.path.join("../UsedCarsPricePrediction/model", "model.joblib")

print(f"Saving model to {model_output_directory}")
joblib.dump(model_lr, model_output_directory)


In [None]:
%%writefile {train_src_dir}/pre_process.py
import os
import argparse # 
import pandas as pd
# import azureml.core
import numpy as np
# import mlflow
# import mlflow.sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
# from azureml.core import Workspace

def main():
    """Main function of the script."""
 

    # input and output arguments passed by the estimator 
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="path to input data")
    parser.add_argument("--output", type=str, help="path to output data")
    args = parser.parse_args()

    ###################
    #<prepare the data>
    ###################
    
    print("input data:", args.data)
    
    data = pd.read_csv(args.data)


    ###################
    #<processing>
    ###################

    # Separate categorical and numerical features
    categorical_columns = data.select_dtypes(include=['object']).columns
    numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns

    # Apply label encoding to categorical columns
    label_encoder = LabelEncoder()
    for col in categorical_columns:
        data[col] = label_encoder.fit_transform(data[col])

    # Apply data scaling to numerical columns
    scaler = StandardScaler()
    data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

    # Exporting processed data to local
    processed_data_path = os.path.join(args.output, 'used_cars_processed.csv')
    data.to_csv(processed_data_path, index=False)
    print("processed data is exported to", processed_data_path)

if __name__ == "__main__":
 main()

In [None]:
# run the script to process the data and save the processed data in the train_src_dir directory 
!python {train_src_dir}/pre_process.py --data {train_src_dir}/used_cars.csv --output {train_src_dir}

In [None]:
%%writefile {train_src_dir}/model_gbr.py

# import mlflow
import argparse

import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# mlflow.start_run()

def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="path to train data")
    parser.add_argument("--n_estimators", required=False, default=100, type=int)
    parser.add_argument("--learning_rate", required=False, default=0.1, type=float)

    args = parser.parse_args()

    df = pd.read_csv(args.data)
    
    target = 'price'
    numeric_features = ['Segment','Kilometers_Driven', 'Mileage', 'Engine','Power','Seats']
    categorical_features = []

    X = df.drop([target], axis=1)
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
    )

    # split the training data into train and validation sets 
    X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
    )

    model_gbr = GradientBoostingRegressor(
    n_estimators=args.n_estimators,
    learning_rate=args.learning_rate
    )

    model_pipeline = make_pipeline(model_gbr)

    model_pipeline.fit(X_train, y_train)

    # print the training accuracy score
    print("training accuracy score:", model_pipeline.score(X_train, y_train))

    # print the training evaluation metrics
    print("training R2 score:", model_pipeline.score(X_train, y_train))
    print("training MAE:", mean_absolute_error(y_train, model_pipeline.predict(X_train)))
    print("training MSE:", mean_squared_error(y_train, model_pipeline.predict(X_train)))
    print("training RMSE:", np.sqrt(mean_squared_error(y_train, model_pipeline.predict(X_train))))

    # apply the model to the validation dataset 
    y_pred = model_pipeline.predict(X_val)

    # print validation accuracy and R2 score
    print("validation accuracy:", model_pipeline.score(X_val, y_val))

    # print the evaluation metrics for the validation dataset 
    print("validation R2 score:", r2_score(y_val, y_pred))
    print("validation MAE:", mean_absolute_error(y_val, y_pred))
    print("validation MSE:", mean_squared_error(y_val, y_pred))
    print("validation RMSE:", np.sqrt(mean_squared_error(y_val, y_pred)))

    # build a dataframe with evaluation metrics for training and validation datasets 
    eval_df = pd.DataFrame(
    {
    'R2_score_train': [model_pipeline.score(X_train, y_train)],
    'R2_score_val': [model_pipeline.score(X_val, y_val)],
    'MAE_train': [mean_absolute_error(y_train, model_pipeline.predict(X_train))],
    'MAE_val': [mean_absolute_error(y_val, y_pred)],
    'MSE_train': [mean_squared_error(y_train, model_pipeline.predict(X_train))],
    'MSE_val': [mean_squared_error(y_val, y_pred)],
    'RMSE_train': [np.sqrt(mean_squared_error(y_train, model_pipeline.predict(X_train)))],
    'RMSE_val': [np.sqrt(mean_squared_error(y_val, y_pred))]
    }
    )

    # save the model to the outputs directory for capture
    model_output_path = 'outputs/model_gbr.pkl'
    joblib.dump(model_pipeline, model_output_path)
    print("saved model to", model_output_path)
    
if __name__ == '__main__':
    main()

In [None]:
# run the script to train the model and save the model in the train_src_dir directory 
!python {train_src_dir}/model_gbr.py --data {train_src_dir}/used_cars_processed.csv --n_estimators 500 --learning_rate 0.01

In [None]:
# run the script to train the model and save the model in the train_src_dir directory 
!python {train_src_dir}/model_gbr.py --data {train_src_dir}/used_cars_processed.csv --n_estimators 100 --learning_rate 0.1

In [None]:
# load the model from the outputs directory and test it on the test dataset 
model_output_path = 'outputs/model_gbr.pkl'
model_gbr = joblib.load(model_output_path)

# load the test dataset
test_data_path = '../UsedCarsPricePrediction/test/used_cars_test.csv'
df_test = pd.read_csv(test_data_path)

# separate the target variable from the test dataset
target = 'price'
X_test = df_test.drop([target], axis=1)
y_test = df_test[target]

# apply the model to the test dataset
y_pred = model_gbr.predict(X_test)

# print the evaluation metrics for the test dataset
print("test R2 score:", model_gbr.score(X_test, y_test))
print("test MAE:", mean_absolute_error(y_test, y_pred))
print("test MSE:", mean_squared_error(y_test, y_pred))
print("test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

# build a dataframe with evaluation metrics for the test dataset
eval_df = pd.DataFrame(
{
'R2_score_test': [model_gbr.score(X_test, y_test)],
'MAE_test': [mean_absolute_error(y_test, y_pred)],
'MSE_test': [mean_squared_error(y_test, y_pred)],
'RMSE_test': [np.sqrt(mean_squared_error(y_test, y_pred))]
}
)

# print the evaluation dataframe
eval_df


In [None]:
%%writefile {train_src_dir}/model_lr.py

# import mlflow
import argparse

import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

# mlflow.start_run()

def main():
    
        parser = argparse.ArgumentParser()
        parser.add_argument("--data", type=str, help="path to train data")
    
        args = parser.parse_args()
    
        df = pd.read_csv(args.data)
        
        target = 'price'
        numeric_features = ['Segment','Kilometers_Driven', 'Mileage', 'Engine','Power','Seats']
        categorical_features = []
    
        X = df.drop([target], axis=1)
        y = df[target]
    
        X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
        )

        # split the training data into train and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
        )
    
        model_lr = LinearRegression()
    
        model_pipeline = make_pipeline(model_lr)
    
        model_pipeline.fit(X_train, y_train)
    
        # print training accuracy and R2 score
        print("training accuracy:", model_pipeline.score(X_train, y_train))
        print("training R2 score:", model_pipeline.score(X_train, y_train))

        # print validation accuracy and R2 score
        print("validation accuracy:", model_pipeline.score(X_val, y_val))
        print("validation R2 score:", model_pipeline.score(X_val, y_val))
        
        # print test accuracy and R2 score
        print("test accuracy:", model_pipeline.score(X_test, y_test))
        print("test R2 score:", model_pipeline.score(X_test, y_test))

if __name__ == '__main__':
    main()

In [None]:
# run the script to train the model and save the model in the train_src_dir directory
!python {train_src_dir}/model_lr.py --data {train_src_dir}/used_cars_processed.csv

In [None]:
%%writefile {train_src_dir}/model_dt.py

# import mlflow
import argparse

import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor

# mlflow.start_run()

def main():
    
        parser = argparse.ArgumentParser()
        parser.add_argument("--data", type=str, help="path to train data")
    
        args = parser.parse_args()
    
        df = pd.read_csv(args.data)
        
        target = 'price'
        numeric_features = ['Segment','Kilometers_Driven', 'Mileage', 'Engine','Power','Seats']
        categorical_features = []
    
        X = df.drop([target], axis=1)
        y = df[target]
    
        X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
        )

        # split the training data into train and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
        )
    
        model_dt = DecisionTreeRegressor()
    
        model_pipeline = make_pipeline(model_dt)
    
        model_pipeline.fit(X_train, y_train)
    
        # print training accuracy and R2 score
        print("training accuracy:", model_pipeline.score(X_train, y_train))
        print("training R2 score:", model_pipeline.score(X_train, y_train))

        # print validation accuracy and R2 score
        print("validation accuracy:", model_pipeline.score(X_val, y_val))
        print("validation R2 score:", model_pipeline.score(X_val, y_val))
        
        # print test accuracy and R2 score
        print("test accuracy:", model_pipeline.score(X_test, y_test))
        print("test R2 score:", model_pipeline.score(X_test, y_test))

if __name__ == '__main__':
    main()

In [None]:
# run the script to train the model and save the model in the train_src_dir directory
!python {train_src_dir}/model_dt.py --data {train_src_dir}/used_cars_processed.csv

In [None]:
%%writefile {train_src_dir}/model_rf.py

# import mlflow
import argparse

import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

# mlflow.start_run()

def main():
    
        parser = argparse.ArgumentParser()
        parser.add_argument("--data", type=str, help="path to train data")
    
        args = parser.parse_args()
    
        df = pd.read_csv(args.data)
        
        target = 'price'
        numeric_features = ['Segment','Kilometers_Driven', 'Mileage', 'Engine','Power','Seats']
        categorical_features = []
    
        X = df.drop([target], axis=1)
        y = df[target]
    
        X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
        )

        # split the training data into train and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
        )
    
        model_rf = RandomForestRegressor()
    
        model_pipeline = make_pipeline(model_rf)
    
        model_pipeline.fit(X_train, y_train)
    
        # print training accuracy and R2 score
        print("training accuracy:", model_pipeline.score(X_train, y_train))
        print("training R2 score:", model_pipeline.score(X_train, y_train))

        # print validation accuracy and R2 score
        print("validation accuracy:", model_pipeline.score(X_val, y_val))
        print("validation R2 score:", model_pipeline.score(X_val, y_val))
        
        # print test accuracy and R2 score
        print("test accuracy:", model_pipeline.score(X_test, y_test))
        print("test R2 score:", model_pipeline.score(X_test, y_test))

if __name__ == '__main__':
    main()

In [None]:
# run the script to train the model and save the model in the train_src_dir directory
!python {train_src_dir}/model_rf.py --data {train_src_dir}/used_cars_processed.csv

In [None]:
%%writefile {train_src_dir}/model_nn.py

# import mlflow
import argparse

import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPRegressor

# mlflow.start_run()

def main():
    
        parser = argparse.ArgumentParser()
        parser.add_argument("--data", type=str, help="path to train data")
    
        args = parser.parse_args()
    
        df = pd.read_csv(args.data)
        
        target = 'price'
        numeric_features = ['Segment','Kilometers_Driven', 'Mileage', 'Engine','Power','Seats']
        categorical_features = []
    
        X = df.drop([target], axis=1)
        y = df[target]
    
        X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
        )

        # split the training data into train and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
        )
    
        model_nn = MLPRegressor()
    
        model_pipeline = make_pipeline(model_nn)
    
        model_pipeline.fit(X_train, y_train)
    
        # print training accuracy and R2 score
        print("training accuracy:", model_pipeline.score(X_train, y_train))
        print("training R2 score:", model_pipeline.score(X_train, y_train))

        # print validation accuracy and R2 score
        print("validation accuracy:", model_pipeline.score(X_val, y_val))
        print("validation R2 score:", model_pipeline.score(X_val, y_val))
        
        # print test accuracy and R2 score
        print("test accuracy:", model_pipeline.score(X_test, y_test))
        print("test R2 score:", model_pipeline.score(X_test, y_test))

if __name__ == '__main__':
    main()

In [None]:
# run the script to train the model and save the model in the train_src_dir directory
!python {train_src_dir}/model_nn.py --data {train_src_dir}/used_cars_processed.csv

In [None]:
%%writefile {train_src_dir}/model_svr.py

# import mlflow
import argparse

import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.model_selection import train_test_split

from sklearn.svm import SVR

# mlflow.start_run()

def main():
    
        parser = argparse.ArgumentParser()
        parser.add_argument("--data", type=str, help="path to train data")
    
        args = parser.parse_args()
    
        df = pd.read_csv(args.data)
        
        target = 'price'
        numeric_features = ['Segment','Kilometers_Driven', 'Mileage', 'Engine','Power','Seats']
        categorical_features = []
    
        X = df.drop([target], axis=1)
        y = df[target]
    
        X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
        )

        # split the training data into train and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
        )
    
        model_svr = SVR()
    
        model_pipeline = make_pipeline(model_svr)
    
        model_pipeline.fit(X_train, y_train)
    
        # print training accuracy and R2 score
        print("training accuracy:", model_pipeline.score(X_train, y_train))
        print("training R2 score:", model_pipeline.score(X_train, y_train))

        # print validation accuracy and R2 score
        print("validation accuracy:", model_pipeline.score(X_val, y_val))
        print("validation R2 score:", model_pipeline.score(X_val, y_val))
        
        # print test accuracy and R2 score
        print("test accuracy:", model_pipeline.score(X_test, y_test))
        print("test R2 score:", model_pipeline.score(X_test, y_test))

if __name__ == '__main__':
    main()

In [None]:
# run the script to train the model and save the model in the train_src_dir directory
!python {train_src_dir}/model_svr.py --data {train_src_dir}/used_cars_processed.csv

In [None]:
%%writefile {train_src_dir}/model_knn.py

# import mlflow
import argparse

import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsRegressor

# mlflow.start_run()

def main():
    
        parser = argparse.ArgumentParser()
        parser.add_argument("--data", type=str, help="path to train data")
    
        args = parser.parse_args()
    
        df = pd.read_csv(args.data)
        
        target = 'price'
        numeric_features = ['Segment','Kilometers_Driven', 'Mileage', 'Engine','Power','Seats']
        categorical_features = []
    
        X = df.drop([target], axis=1)
        y = df[target]
    
        X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
        )

        # split the training data into train and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
        )
    
        model_knn = KNeighborsRegressor()
    
        model_pipeline = make_pipeline(model_knn)
    
        model_pipeline.fit(X_train, y_train)
    
        # print training accuracy and R2 score
        print("training accuracy:", model_pipeline.score(X_train, y_train))
        print("training R2 score:", model_pipeline.score(X_train, y_train))

        # print validation accuracy and R2 score
        print("validation accuracy:", model_pipeline.score(X_val, y_val))
        print("validation R2 score:", model_pipeline.score(X_val, y_val))
        
        # print test accuracy and R2 score
        print("test accuracy:", model_pipeline.score(X_test, y_test))
        print("test R2 score:", model_pipeline.score(X_test, y_test))

if __name__ == '__main__':
    main()

In [None]:
# run the script to train the model and save the model in the train_src_dir directory
!python {train_src_dir}/model_knn.py --data {train_src_dir}/used_cars_processed.csv

In [None]:
%%writefile {train_src_dir}/model_bayesian.py

# import mlflow
import argparse

import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import BayesianRidge

# mlflow.start_run()

def main():
    
        parser = argparse.ArgumentParser()
        parser.add_argument("--data", type=str, help="path to train data")
    
        args = parser.parse_args()
    
        df = pd.read_csv(args.data)
        
        target = 'price'
        numeric_features = ['Segment','Kilometers_Driven', 'Mileage', 'Engine','Power','Seats']
        categorical_features = []
    
        X = df.drop([target], axis=1)
        y = df[target]
    
        X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
        )

        # split the training data into train and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
        )
    
        model_bayesian = BayesianRidge()
    
        model_pipeline = make_pipeline(model_bayesian)
    
        model_pipeline.fit(X_train, y_train)
    
        # print training accuracy and R2 score
        print("training accuracy:", model_pipeline.score(X_train, y_train))
        print("training R2 score:", model_pipeline.score(X_train, y_train))

        # print validation accuracy and R2 score
        print("validation accuracy:", model_pipeline.score(X_val, y_val))
        print("validation R2 score:", model_pipeline.score(X_val, y_val))
        
        # print test accuracy and R2 score
        print("test accuracy:", model_pipeline.score(X_test, y_test))
        print("test R2 score:", model_pipeline.score(X_test, y_test))

if __name__ == '__main__':
    main()

In [None]:
# run the script to train the model and save the model in the train_src_dir directory
!python {train_src_dir}/model_bayesian.py --data {train_src_dir}/used_cars_processed.csv

In [None]:

# different regression models 
# 1. Linear Regression
# 2. Decision Tree Regression
# 3. Random Forest Regression
# 4. Gradient Boosting Regression
# 5. Neural Network Regression
# 6. Support Vector Regression
# 7. K-Nearest Neighbors Regression
# 8. Ridge Regression
# 9. Lasso Regression
# 10. ElasticNet Regression
# 11. Bayesian Regression
# 12. Huber Regression
# 13. TheilSen Regression
# 14. RANSAC Regression
# 15. Poisson Regression
# 16. Gamma Regression
# 17. Tweedie Regression
# 18. Passive Aggressive Regression
# 19. Orthogonal Matching Pursuit Regression
# 20. Bayesian Ridge Regression
# 21. ARD Regression
# 22. SGD Regression
# 23. Extra Trees Regression
# 24. Multi Task ElasticNet Regression
# 25. Multi Task Lasso Regression
# 26. Multi Task LassoCV Regression
# 27. Multi Task Ridge Regression
# 28. Radius Neighbors Regression
# 29. Isotonic Regression


In [None]:
# run the script to train the model using different values of n_estimators and learning_rate using for loop 
for n_estimators in [100, 200, 300]:
    for learning_rate in [0.1, 0.01, 0.001]:
        !python {train_src_dir}/main.py --data {train_src_dir}/used_cars_processed.csv --n_estimators {n_estimators} --learning_rate {learning_rate}

In [None]:
%%writefile {train_src_dir}/pre_process.py
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
def main():
 """Main function of the script."""

    data = pd.read_csv(args.data)


    ###################
    #<processing>
    ###################

    # Separate categorical and numerical features
    categorical_columns = data.select_dtypes(include=['object']).columns
    numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns

    # Apply label encoding to categorical columns
    label_encoder = LabelEncoder()
    for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

    # Apply data scaling to numerical columns
    scaler = StandardScaler()
    data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

    # Exporting processed data to local
    data.to_csv(os.path.join(args.output, 'processed_data.csv'), index=False)

if __name__ == "__main__":
    main()