In [1]:
# =============================================================================
# PREDICTING PRICE OF PRE-OWNED CARS 
# =============================================================================

import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
import argparse
import os

In [2]:

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [3]:
!python -V

Python 3.9.12


In [4]:
def dump_pickle(obj, filename):
    with open(filename, "wb") as f_out:
        return pickle.dump(obj, f_out)

In [5]:
def read_Dataframe(path_to_df):
    cars_data=pd.read_csv(path_to_df)
    cars=cars_data.copy()
    col=['name','dateCrawled','dateCreated','postalCode','lastSeen']
    cars=cars.drop(columns=col, axis=1)
    cars.drop_duplicates(keep='first',inplace=True)
    cars = cars[
            (cars.yearOfRegistration <= 2018) 
          & (cars.yearOfRegistration >= 1950) 
          & (cars.price >= 100) 
          & (cars.price <= 150000) 
          & (cars.powerPS >= 10) 
          & (cars.powerPS <= 500)]
    cars['monthOfRegistration']/=12

    # Creating new varible Age by adding yearOfRegistration and monthOfRegistration
    cars['Age']=(2018-cars['yearOfRegistration'])+cars['monthOfRegistration']
    cars['Age']=round(cars['Age'],2)

    cars=cars.drop(columns=['yearOfRegistration','monthOfRegistration'], axis=1)
    col=['seller','offerType','abtest']
    cars=cars.drop(columns=col, axis=1)
    cars_copy=cars.copy()
    cars_omit=cars.dropna(axis=0)
    x1 = cars_omit.drop(['price'], axis='columns', inplace=False)
    y1 = cars_omit['price']
    X_train, X_test, y_train, y_test = train_test_split(x1, y1, test_size=0.3, random_state = 3)
    y_train = np.log(y_train.values)
    y_test = np.log(y_test.values)
    
    return X_train, X_test, y_train, y_test
    # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [6]:
def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):
    categorical = ['vehicleType', 'gearbox','model', 'fuelType', 'brand', 'notRepairedDamage']
    numerical = ['powerPS','kilometer','Age']

    df[categorical] = df[categorical].astype(str)
    dicts = df[categorical + numerical].to_dict(orient='records')
    if fit_dv:
        X = dv.fit_transform(dicts)
    else:
        X = dv.transform(dicts)
    return X, dv

In [5]:
def run(raw_data_path: str, dest_path: str, dataset: str = "cars_sampled.csv"):
    X_train, X_test, y_train, y_test = read_Dataframe(
        os.path.join(raw_data_path, dataset))
    
    dv = DictVectorizer()
    X_train, dv = preprocess(X_train, dv, fit_dv=True)
    X_test, _ = preprocess(X_test, dv, fit_dv=False)
    
    
     # create dest_path folder unless it already exists
    os.makedirs(dest_path, exist_ok=True)

    # save dictvectorizer and datasets
    dump_pickle(dv, os.path.join(dest_path, "dv.pkl"))
    dump_pickle((X_train, y_train), os.path.join(dest_path, "train.pkl"))
#     dump_pickle((X_valid, y_valid), os.path.join(dest_path, "valid.pkl"))
    dump_pickle((X_test, y_test), os.path.join(dest_path, "test.pkl"))
    
    
    
    

In [None]:
if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--raw_data_path",
        help="the location where the raw NYC taxi trip data was saved"
    )
    parser.add_argument(
        "--dest_path",
        help="the location where the resulting files will be saved."
    )
    args = parser.parse_args()

    run(args.raw_data_path, args.dest_path)

In [None]:
X_train, X_test, y_train, y_test = read_Dataframe('cars_sampled.csv')

In [None]:
dv = DictVectorizer()
X_train, dv = preprocess(X_train, dv, fit_dv=True)
X_test, _ = preprocess(X_test, dv, fit_dv=False)

In [None]:
#new
# categorical = ['vehicleType', 'gearbox','model', 'fuelType', 'brand', 'notRepairedDamage']
# numerical = ['powerPS','kilometer','Age']

# X_train[categorical] = X_train[categorical].astype(str)

In [None]:

# dv = DictVectorizer()
# train_dicts = X_train[categorical + numerical].to_dict(orient='records')
# X_train = dv.fit_transform(train_dicts)


# test_dicts = X_test[categorical + numerical].to_dict(orient='records')
# X_test = dv.transform(test_dicts)


# # target = 'price'
# y_train = np.log(y_train.values)
# y_test = np.log(y_test.values)

# lr = LinearRegression()
# lr.fit(X_train, y_train)

# y_pred = lr.predict(X_train)

# mean_squared_error(y_train, y_pred, squared=False)

In [None]:
# Setting intercept as true
lgr=LinearRegression(fit_intercept=True)

# Model
model_lin1=lgr.fit(X_train,y_train)

# Predicting model on test set
cars_predictions_lin1 = lgr.predict(X_test)

# Computing MSE and RMSE
lin_mse1 = mean_squared_error(y_test, cars_predictions_lin1)
lin_rmse1 = np.sqrt(lin_mse1)
print(lin_rmse1)

# R squared value
r2_lin_test1=model_lin1.score(X_test,y_test)
r2_lin_train1=model_lin1.score(X_train,y_train)
print(r2_lin_test1,r2_lin_train1)

# Regression diagnostics- Residual plot analysis
residuals1=y_test-cars_predictions_lin1
sns.regplot(x=cars_predictions_lin1, y=residuals1, scatter=True, 
            fit_reg=False)
# residuals1.describe()

In [None]:
sns.distplot(cars_predictions_lin1, label='prediction')
sns.distplot(y_test, label='actual')

plt.legend()

In [None]:
# =============================================================================
# RANDOM FOREST WITH OMITTED DATA
# =============================================================================

# Model parameters
rf = RandomForestRegressor(n_estimators = 100,max_features='auto',
                           max_depth=100,min_samples_split=10,
                           min_samples_leaf=4,random_state=1)

# Model
model_rf1=rf.fit(X_train,y_train)

# Predicting model on test set
cars_predictions_rf1 = rf.predict(X_test)

# Computing MSE and RMSE
rf_mse1 = mean_squared_error(y_test, cars_predictions_rf1)
rf_rmse1 = np.sqrt(rf_mse1)
print(rf_rmse1)

# R squared value
r2_rf_test1=model_rf1.score(X_test,y_test)
r2_rf_train1=model_rf1.score(X_train,y_train)
print(r2_rf_test1,r2_rf_train1)   


In [None]:
import xgboost as xgb
import mlflow

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope



In [None]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_test, label=y_test)

In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("my-final-project")

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_search_result = {
    'max_depth': 11,
    'learning_rate': 0.11504139773734708,
    'reg_alpha': 0.03143119240248877,
    'reg_lambda': 0.0058914904219020325,
    'min_child_weight': 9.242463709505468,
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=best_search_result,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [None]:
with open('./lin_reg.bin', 'wb') as f_out:
    pickle.dump( lgr, f_out)


In [None]:
with open('./random_forest.bin', 'wb') as f_out:
    pickle.dump( rf, f_out)