## Installing required libraries

- Library for categorical encoding
- weight and biases library

In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)
[?25l[K     |████                            | 10 kB 23.9 MB/s eta 0:00:01[K     |████████▏                       | 20 kB 28.4 MB/s eta 0:00:01[K     |████████████▏                   | 30 kB 26.1 MB/s eta 0:00:01[K     |████████████████▎               | 40 kB 18.7 MB/s eta 0:00:01[K     |████████████████████▎           | 51 kB 5.6 MB/s eta 0:00:01[K     |████████████████████████▍       | 61 kB 5.9 MB/s eta 0:00:01[K     |████████████████████████████▍   | 71 kB 5.5 MB/s eta 0:00:01[K     |████████████████████████████████| 80 kB 3.8 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [None]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.12.4-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.4 MB/s 
[?25hCollecting subprocess32>=3.5.3
  Downloading subprocess32-3.5.4.tar.gz (97 kB)
[K     |████████████████████████████████| 97 kB 6.4 MB/s 
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.24-py3-none-any.whl (180 kB)
[K     |████████████████████████████████| 180 kB 45.7 MB/s 
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting configparser>=3.8.1
  Downloading configparser-5.0.2-py3-none-any.whl (19 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.4.3-py2.py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 50.4 MB/s 
[?25hCollecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.1-py3-none-any.whl (7.5 kB)
Collecting yaspin>=1.0.0
  Downloading yaspin-2.1.0-py3-none-any.whl (18 kB)
Colle

## Loading the dataset: Used Car Price Prediction

In [None]:
import pandas as pd
import numpy as np
from category_encoders import OneHotEncoder, TargetEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb
import wandb
import os

  import pandas.util.testing as tm


In [None]:
cars_df = pd.read_csv( "https://drive.google.com/uc?export=download&id=1V_VBbyjGj6vvD0A90S5Lk0DG90djz28B" )

In [None]:
cars_df.head(5)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price,mileage_new,engine_new,power_new,age,make,model,KM_Driven
0,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5,18.2,1199,88.7,9,honda,jazz,46
1,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0,20.77,1248,88.76,8,maruti,ertiga,87
2,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08 kmpl,1461 CC,63.1 bhp,5.0,,3.5,23.08,1461,63.1,7,nissan,micra,86
3,Tata Indica Vista Quadrajet LS,Chennai,2012,65932,Diesel,Manual,Second,22.3 kmpl,1248 CC,74 bhp,5.0,,1.95,22.3,1248,74.0,8,tata,indica,65
4,Maruti Swift VDI BSIV,Jaipur,2015,64424,Diesel,Manual,First,25.2 kmpl,1248 CC,74 bhp,5.0,,5.6,25.2,1248,74.0,5,maruti,swift,64


In [None]:
x_columns = ['KM_Driven', 'Fuel_Type', 'age',
              'Transmission', 'Owner_Type', 'Seats',
              'make', 'mileage_new', 'engine_new', 'model',
              'power_new', 'Location']
## model of the car is not included in the model

In [None]:
cars_df.shape

(3092, 20)

In [None]:
cars_df = cars_df[x_columns + ['Price']].dropna()

In [None]:
cars_df.shape

(3091, 13)

## Identifying numerical and categorical features

In [None]:
cat_features = ['Fuel_Type',
                'Transmission', 'Owner_Type', 'model',
                'make', 'Location']

In [None]:
num_features = list(set(x_columns) - set(cat_features))

## Utility method for preparing the data

- Splitting the dataset
- Encoding Catgorical Variables

In [None]:
def prepare_data(X,
                 y,
#                 num_scaler = None,
#                 num_imputer = None,
#                 cat_imputer = None,
                 cat_encoder,
                 num_features,
                 cat_features,
                 train_size = 0.8,
                 seed = 100):
  
  # Split the dataset into train and test split
  x_train, x_test, y_train, y_test = train_test_split(X,
                                                      y,
                                                      train_size = train_size,
                                                      random_state = seed)
  
  
  # encoder the categorical features
  cat_encoder.fit(X=x_train, y=y_train)
  x_train_encoded = cat_encoder.transform(x_train)
  x_test_encoded = cat_encoder.transform(x_test)

  return x_train_encoded, x_test_encoded, y_train, y_test
  

In [None]:
#For removing an environment variable wrongly set by os.environ()
#print(os.environ["WANDB_MODE"])
#os.environ.pop("WANDB_MODE")

## Initilializing Weights and Biases

In [None]:
os.environ["WANDB_API_KEY"] = "8985894d5778b95c7acb681fd408dffe742c67c3"

In [None]:
#!wandb online

In [None]:
def run_model_experiment(model,
                         x_train,
                         y_train,
                         x_test,
                         y_test,
                         name,
                         config = None,
                         tags = None):
  
  model.fit(x_train, y_train)

  wandb.init(project='usedcar', config=config, tags = tags)
  wandb.run.name = name
  rmse = np.sqrt(mean_squared_error(y_test, model.predict(x_test)))
  r2 = model.score(x_test, y_test)

  wandb.log( {"rmse" : rmse, 
              "r2": r2} )
  
  wandb.sklearn.plot_regressor(model, 
                               x_train, 
                               x_test, 
                               y_train, 
                               y_test,  
                               model_name=name)
  
  wandb.Artifact(name, 
                 type = 'model',
                 description = config)

  wandb.save()
  wandb.finish()

  return model

## Baseline Model: Linear Regression

In [None]:
ohe_encoder = OneHotEncoder(cols = cat_features,
                            use_cat_names = True)

x_train, x_test, y_train, y_test = prepare_data(cars_df[x_columns],
                                                cars_df.Price,
                                                cat_encoder = ohe_encoder,
                                                num_features = num_features,
                                                cat_features = cat_features)

dtree_model = DecisionTreeRegressor(max_depth = 10)  

dtree_model = run_model_experiment(dtree_model,
                                   x_train,
                                   y_train,
                                   x_test,
                                   y_test,
                                   name = 'DecisionTreeWithOHE',
                                   tags = ['Decision Tree', 'baseline', 'OheEncoder'])

  elif pd.api.types.is_categorical(cols):
[34m[1mwandb[0m: Currently logged in as: [33mmanaranjan[0m (use `wandb login --relogin` to force relogin)


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting DecisionTreeWithOHE.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged learning curve.
  result = getattr(ufunc, method)(*inputs, **kwargs)
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: Logged residuals.


VBox(children=(Label(value=' 0.13MB of 0.13MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
r2,▁
rmse,▁

0,1
r2,0.79775
rmse,0.94576


## Running Multiple Experiments

### Random Forest

In [None]:
ohe_encoder = OneHotEncoder(cols = cat_features, 
                            use_cat_names = True)

x_train, x_test, y_train, y_test = prepare_data(cars_df[x_columns],
                                                cars_df.Price,
                                                cat_encoder = ohe_encoder,
                                                num_features = num_features,
                                                cat_features = cat_features)

In [None]:
params = { "n_estimators": 100,
           "max_depth": 10,
           "max_features": .2,
           "max_samples": 0.4 }

In [None]:
rf_model = RandomForestRegressor(**params)

In [None]:
rf_model

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features=0.2, max_leaf_nodes=None,
                      max_samples=0.4, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [None]:
rf_model = run_model_experiment(rf_model,
                                x_train,
                                y_train,
                                x_test,
                                y_test,
                                config = params,
                                name = 'RFwithOHE',
                                tags = ['RF', 'OheEncoding'])

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting RFwithOHE.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged learning curve.
  result = getattr(ufunc, method)(*inputs, **kwargs)
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: Logged residuals.


VBox(children=(Label(value=' 0.14MB of 0.15MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.88681571900…

0,1
r2,▁
rmse,▁

0,1
r2,0.83415
rmse,0.85642


### RandomForest with Target Encoding

In [None]:
target_encoder = TargetEncoder(cols = cat_features)

x_train, x_test, y_train, y_test = prepare_data(cars_df[x_columns],
                                                cars_df.Price,
                                                cat_encoder = target_encoder,
                                                num_features = num_features,
                                                cat_features = cat_features)

params = { "n_estimators": 100,
           "max_depth": 10,
           "max_features": .2,
           "max_samples": 0.4 }

rf_model = RandomForestRegressor(**params)  

rf_model = run_model_experiment(rf_model,
                                x_train,
                                y_train,
                                x_test,
                                y_test,
                                config = params,
                                name = 'RFwithTargetEncoding',
                                tags = ['RF', 'TargetEncoding'])

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting RFwithTargetEncoding.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: Logged residuals.


VBox(children=(Label(value=' 0.14MB of 0.14MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
r2,▁
rmse,▁

0,1
r2,0.86741
rmse,0.76576


### Gradient Boosted Trees

In [None]:
target_encoder = TargetEncoder(cols = cat_features)

x_train, x_test, y_train, y_test = prepare_data(cars_df[x_columns],
                                                cars_df.Price,
                                                cat_encoder = target_encoder,
                                                num_features = num_features,
                                                cat_features = cat_features)

params = { "n_estimators": 100,
           "max_depth": 4 }

gbm_model = GradientBoostingRegressor(**params)  

gbm_model = run_model_experiment(gbm_model,
                                 x_train,
                                 y_train,
                                 x_test,
                                 y_test,
                                 config = params,
                                 name = 'GBMwithTargetEncoding',
                                 tags = ['GBM', 'TargetEncoding'])

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting GBMwithTargetEncoding.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: Logged residuals.


VBox(children=(Label(value=' 0.13MB of 0.15MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.88527079998…

0,1
r2,▁
rmse,▁

0,1
r2,0.91134
rmse,0.62618


### Grid Search and Best Model Tracking

In [None]:
target_encoder = TargetEncoder(cols = cat_features)

x_train, x_test, y_train, y_test = prepare_data(cars_df[x_columns],
                                                cars_df.Price,
                                                cat_encoder = target_encoder,
                                                num_features = num_features,
                                                cat_features = cat_features)

params = { "n_estimators": [100, 200, 300],
           "max_depth": [4, 6] }

gbm_cv = GridSearchCV(GradientBoostingRegressor(),
                      param_grid = params,
                      cv = 5,
                      scoring = 'neg_mean_squared_error')  

gbm_cv.fit(x_train, y_train)         

GridSearchCV(cv=10, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_...,
                           

In [None]:
gbm_cv.best_params_

{'loss': 'huber', 'max_depth': 4, 'n_estimators': 300}

In [None]:
gbm_model = GradientBoostingRegressor(**gbm_cv.best_params_)  

gbm_model = run_model_experiment(gbm_model,
                                 x_train,
                                 y_train,
                                 x_test,
                                 y_test,
                                 config = gbm_cv.best_params_,
                                 name = 'GBMwithTargetEncoding',
                                 tags = ['GBM', 'TargetEncoding'])

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting GBMwithTargetEncoding.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: Logged residuals.


VBox(children=(Label(value=' 0.14MB of 0.14MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
r2,▁
rmse,▁

0,1
r2,0.90722
rmse,0.64058


### XGB Model

In [None]:
import xgboost as xgb

In [None]:
target_encoder = TargetEncoder(cols = cat_features)

x_train, x_test, y_train, y_test = prepare_data(cars_df[x_columns],
                                                cars_df.Price,
                                                cat_encoder = target_encoder,
                                                num_features = num_features,
                                                cat_features = cat_features)

params = { "n_estimators": 200,
           "max_depth": 6,
           "objective": 'reg:squarederror' }

xgb_regressor = xgb.XGBRegressor(**params)

wandb.init(project='usedcar', 
           config=params, 
           tags = ['XGB', 'TargetEncoding'])

xgb_regressor.fit(x_train, 
                  y_train, 
                  callbacks=[wandb.xgboost.wandb_callback()])

wandb.run.name = "XGBWithTargetEncoding"

rmse = np.sqrt(mean_squared_error(y_test, 
                                  xgb_regressor.predict(x_test)))

r2 = xgb_regressor.score(x_test, y_test)

wandb.log( {"rmse" : rmse, 
            "r2": r2} )
  
wandb.save()
wandb.finish()



VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
r2,▁
rmse,▁

0,1
r2,0.9142
rmse,0.616


## Get Experiment Details

In [None]:
api = wandb.Api()

all_runs = api.runs("manaranjan/usedcar", order="+summary_metrics.rmse")

for run in all_runs:
  print(f"Model Name: {run.name} and RMSE {run.summary.get('rmse')}")
  print(run.config)

Model Name: XGBWithTargetEncoding and RMSE 0.6160012713560885
{'max_depth': 6, 'objective': 'reg:squarederror', 'n_estimators': 200}
Model Name: GBMwithTargetEncoding and RMSE 0.626183385535465
{'max_depth': 4, 'n_estimators': 100}
Model Name: GBMwithTargetEncoding and RMSE 0.6405800756270669
{'loss': 'huber', 'max_depth': 4, 'n_estimators': 300}
Model Name: RFwithTargetEncoding and RMSE 0.7657624697958263
{'max_depth': 10, 'max_samples': 0.4, 'max_features': 0.2, 'n_estimators': 100}
Model Name: RFwithOHE and RMSE 0.8564247778732033
{'max_depth': 10, 'max_samples': 0.4, 'max_features': 0.2, 'n_estimators': 100}
Model Name: DecisionTreeWithOHE and RMSE 0.9457617713462364
{}
