#### Installing weight and biases library

In [2]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Loading the dataset: Used Car Price Prediction

In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import wandb
import os

In [4]:
cars_df = pd.read_csv( "https://drive.google.com/uc?export=download&id=1V_VBbyjGj6vvD0A90S5Lk0DG90djz28B" )

In [5]:
cars_df.head(5)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price,mileage_new,engine_new,power_new,age,make,model,KM_Driven
0,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5,18.2,1199,88.7,9,honda,jazz,46
1,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0,20.77,1248,88.76,8,maruti,ertiga,87
2,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08 kmpl,1461 CC,63.1 bhp,5.0,,3.5,23.08,1461,63.1,7,nissan,micra,86
3,Tata Indica Vista Quadrajet LS,Chennai,2012,65932,Diesel,Manual,Second,22.3 kmpl,1248 CC,74 bhp,5.0,,1.95,22.3,1248,74.0,8,tata,indica,65
4,Maruti Swift VDI BSIV,Jaipur,2015,64424,Diesel,Manual,First,25.2 kmpl,1248 CC,74 bhp,5.0,,5.6,25.2,1248,74.0,5,maruti,swift,64


In [6]:
x_columns = ['KM_Driven', 'Fuel_Type', 'age',
             'Transmission', 'Owner_Type', 'Seats',
             'make', 'mileage_new', 'engine_new', 'model',
             'power_new', 'Location']
## model of the car is not included in the model

In [7]:
cars_df.shape

(3092, 20)

In [8]:
cars_df = cars_df[x_columns + ['Price']].dropna()

In [9]:
cars_df.shape

(3091, 13)

## Identifying numerical and categorical features

In [10]:
cat_features = ['Fuel_Type',
                'Transmission', 'Owner_Type', 'model',
                'make', 'Location']

In [11]:
num_features = list(set(x_columns) - set(cat_features))

## Utility method for preparing the data

- Splitting the dataset
- Encoding Catgorical Variables

In [12]:
X = cars_df[x_columns]
y = cars_df.Price

In [13]:
# Split the dataset into train and test split
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size = 0.8,
                                                    random_state = 100)

### Creating ML Pipeline

In [14]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [15]:
ohe_encoder = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()

## Creating the imputer for columns that have missing values
imputed_num_vars = ['Seats']
non_imputed_num_vars = list(set(num_features) - set(imputed_num_vars))
mean_imputer = SimpleImputer(strategy='mean')


## Pipeline for the applying imputation and then scaling
imputed_num_transformer = Pipeline( steps = [  
        ('imputation', mean_imputer),
        ('scaler', scaler)])

non_imputed_num_transformer = Pipeline( steps = [('scaler', scaler)])


## Pipeline for OHE encoding the categorical columns
cat_transformer = Pipeline( steps = [('ohencoder', ohe_encoder)])

## The complete pipeline for applying the required transformatinons to the respective columns
preprocessor = ColumnTransformer(
    transformers=[  
        ('num_imputed', imputed_num_transformer, imputed_num_vars),
        ('num_not_imputed', non_imputed_num_transformer, non_imputed_num_vars),
        ('catvars', cat_transformer, cat_features)])

## Initilializing Weights and Biases

In [16]:
os.environ["WANDB_API_KEY"] = "dc797f600c763f09dc0ffc8637c8bcdf5bc1294b"

## Baseline Model: Linear Regression

In [17]:
linear_reg = LinearRegression()  

linear_model = Pipeline(steps=[('preprocessor', preprocessor),
                               ('linear_model', linear_reg)])
## Pipeline for the applying imputation and then scaling

linear_model.fit(x_train, y_train)

wandb.init(project='mlops_usedcar', config=None, tags = ['Linear Model', 'baseline', 'OHE Encoding'])
wandb.run.name = "LinearModel"
rmse = np.sqrt(mean_squared_error(y_test, linear_model.predict(x_test)))
r2 = linear_model.score(x_test, y_test)

wandb.log( {"rmse" : rmse, 
            "r2": r2} )
  
wandb.sklearn.plot_regressor(linear_model, 
                             x_train, 
                             x_test, 
                             y_train, 
                             y_test,  
                             model_name="LinearModel")
  
wandb.Artifact("LinearModel", 
               type = 'model',
               description = None)

wandb.save()
wandb.finish()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mawesomestats[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting LinearModel.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode or one hot encode X and call the plotting function again.
[34m[1mwandb[0m: [32m[41mERROR[0m X_test contains values that are not numbers. Please vectorize, label encode or one hot encode X_test and call the plotting function again.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode or one hot encode X and call the plotting function again.
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode or one hot encode X and call the plotting function again.
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode or one

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
r2,▁
rmse,▁

0,1
r2,0.8791
rmse,0.73121


In [18]:
params = {"max_depth": 10}

dtree = DecisionTreeRegressor(**params)  

dtree_model = Pipeline(steps=[('preprocessor', preprocessor),
                               ('linear_model', dtree)])


dtree_model.fit(x_train, y_train)

wandb.init(project='mlops_usedcar', config=params, tags = ['Decision Tree', 'OHE Encoding'])
wandb.run.name = "DecisionTree"
rmse = np.sqrt(mean_squared_error(y_test, dtree_model.predict(x_test)))
r2 = dtree_model.score(x_test, y_test)

wandb.log( {"rmse" : rmse, 
            "r2": r2} )
  
wandb.sklearn.plot_regressor(dtree_model, 
                             x_train, 
                             x_test, 
                             y_train, 
                             y_test,  
                             model_name="DecisionTree")
  
wandb.Artifact("DecisionTree", 
               type = 'model',
               description = params)

wandb.save()
wandb.finish()

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting DecisionTree.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode or one hot encode X and call the plotting function again.
[34m[1mwandb[0m: [32m[41mERROR[0m X_test contains values that are not numbers. Please vectorize, label encode or one hot encode X_test and call the plotting function again.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode or one hot encode X and call the plotting function again.
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode or one hot encode X and call the plotting function again.
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode or on

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
r2,▁
rmse,▁

0,1
r2,0.7978
rmse,0.94564


## Get Experiment Details

In [19]:
api = wandb.Api()

all_runs = api.runs("awesomestats/mlops_usedcar", order="+summary_metrics.rmse")

for run in all_runs:
  print(f"Model Name: {run.name} and R2 {run.summary.get('r2')}")
  print(run.config)

Model Name: LinearModel and R2 0.8791047274738495
{}
Model Name: DecisionTree and R2 0.7978008514988881
{'max_depth': 10}


### Storing the model into a file

In [20]:
from joblib import dump

MODEL_DIR = "./carsmodel"

os.mkdir(MODEL_DIR)
dump(linear_model, MODEL_DIR + "/" + 'cars.pkl')

['./carsmodel/cars.pkl']

### Logging the model artifact in the tracking tools (weights and Biases)

In [21]:
wandb.init(project='mlops_usedcar', 
           config=None, 
           tags = ['Final Model'])
wandb.run.name = "FinalModel"

In [22]:
model_artifact = wandb.Artifact("Linear_Model_UsedCar",
                                type = 'model',
                                description = 'Linear Model for used car price prediction')

In [23]:
model_artifact.add_dir(MODEL_DIR)

[34m[1mwandb[0m: Adding directory to artifact (./carsmodel)... Done. 0.1s


In [24]:
wandb.run.log_artifact(model_artifact)

<wandb.sdk.wandb_artifacts.Artifact at 0x7fd7bb241510>

In [25]:
wandb.save()
wandb.finish()

VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [None]:
import sklearn
sklearn.__version__