## Installing required libraries

- onnx library
- weight and biases library

In [1]:
!pip install wandb
!pip install onnx
!pip install skl2onnx
!pip install onnxruntime
!pip install onnxmltools

Collecting wandb
  Downloading wandb-0.12.4-py2.py3-none-any.whl (1.7 MB)
[?25l[K     |▏                               | 10 kB 20.9 MB/s eta 0:00:01[K     |▍                               | 20 kB 27.3 MB/s eta 0:00:01[K     |▋                               | 30 kB 21.5 MB/s eta 0:00:01[K     |▉                               | 40 kB 17.6 MB/s eta 0:00:01[K     |█                               | 51 kB 10.3 MB/s eta 0:00:01[K     |█▏                              | 61 kB 10.4 MB/s eta 0:00:01[K     |█▍                              | 71 kB 9.7 MB/s eta 0:00:01[K     |█▋                              | 81 kB 10.7 MB/s eta 0:00:01[K     |█▉                              | 92 kB 8.4 MB/s eta 0:00:01[K     |██                              | 102 kB 9.2 MB/s eta 0:00:01[K     |██▏                             | 112 kB 9.2 MB/s eta 0:00:01[K     |██▍                             | 122 kB 9.2 MB/s eta 0:00:01[K     |██▋                             | 133 kB 9.2 MB/s eta 0:00:0

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor
from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer
import onnxruntime as rt
from skl2onnx import convert_sklearn, __version__, update_registered_converter
from skl2onnx.common.data_types import FloatTensorType, StringTensorType
from skl2onnx.common.data_types import Int64TensorType
from skl2onnx.common.shape_calculator import calculate_linear_regressor_output_shapes
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost  # noqa
import pprint
import os
import wandb

## Loading the dataset: Used Car Price Prediction

In [3]:
cars_df = pd.read_csv( "https://drive.google.com/uc?export=download&id=1V_VBbyjGj6vvD0A90S5Lk0DG90djz28B" )

In [4]:
cars_df.head(5)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price,mileage_new,engine_new,power_new,age,make,model,KM_Driven
0,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5,18.2,1199,88.7,9,honda,jazz,46
1,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0,20.77,1248,88.76,8,maruti,ertiga,87
2,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08 kmpl,1461 CC,63.1 bhp,5.0,,3.5,23.08,1461,63.1,7,nissan,micra,86
3,Tata Indica Vista Quadrajet LS,Chennai,2012,65932,Diesel,Manual,Second,22.3 kmpl,1248 CC,74 bhp,5.0,,1.95,22.3,1248,74.0,8,tata,indica,65
4,Maruti Swift VDI BSIV,Jaipur,2015,64424,Diesel,Manual,First,25.2 kmpl,1248 CC,74 bhp,5.0,,5.6,25.2,1248,74.0,5,maruti,swift,64


In [5]:
x_columns = ['KM_Driven', 'Fuel_Type', 'age',
              'Transmission', 'Owner_Type', 'Seats',
              'make', 'mileage_new', 'engine_new', 'model',
              'power_new', 'Location']
## model of the car is not included in the model

In [6]:
cars_df.shape

(3092, 20)

In [7]:
cars_df = cars_df[x_columns + ['Price']].dropna()

In [8]:
cars_df.shape

(3091, 13)

## Identifying numerical and categorical features

In [9]:
cat_features = ['Fuel_Type',
                'Transmission', 'Owner_Type', 'model',
                'make', 'Location']

In [10]:
num_features = list(set(x_columns) - set(cat_features))

## Split the dataset

In [11]:
x_train, x_test, y_train, y_test = train_test_split(cars_df[x_columns],
                                                    cars_df.Price,
                                                    train_size = 0.8,
                                                    random_state = 100)

## Creating the pipeline for the deployment

http://onnx.ai/sklearn-onnx/auto_examples/plot_complex_pipeline.html#example-complex-pipeline

http://onnx.ai/sklearn-onnx/auto_examples/plot_pipeline_xgboost.html

In [12]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('onehot', 
                                           OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),                  
        ('cat', categorical_transformer, cat_features),
    ])

params = { "n_estimators": 400,
           "max_depth": 4,
           "objective": 'reg:squarederror' }

xgb_regressor = XGBRegressor(**params)

reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', xgb_regressor)])           

reg.fit(x_train, 
        y_train)

rmse = np.sqrt(mean_squared_error(y_test, 
                                  reg.predict(x_test)))

In [13]:
print(rmse)

0.6187140821394929


## Creating the ONNX Graph

In [14]:
def convert_dataframe_schema(df, drop=None):
    inputs = []
    for k, v in zip(df.columns, df.dtypes):
        if drop is not None and k in drop:
            continue
        if v == 'int64':
            t = FloatTensorType([None, 1])
        elif v == 'float64':
            t = FloatTensorType([None, 1])
        else:
            t = StringTensorType([None, 1])
        inputs.append((k, t))
    return inputs


inputs = convert_dataframe_schema(x_train)

pprint.pprint(inputs)

[('KM_Driven', FloatTensorType(shape=[None, 1])),
 ('Fuel_Type', StringTensorType(shape=[None, 1])),
 ('age', FloatTensorType(shape=[None, 1])),
 ('Transmission', StringTensorType(shape=[None, 1])),
 ('Owner_Type', StringTensorType(shape=[None, 1])),
 ('Seats', FloatTensorType(shape=[None, 1])),
 ('make', StringTensorType(shape=[None, 1])),
 ('mileage_new', FloatTensorType(shape=[None, 1])),
 ('engine_new', FloatTensorType(shape=[None, 1])),
 ('model', StringTensorType(shape=[None, 1])),
 ('power_new', FloatTensorType(shape=[None, 1])),
 ('Location', StringTensorType(shape=[None, 1]))]


- Registers or updates a converter for a new model so that it can be converted when inserted in a scikit-learn pipeline.

In [15]:
update_registered_converter(
    XGBRegressor, 'XGBoostXGBRegressor',
    calculate_linear_regressor_output_shapes, convert_xgboost)


## Converting the pipeline to ONNX format

In [16]:
model_onnx = convert_sklearn(
    reg, 'pipeline_xgboost',
    inputs)

In [17]:
#!rm -r {MODEL_DIR}

## Storing the pipeline in a file

In [18]:
MODEL_DIR = "./usedcarsprod"
MODEL_FILE_NAME = "usedcar_xgboost.onnx"

In [19]:
os.mkdir(MODEL_DIR)
# And save.
with open(MODEL_DIR + "/" + MODEL_FILE_NAME, "wb") as f:
    f.write(model_onnx.SerializeToString())

In [20]:
!ls -al

total 20
drwxr-xr-x 1 root root 4096 Oct  9 07:21 .
drwxr-xr-x 1 root root 4096 Oct  9 07:15 ..
drwxr-xr-x 4 root root 4096 Sep 30 17:11 .config
drwxr-xr-x 1 root root 4096 Sep 30 17:12 sample_data
drwxr-xr-x 2 root root 4096 Oct  9 07:21 usedcarsprod


### Uploading the pipeline to weights and biases for tracking

https://docs.wandb.ai/guides/artifacts/model-versioning#version-and-store-reliably

In [21]:
os.environ["WANDB_API_KEY"] = "8985894d5778b95c7acb681fd408dffe742c67c3"

In [22]:
wandb.init(project='usedcarprod', 
           config=params, 
           tags = ['XGB', 'TargetEncoding'])

wandb.run.name = "XGBProdModel"

[34m[1mwandb[0m: Currently logged in as: [33mmanaranjan[0m (use `wandb login --relogin` to force relogin)


In [23]:
wandb.log( {"rmse" : rmse} )

In [24]:
model_artifact = wandb.Artifact("XGBoost_UserCar",
                                type = 'model',
                                description = 'XGBoost Model for used car price prediction')

In [25]:
model_artifact.add_dir(MODEL_DIR)

[34m[1mwandb[0m: Adding directory to artifact (./usedcarsprod)... Done. 0.1s


In [26]:
wandb.run.log_artifact(model_artifact)

<wandb.sdk.wandb_artifacts.Artifact at 0x7fe53d50ced0>

In [27]:
wandb.save()
wandb.finish()



VBox(children=(Label(value=' 0.28MB of 0.28MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
rmse,▁

0,1
rmse,0.61871


### References:

- https://docs.wandb.ai/guides/artifacts/model-versioning#version-and-store-reliably

In [None]:
import skl2onnx 

skl2onnx.get_latest_tested_opset_version()

15