## Building and storing the file as pickle file

In [62]:
# Not required now
#!pip install -U scikit-learn==0.24

Requirement already up-to-date: scikit-learn==0.24 in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (0.24.0)


In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
import pprint
import os

In [64]:
import xgboost
xgboost.__version__

'0.90'

In [65]:
import sklearn
sklearn.__version__

'0.24.0'

## Loading the dataset: Used Car Price Prediction

In [66]:
from azureml.core import Workspace, Dataset

subscription_id = 'fbb085eb-3abd-4384-93aa-608f78277ad2'
resource_group = 'MLOpsDemo'
workspace_name = 'MLOPS2021'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='usedcar_dataset')
cars_df = dataset.to_pandas_dataframe()

In [67]:
cars_df.head(5)

Unnamed: 0,Location,Fuel_Type,Transmission,Owner_Type,Seats,Price,mileage_new,engine_new,power_new,age,make,model,KM_Driven
0,Chennai,Petrol,Manual,First,5.0,4.5,18.2,1199,88.7,9,honda,jazz,46
1,Chennai,Diesel,Manual,First,7.0,6.0,20.77,1248,88.76,8,maruti,ertiga,87
2,Jaipur,Diesel,Manual,First,5.0,3.5,23.08,1461,63.1,7,nissan,micra,86
3,Chennai,Diesel,Manual,Second,5.0,1.95,22.3,1248,74.0,8,tata,indica,65
4,Jaipur,Diesel,Manual,First,5.0,5.6,25.2,1248,74.0,5,maruti,swift,64


In [68]:
x_columns = ['KM_Driven', 'Fuel_Type', 'age',
              'Transmission', 'Owner_Type', 'Seats',
              'make', 'mileage_new', 'engine_new', 'model',
              'power_new', 'Location']
## model of the car is not included in the model

In [69]:
cars_df.shape

(3092, 13)

In [70]:
cars_df = cars_df[x_columns + ['Price']].dropna()

In [71]:
cars_df.shape

(3091, 13)

## Identifying numerical and categorical features

In [72]:
cat_features = ['Fuel_Type',
                'Transmission', 'Owner_Type', 'model',
                'make', 'Location']

In [73]:
num_features = list(set(x_columns) - set(cat_features))

## Split the dataset

In [74]:
x_train, x_test, y_train, y_test = train_test_split(cars_df[x_columns],
                                                    cars_df.Price,
                                                    train_size = 0.8,
                                                    random_state = 100)

## Creating the pipeline for the deployment

In [79]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('onehot', 
                                           OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),                  
        ('cat', categorical_transformer, cat_features),
    ])

params = { "n_estimators": 400,
           "max_depth": 4 }

xgb_regressor = GradientBoostingRegressor(**params)

reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', xgb_regressor)])           

reg.fit(x_train, 
        y_train)

rmse = np.sqrt(mean_squared_error(y_test, 
                                  reg.predict(x_test)))

In [80]:
x_test[0:1]

Unnamed: 0,KM_Driven,Fuel_Type,age,Transmission,Owner_Type,Seats,make,mileage_new,engine_new,model,power_new,Location
1588,75,Diesel,9,Manual,Second,5.0,maruti,21.1,1248,ritz,73.9,Mumbai


In [81]:
print(rmse)

0.6454254301417076


## Creating Pickel File and Storing it

In [88]:
!rm carmodel.pkl

In [89]:
from joblib import dump

In [91]:
dump(reg, "carmodel.pkl")

['carmodel.pkl']

In [92]:
!ls -al 

total 1250
drwxrwxrwx 2 root root       0 Nov 12 05:30  .
drwxrwxrwx 2 root root       0 Nov 12 05:30  ..
-rwxrwxrwx 1 root root     315 Nov 12 06:57  .amlignore
-rwxrwxrwx 1 root root     315 Nov 12 06:57  .amlignore.amltmp
drwxrwxrwx 2 root root       0 Nov 12 06:59  .config
drwxrwxrwx 2 root root       0 Nov 12 06:57  .ipynb_aml_checkpoints
-rwxrwxrwx 1 root root   42114 Nov 13 06:58 'Azure Model Used Car Sklearn.ipynb'
-rwxrwxrwx 1 root root   18800 Nov 12 09:39 'Azure Model Used Car.ipynb'
-rwxrwxrwx 1 root root   43808 Nov 12 11:02 'Debug Notebook.ipynb'
-rwxrwxrwx 1 root root    7669 Nov 13 06:54 'Usedcar Model Deploy.ipynb'
-rwxrwxrwx 1 root root   42114 Nov 13 06:58 'azure model used car sklearn.ipynb.amltmp'
-rwxrwxrwx 1 root root   18800 Nov 12 09:39 'azure model used car.ipynb.amltmp'
-rwxrwxrwx 1 root root 1050292 Nov 13 07:00  carmodel.pkl
-rwxrwxrwx 1 root root   43808 Nov 12 11:02 'debug notebook.ipynb.amltmp'
drwxrwxrwx 2 root root       0 Nov 12 09:07  

In [93]:
import sklearn

from azureml.core import Model
from azureml.core.workspace import Workspace
from azureml.core.resource_configuration import ResourceConfiguration

ws = Workspace.from_config()

model = Model.register(workspace=ws,
                       model_name='usedcarprice_gbm',                # Name of the registered model in your workspace.
                       model_path='carmodel.pkl',  # Local file to upload and register as a model.
                       model_framework=Model.Framework.SCIKITLEARN,  # Framework used to create the model.
                       model_framework_version=sklearn.__version__,  # Version of scikit-learn used to create the model.
                       description='XGBoost Regression model to predict the price of an used car.',
                       tags={'area': 'usedcar', 'type': 'regression'})

Registering model usedcarprice_gbm
