## Building and storing the file as pickle file

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
import pprint
import os

In [2]:
import xgboost
xgboost.__version__

'0.90'

In [3]:
import sklearn
sklearn.__version__

'0.22.2.post1'

## Loading the dataset: Used Car Price Prediction

In [4]:
from azureml.core import Workspace, Dataset

subscription_id = 'fbb085eb-3abd-4384-93aa-608f78277ad2'
resource_group = 'MLOpsDemo'
workspace_name = 'MLOPS2021'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='usedcar_dataset')
cars_df = dataset.to_pandas_dataframe()

In [5]:
cars_df.head(5)

Unnamed: 0,Location,Fuel_Type,Transmission,Owner_Type,Seats,Price,mileage_new,engine_new,power_new,age,make,model,KM_Driven
0,Chennai,Petrol,Manual,First,5.0,4.5,18.2,1199,88.7,9,honda,jazz,46
1,Chennai,Diesel,Manual,First,7.0,6.0,20.77,1248,88.76,8,maruti,ertiga,87
2,Jaipur,Diesel,Manual,First,5.0,3.5,23.08,1461,63.1,7,nissan,micra,86
3,Chennai,Diesel,Manual,Second,5.0,1.95,22.3,1248,74.0,8,tata,indica,65
4,Jaipur,Diesel,Manual,First,5.0,5.6,25.2,1248,74.0,5,maruti,swift,64


In [6]:
x_columns = ['KM_Driven', 'Fuel_Type', 'age',
              'Transmission', 'Owner_Type', 'Seats',
              'make', 'mileage_new', 'engine_new', 'model',
              'power_new', 'Location']
## model of the car is not included in the model

In [7]:
cars_df.shape

(3092, 13)

In [8]:
cars_df = cars_df[x_columns + ['Price']].dropna()

In [9]:
cars_df.shape

(3091, 13)

## Identifying numerical and categorical features

In [10]:
cat_features = ['Fuel_Type',
                'Transmission', 'Owner_Type', 'model',
                'make', 'Location']

In [11]:
num_features = list(set(x_columns) - set(cat_features))

## Split the dataset

In [12]:
x_train, y_train = train_test_split(cars_df[x_columns+['Price']],
                                    train_size = 0.8,
                                    random_state = 100)

In [13]:
import logging

automl_settings = {
    "iteration_timeout_minutes": 10,
    "experiment_timeout_hours": 0.3,
    "enable_early_stopping": True,
    "primary_metric": 'r2_score',
    "featurization": 'auto',
    "verbosity": logging.INFO,
    "n_cross_validations": 5
}

## Creating the pipeline for the deployment

In [14]:
from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(task='regression',
                             debug_log='automated_ml_errors.log',
                             training_data=x_train,
                             label_column_name="Price",
                             **automl_settings)

In [15]:
from azureml.core.experiment import Experiment
experiment = Experiment(workspace, "AutoML_UsedCarPricePrediction")
local_run = experiment.submit(automl_config, show_output=True)

No run_configuration provided, running on local with default configuration
Running in the active local environment.


Experiment,Id,Type,Status,Details Page,Docs Page
AutoML_UsedCarPricePrediction,AutoML_7afd3256-3672-462b-a844-788c02150c9e,automl,Preparing,Link to Azure Machine Learning studio,Link to Documentation


Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed

In [16]:
from azureml.widgets import RunDetails
RunDetails(local_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Usin

In [17]:
best_run, fitted_model = local_run.get_output()
print(best_run)
print(fitted_model)

Run(Experiment: AutoML_UsedCarPricePrediction,
Id: AutoML_7afd3256-3672-462b-a844-788c02150c9e_31,
Type: None,
Status: Completed)
RegressionPipeline(pipeline=Pipeline(memory=None,
                                     steps=[('datatransformer',
                                             DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=True, is_onnx_compatible=False, observer=None, task='regression', working_dir='/mnt/batch/ta...
), random_state=0, reg_alpha=0.7291666666666667, reg_lambda=1.3541666666666667, subsample=0.5, tree_method='auto'))], verbose=False))], meta_learner=ElasticNetCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True, l1_ratio=0.5, max_iter=1000, n_alphas=100, n_jobs=None, normalize=False, positive=False, precompute='auto', random_state=None, selection='cyclic', tol=0.0001, verbose=0), training_cv_folds=5))],