## Building and storing the file as pickle file

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor

In [2]:
import xgboost
xgboost.__version__

'0.90'

In [3]:
import sklearn
sklearn.__version__

'1.0.2'

## Loading the dataset: Used Car Price Prediction

In [4]:
cars_df = pd.read_csv( "https://drive.google.com/uc?export=download&id=1V_VBbyjGj6vvD0A90S5Lk0DG90djz28B" )

In [5]:
cars_df.head(5)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price,mileage_new,engine_new,power_new,age,make,model,KM_Driven
0,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5,18.2,1199,88.7,9,honda,jazz,46
1,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0,20.77,1248,88.76,8,maruti,ertiga,87
2,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08 kmpl,1461 CC,63.1 bhp,5.0,,3.5,23.08,1461,63.1,7,nissan,micra,86
3,Tata Indica Vista Quadrajet LS,Chennai,2012,65932,Diesel,Manual,Second,22.3 kmpl,1248 CC,74 bhp,5.0,,1.95,22.3,1248,74.0,8,tata,indica,65
4,Maruti Swift VDI BSIV,Jaipur,2015,64424,Diesel,Manual,First,25.2 kmpl,1248 CC,74 bhp,5.0,,5.6,25.2,1248,74.0,5,maruti,swift,64


In [6]:
cars_df_dev, cars_df_prod = train_test_split(cars_df,
                                             train_size = 0.8,
                                             random_state = 42)

In [7]:
cars_df_prod = cars_df_prod.reset_index()
cars_df_dev = cars_df_dev.reset_index()

In [8]:
x_columns = ['KM_Driven', 'Fuel_Type', 'age',
              'Transmission', 'Owner_Type', 'Seats',
              'make', 'mileage_new', 'engine_new', 
              'power_new', 'Location']
## model of the car is not included in the model

In [9]:
cars_df.shape

(3092, 20)

In [10]:
cars_df = cars_df[x_columns + ['Price']].dropna()

In [11]:
cars_df.shape

(3091, 12)

## Identifying numerical and categorical features

In [12]:
cat_features = ['Fuel_Type',
                'Transmission', 'Owner_Type', 
                'make', 'Location']

In [13]:
num_features = list(set(x_columns) - set(cat_features))

## Split the dataset

In [14]:
x_train, x_test, y_train, y_test = train_test_split(cars_df_dev[x_columns],
                                                    cars_df_dev.Price,
                                                    train_size = 0.8,
                                                    random_state = 100)

In [15]:
x_train.shape

(1978, 11)

## Creating the pipeline for the deployment

In [16]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),                  
        ('cat', categorical_transformer, cat_features),
    ])

params = { "n_estimators": 400,
           "max_depth": 4,
           "objective": 'reg:squarederror' }

xgb_regressor = XGBRegressor(**params)

reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', xgb_regressor)])           

reg.fit(x_train, 
        y_train)

rmse = np.sqrt(mean_squared_error(y_test, 
                                  reg.predict(x_test)))

In [17]:
print(rmse)

0.6908830920584765


In [18]:
!pip install evidently



In [19]:
!pip install statsmodels --upgrade 



## Predicting on Production Set

In [20]:
cars_df_prod['prediction'] = reg.predict(cars_df_prod[x_columns])

### Retriving the dummy variables from the pipeline

In [21]:
new_cat_features = list(reg['preprocessor'].transformers_[1][1].get_feature_names())



In [22]:
new_cat_features

['x0_Diesel',
 'x0_Petrol',
 'x1_Automatic',
 'x1_Manual',
 'x2_First',
 'x2_Fourth & Above',
 'x2_Second',
 'x2_Third',
 'x3_chevrolet',
 'x3_datsun',
 'x3_fiat',
 'x3_ford',
 'x3_honda',
 'x3_hyundai',
 'x3_mahindra',
 'x3_maruti',
 'x3_mitsubishi',
 'x3_nissan',
 'x3_renault',
 'x3_skoda',
 'x3_tata',
 'x3_toyota',
 'x3_volkswagen',
 'x4_Ahmedabad',
 'x4_Bangalore',
 'x4_Chennai',
 'x4_Coimbatore',
 'x4_Delhi',
 'x4_Hyderabad',
 'x4_Jaipur',
 'x4_Kochi',
 'x4_Kolkata',
 'x4_Mumbai',
 'x4_Pune']

In [23]:
len(num_features + new_cat_features)

40

## Prepare Production Data

We need to use unscaed numerical data and new categorical (dummy) variables

In [24]:
cars_df_prod_x_cols = reg.named_steps['preprocessor'].transform(cars_df_prod[x_columns])
cars_df_prod_x_cols_df = pd.DataFrame( cars_df_prod_x_cols.toarray(), columns  = num_features + new_cat_features)
cars_df_prod_x_cols_df_final = pd.concat([cars_df_prod[num_features], cars_df_prod_x_cols_df[new_cat_features]], axis = 1)
cars_df_prod_x_cols_df_final['prediction'] = reg.predict(cars_df_prod[x_columns])
cars_df_prod_x_cols_df_final

Unnamed: 0,Seats,power_new,engine_new,KM_Driven,mileage_new,age,x0_Diesel,x0_Petrol,x1_Automatic,x1_Manual,...,x4_Chennai,x4_Coimbatore,x4_Delhi,x4_Hyderabad,x4_Jaipur,x4_Kochi,x4_Kolkata,x4_Mumbai,x4_Pune,prediction
0,5.0,98.6,1396,55,15.00,8,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.563105
1,5.0,55.2,814,38,21.10,5,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.665395
2,5.0,86.8,1198,22,18.50,5,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.280173
3,5.0,55.2,814,45,21.10,8,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.618295
4,5.0,88.7,1199,3,19.00,2,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8.667166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
614,5.0,74.0,1198,22,16.47,6,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.447949
615,5.0,74.0,1248,88,22.90,7,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.497047
616,5.0,78.9,1197,59,20.36,8,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.545256
617,5.0,78.9,1197,70,20.36,10,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.934929


## Preparing the training data

In [25]:
x_train_new = x_train.reset_index()
x_train_x_cols = reg.named_steps['preprocessor'].transform(x_train_new)
x_train_x_cols_df = pd.DataFrame( x_train_x_cols.toarray(), columns  = num_features + new_cat_features)
x_train_x_cols_df_final = pd.concat([x_train_new[num_features], x_train_x_cols_df[new_cat_features]], axis = 1)
x_train_x_cols_df_final['prediction'] = reg.predict(x_train[x_columns])
x_train_x_cols_df_final['Price'] = np.array(y_train)
x_train_x_cols_df_final

Unnamed: 0,Seats,power_new,engine_new,KM_Driven,mileage_new,age,x0_Diesel,x0_Petrol,x1_Automatic,x1_Manual,...,x4_Coimbatore,x4_Delhi,x4_Hyderabad,x4_Jaipur,x4_Kochi,x4_Kolkata,x4_Mumbai,x4_Pune,prediction,Price
0,5.0,74.00,1248,25,28.40,3,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6.427374,5.99
1,5.0,80.00,1197,58,17.00,8,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.149638,4.00
2,5.0,67.10,998,73,24.07,5,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.177943,3.30
3,5.0,86.80,1196,21,18.16,3,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.403126,6.42
4,5.0,83.80,1461,55,19.87,6,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.100860,4.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1973,5.0,74.00,1248,37,26.59,5,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.084397,4.85
1974,5.0,67.04,1364,179,23.59,7,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.958296,3.80
1975,5.0,67.00,1172,72,15.70,10,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.264123,2.25
1976,5.0,90.00,1396,77,23.00,4,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.162492,6.75


# Creating Data Drift Report

In [26]:
from evidently.dashboard import Dashboard
from evidently.tabs import DataDriftTab, NumTargetDriftTab

from evidently.model_profile import Profile
from evidently.profile_sections import DataDriftProfileSection

  defaults = yaml.load(f)


In [27]:
from evidently.pipeline.column_mapping import ColumnMapping

column_mapping = ColumnMapping()

column_mapping.target = 'Price' #'y' is the name of the column with the target function
column_mapping.prediction = 'prediction' #'pred' is the name of the column(s) with model predictions
column_mapping.id = None #there is no ID column in the dataset
column_mapping.datetime = None #'date' is the name of the column with datetime 

column_mapping.numerical_features = num_features #list of numerical features
column_mapping.categorical_features = new_cat_features #list of categorical features

In [28]:
column_mapping

ColumnMapping(target='Price', prediction='prediction', datetime=None, id=None, numerical_features=['Seats', 'power_new', 'engine_new', 'KM_Driven', 'mileage_new', 'age'], categorical_features=['x0_Diesel', 'x0_Petrol', 'x1_Automatic', 'x1_Manual', 'x2_First', 'x2_Fourth & Above', 'x2_Second', 'x2_Third', 'x3_chevrolet', 'x3_datsun', 'x3_fiat', 'x3_ford', 'x3_honda', 'x3_hyundai', 'x3_mahindra', 'x3_maruti', 'x3_mitsubishi', 'x3_nissan', 'x3_renault', 'x3_skoda', 'x3_tata', 'x3_toyota', 'x3_volkswagen', 'x4_Ahmedabad', 'x4_Bangalore', 'x4_Chennai', 'x4_Coimbatore', 'x4_Delhi', 'x4_Hyderabad', 'x4_Jaipur', 'x4_Kochi', 'x4_Kolkata', 'x4_Mumbai', 'x4_Pune'], datetime_features=None, target_names=None, task=None)

In [30]:
cars_data_drift_dashboard = Dashboard(tabs=[DataDriftTab()])
cars_data_drift_dashboard.calculate(x_train_x_cols_df_final, 
                                    cars_df_prod_x_cols_df_final, 
                                    column_mapping = column_mapping)

In [31]:
cars_data_drift_dashboard.save("newreports/cars_datadriftV1.html")

In [32]:
cars_data_drift_profile = Profile(sections=[DataDriftProfileSection()])
cars_data_drift_profile.calculate(x_train_x_cols_df_final, 
                                  cars_df_prod_x_cols_df_final, 
                                  column_mapping = column_mapping)
cars_data_drift_profile.json()

'{"data_drift": {"name": "data_drift", "datetime": "2022-03-11 17:01:43.897289", "data": {"utility_columns": {"date": null, "id": null, "target": "Price", "prediction": "prediction"}, "cat_feature_names": ["x0_Diesel", "x0_Petrol", "x1_Automatic", "x1_Manual", "x2_First", "x2_Fourth & Above", "x2_Second", "x2_Third", "x3_chevrolet", "x3_datsun", "x3_fiat", "x3_ford", "x3_honda", "x3_hyundai", "x3_mahindra", "x3_maruti", "x3_mitsubishi", "x3_nissan", "x3_renault", "x3_skoda", "x3_tata", "x3_toyota", "x3_volkswagen", "x4_Ahmedabad", "x4_Bangalore", "x4_Chennai", "x4_Coimbatore", "x4_Delhi", "x4_Hyderabad", "x4_Jaipur", "x4_Kochi", "x4_Kolkata", "x4_Mumbai", "x4_Pune"], "num_feature_names": ["Seats", "power_new", "engine_new", "KM_Driven", "mileage_new", "age"], "datetime_feature_names": [], "target_names": null, "options": {"confidence": 0.95, "drift_share": 0.5, "nbinsx": 10, "xbins": null}, "metrics": {"n_features": 40, "n_drifted_features": 1, "share_drifted_features": 0.025, "dataset

## Generating Regression Performance Report

This report is generated after the actual ground truth is obtained i.e. after the actual sales of the cars in production environment.

In [37]:
cars_df_prod_x_cols_df_final['Price'] = cars_df_prod['Price']

In [38]:
from evidently.tabs import RegressionPerformanceTab

In [39]:
dashboard = Dashboard(tabs=[RegressionPerformanceTab()])

In [40]:
dashboard.calculate(x_train_x_cols_df_final, 
                    cars_df_prod_x_cols_df_final, 
                    column_mapping=column_mapping)

In [41]:
dashboard.save("newreports/cars_dataPerormanceV1.html")