### Install alibi_detect library

In [None]:
import numpy as np
np.__version__

In [1]:
!pip install alibi alibi_detect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import alibi
from alibi_detect.cd import ChiSquareDrift, TabularDrift
from alibi_detect.saving import save_detector, load_detector

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [4]:
cars_df = pd.read_csv( "https://drive.google.com/uc?export=download&id=10ABViLN4Q7vgIlLvepCduU4B3C6BneJR" )

In [5]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1038 entries, 0 to 1037
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Location      1038 non-null   object 
 1   Fuel_Type     1038 non-null   object 
 2   Transmission  1038 non-null   object 
 3   Owner_Type    1038 non-null   object 
 4   Seats         1037 non-null   float64
 5   Price         1038 non-null   float64
 6   age           1038 non-null   int64  
 7   KM_Driven     1038 non-null   int64  
 8   make          1038 non-null   object 
 9   mileage       1038 non-null   float64
 10  engine        1038 non-null   int64  
 11  power         1038 non-null   float64
dtypes: float64(4), int64(3), object(5)
memory usage: 97.4+ KB


In [7]:
x_features = list(cars_df.columns)

In [8]:
x_features

['Location',
 'Fuel_Type',
 'Transmission',
 'Owner_Type',
 'Seats',
 'Price',
 'age',
 'KM_Driven',
 'make',
 'mileage',
 'engine',
 'power']

#### Specify the index of the columns which are categorical feautures

In [9]:
cat_vars = [0, 1, 2, 3, 8]

In [10]:
X = cars_df[x_features]
y = cars_df.Price

### Split the dataset into two sets

**Note**: In this exampls, data is split to create train and production datasets. This is done only for the lab session. In real world, the production data will come from the inference stystem.

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_prod, y_train, y_prod = train_test_split(X,
                                                    y,
                                                    train_size = 0.9,
                                                    random_state = 23)

In [13]:
categories_per_feature = {f: None for f in cat_vars}

In [14]:
categories_per_feature

{0: None, 1: None, 2: None, 3: None, 8: None}

### Measure the drift

In [15]:
cd = TabularDrift(X_train.values, 
                  p_val=.05, 
                  categories_per_feature=categories_per_feature)

In [16]:
filepath = 'carsdrift'  # change to directory where detector is saved
save_detector(cd, filepath, legacy = True)

In [17]:
cd = load_detector(filepath)

In [18]:
preds = cd.predict(X_prod.to_numpy())

### Printing the test results

- KS test for the numerical features
- chi-squared test for the categorical features

In [19]:
for f in range(cd.n_features):
    stat = 'Chi2' if f in list(categories_per_feature.keys()) else 'K-S'
    fname = x_features[f]
    stat_val, p_val = preds['data']['distance'][f], preds['data']['p_val'][f]
    print(f'{fname} -- {stat} {stat_val:.3f} -- p-value {p_val:.3f}')

Location -- Chi2 8.221 -- p-value 0.607
Fuel_Type -- Chi2 4.102 -- p-value 0.043
Transmission -- Chi2 0.639 -- p-value 0.424
Owner_Type -- Chi2 11.013 -- p-value 0.012
Seats -- K-S 0.018 -- p-value 1.000
Price -- K-S 0.084 -- p-value 0.495
age -- K-S 0.058 -- p-value 0.894
KM_Driven -- K-S 0.131 -- p-value 0.072
make -- Chi2 13.928 -- p-value 0.455
mileage -- K-S 0.114 -- p-value 0.158
engine -- K-S 0.167 -- p-value 0.009
power -- K-S 0.150 -- p-value 0.026


### Checking the distribution of Owner_Type in training and production data

In [20]:
X_train.Owner_Type.value_counts()

First     783
Second    127
Third      24
Name: Owner_Type, dtype: int64

In [21]:
X_prod.Owner_Type.value_counts()

First             84
Second            18
Fourth & Above     1
Third              1
Name: Owner_Type, dtype: int64