# Train Test Feature Drift

In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

train_data = np.concatenate([np.random.randn(1000,2), np.random.choice(a=['apple', 'orange', 'banana'], p=[0.5, 0.3, 0.2], size=(1000, 2))], axis=1)
test_data = np.concatenate([np.random.randn(1000,2), np.random.choice(a=['apple', 'orange', 'banana'], p=[0.5, 0.3, 0.2], size=(1000, 2))], axis=1)

df_train = pd.DataFrame(train_data, columns=['numeric_without_drift', 'numeric_with_drift', 'categorical_without_drift', 'categorical_with_drift'])
df_test = pd.DataFrame(test_data, columns=df_train.columns)

df_train = df_train.astype({'numeric_without_drift': 'float', 'numeric_with_drift': 'float'})
df_test = df_test.astype({'numeric_without_drift': 'float', 'numeric_with_drift': 'float'})

In [2]:
df_train

Unnamed: 0,numeric_without_drift,numeric_with_drift,categorical_without_drift,categorical_with_drift
0,0.496714,-0.138264,apple,apple
1,0.647689,1.523030,apple,apple
2,-0.234153,-0.234137,banana,banana
3,1.579213,0.767435,apple,banana
4,-0.469474,0.542560,orange,apple
...,...,...,...,...
995,0.800410,0.754291,apple,apple
996,1.188913,0.708304,apple,orange
997,0.351448,1.070150,apple,apple
998,-0.026521,-0.881875,apple,banana


In [3]:
# Now, we insert a synthetic drift into 2 columns in the dataset
df_test['numeric_with_drift'] = df_test['numeric_with_drift'].astype('float') + abs(np.random.randn(1000)) + np.arange(0, 1, 0.001) * 4
df_test['categorical_with_drift'] = np.random.choice(a=['apple', 'orange', 'banana', 'lemon'], p=[0.5, 0.25, 0.15, 0.1], size=(1000, 1))

In [4]:
# Train a model
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier

from deepchecks.tabular import Dataset

In [5]:
model = Pipeline([
    ('handle_cat', ColumnTransformer(
        transformers=[
            ('num', 'passthrough',
             ['numeric_with_drift', 'numeric_without_drift']),
            ('cat',
             Pipeline([
                 ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
             ]),
             ['categorical_with_drift', 'categorical_without_drift'])
        ]
    )),
    ('model', DecisionTreeClassifier(random_state=0, max_depth=2))]
)

In [6]:
label = np.random.randint(0, 2, size=(df_train.shape[0],))
cat_features = ['categorical_without_drift', 'categorical_with_drift']
df_train['target'] = label
train_dataset = Dataset(df_train, label='target', cat_features=cat_features)

model.fit(train_dataset.data[train_dataset.features], label)

label = np.random.randint(0, 2, size=(df_test.shape[0],))
df_test['target'] = label
test_dataset = Dataset(df_test, label='target', cat_features=cat_features)

In [7]:
# Run the check
from deepchecks.tabular.checks import TrainTestFeatureDrift

check = TrainTestFeatureDrift()
result = check.run(train_dataset=train_dataset, test_dataset=test_dataset, model=model)
result.show()

deepchecks - INFO - Calculating permutation feature importance. Expected to finish in 2 seconds


VBox(children=(HTML(value='<h4><b>Train Test Feature Drift</b></h4>'), HTML(value='<p>    Calculate drift betw…

In [8]:
result.value

OrderedDict([('numeric_without_drift',
              {'Drift score': 0.019594833552359095,
               'Method': "Earth Mover's Distance",
               'Importance': 0.6911764705882353}),
             ('numeric_with_drift',
              {'Drift score': 0.3430867349314306,
               'Method': "Earth Mover's Distance",
               'Importance': 0.3088235294117647}),
             ('categorical_without_drift',
              {'Drift score': 0.005136700975462043,
               'Method': "Cramer's V",
               'Importance': 0.0}),
             ('categorical_with_drift',
              {'Drift score': 0.22862322289807285,
               'Method': "Cramer's V",
               'Importance': 0.0})])

In [None]:
# result.to_json()

In [10]:
# Define a condition
check_cond = check.add_condition_drift_score_less_than(max_allowed_categorical_score=0.2,
                                                       max_allowed_numeric_score=0.1)

In [15]:
result = check_cond.run(train_dataset=train_dataset, test_dataset=test_dataset)
result.show(show_additional_outputs=False)

VBox(children=(HTML(value='<h4><b>Train Test Feature Drift</b></h4>'), HTML(value='<p>    Calculate drift betw…

In [16]:
# Get an aggregate value
check = TrainTestFeatureDrift(aggregation_method='weighted')
result = check.run(train_dataset=train_dataset, test_dataset=test_dataset, model=model)

deepchecks - INFO - Calculating permutation feature importance. Expected to finish in 1 seconds


{'Weighted Drift Score': 0.11949674427236648}

In [17]:
result.reduce_output()

{'Weighted Drift Score': 0.11949674427236648}

# Train Test Label Drift

In [20]:
import pprint

import numpy as np
import pandas as pd

from deepchecks.tabular import Dataset
from deepchecks.tabular.checks import TrainTestLabelDrift

In [21]:
# Generate data:
# --------------

np.random.seed(42)

train_data = np.concatenate([np.random.randn(1000,2), np.random.choice(a=[1,0], p=[0.5, 0.5], size=(1000, 1))], axis=1)
#Create test_data with drift in label:
test_data = np.concatenate([np.random.randn(1000,2), np.random.choice(a=[1,0], p=[0.35, 0.65], size=(1000, 1))], axis=1)

df_train = pd.DataFrame(train_data, columns=['col1', 'col2', 'target'])
df_test = pd.DataFrame(test_data, columns=['col1', 'col2', 'target'])

train_dataset = Dataset(df_train, label='target')
test_dataset = Dataset(df_test, label='target')



In [22]:
df_train.head()

Unnamed: 0,col1,col2,target
0,0.496714,-0.138264,1.0
1,0.647689,1.52303,1.0
2,-0.234153,-0.234137,1.0
3,1.579213,0.767435,1.0
4,-0.469474,0.54256,0.0


In [23]:
check = TrainTestLabelDrift()
result = check.run(train_dataset=train_dataset, test_dataset=test_dataset)
result

VBox(children=(HTML(value='<h4><b>Train Test Label Drift</b></h4>'), HTML(value='<p>    Calculate label drift …

In [24]:
# Run Check on a Regression Label
# Generate data:
# --------------

train_data = np.concatenate([np.random.randn(1000,2), np.random.randn(1000, 1)], axis=1)
test_data = np.concatenate([np.random.randn(1000,2), np.random.randn(1000, 1)], axis=1)

df_train = pd.DataFrame(train_data, columns=['col1', 'col2', 'target'])
df_test = pd.DataFrame(test_data, columns=['col1', 'col2', 'target'])
#Create drift in test:
df_test['target'] = df_test['target'].astype('float') + abs(np.random.randn(1000)) + np.arange(0, 1, 0.001) * 4

train_dataset = Dataset(df_train, label='target')
test_dataset = Dataset(df_test, label='target')



In [25]:
df_train.head()

Unnamed: 0,col1,col2,target
0,-1.348177,0.015682,-0.56193
1,1.900268,1.023057,-1.428905
2,1.643542,0.391128,1.189226
3,1.16751,-0.723965,0.667245
4,-0.64434,-2.73914,-0.028218


In [26]:
check = TrainTestLabelDrift()
result = check.run(train_dataset=train_dataset, test_dataset=test_dataset)
result

VBox(children=(HTML(value='<h4><b>Train Test Label Drift</b></h4>'), HTML(value='<p>    Calculate label drift …

In [27]:
# Add a condition
check_cond = TrainTestLabelDrift().add_condition_drift_score_less_than()
check_cond.run(train_dataset=train_dataset, test_dataset=test_dataset)

VBox(children=(HTML(value='<h4><b>Train Test Label Drift</b></h4>'), HTML(value='<p>    Calculate label drift …

# Multivariate Drift

In [28]:
from urllib.request import urlopen

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

from deepchecks.tabular import Dataset
from deepchecks.tabular.datasets.classification import adult

In [31]:
label_name = 'income'
train_ds, test_ds = adult.load_data()
encoder = LabelEncoder()
train_ds.data[label_name] = encoder.fit_transform(train_ds.data[label_name])
test_ds.data[label_name] = encoder.transform(test_ds.data[label_name])

In [33]:
train_ds.data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [34]:
from deepchecks.tabular.checks import MultivariateDrift

check = MultivariateDrift()
check.run(train_dataset=train_ds, test_dataset=test_ds)

VBox(children=(HTML(value='<h4><b>Multivariate Drift</b></h4>'), HTML(value='<p>    Calculate drift between th…

In [35]:
# Introduce drift to dataset
sample_size = 10000
random_seed = 0

train_drifted_df = pd.concat([train_ds.data.sample(min(sample_size, train_ds.n_samples) - 5000, random_state=random_seed),
                             train_ds.data[train_ds.data['sex'] == ' Female'].sample(5000, random_state=random_seed)])
test_drifted_df = test_ds.data.sample(min(sample_size, test_ds.n_samples), random_state=random_seed)

train_drifted_ds = Dataset(train_drifted_df, label=label_name, cat_features=train_ds.cat_features)
test_drifted_ds = Dataset(test_drifted_df, label=label_name, cat_features=test_ds.cat_features)

In [36]:
check = MultivariateDrift()
check.run(train_dataset=train_drifted_ds, test_dataset=test_drifted_ds)

VBox(children=(HTML(value='<h4><b>Multivariate Drift</b></h4>'), HTML(value='<p>    Calculate drift between th…

In [37]:
# Define a Condition
check = MultivariateDrift()
check.add_condition_overall_drift_value_less_than(0.1)
check.run(train_dataset=train_drifted_ds, test_dataset=test_drifted_ds)

VBox(children=(HTML(value='<h4><b>Multivariate Drift</b></h4>'), HTML(value='<p>    Calculate drift between th…

# Train Test Prediction Drift

In [38]:
from sklearn.preprocessing import LabelEncoder

from deepchecks.tabular.checks import TrainTestPredictionDrift
from deepchecks.tabular.datasets.classification import adult

In [39]:
label_name = 'income'
train_ds, test_ds = adult.load_data()

In [40]:
# Introducing drift:
test_ds.data['education-num'] = 13
test_ds.data['education'] = ' Bachelors'

In [41]:
# Build Model
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

In [42]:
numeric_transformer = SimpleImputer()
categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="most_frequent")), ("encoder", OrdinalEncoder())]
)

train_ds.features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, train_ds.numerical_features),
        ("cat", categorical_transformer, train_ds.cat_features),
    ]
)

model = Pipeline(steps=[("preprocessing", preprocessor), ("model", RandomForestClassifier(max_depth=5, n_jobs=-1))])
model = model.fit(train_ds.data[train_ds.features], train_ds.data[train_ds.label_name])

In [43]:
check = TrainTestPredictionDrift()
result = check.run(train_dataset=train_ds, test_dataset=test_ds, model=model)
result

VBox(children=(HTML(value='<h4><b>Train Test Prediction Drift</b></h4>'), HTML(value='<p>    Calculate predict…

In [44]:
# The prediction drift check can also calculate drift on the predicted classes rather than 
# the probabilities. This is the default behavior for multiclass tasks. To force this behavior 
# for binary tasks, set the drift_mode parameter to prediction.
check = TrainTestPredictionDrift(drift_mode='prediction')
result = check.run(train_dataset=train_ds, test_dataset=test_ds, model=model)
result

VBox(children=(HTML(value='<h4><b>Train Test Prediction Drift</b></h4>'), HTML(value='<p>    Calculate predict…