# ActiveDetect Example

In this example, we will explore how to predict errors within a dataset using multiple error modules available with ActiveDetect using the HR promotion dataset.

This subclass uses error detection modules presented in the Sanjay Krishnan et al.'s activedetect repo and paper: [BoostClean: Automated Error Detection and Repair for Machine Learning](https://arxiv.org/abs/1711.01299). 

In [1]:
import sys
sys.path.append('../../../notebooks')

import pandas as pd
import numpy as np
from raimitigations.datadiagnostics import *
from download import download_datasets


In [2]:
data_dir = '../../../datasets/'
download_datasets(data_dir)
dataset = pd.read_csv(data_dir + 'hr_promotion/train.csv')
dataset = dataset[:10000]

dataset

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,14934,Procurement,region_13,Master's & above,f,other,1,37,4.0,7,1,0,71,0
9996,22040,Sales & Marketing,region_33,Master's & above,m,sourcing,1,39,3.0,7,0,0,48,0
9997,14188,Finance,region_13,Master's & above,f,sourcing,1,33,4.0,4,1,0,58,0
9998,73566,Operations,region_28,Master's & above,m,other,1,32,4.0,4,1,0,57,1


## Error Modules
First, let's explore each error module individually:

### QuantitativeErrorModule

In [3]:
active_detector_1 = ActiveDetect(
    df=dataset,
    col_predict=None,
    error_modules=[QuantitativeErrorModule()],
    verbose=False,
)
active_detector_1.fit()

<raimitigations.datadiagnostics.active_detect.ActiveDetect at 0x7f3c0a91f430>

Take a look at the automatically assigned types for each column:

In [4]:
active_detector_1.types

['numerical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'numerical',
 'numerical',
 'numerical',
 'numerical',
 'numerical',
 'numerical',
 'numerical',
 'numerical']

`predict()` errors on the dataset. Here we are printing erroneous values of each column for post verification that the error matrix matches these values:

In [5]:
error_matrix = active_detector_1.predict(dataset)
error_matrix

employee_id
error_vals:  []
no_of_trainings
error_vals:  [4.0, 5.0, 6.0, 7.0, 8.0, 10.0]
age
error_vals:  []
previous_year_rating
error_vals:  []
length_of_service
error_vals:  [32.0, 33.0, 34.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0]
KPIs_met >80%
error_vals:  []
awards_won?
error_vals:  [1.0]
avg_training_score
error_vals:  []
is_promoted
error_vals:  []


array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

Verify that the erroneous values and their indices match between the dataset and the error matrix:

In [6]:
dataset[dataset['no_of_trainings'].isin([4,5,6,7,8,10])].index

Int64Index([ 117,  119,  220,  312,  326,  399,  548,  601,  614,  622,
            ...
            9179, 9277, 9394, 9434, 9515, 9540, 9695, 9862, 9983, 9991],
           dtype='int64', length=114)

In [7]:
np.where(error_matrix[:, 6] == -1)[0]

array([ 117,  119,  220,  312,  326,  399,  548,  601,  614,  622,  682,
        778,  946,  972, 1095, 1297, 1532, 1552, 1629, 1884, 1988, 2276,
       2388, 2447, 2534, 2555, 2689, 2881, 3188, 3365, 3477, 3550, 3582,
       3587, 3604, 3620, 3719, 3844, 3846, 3911, 3948, 3979, 4099, 4154,
       4192, 4227, 4401, 4423, 4505, 4575, 4900, 4916, 5021, 5062, 5185,
       5193, 5294, 5371, 5429, 5523, 5573, 5663, 5775, 5801, 5883, 6146,
       6207, 6265, 6291, 6316, 6394, 6469, 6495, 6538, 6552, 6601, 6736,
       6749, 6860, 6953, 6989, 6999, 7018, 7126, 7191, 7285, 7374, 7520,
       7659, 7719, 7847, 8058, 8153, 8261, 8393, 8535, 8587, 8754, 8771,
       8810, 8812, 8901, 8925, 9059, 9179, 9277, 9394, 9434, 9515, 9540,
       9695, 9862, 9983, 9991])

We can also call the error matrix of each error module individually:

In [8]:
matrix = active_detector_1.get_error_module_matrix('QuantitativeErrorModule')
matrix.shape

(10000, 14)

### PuncErrorModule

In [9]:
active_detector_2 = ActiveDetect(
    df=dataset,
    col_predict=None,
    error_modules=[PuncErrorModule()],
    verbose=False,
)
active_detector_2.fit()

<raimitigations.datadiagnostics.active_detect.ActiveDetect at 0x7f3c0a704d30>

In [10]:
active_detector_2.predict(dataset)

department
[]
region
[]
education
[]
gender
[]
recruitment_channel
[]


array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

### SemanticErrorModule

In [11]:
active_detector_3 = ActiveDetect(
    df=dataset,
    col_predict=None,
    error_modules=[SemanticErrorModule()],
    verbose=False,
)
active_detector_3.fit()

<raimitigations.datadiagnostics.active_detect.ActiveDetect at 0x7f3c0a91f5b0>

In [12]:
active_detector_3.predict(dataset)

department
[]
region
[]
education
[]
gender
[]
recruitment_channel
[]


array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

### DistributionErrorModule

In [13]:
active_detector_4 = ActiveDetect(
    df=dataset,
    col_predict=None,
    error_modules=[DistributionErrorModule()],
    verbose=False,
)
active_detector_4.fit()
active_detector_4.predict(dataset)

employee_id
[]
department
[]
region
['region_2']
education
[]
gender
[]
recruitment_channel
[]
no_of_trainings
[]
age
[]
previous_year_rating
[]
length_of_service
[]
KPIs_met >80%
[]
awards_won?
[]
avg_training_score
[]
is_promoted
[]


array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

### StringSimilarityErrorModule

In [14]:
active_detector_5 = ActiveDetect(
    df=dataset,
    col_predict=None,
    error_modules=[StringSimilarityErrorModule()],
    verbose=False,
)
active_detector_5.fit()
active_detector_5.types[1] = 'string'
active_detector_5.predict(dataset)


department
[]


array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

### CharSimilarityErrorModule

In [15]:
active_detector_6 = ActiveDetect(
    df=dataset,
    col_predict=None,
    error_modules=[CharSimilarityErrorModule()],
    verbose=False,
)
active_detector_6.fit()
active_detector_6.types[1]='string'
active_detector_6.predict(dataset)

department
['HR']


array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

In [17]:
new_df = active_detector_6.transform(dataset)

In [19]:
dataset['department'].unique()

array(['Sales & Marketing', 'Operations', 'Technology', 'Analytics',
       'R&D', 'Procurement', 'Finance', 'HR', 'Legal'], dtype=object)

In [18]:
new_df['department'].unique()

array(['Sales & Marketing', 'Operations', 'Technology', 'Analytics',
       'R&D', 'Procurement', 'Finance', nan, 'Legal'], dtype=object)