### Examples on how to use MetaDataDiagnostic class

In [1]:
import sys
sys.path.append('../../../notebooks')
from raimitigations.datadiagnostics import *
from download import download_datasets

import pandas as pd
import random
import string
import numpy as np

In [2]:
data_dir = '../../../datasets/'
download_datasets(data_dir)
dataset = pd.read_csv(data_dir + 'hr_promotion/train.csv')
dataset = dataset[:10000].drop('employee_id', axis=1)

dataset

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Procurement,region_13,Master's & above,f,other,1,37,4.0,7,1,0,71,0
9996,Sales & Marketing,region_33,Master's & above,m,sourcing,1,39,3.0,7,0,0,48,0
9997,Finance,region_13,Master's & above,f,sourcing,1,33,4.0,4,1,0,58,0
9998,Operations,region_28,Master's & above,m,other,1,32,4.0,4,1,0,57,1


#### ActiveDetect only

#### Dataset Edit
Datasets tend to naturally have errors in the data and this dataset is no different. However, in addition to existing errors, for the sake of this tutorial, we will add some synthetic errors to help highlight functionalities offered by `ActiveDetect`.

In [3]:
random.seed(42)
np.random.seed(42)
def get_random_indices(df: pd.DataFrame, size: int) -> list:
    indices = df.index.to_list()
    return random.sample(indices, size)

Add quantitative errors to `"avg_training_score"` column:

In [4]:
rand_indices = get_random_indices(dataset, 6)
dataset.loc[rand_indices, "avg_training_score"] = dataset.loc[rand_indices, "avg_training_score"] * 1000

Add punctuation errors to `"gender"` column:

In [5]:
rand_indices = get_random_indices(dataset, 5)
dataset.loc[rand_indices, "gender"] = ['','.',',','  ','. ']

Add semantic errors to `"education"` column:

In [6]:
rand_indices = get_random_indices(dataset, 2)
dataset.loc[rand_indices, "education"] = ["not an education status 1", "not an education status 2"]

Add distribution errors to `"region"` column:

In [7]:
dataset["region"].value_counts()

region_2     2231
region_22    1183
region_7      886
region_15     509
region_13     488
region_26     427
region_31     342
region_4      320
region_27     311
region_16     286
region_28     236
region_11     216
region_23     214
region_29     189
region_32     184
region_19     158
region_14     155
region_17     151
region_20     144
region_5      139
region_25     134
region_6      134
region_1      124
region_30     117
region_8      115
region_10     113
region_24      84
region_12      82
region_9       78
region_21      65
region_3       64
region_34      58
region_33      56
region_18       7
Name: region, dtype: int64

In [8]:
rand_indices = get_random_indices(dataset, 5000)
dataset.loc[rand_indices, "region"] = "region_x"

Add a synthetic string column `"X"` (including synthetic errors) to evaluate string and character similarity errors:

In [9]:
random.seed(100)
dataset["X"] = ""
dataset["X"] = [''.join(random.choice(string.ascii_lowercase) for i in range(5)) for i in range(dataset.shape[0])]
rand_indices = get_random_indices(dataset, 2)
dataset.loc[rand_indices, "X"] = [str(random.randint(0, 1000)), "?????****?????"]

Add missing values to `"region"` column

In [10]:
rand_indices = get_random_indices(dataset, 5)
dataset.loc[rand_indices, "region"] = np.nan

### EXAMPLES

In [11]:
## mode = "column"
active_detector_1 = ActiveDetect(
    col_predict=None,
    mode="column",
    error_modules=[QuantitativeErrorModule()],
    verbose=False,
)
## mode = "column"
active_detector_2 = ActiveDetect(
    col_predict=None,
    error_modules=[PuncErrorModule()],
    verbose=False,
)

##### mode="column"; type=absolute: returns a mapping of every column index to a list of erroneous rows in that column

In [12]:
metadata_obj_1 = MetaDataDiagnostic(
    df = dataset,
    detectors = [active_detector_1, active_detector_2],
    mode = "column",
    type="aggregate")
metadata_obj_1.get_diagnostics()

{'Quantitative Errors': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0,
  10: 0,
  11: 0,
  12: 0,
  13: 0},
 'Punctuation Errors': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0,
  10: 0,
  11: 0,
  12: 0,
  13: 0}}

##### mode="row"; type=aggregate: appends ratio of erroneous rows (for all columns)

In [13]:
metadata_obj_2 = MetaDataDiagnostic(
    df = dataset,
    detectors = [active_detector_1, active_detector_2],
    mode = "row",
    type="aggregate")
metadata_obj_2.get_diagnostics()

{'Quantitative Errors': 0, 'Punctuation Errors': 0}

##### mode="column"; type=absolute: returns a mapping of every column index to a list of erroneous rows in that column

In [14]:
metadata_obj_3 = MetaDataDiagnostic(
    df = dataset,
    detectors = [active_detector_1, active_detector_2],
    mode = "column",
    type="absolute")
metadata_obj_3.get_diagnostics()

{'Quantitative Errors': {0: [],
  1: [],
  2: [],
  3: [],
  4: [],
  5: [117,
   119,
   220,
   312,
   326,
   399,
   548,
   601,
   614,
   622,
   682,
   778,
   946,
   972,
   1095,
   1297,
   1532,
   1552,
   1629,
   1884,
   1988,
   2276,
   2388,
   2447,
   2534,
   2555,
   2689,
   2881,
   3188,
   3365,
   3477,
   3550,
   3582,
   3587,
   3604,
   3620,
   3719,
   3844,
   3846,
   3911,
   3948,
   3979,
   4099,
   4154,
   4192,
   4227,
   4401,
   4423,
   4505,
   4575,
   4900,
   4916,
   5021,
   5062,
   5185,
   5193,
   5294,
   5371,
   5429,
   5523,
   5573,
   5663,
   5775,
   5801,
   5883,
   6146,
   6207,
   6265,
   6291,
   6316,
   6394,
   6469,
   6495,
   6538,
   6552,
   6601,
   6736,
   6749,
   6860,
   6953,
   6989,
   6999,
   7018,
   7126,
   7191,
   7285,
   7374,
   7520,
   7659,
   7719,
   7847,
   8058,
   8153,
   8261,
   8393,
   8535,
   8587,
   8754,
   8771,
   8810,
   8812,
   8901,
   8925,
   9059,
   9179

##### mode="row"; type=absolute: appends a list of erroneous rows (for all columns)

In [15]:
metadata_obj_4 = MetaDataDiagnostic(
    df = dataset,
    detectors = [active_detector_1, active_detector_2],
    mode = "row",
    type="absolute")
metadata_obj_4.get_diagnostics()

{'Quantitative Errors': [42,
  66,
  117,
  119,
  122,
  138,
  140,
  200,
  208,
  220,
  230,
  239,
  250,
  268,
  312,
  314,
  317,
  326,
  335,
  399,
  409,
  422,
  426,
  435,
  445,
  449,
  515,
  548,
  574,
  580,
  590,
  601,
  613,
  614,
  622,
  639,
  680,
  682,
  703,
  712,
  725,
  735,
  776,
  778,
  810,
  893,
  946,
  972,
  994,
  1009,
  1091,
  1095,
  1104,
  1109,
  1112,
  1113,
  1147,
  1195,
  1204,
  1226,
  1249,
  1250,
  1255,
  1268,
  1297,
  1354,
  1375,
  1443,
  1466,
  1482,
  1510,
  1532,
  1552,
  1600,
  1629,
  1642,
  1666,
  1675,
  1750,
  1786,
  1824,
  1884,
  1936,
  1962,
  1979,
  1988,
  2004,
  2056,
  2071,
  2128,
  2131,
  2139,
  2156,
  2195,
  2212,
  2234,
  2276,
  2286,
  2296,
  2314,
  2343,
  2348,
  2364,
  2388,
  2391,
  2395,
  2409,
  2422,
  2447,
  2454,
  2477,
  2478,
  2534,
  2555,
  2582,
  2589,
  2610,
  2616,
  2645,
  2649,
  2678,
  2689,
  2718,
  2843,
  2873,
  2881,
  2901,
  2913,
  29

##### mode="full-data"; type=na: ignores type and returns the count of all erroneous values in full data

In [16]:
metadata_obj_5 = MetaDataDiagnostic(
    df = dataset,
    detectors = [active_detector_1, active_detector_2],
    mode = "full-data")
metadata_obj_5.get_diagnostics()

{'Quantitative Errors': 475, 'Punctuation Errors': 6}

#### IsolationForestDetect Only

In [17]:
data_dir = '../../../datasets/'
download_datasets(data_dir)
dataset2 = pd.read_csv(data_dir + 'hr_promotion/train.csv')
dataset2 = dataset2[:10000].drop('employee_id', axis=1)
dataset2 = dataset2.dropna(axis=0).reset_index(drop=True)

#### Dataset Edit
Datasets tend to naturally have errors in them and this dataset is no different. However, in addition to existing errors, for the sake of this tutorial, we will inject some synthetic outliers to help highlight functionalities offered by `IsolationForestDetect`.

In [18]:
random.seed(42)
np.random.seed(42)
def get_random_indices(df: pd.DataFrame, size: int) -> list:
    indices = df.index.to_list()
    return random.sample(indices, size)

In [19]:
numerical_columns = list(dataset2.select_dtypes(include=['number']).columns)
categorical_columns = list(set(list(dataset2)) - set(numerical_columns))


In [20]:
synthetic_outliers={}
def append_synthetic_outliers(synthetic_outliers_dict, column, outlier):
    if column not in synthetic_outliers_dict:
        synthetic_outliers_dict[column] = set()
    synthetic_outliers_dict[column].add(outlier)
    return synthetic_outliers_dict

In [21]:
# injecting random values
rand_indices = get_random_indices(dataset2, 10)
for index in rand_indices:
    column = np.random.choice(numerical_columns)
    mean = dataset2[column].mean()
    std_dev = dataset2[column].std()
    outlier = np.random.choice([mean - 4 * std_dev, mean + 4 * std_dev])
    dataset2.at[index, column] = outlier
    append_synthetic_outliers(synthetic_outliers, column, outlier)
    print("column: ", column, "; outlier: ", outlier)

column:  avg_training_score ; outlier:  117.47971917329522
column:  KPIs_met >80% ; outlier:  -1.5597340269076336
column:  previous_year_rating ; outlier:  8.365703973833686
column:  KPIs_met >80% ; outlier:  -1.5615605053124915
column:  avg_training_score ; outlier:  117.5312748043361
column:  previous_year_rating ; outlier:  -1.6581673717621652
column:  previous_year_rating ; outlier:  -1.6629606123281913
column:  is_promoted ; outlier:  -1.0617178912195158
column:  length_of_service ; outlier:  23.266773848505803
column:  is_promoted ; outlier:  -1.0628726336695264


In [22]:
# injecting extreme values
rand_indices = get_random_indices(dataset2, 10)
for index in rand_indices:
    column = np.random.choice(numerical_columns)
    extreme_value = 100 * dataset2.at[index, column]
    dataset2.at[index, column] = extreme_value
    append_synthetic_outliers(synthetic_outliers, column, extreme_value)
    print("column: ", column, "; outlier: ", extreme_value)


column:  awards_won? ; outlier:  0
column:  KPIs_met >80% ; outlier:  0.0
column:  age ; outlier:  3800
column:  is_promoted ; outlier:  100.0
column:  length_of_service ; outlier:  300.0
column:  awards_won? ; outlier:  0
column:  awards_won? ; outlier:  0
column:  age ; outlier:  3100
column:  is_promoted ; outlier:  100.0
column:  length_of_service ; outlier:  400.0


In [23]:
# injecting inverted values
rand_indices = get_random_indices(dataset2, 10)
for index in rand_indices:
    column = np.random.choice(numerical_columns)
    inverted_value = -1 * dataset2.at[index, column]
    dataset2.at[index, column] = inverted_value
    append_synthetic_outliers(synthetic_outliers, column, inverted_value)
    print("column: ", column, "; outlier: ", inverted_value)

column:  KPIs_met >80% ; outlier:  -0.0
column:  no_of_trainings ; outlier:  -1
column:  length_of_service ; outlier:  -11.0
column:  age ; outlier:  -36
column:  awards_won? ; outlier:  0
column:  KPIs_met >80% ; outlier:  -1.0
column:  length_of_service ; outlier:  -16.0
column:  no_of_trainings ; outlier:  -1
column:  no_of_trainings ; outlier:  -3
column:  previous_year_rating ; outlier:  -3.0


In [24]:
rand_indices = get_random_indices(dataset2, 4)
education_outliers = ["not an education status 1", "not an education status 2", "not an education status 3", "not an education status 4"]
synthetic_outliers["education"] = education_outliers
dataset2.loc[rand_indices, "education"] = education_outliers

### EXAMPLES

In [25]:
from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings('ignore')

sklearn_obj = IsolationForest(random_state=100, contamination=0.15, verbose=False)
isf_detector_1 = IsolationForestDetect(
    col_predict=None,
    sklearn_obj=sklearn_obj,
    enable_encoder=True,
    verbose=False,)

##### mode="column"; type=absolute: returns a mapping of every column index to a list of erroneous rows in that column

In [26]:
metadata_obj_1 = MetaDataDiagnostic(
    df=dataset2,
    detectors=[isf_detector_1],
    mode="column",
    type="aggregate")
metadata_obj_1.get_diagnostics()

{'IsolationForest Outliers': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0,
  10: 0,
  11: 0,
  12: 0}}

##### mode="row"; type=aggregate: appends ratio of erroneous rows (for all columns)

In [27]:
metadata_obj_1 = MetaDataDiagnostic(
    df=dataset2,
    detectors=[isf_detector_1],
    mode="row",
    type="aggregate")
metadata_obj_1.get_diagnostics()

{'IsolationForest Outliers': 0}

##### mode="column"; type=absolute: returns a mapping of every column index to a list of erroneous rows in that column

In [28]:
metadata_obj_1 = MetaDataDiagnostic(
    df=dataset2,
    detectors=[isf_detector_1],
    mode="column",
    type="absolute")
metadata_obj_1.get_diagnostics()

{'IsolationForest Outliers': {0: [13,
   20,
   21,
   27,
   37,
   44,
   47,
   50,
   55,
   71,
   73,
   77,
   82,
   115,
   120,
   127,
   133,
   134,
   146,
   178,
   181,
   199,
   223,
   230,
   236,
   247,
   253,
   258,
   264,
   272,
   275,
   280,
   287,
   300,
   303,
   315,
   334,
   357,
   361,
   364,
   369,
   375,
   383,
   422,
   427,
   429,
   436,
   451,
   454,
   457,
   470,
   488,
   491,
   492,
   497,
   504,
   516,
   518,
   521,
   529,
   556,
   567,
   587,
   591,
   605,
   625,
   629,
   630,
   631,
   639,
   643,
   646,
   649,
   654,
   656,
   659,
   667,
   681,
   684,
   691,
   696,
   712,
   714,
   726,
   729,
   748,
   750,
   765,
   783,
   819,
   830,
   832,
   839,
   859,
   880,
   883,
   911,
   925,
   939,
   940,
   944,
   949,
   957,
   958,
   966,
   970,
   971,
   975,
   992,
   1009,
   1013,
   1018,
   1035,
   1037,
   1051,
   1054,
   1075,
   1081,
   1083,
   1084,
   1097,
  

##### mode="row"; type=absolute: appends a list of erroneous rows (for all columns)

In [29]:
metadata_obj_1 = MetaDataDiagnostic(
    df=dataset2,
    detectors=[isf_detector_1],
    mode="row",
    type="absolute")
metadata_obj_1.get_diagnostics()

{'IsolationForest Outliers': [0,
  10,
  37,
  43,
  52,
  58,
  60,
  64,
  72,
  74,
  82,
  95,
  98,
  101,
  103,
  104,
  112,
  116,
  118,
  128,
  129,
  131,
  167,
  174,
  181,
  184,
  201,
  203,
  208,
  212,
  217,
  227,
  234,
  235,
  252,
  256,
  260,
  271,
  276,
  277,
  279,
  283,
  285,
  287,
  290,
  295,
  303,
  308,
  309,
  310,
  315,
  334,
  341,
  353,
  356,
  365,
  369,
  370,
  373,
  376,
  378,
  380,
  384,
  386,
  387,
  389,
  393,
  395,
  399,
  402,
  418,
  434,
  435,
  440,
  441,
  442,
  443,
  448,
  454,
  465,
  466,
  470,
  481,
  489,
  492,
  496,
  502,
  505,
  507,
  514,
  518,
  522,
  525,
  532,
  537,
  541,
  545,
  547,
  563,
  565,
  572,
  591,
  594,
  598,
  604,
  613,
  615,
  622,
  624,
  626,
  634,
  643,
  654,
  669,
  681,
  682,
  684,
  686,
  710,
  720,
  723,
  724,
  731,
  736,
  740,
  751,
  754,
  761,
  768,
  779,
  797,
  820,
  826,
  840,
  844,
  869,
  883,
  889,
  896,
  907,
  939,

##### mode="full-data"; type=na: ignores type and returns the count of all erroneous values in full data

In [30]:
metadata_obj_1 = MetaDataDiagnostic(
    df=dataset2,
    detectors=[isf_detector_1],
    mode="full-data")
metadata_obj_1.get_diagnostics()

{'IsolationForest Outliers': 8490}

#### CorrelatedFeaturesDetect Only

In [31]:
from raimitigations.datadiagnostics import CorrelatedFeaturesDetect
from raimitigations.dataprocessing import CorrelatedFeatures
cor_feat_obj_1 = CorrelatedFeatures(save_json=False)
correlated_features_detect_1 = CorrelatedFeaturesDetect(
    col_predict=None,
    correlatedfeatures_object = cor_feat_obj_1,
    save_json= False,
    verbose=False,
)

In [32]:
metadata_obj_1 = MetaDataDiagnostic(
    df=dataset,
    detectors=[correlated_features_detect_1],
    mode="column",
    type="absolute",
    label_col='is_promoted')
metadata_obj_1.get_diagnostics() #TODO, debug module

No correlations detected. Nothing to be done here.


KeyError: 'ERROR: Column: is_promoted seen at fit time, but not present in dataframe.'

### Combine Modules

In [33]:
metadata_obj_1 = MetaDataDiagnostic(
    df=dataset2,
    detectors=[active_detector_1, active_detector_2, isf_detector_1],
    mode="column",
    type="aggregate",)
metadata_obj_1.get_diagnostics()

{'Quantitative Errors': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0,
  10: 0,
  11: 0,
  12: 0},
 'Punctuation Errors': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0,
  10: 0,
  11: 0,
  12: 0},
 'IsolationForest Outliers': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0,
  10: 0,
  11: 0,
  12: 0}}

### Other functionalities

#### Get categorical data info

##### type = absolute: lists categorical columns

In [34]:
metadata_obj_1 = MetaDataDiagnostic(
    df=dataset2,
    type="absolute",)
metadata_obj_1.get_num_categorical_features()

{'Categorical Features': ['department',
  'region',
  'education',
  'gender',
  'recruitment_channel']}

##### type = aggregate: count of categorical columns

In [35]:
metadata_obj_1 = MetaDataDiagnostic(
    df=dataset2,
    type="aggregate",)
metadata_obj_1.get_num_categorical_features()

{'Categorical Features': 5}