In [125]:
import pandas as pd
import numpy as np
import random
import string
import json
import os

from enum import Enum
from functools import partial

In [63]:
%config Completer.use_jedi = False

In [64]:
def make_random_string():
    return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10))

In [90]:
class AnomalyFunctions:
    @staticmethod
    def columns_swap(df: pd.DataFrame, columns: list) -> pd.DataFrame:
        columnA = columns[0]
        columnB = columns[1]
        
        df = df.copy()
        tmp = df[columnA].copy()
        df[columnA] = df[columnB]
        df[columnB] = tmp

        return df
    
    @staticmethod
    def columns_partial_swap(df: pd.DataFrame, columns: list) -> pd.DataFrame:
        columnA = columns[0]
        columnB = columns[1]
        
        df = df.copy()
        swap_starting_at = df.shape[0] // 2
        tmp = df[columnA].iloc[swap_starting_at].copy()
        df[columnA].iloc[swap_starting_at] = df[columnB].iloc[swap_starting_at]
        df[columnB].iloc[swap_starting_at] = tmp
        
        return df
    
    @staticmethod
    def duplicates_influx(df: pd.DataFrame, columns: list, share=1/2) -> pd.DataFrame:
        columnA = columns[0]
        
        df = df.copy()
        value_to_make_dupes = df[columnA].iat[0]
        df[columnA].iloc[np.random.choice(int(df.shape[0] * share))] = value_to_make_dupes
        
        return df
    
    @staticmethod
    def nan_influx(df: pd.DataFrame, columns: list, share=1/2) -> pd.DataFrame:
        columnA = columns[0]
        
        df = df.copy()
        df[columnA].iloc[np.random.choice(df.shape[0], int(df.shape[0] * share))] = None
        
        return df
    
    @staticmethod
    def random_influx(df: pd.DataFrame, columns: list, share=1/2) -> pd.DataFrame:
        columnA = columns[0]
        
        df = df.copy()
        df[columnA].iloc[np.random.choice(int(df.shape[0] * share))] = make_random_string()
        
        return df
    
    @staticmethod
    def numeric_variance_change(df: pd.DataFrame, columns: list, coef=2) -> pd.DataFrame:
        columnA = columns[0]
        
        df = df.copy()
        df[columnA] = df[columnA] * coef
        
        return df
    
    @staticmethod
    def numeric_mean_change(df: pd.DataFrame, columns: list, coef=0.5) -> pd.DataFrame:
        columnA = columns[0]
        
        avg = df[columnA].mean()
        
        df = df.copy()
        df[columnA] = (df[columnA] - avg) + avg * coef
        
        return df
    
    @staticmethod
    def categorical_new_category_influx(df: pd.DataFrame, columns: list, share=1/2) -> pd.DataFrame:
        columnA = columns[0]
        
        uniqueVals = df[columnA].unique()
        newVal = ''.join(uniqueVals)
        
        df = df.copy()
        df[columnA].iloc[np.radom.choice(int(df.shape[0] * share))] = newVal
        
        return df
    
    @staticmethod
    def categorical_category_miss(df: pd.DataFrame, columns: list) -> pd.DataFrame:
        columnA = columns[0]
        
        unique_vals = df[columnA].unique()
        miss_val = unique_vals[0]
        fill_val = unique_vals[-1]
        
        df = df.copy()
        df[columnA][df[columnA] == miss_val] = fill_val
        
        return df
    
    def categorical_distribution_changed(df: pd.DataFrame, columns: list, random_seed=42) -> pd.DataFrame:
        columnA = columns[0]
        
        unique_vals = df[columnA].unique()
        sample_probas = None
        
        if len(unique_vals) == 1:
            sample_probas = [1]
        elif len(unique_vals) == 2:
            sample_probas = [0.2, 0.8]
        elif len(unique_vals) == 3:
            sample_probas = [0.2, 0.5, 0.3]
        elif len(unique_vals) == 4:
            sample_probas = [0.2, 0.2, 0.2, 0.4]
        else:
            sample_probas = np.dirichlet(np.arange(1, len(unique_vals) + 1), 1).flatten()
            
        df[columnA] = np.random.choice(unique_vals, size=df.shape[0], p=sample_probas)
        
        return df

In [103]:
metadata = json.load(open('./Datasets/Real/BankChurners/metadata.json', 'r'))

In [108]:
metadata

{'CLIENTNUM': 'numeric',
 'Attrition_Flag': 'categorical',
 'Customer_Age': 'numeric',
 'Gender': 'categorical',
 'Dependent_count': 'numeric',
 'Education_Level': 'categorical',
 'Marital_Status': 'categorical',
 'Income_Category': 'categorical',
 'Card_Category': 'categorical',
 'Months_on_book': 'numeric',
 'Total_Relationship_Count': 'numeric',
 'Months_Inactive_12_mon': 'numeric',
 'Contacts_Count_12_mon': 'numeric',
 'Credit_Limit': 'numeric',
 'Total_Revolving_Bal': 'numeric',
 'Avg_Open_To_Buy': 'numeric',
 'Total_Amt_Chng_Q4_Q1': 'numeric',
 'Total_Trans_Amt': 'numeric',
 'Total_Ct_Chng_Q4_Q1': 'numeric',
 'Avg_Utilization_Ratio': 'numeric'}

In [121]:
print(np.random.choice(list(metadata.keys()), 8, replace=False).tolist())

['Total_Relationship_Count', 'CLIENTNUM', 'Income_Category', 'Dependent_count', 'Card_Category', 'Attrition_Flag', 'Gender', 'Credit_Limit']


# GENERATE GROUP-LEVEL ANOMALY DATASETS

In [127]:
ANOMALY_COLUMNS = {
    1: ['Attrition_Flag', 'Gender', "Education_Level", "Marital_Status", 'Credit_Limit', 'Total_Revolving_Bal', 'Total_Trans_Amt'],
    2: ['Avg_Utilization_Ratio', 'Card_Category', 'Customer_Age', 'Avg_Open_To_Buy', 'Customer_Age', 'Total_Trans_Amt'],
    3: ['Avg_Utilization_Ratio', 'Card_Category', 'Customer_Age', 'Avg_Open_To_Buy', 'Total_Revolving_Bal', 'Total_Trans_Amt'],
    4: ['Avg_Utilization_Ratio', 'Total_Revolving_Bal', 'Attrition_Flag', 'Dependent_count', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Ct_Chng_Q4_Q1', 'Months_on_book'],
    5: ['Customer_Age', 'Total_Relationship_Count', 'Education_Level', 'Total_Amt_Chng_Q4_Q1', 'Marital_Status', 'Total_Revolving_Bal', 'Avg_Utilization_Ratio', 'Total_Ct_Chng_Q4_Q1'],
    6: ['CLIENTNUM', 'Gender', 'Contacts_Count_12_mon', 'Dependent_count', 'Total_Relationship_Count', 'Credit_Limit', 'Avg_Utilization_Ratio', 'Total_Revolving_Bal'],
    7: ['Total_Amt_Chng_Q4_Q1', 'CLIENTNUM', 'Gender', 'Months_on_book', 'Avg_Utilization_Ratio', 'Income_Category', 'Dependent_count', 'Card_Category'],
    8: ['CLIENTNUM', 'Total_Revolving_Bal', 'Dependent_count', 'Education_Level', 'Contacts_Count_12_mon', 'Gender', 'Avg_Utilization_Ratio', 'Customer_Age'],
    9: ['Credit_Limit', 'Marital_Status', 'Contacts_Count_12_mon', 'Customer_Age', 'Total_Relationship_Count', 'Months_on_book', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt'],
    10: ['Months_Inactive_12_mon', 'Total_Amt_Chng_Q4_Q1', 'Dependent_count', 'Avg_Utilization_Ratio', 'Gender', 'Total_Revolving_Bal', 'Total_Trans_Amt', 'Months_on_book'],
    11: ['Credit_Limit', 'Gender', 'Education_Level', 'Avg_Utilization_Ratio', 'Months_on_book', 'Total_Ct_Chng_Q4_Q1', 'Customer_Age', 'Total_Amt_Chng_Q4_Q1'],
    12: ['Avg_Utilization_Ratio', 'Avg_Open_To_Buy', 'Total_Relationship_Count', 'Income_Category', 'Total_Revolving_Bal', 'Attrition_Flag', 'CLIENTNUM', 'Months_Inactive_12_mon'],
    13: ['Credit_Limit', 'Customer_Age', 'Total_Relationship_Count', 'Total_Ct_Chng_Q4_Q1', 'Months_Inactive_12_mon', 'Marital_Status', 'Gender', 'Months_on_book'],
    14: ['Months_Inactive_12_mon', 'Total_Relationship_Count', 'Credit_Limit', 'Gender', 'Contacts_Count_12_mon', 'Total_Trans_Amt', 'Dependent_count', 'Total_Ct_Chng_Q4_Q1'],
    15: ['Total_Relationship_Count', 'CLIENTNUM', 'Income_Category', 'Dependent_count', 'Card_Category', 'Attrition_Flag', 'Gender', 'Credit_Limit']
}

ANOMALY_TRANSFORMS = {
    # ['Attrition_Flag', 'Gender', "Education_Level", "Marital_Status", 'Credit_Limit', 'Total_Revolving_Bal', 'Total_Trans_Amt']
    1: [
        (partial(AnomalyFunctions.categorical_category_miss, columns=["Attrition_Flag"])),
        (partial(AnomalyFunctions.categorical_category_miss, columns=["Gender"])),
        (partial(AnomalyFunctions.columns_swap, columns=["Education_Level", "Marital_Status"])),
        (partial(AnomalyFunctions.nan_influx, columns=["Credit_Limit"])),
        (partial(AnomalyFunctions.numeric_mean_change, columns=["Total_Revolving_Bal"])),
        (partial(AnomalyFunctions.numeric_variance_change, columns=["Total_Trans_Amt"])),
    ],
    # ['Avg_Utilization_Ratio', 'Card_Category', 'Customer_Age', 'Avg_Open_To_Buy', 'Customer_Age', 'Total_Trans_Amt']
    2: [
        (partial(AnomalyFunctions.numeric_mean_change, columns=["Avg_Utilization_Ratio"], coef=1.5)),
        (partial(AnomalyFunctions.categorical_category_miss, columns=["Card_Category"])),
        (partial(AnomalyFunctions.numeric_variance_change, columns=["Customer_Age"], coef=0.5)),
        (partial(AnomalyFunctions.random_influx, columns=["Avg_Open_To_Buy"])),
        (partial(AnomalyFunctions.duplicates_influx, columns=["Customer_Age"])),
        (partial(AnomalyFunctions.categorical_category_miss, columns=["Total_Trans_Amt"])),
    ],
    # ['Avg_Utilization_Ratio', 'Card_Category', 'Customer_Age', 'Avg_Open_To_Buy', 'Total_Revolving_Bal', 'Total_Trans_Amt']
    3: [
        (partial(AnomalyFunctions.numeric_mean_change, columns=["Avg_Utilization_Ratio"], coef=1.5)),
        (partial(AnomalyFunctions.categorical_category_miss, columns=["Card_Category"])),
        (partial(AnomalyFunctions.numeric_variance_change, columns=["Customer_Age"], coef=0.5)),
        (partial(AnomalyFunctions.random_influx, columns=["Avg_Open_To_Buy"])),
        (partial(AnomalyFunctions.duplicates_influx, columns=["Total_Revolving_Bal"])),
        (partial(AnomalyFunctions.categorical_category_miss, columns=["Total_Trans_Amt"])),
    ], 
    # ['Avg_Utilization_Ratio', 'Total_Revolving_Bal', 'Attrition_Flag', 'Dependent_count', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Ct_Chng_Q4_Q1', 'Months_on_book']
    4: [
        (partial(AnomalyFunctions.nan_influx, columns=["Avg_Utilization_Ratio"])),
        (partial(AnomalyFunctions.random_influx, columns=["Total_Revolving_Bal"])),
        (partial(AnomalyFunctions.categorical_new_category_influx, columns=["Attrition_Flag"])),
        (partial(AnomalyFunctions.columns_partial_swap, columns=["Dependent_count", "Avg_Open_To_Buy"])),
        (partial(AnomalyFunctions.columns_swap, columns=["Total_Amt_Chng_Q4_Q1", "Total_Ct_Chng_Q4_Q1"])),
        (partial(AnomalyFunctions.numeric_mean_change, columns=["Months_on_book"], coef=1.2)),
    ],
    # ['Customer_Age', 'Total_Relationship_Count', 'Education_Level', 'Total_Amt_Chng_Q4_Q1', 'Marital_Status', 'Total_Revolving_Bal', 'Avg_Utilization_Ratio', 'Total_Ct_Chng_Q4_Q1']
    5: [
        (partial(AnomalyFunctions.columns_swap, columns=["Customer_Age", "Total_Relationship_Count"])),
        (partial(AnomalyFunctions.categorical_new_category_influx, columns=["Education_Level"])),
        (partial(AnomalyFunctions.columns_partial_swap, columns=["Total_Amt_Chng_Q4_Q1", "Marital_Status"])),
        (partial(AnomalyFunctions.numeric_variance_change, columns=["Total_Revolving_Bal"], coef=1.2)),
        (partial(AnomalyFunctions.random_influx, columns=["Avg_Utilization_Ratio"])),
        (partial(AnomalyFunctions.numeric_variance_change, columns=["Total_Ct_Chng_Q4_Q1"], coef=0.8))
    ],
    # ['CLIENTNUM', 'Gender', 'Contacts_Count_12_mon', 'Dependent_count', 'Total_Relationship_Count', 'Credit_Limit', 'Avg_Utilization_Ratio', 'Total_Revolving_Bal']
    6: [
        (partial(AnomalyFunctions.columns_partial_swap, columns=['CLIENTNUM', 'Gender'])),
        (partial(AnomalyFunctions.nan_influx, columns=['Contacts_Count_12_mon'])),
        (partial(AnomalyFunctions.columns_partial_swap, columns=['Dependent_count', 'Total_Relationship_Count'])),
        (partial(AnomalyFunctions.random_influx, columns=['Credit_Limit'])),
        (partial(AnomalyFunctions.numeric_variance_change, columns=['Avg_Utilization_Ratio'], coef=2.0)),
        (partial(AnomalyFunctions.numeric_variance_change, columns=['Total_Revolving_Bal'], coef=1.2))
    ],
    # ['Total_Amt_Chng_Q4_Q1', 'CLIENTNUM', 'Gender', 'Months_on_book', 'Avg_Utilization_Ratio', 'Income_Category', 'Dependent_count', 'Card_Category']
    7: [
        (partial(AnomalyFunctions.numeric_mean_change, columns=['Total_Amt_Chng_Q4_Q1'], coef=0.3)),
        (partial(AnomalyFunctions.random_influx, columns=['CLIENTNUM'])),
        (partial(AnomalyFunctions.categorical_distribution_changed, columns=['Gender'])),
        (partial(AnomalyFunctions.columns_swap, columns=['Months_on_book', 'Avg_Utilization_Ratio'])),
        (partial(AnomalyFunctions.columns_swap, columns=['Income_Category', 'Dependent_count'])),
        (partial(AnomalyFunctions.duplicates_influx, columns=['Card_Category']))
    ],
    # ['CLIENTNUM', 'Total_Revolving_Bal', 'Dependent_count', 'Education_Level', 'Contacts_Count_12_mon', 'Gender', 'Avg_Utilization_Ratio', 'Customer_Age']
    8: [
        (partial(AnomalyFunctions.duplicates_influx, columns=['CLIENTNUM'])),
        (partial(AnomalyFunctions.nan_influx, columns=['Total_Revolving_Bal'])),
        (partial(AnomalyFunctions.columns_partial_swap, columns=['Dependent_count', 'Education_Level'])),
        (partial(AnomalyFunctions.columns_swap, columns=['Contacts_Count_12_mon', 'Gender'])),
        (partial(AnomalyFunctions.numeric_variance_change, columns=['Avg_Utilization_Ratio'], coef=0.5)),
        (partial(AnomalyFunctions.numeric_mean_change, columns=['Customer_Age'], coef=0.9))
    ],
    # ['Credit_Limit', 'Marital_Status', 'Contacts_Count_12_mon', 'Customer_Age', 'Total_Relationship_Count', 'Months_on_book', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt']
    9: [
        (partial(AnomalyFunctions.numeric_mean_change, columns=['Credit_Limit'])),
        (partial(AnomalyFunctions.nan_influx, columns=['Marital_Status'])),
        (partial(AnomalyFunctions.columns_swap, columns=['Contacts_Count_12_mon', 'Customer_Age'])),
        (partial(AnomalyFunctions.columns_partial_swap, columns=['Total_Relationship_Count', 'Months_on_book'])),
        (partial(AnomalyFunctions.numeric_mean_change, columns=['Total_Amt_Chng_Q4_Q1'])),
        (partial(AnomalyFunctions.nan_influx, columns=['Total_Trans_Amt']))
    ],
    # ['Months_Inactive_12_mon', 'Total_Amt_Chng_Q4_Q1', 'Dependent_count', 'Avg_Utilization_Ratio', 'Gender', 'Total_Revolving_Bal', 'Total_Trans_Amt', 'Months_on_book']
    10: [
        (partial(AnomalyFunctions.columns_swap, columns=['Months_Inactive_12_mon', 'Total_Amt_Chng_Q4_Q1'])),
        (partial(AnomalyFunctions.numeric_mean_change, columns=['Dependent_count'], coef=0.7)),
        (partial(AnomalyFunctions.numeric_mean_change, columns=['Avg_Utilization_Ratio'], coef=0.9)),
        (partial(AnomalyFunctions.categorical_distribution_changed, columns=['Gender'])),
        (partial(AnomalyFunctions.columns_swap, columns=['Total_Revolving_Bal', 'Total_Trans_Amt'])),
        (partial(AnomalyFunctions.numeric_mean_change, columns=['Months_on_book'], coef=1.4))
    ],
    # ['Credit_Limit', 'Gender', 'Education_Level', 'Avg_Utilization_Ratio', 'Months_on_book', 'Total_Ct_Chng_Q4_Q1', 'Customer_Age', 'Total_Amt_Chng_Q4_Q1']
    11: [
        (partial(AnomalyFunctions.random_influx, columns=['Credit_Limit'])),
        (partial(AnomalyFunctions.columns_partial_swap, columns=['Gender', 'Education_Level'])),
        (partial(AnomalyFunctions.nan_influx, columns=['Avg_Utilization_Ratio'])),
        (partial(AnomalyFunctions.columns_partial_swap, columns=['Months_on_book', 'Total_Ct_Chng_Q4_Q1'])),
        (partial(AnomalyFunctions.random_influx, columns=['Customer_Age'])),
        (partial(AnomalyFunctions.nan_influx, columns=['Total_Amt_Chng_Q4_Q1']))
    ], 
    # ['Avg_Utilization_Ratio', 'Avg_Open_To_Buy', 'Total_Relationship_Count', 'Income_Category', 'Total_Revolving_Bal', 'Attrition_Flag', 'CLIENTNUM', 'Months_Inactive_12_mon']
    12: [
        (partial(AnomalyFunctions.columns_partial_swap, columns=['Avg_Utilization_Ratio', 'Avg_Open_To_Buy'])),
        (partial(AnomalyFunctions.numeric_variance_change, columns=['Total_Relationship_Count'], coef=1.2)),
        (partial(AnomalyFunctions.nan_influx, columns=['Income_Category'])),
        (partial(AnomalyFunctions.random_influx, columns=['Total_Revolving_Bal'])),
        (partial(AnomalyFunctions.columns_partial_swap, columns=['Attrition_Flag', 'CLIENTNUM'])),
        (partial(AnomalyFunctions.numeric_variance_change, columns=['Months_Inactive_12_mon'], coef=2.5))
    ], 
    # ['Credit_Limit', 'Customer_Age', 'Total_Relationship_Count', 'Total_Ct_Chng_Q4_Q1', 'Months_Inactive_12_mon', 'Marital_Status', 'Gender', 'Months_on_book']
    13: [
        (partial(AnomalyFunctions.nan_influx, columns=['Credit_Limit'])),
        (partial(AnomalyFunctions.random_influx, columns=['Customer_Age'])),
        (partial(AnomalyFunctions.columns_partial_swap, columns=['Total_Relationship_Count', 'Total_Ct_Chng_Q4_Q1'])),
        (partial(AnomalyFunctions.random_influx, columns=['Months_Inactive_12_mon'])),
        (partial(AnomalyFunctions.columns_swap, columns=['Marital_Status', 'Gender'])),
        (partial(AnomalyFunctions.numeric_mean_change, columns=['Months_on_book'], coef=1.1))
    ],
    # ['Months_Inactive_12_mon', 'Total_Relationship_Count', 'Credit_Limit', 'Gender', 'Contacts_Count_12_mon', 'Total_Trans_Amt', 'Dependent_count', 'Total_Ct_Chng_Q4_Q1']
    14: [
        (partial(AnomalyFunctions.numeric_mean_change, columns=['Months_Inactive_12_mon'], coef=0.7)),
        (partial(AnomalyFunctions.columns_swap, columns=['Total_Relationship_Count', 'Credit_Limit'])),
        (partial(AnomalyFunctions.categorical_new_category_influx, columns=['Gender'])),
        (partial(AnomalyFunctions.numeric_variance_change, columns=['Contacts_Count_12_mon'], coef=2.3)),
        (partial(AnomalyFunctions.columns_swap, columns=['Total_Trans_Amt', 'Dependent_count'])),
        (partial(AnomalyFunctions.nan_influx, columns=['Total_Ct_Chng_Q4_Q1']))
    ],
    # ['Total_Relationship_Count', 'CLIENTNUM', 'Income_Category', 'Dependent_count', 'Card_Category', 'Attrition_Flag', 'Gender', 'Credit_Limit']
    15: [
        (partial(AnomalyFunctions.numeric_variance_change, columns=['Total_Relationship_Count'])),
        (partial(AnomalyFunctions.columns_swap, columns=['CLIENTNUM', 'Income_Category'])),
        (partial(AnomalyFunctions.nan_influx, columns=['Dependent_count'])),
        (partial(AnomalyFunctions.categorical_distribution_changed, columns=['Card_Category'])),
        (partial(AnomalyFunctions.categorical_distribution_changed, columns=['Attrition_Flag'])),
        (partial(AnomalyFunctions.columns_swap, columns=['Gender', 'Credit_Limit']))
    ]
}

In [138]:
DIRTY_BASE_PATH = "./Datasets/Real/BankChurners/DirtyBase"
DIRTY_GROUP_PATH = "./Datasets/Real/BankChurners/DirtyGroup"

In [139]:
dirty_dataframes = list()
for fname in os.listdir(DIRTY_BASE_PATH):
    if fname.endswith(".csv"):
        dirty_dataframes.append(pd.read_csv(f'{DIRTY_BASE_PATH}/{fname}'))

In [132]:
dataset_count = 0
labels = []

for ddframe in dirty_dataframes:
    for key, anomalies in ANOMALIES:
        ddframe = ddframe.copy()
        for anomaly in anomalies:
            ddframe = f(ddframe)
        labels.append(ANOMALY_COLUMNS[key])
        
        pd.to_csv(f'{DIRTY_GROUP_PATH}/dataset_{i}.csv', index=False)
        
    

# GENERATE ANOMALY-LEVEL DATASET

In [134]:
ANOMALY_WITH_TYPES = [
    ('duplicates_influx', 'any', AnomalyFunctions.duplicates_influx),
    ('nan_influx', 'any', AnomalyFunctions.nan_influx),
    ('random_influx', 'any', AnomalyFunctions.random_influx),
    ('numeric_variance_change_0.7', 'numeric', partial(AnomalyFunctions.numeric_variance_change, coef=0.7)),
    ('numeric_mean_change_0.7', 'numeric', partial(AnomalyFunctions.numeric_mean_change, coef=0.7)),
    ('numeric_variance_change_1.3', 'numeric', partial(AnomalyFunctions.numeric_variance_change, coef=1.3)),
    ('numeric_mean_change_1.3', 'numeric', partial(AnomalyFunctions.numeric_mean_change, coef=1.3)),
    ('numeric_variance_change_0.9', 'numeric', partial(AnomalyFunctions.numeric_variance_change, coef=0.9)),
    ('numeric_mean_change_0.9', 'numeric', partial(AnomalyFunctions.numeric_mean_change, coef=0.9)),
    ('categorical_new_category_influx', 'categorical', AnomalyFunctions.categorical_new_category_influx),
    ('categorical_category_miss', 'categorical', AnomalyFunctions.categorical_category_miss),
    ('categorical_distribution_changed', 'categorical', AnomalyFunctions.categorical_distribution_changed),
]

# GENERATE COLUMN-LEVEL DATASET