In [1]:
from scipy import stats
from scipy.stats import entropy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import os
import pickle

In [2]:
%config Completer.use_jedi = False

In [3]:
np.set_printoptions(suppress=True)

In [15]:
def is_number_repl_isdigit(s):
    """ Returns True if string is a number. """
    if isinstance(s, str):
        return s.replace('.','',1).isdigit()
    else:
        return True

    
def get_num_feature_stats(vals: pd.Series):
    is_digit_values = vals.apply(is_number_repl_isdigit)
    
    vals_digit = vals[is_digit_values].astype('float')
    vals_not_digit = vals[~is_digit_values]
    
    is_nan = vals_digit.isna()
    vals_digit_not_nan = vals_digit[~is_nan]
    
    if not vals_digit_not_nan.empty:
        quantiles = np.quantile(vals_digit_not_nan, [0.25, 0.5, 0.75, 0.95])
        descr = stats.describe(vals_digit_not_nan)
        
        return quantiles.tolist() + [
            len(vals_digit_not_nan)-len(vals_digit_not_nan.drop_duplicates()),
            descr.nobs, 
            descr.minmax[0], 
            descr.minmax[1], 
            descr.mean, 
            np.sqrt(descr.variance), 
            descr.skewness, 
            descr.kurtosis,
            is_digit_values.sum() / vals.shape[0],
            is_nan.sum() / vals.shape[0]
        ]
    else:
        return [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, (~is_digit_values).sum() / vals.shape[0], is_nan.sum() / vals.shape[0]]

In [16]:
def get_cat_feature_stats(vals: pd.Series):
    is_nan = vals.isna()
    vals_not_nan = vals[~is_nan]
    
    if not vals_not_nan.empty:
        vals_counts = vals_not_nan.value_counts()
        vals_probas = vals_counts / vals_counts.sum()
        entr = entropy(vals_probas.values)    
        vals_probas_log = np.log(vals_probas)
        descr = stats.describe(vals_probas_log)

        # добавить log odds
        # добавить count - количество уникальных атрибутов

        quantiles = np.quantile(vals_probas_log, [0.25, 0.5, 0.75, 0.95])

        return quantiles.tolist() + [
            entr, 
            descr.nobs, 
            descr.minmax[0], 
            descr.minmax[1], 
            is_nan.sum() / vals.shape[0]
        ]
    else:
        [-1,-1,-1,-1,-1,-1,-1,-1,1]

# LOAD CLEAN DATA

In [6]:
DIRTY_BASE_PATH = "./Datasets/Real/BankChurners/DirtyBase"
CLEAR_BASE_PATH = "./Datasets/Real/BankChurners/Clear"

In [7]:
METADATA = json.load(open('./Datasets/Real/BankChurners/metadata.json', 'r'))

In [8]:
dirty_dataframes = list()
for fname in sorted(os.listdir(DIRTY_BASE_PATH)):
    if fname.endswith(".csv"):
        dirty_dataframes.append(pd.read_csv(f'{DIRTY_BASE_PATH}/{fname}'))

In [9]:
clear_dataframes = list()
for fname in sorted(os.listdir(CLEAR_BASE_PATH)):
    if fname.endswith(".csv"):
        clear_dataframes.append(pd.read_csv(f'{CLEAR_BASE_PATH}/{fname}'))

# MAKE GROUP-LEVEL FEATURIZED DATASET

In [17]:
DIRTY_GROUP_PATH = "./Datasets/Real/BankChurners/DirtyGroup"
DIRTY_GROUP_FEATURIZED_PATH = "./Datasets/Featurized/BankChurners/DirtyGroup"

In [18]:
dataset_train = {}
dataset_control = {}

In [19]:
for anomaly_name in os.listdir(DIRTY_GROUP_PATH):
    if anomaly_name.startswith('.'):
        continue
        
    anomaly_path = DIRTY_GROUP_PATH + f"/{anomaly_name}"
    labels = json.load(open(f"{anomaly_path}/labels.json", "r"))
    
    for dataset_name in os.listdir(anomaly_path):
        if not dataset_name.endswith(".csv"):
            continue
        df = pd.read_csv(f"{anomaly_path}/{dataset_name}")
        
        for column, metadata in METADATA.items():
            if metadata == 'categorical':
                column_feature = get_cat_feature_stats(df[column])
            elif metadata == 'numeric':
                column_feature = get_num_feature_stats(df[column])
            else:
                raise Exception(f"Unsupported type of columns: {metadata}")
            
            if column not in dataset_control:
                dataset_control[column] = list()
            
            dataset_control[column].append((column_feature, column in labels))

In [20]:
for df in clear_dataframes:
    for column, metadata in METADATA.items():
        column_raw = df[column]
        
        if metadata == 'categorical':
            column_feature = get_cat_feature_stats(pd.Series(column_raw))
        elif metadata == 'numeric':
            column_feature = get_num_feature_stats(pd.Series(column_raw))
        else:
            raise Exception("unknown column", column)
        
        if column not in dataset_train:
            dataset_train[column] = list()
        dataset_train[column].append((column_feature, False))

In [21]:
os.makedirs(DIRTY_GROUP_FEATURIZED_PATH, exist_ok=True)

pickle.dump(
    {'train': dataset_train, 'control': dataset_control}, 
    open(f"{DIRTY_GROUP_FEATURIZED_PATH}/DirtyGroup.pickle", "wb"), 
    protocol=pickle.HIGHEST_PROTOCOL
)

# MAKE ANOMALY-LEVEL FEATURIZED DATASET

In [151]:
DIRTY_SINGLE_ANOMALY_PATH = "./Datasets/Real/BankChurners/DirtySingleAnomaly"
DIRTY_SINGLE_ANOMALY_FEATURIZED_PATH = "./Datasets/Featurized/BankChurners/DirtySingleAnomaly"

# MAKE COLUMN-LEVEL FEATURIZED DATASET

In [152]:
DIRTY_SINGLE_COLUMN_PATH = "./Datasets/Real/BankChurners/DirtySingleColumn"
DIRTY_SINGLE_COLUMN_FEATURIZED_PATH = "./Datasets/Featurized/BankChurners"

In [153]:
data_column = dict()

for column_dir in os.listdir(DIRTY_SINGLE_COLUMN_PATH):
    if column_dir.startswith('.'):
        continue
    
    column_path = f"{DIRTY_SINGLE_COLUMN_PATH}/{column_dir}"
    
    data_column[column_dir] = {'train': list(), 'anomaly': list(), 'control': list()}
    
    for defaced_column in sorted(os.listdir(column_path)):        
        column_raw = np.load(f"{column_path}/{defaced_column}", allow_pickle=True)
        if METADATA[column_dir] == 'categorical':
            column_feature = get_cat_feature_stats(pd.Series(column_raw))
        elif METADATA[column_dir] == 'numeric':
            column_feature = get_num_feature_stats(pd.Series(column_raw))
        else:
            raise Exception("unknown column", column_dir)
            
        data_column[column_dir]['anomaly'].append(column_feature)

  **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [154]:
for df in dirty_dataframes:
    for column in data_column.keys():
        column_raw = df[column]
        
        if METADATA[column] == 'categorical':
            column_feature = get_cat_feature_stats(pd.Series(column_raw))
        elif METADATA[column] == 'numeric':
            column_feature = get_num_feature_stats(pd.Series(column_raw))
        else:
            raise Exception("unknown column", column)
        
        data_column[column]['control'].append(column_feature)

In [155]:
for df in clear_dataframes:
    for column in data_column.keys():
        column_raw = df[column]
        
        if METADATA[column] == 'categorical':
            column_feature = get_cat_feature_stats(pd.Series(column_raw))
        elif METADATA[column] == 'numeric':
            column_feature = get_num_feature_stats(pd.Series(column_raw))
        else:
            raise Exception("unknown column", column)
        
        data_column[column]['train'].append(column_feature)

In [134]:
os.makedirs(DIRTY_SINGLE_COLUMN_FEATURIZED_PATH, exist_ok=True)
pickle.dump(data_column, open(f"{DIRTY_SINGLE_COLUMN_FEATURIZED_PATH}/DirtySingleColumn.pickle", "wb"), protocol=pickle.HIGHEST_PROTOCOL)