In [1]:
from scipy import stats
from scipy.stats import entropy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
np.set_printoptions(suppress=True)

In [5]:
def is_number_repl_isdigit(s):
    """ Returns True if string is a number. """
    if isinstance(s, str):
        return s.replace('.','',1).isdigit()
    else:
        return True

    
def get_num_feature_stats(vals: pd.Series):
    is_digit_values = vals.apply(is_number_repl_isdigit)
    
    vals_digit = vals[is_digit_values].astype('float')
    vals_not_digit = vals[~is_digit_values]
    
    is_nan = vals_digit.isna()
    vals_digit_not_nan = vals_digit[~is_nan]
    
    if not vals_digit_not_nan.empty:
        quantiles = np.quantile(vals_digit_not_nan, [0.25, 0.5, 0.75, 0.95])
        descr = stats.describe(vals_digit_not_nan)
        
        return quantiles.tolist() + [
            len(vals_digit_not_nan)-len(vals_digit_not_nan.drop_duplicates()),
            descr.nobs, 
            descr.minmax[0], 
            descr.minmax[1], 
            descr.mean, 
            np.sqrt(descr.variance), 
            descr.skewness, 
            descr.kurtosis,
            is_digit_values.sum() / vals.shape[0],
            is_nan.sum() / vals.shape[0]
        ]
    else:
        return [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, (~is_digit_values).sum() / vals.shape[0], is_nan.sum() / vals.shape[0]]

In [6]:
def get_cat_feature_stats(vals: pd.Series):
    vals_counts = vals.value_counts()
    vals_probas = vals_counts / vals_counts.sum()
    entr = entropy(vals_probas.values)    
    vals_probas_log = np.log(vals_probas)
    descr = stats.describe(vals_probas_log)
    
    # добавить log odds
    # добавить count - количество уникальных атрибутов
    
    quantiles = np.quantile(vals_probas_log, [0.25, 0.5, 0.75, 0.95])
    
    return quantiles.tolist() + [
        entr, 
        descr.nobs, 
        descr.minmax[0], 
        descr.minmax[1], 
    ]

# MAKE GROUP-LEVEL FEATURIZED DATASET

# MAKE ANOMALY-LEVEL FEATURIZED DATASET

In [11]:
DIRTY_SINGLE_ANOMALY_PATH = "./Datasets/Real/BankChurners/DirtySingleAnomaly"
DIRTY_SINGLE_ANOMALY_FEATURIZED_PATH = "./Datasets/Featurized/BankChurners/DirtySingleAnomaly"

# MAKE COLUMN-LEVEL FEATURIZED DATASET