**категоризация и бинаризация признаков**

Евгений Борисов esborisov@sevsu.ru

In [1]:
import numpy as np
from numpy import random as rng
from matplotlib import pyplot as plt
import pandas as pd

In [2]:
np.set_printoptions(precision=3)
pd.options.display.float_format = '{:,.3f}'.format

In [3]:
class Categorizer:
    
    def __init__(self,percentiles=[],scale=None):
        self._percentiles = percentiles if len(percentiles)>0 else [10,25,50,75,95]
        if scale is None:
            self._scale = np.array([])
            self._pos = np.array([])
        else:
            assert type( scale) is np.ndarray, 'scale is not numpy array'
            assert len(scale.shape)==1, 'scale is not 1d numpy array'
            self._scale = scale
            self._pos = np.array([range(len(self._scale))])+1
        
    @property    
    def scale(self): 
        return self._scale
    
    @property    
    def percentiles(self): 
        return self._percentiles
        
    def fit(self,x):
        assert type(x) is np.ndarray, 'x is not numpy array'
        assert len(x.shape)==2,'column vector needed'
        assert x.shape[1]==1,'column vector needed'
        self._scale = np.percentile(x,self._percentiles,axis=0).flatten() 
        self._pos = np.array([range(len(self._scale))])+1
        return self

    def transform(self,x):
        assert len(x.shape)==2,'column vector needed'
        assert x.shape[1]==1,'column vector needed'
        r = (x > self._scale)*self._pos
        return np.where( r.sum(axis=1)>0, np.max(r,axis=1), 0 )[:,np.newaxis]

In [4]:
class Binarizer:
    
    @staticmethod
    def transform(cat):  
        assert type(cat) is np.ndarray, 'cat is not numpy array'
        assert cat.dtype == int, 'values is not int type'
        return np.eye(cat.max()+1,dtype=np.int8)[cat]

---

In [5]:
x = rng.uniform(size=(100,1)) # генерируем случайный набор значений 

In [6]:
cat = Categorizer().fit(x) # выполняем категоризацию

# собираем всё в таблицу
df = pd.DataFrame(np.hstack([x,cat.transform(x)]),columns=['x','cat']).convert_dtypes()

# выполняем бинаризацию категорий
df['bin'] = Binarizer.transform( df['cat'].to_numpy(dtype=int) ).tolist()

In [7]:
pd.DataFrame({ # шкала категоризации
'percentiles':cat.percentiles,
'scale':cat.scale,
}).set_index('percentiles').T


percentiles,10,25,50,75,95
scale,0.075,0.288,0.55,0.813,0.966


In [8]:
df.sample(12)

Unnamed: 0,x,cat,bin
75,0.534,2,"[0, 0, 1, 0, 0, 0]"
60,0.004,0,"[1, 0, 0, 0, 0, 0]"
55,0.736,3,"[0, 0, 0, 1, 0, 0]"
81,0.291,2,"[0, 0, 1, 0, 0, 0]"
8,0.649,3,"[0, 0, 0, 1, 0, 0]"
58,0.848,4,"[0, 0, 0, 0, 1, 0]"
90,0.01,0,"[1, 0, 0, 0, 0, 0]"
36,0.613,3,"[0, 0, 0, 1, 0, 0]"
77,0.452,2,"[0, 0, 1, 0, 0, 0]"
76,0.239,1,"[0, 1, 0, 0, 0, 0]"
