**категоризация и бинаризация признаков**

Евгений Борисов esborisov@sevsu.ru

In [1]:
import numpy as np
from numpy import random as rng
from matplotlib import pyplot as plt
import pandas as pd

In [2]:
np.set_printoptions(precision=3)
pd.options.display.float_format = '{:,.3f}'.format

In [3]:
class Categorizer:
    
    def __init__(self,percentiles=[],scale=None):
        self._percentiles = percentiles if len(percentiles)>0 else [10,25,50,75,95]
        if scale is None:
            self._scale = np.array([])
            self._pos = np.array([])
        else:
            assert type( scale) is np.ndarray, 'scale is not numpy array'
            assert len(scale.shape)==1, 'scale is not 1d numpy array'
            self._scale = scale
            self._pos = np.array([range(len(self._scale))])+1
        
    @property    
    def scale(self): 
        return self._scale
    
    @property    
    def percentiles(self): 
        return self._percentiles
        
    def fit(self,x):
        assert type(x) is np.ndarray, 'x is not numpy array'
        assert len(x.shape)==2,'column vector needed'
        assert x.shape[1]==1,'column vector needed'
        self._scale = np.percentile(x,self._percentiles,axis=0).flatten() 
        self._pos = np.array([range(len(self._scale))])+1
        return self

    def transform(self,x):
        assert len(x.shape)==2,'column vector needed'
        assert x.shape[1]==1,'column vector needed'
        r = (x > self._scale)*self._pos
        return np.where( r.sum(axis=1)>0, np.argmax(r,axis=1)+1, 0 )[:,np.newaxis]

In [4]:
class Binarizer:
    
    @staticmethod
    def transform(cat):  
        assert type(cat) is np.ndarray, 'cat is not numpy array'
        assert cat.dtype == np.int, 'values is not int type'
        ncat = cat.max()+1
        return np.eye(ncat,dtype=np.int8)[cat]

In [5]:
x = rng.uniform(size=(100,1))

In [6]:
cat = Categorizer().fit(x)
df = pd.DataFrame(np.hstack([x,cat.transform(x)]),columns=['x','cat']).convert_dtypes()
df['bin'] = Binarizer.transform( df['cat'].to_numpy(dtype=np.int) ).tolist()

In [7]:
pd.DataFrame({
'percentiles':cat.percentiles,
'scale':cat.scale,
}).set_index('percentiles').T


percentiles,10,25,50,75,95
scale,0.104,0.352,0.501,0.732,0.976


In [8]:
df.sample(12)

Unnamed: 0,x,cat,bin
32,0.533,3,"[0, 0, 0, 1, 0, 0]"
78,0.011,0,"[1, 0, 0, 0, 0, 0]"
79,0.529,3,"[0, 0, 0, 1, 0, 0]"
89,0.022,0,"[1, 0, 0, 0, 0, 0]"
96,0.985,5,"[0, 0, 0, 0, 0, 1]"
55,0.088,0,"[1, 0, 0, 0, 0, 0]"
46,0.149,1,"[0, 1, 0, 0, 0, 0]"
57,0.462,2,"[0, 0, 1, 0, 0, 0]"
72,0.427,2,"[0, 0, 1, 0, 0, 0]"
51,0.117,1,"[0, 1, 0, 0, 0, 0]"
