In [1]:
import pandas as pd
import numpy as np

from tubesml.base import BaseTransformer, self_columns, reset_columns
import tubesml as tml

from sklearn.pipeline import Pipeline

from source import explore as ex
from source import utility as ut

pd.set_option('max_columns', 100)

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
subs = pd.read_csv('data/sample_submission.csv')

In [3]:
train_set, test_set = ut.make_test(df_train, 0.25, random_state=516, strat_feat='cat9')

In [39]:
class CatSimp(BaseTransformer):
    def __init__(self, cat7=True, cat6=True, cat8=True, cat4=True, cat9=True):
        super().__init__()
        self.cat7 = cat7
        self.cat6 = cat6
        self.cat8 = cat8
        self.cat4 = cat4
        self.cat9 = cat9
     
    
    def cat7_tr(self, X):
        X_tr = X.copy()
        
        if self.cat7:
            X_tr['cat7'] = X_tr['cat7'].map({'C': 'E', 
                                             'A': 'B', 
                                             'F': 'G', 
                                             'I': 'G'}).fillna(X_tr['cat7'])
        
        return X_tr
    
    
    def cat6_tr(self, X):
        X_tr = X.copy()
        
        if self.cat6:
            X_tr.loc[X_tr['cat6'] != 'A', 'cat6'] = 'B'
        
        return X_tr
    
    
    def cat8_tr(self, X):
        X_tr = X.copy()
        
        if self.cat8:
            X_tr['cat8'] = X_tr['cat8'].map({'B': 'E', 'F': 'E'}).fillna(X_tr['cat8'])
        
        return X_tr
    
    
    def cat4_tr(self, X):
        X_tr = X.copy()
        
        if self.cat4:
            X_tr['cat4'] = X_tr['cat4'].map({'D': 'A'}).fillna(X_tr['cat4'])
        
        return X_tr
    
    
    def cat9_tr(self, X):
        X_tr = X.copy()
        
        if self.cat9:
            X_tr['cat9'] = X_tr['cat9'].map({'E': 'L', 'D': 'J', 'C': 'L'}).fillna(X_tr['cat9'])
        
        return X_tr
    
    @self_columns
    def transform(self, X, y=None):
        
        Xtransf = self.cat7_tr(X)
        Xtransf = self.cat6_tr(Xtransf)
        Xtransf = self.cat8_tr(Xtransf)
        Xtransf = self.cat4_tr(Xtransf)
        Xtransf = self.cat9_tr(Xtransf)
        
        return Xtransf

In [40]:
numeric_pipe = Pipeline([('fs', tml.DtypeSel('numeric'))])


cat_pipe = Pipeline([('fs', tml.DtypeSel('category')),
                     ('simp', CatSimp())])#,
                     #('dummies', tml.Dummify(match_cols=True))])


processing_pipe = tml.FeatureUnionDf(transformer_list=[('cat_pipe', cat_pipe),
                                                 ('num_pipe', numeric_pipe)])


full_pipe = Pipeline([('processing', processing_pipe)])#, 
                      #('scaler', tml.DfScaler())])

In [41]:
tmp = train_set.copy()

tmp = full_pipe.fit_transform(tmp)

In [42]:
tmp.cat9.value_counts()

F    80461
I    37548
L    31862
H    18569
K    15716
A    10056
G     7807
M     7378
J     5313
O     4630
N     3084
B     2576
Name: cat9, dtype: int64