In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


#Scaling data
from sklearn.preprocessing import RobustScaler, Normalizer

#Regressor
from xgboost import XGBRegressor

The get_params() function looks at the init arguments to figure out what the class parameters are and then assumes that they're the same as the internal variable names. https://stackoverflow.com/questions/61394346/why-is-my-sklearn-custom-transformer-not-saving-an-attribute-when-used-in-a-colu

In [2]:
class CatBoostCustom(BaseEstimator, TransformerMixin):
    '''
    Given a list of categorical variables applies CatBoostEncoder on them
    in a pipeline. 
    '''

    def __init__(self, cat):
        self.cat = cat  
        self.cbe = ce.CatBoostEncoder(cols=cat, handle_unknown='value')
        

    def transform(self, X, y=None):
        X[self.cat] = self.cbe.transform(X[self.cat])
        return X

    def fit(self, X, y):
        self.cbe.fit(X[self.cat], y)
        return self 

In [4]:
class CustomImputer(BaseEstimator, TransformerMixin):
    '''
    Simple Imputer that conserves column names. To be able to impute missing values before encoding and creating bivariates
    '''

    def __init__(self, strategy='mean', fill_value=-1):
        self.strategy = strategy
        self.fill_value = fill_value
        self.imp = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
        

    def transform(self, X, y=None):
        colnames = X.columns        
        X = pd.DataFrame(self.imp.transform(X))
        X.columns = colnames
        return X

    def fit(self, X, y=None):
        self.imp.fit(X, y)
        return self 

In [5]:
class CatBivariates(BaseEstimator, TransformerMixin):
    '''
    Given a dictionary of bivariate feature names as keys, and tuples of two (string) feature names as values,
    creates the bivariate feature as a result of concatenating the two feature strings.
    By default it drops the original categorical features
    '''
    def __init__(self, features={}, drop=True, dismissed=[]):
        self.features = features
        self.drop = drop
        self.dismissed = dismissed

    def transform(self, X, y=None):
        for name, fts in self.features.items():
            X[name] = X[fts[0]].astype(str) + '-' + X[fts[1]].astype(str)
            if self.drop:
                X.drop(columns=[fts[0], fts[1]], inplace=True)
                self.dismissed.extend([fts[0], fts[1]])
        return X

    def fit(self, X, y=None):
        return self 
    
    def getDismissed(self):
        return self.dismissed 
    
    def getVars(self):
        return self.features 

In [6]:
class NumBivariates(BaseEstimator, TransformerMixin):
    '''
    Given a dictionary of bivariate feature names as keys, and tuples of two numerical feature names as values,
    creates the bivariate feature as the multiplication of both features.
    By default it does not drop the original categorical features
    '''
    def __init__(self, features={}, drop=False, dismissed=[]):
        self.features = features
        self.drop = drop
        self.dismissed = dismissed
    
    def transform(self, X, y=None):
        for name, fts in self.features.items():
            X[name] = X[fts[0]] * X[fts[1]]
            #X = pd.concat([X,new], axis=1)
            if self.drop:
                X.drop(columns=[fts[0], fts[1]], inplace=True)
                self.dismissed.extend([fts[0], fts[1]])
        return X

    def fit(self, X, y=None):
        return self 
    
    def getDismissed(self):
        #"ALO"
        return self.dismissed 
    
    def getVars(self):
        return self.features

In [7]:
class NumBivariatesBool(BaseEstimator, TransformerMixin):
    '''
    Given a dictionary of bivariate feature names as keys, and tuples of one feature and a lower limit,
    returns a boolean variable (cast as int) as the result of dividing the feature values by the limit.
    Ex: ('Num', 15) returns 1 for every value in num strictly higher than 15, 0 otherwise.
    By default it does not drop the original categorical features
    '''
    def __init__(self, features={}, drop=False, dismissed=[]):
        self.features = features
        self.drop = drop
        self.dismissed = dismissed
    
    def transform(self, X, y=None):
        for name, fts in self.features.items():
            X[name] = (X[fts[0]] > fts[1]).astype(int)
            #X = pd.concat([X,new], axis=1)
            if self.drop:
                X.drop(columns=[fts[0]], inplace=True)
                self.dismissed.append(fts[0])
        return X

    def fit(self, X, y=None):
        return self 
    
    def getDismissed(self):
        #"ALO"
        return self.dismissed 
    
    def getVars(self):
        return self.features

In [8]:
class LogRtTransformer(BaseEstimator, TransformerMixin):
    '''
    Applies log transform to features in log list
    and cube root transform to features in cbrt list
    '''
#     def __init__(self):
    def __init__(self, log=[], cbrt=[], sqrt=[]):
        self.log = log
        self.cbrt = cbrt
        self.sqrt = sqrt
        
    def transform(self, X, y=None):
        for ftl in self.log:
            X[ftl] = np.log(X[ftl])
        for ftc in self.cbrt:
            X[ftc] = np.cbrt(X[ftc])
        for fts in self.sqrt:
            X[fts] = np.sqrt(X[fts])
        return X

    def fit(self, X, y=None):
        return self 
    
    def getVars(self):
        return self.log, self.cbrt
