In [None]:
%load_ext autoreload
%autoreload 2

import sys

import category_encoders as ce
import pandas as pd

sys.path.append('../')
from utils import read_csv_time_chunks
from steps.base import BaseTransformer

FILES_DIR = '/mnt/ml-team/minerva/talking_data/files/'

CATEGORICAL_FEATURES = ['ip', 'app', 'device', 'os', 'channel']
TARGET_COLUMN = ['is_attributed']

In [None]:
train = read_csv_time_chunks(FILES_DIR, days=[8],hours=[4])

In [None]:
train.head()

In [None]:
X = train[CATEGORICAL_FEATURES].sample(100000)
y = train[TARGET_COLUMN].sample(100000)

# Out of the box 

In [None]:
%%time
target_encoder = ce.TargetEncoder(cols=CATEGORICAL_FEATURES)
target_encoder.fit(X, y.values.reshape(-1))
X_ = target_encoder.transform(X)

In [None]:
X_.head()

In [None]:
binary_encoder = ce.binary.BinaryEncoder(cols=CATEGORICAL_FEATURES)
binary_encoder.fit(X,y)
X_ = binary_encoder.transform(X)

In [None]:
X_.head()

In [None]:
from sklearn.model_selection import KFold

class TargetEncoderNSplits(BaseTransformer):
    def __init__(self, n_splits, **kwargs):
        self.k_folds = KFold(n_splits=n_splits)
        self.target_means_map = {}
        
    def _target_means_names(self, columns):
        confidence_rate_names = ['target_mean_{}'.format(column) for column in columns]
        return confidence_rate_names

    def _is_null_names(self, columns):
        is_null_names = ['target_mean_is_nan_{}'.format(column) for column in columns]
        return is_null_names
        
    def fit(self, categorical_features, target, **kwargs):
        feature_columns, target_column = categorical_features.columns, target.columns[0]

        X_target_means = []
        self.k_folds.get_n_splits(target)
        for train_index, test_index in self.k_folds.split(target):
            X_train, y_train = categorical_features.iloc[train_index], target.iloc[train_index]
            X_test, y_test = categorical_features.iloc[test_index], target.iloc[test_index]
            
            train = pd.concat([X_train, y_train], axis=1)
            for column, target_mean_name in zip(feature_columns, self._target_means_names(feature_columns)):
                group_object = train.groupby(column)
                train_target_means = group_object[target_column].mean().\
                    reset_index().rename(index=str, columns={target_column: target_mean_name})
                
                X_test = X_test.merge(train_target_means, on=column, how='left')
            X_target_means.append(X_test)
        X_target_means = pd.concat(X_target_means, axis=0)
        
        for column, target_mean_name in zip(feature_columns, self._target_means_names(feature_columns)):
            group_object = X_target_means.groupby(column)
            self.target_means_map[column] = group_object[target_mean_name].mean().reset_index()
        
        return self
    
    def transform(self, categorical_features, **kwargs):
        columns = categorical_features.columns
        
        for column, target_mean_name, is_null_name in zip(columns,
                                                          self._target_means_names(columns),
                                                          self._is_null_names(columns)):
            
            categorical_features = categorical_features.merge(self.target_means_map[column],
                                                              on=column,
                                                              how='left')
            categorical_features[is_null_name] = pd.isnull(categorical_features[target_mean_name]).astype(int)
            categorical_features[target_mean_name].fillna(0, inplace=True)
        
        return {'numerical_features': categorical_features[self._target_means_names(columns)],
                'categorical_features': categorical_features[self._is_null_names(columns)]}

    def load(self, filepath):
        self.target_means_map = joblib.load(filepath)
        return self

    def save(self, filepath):
        joblib.dump(self.target_means_map, filepath)

In [None]:
%%time
target_encoder = TargetEncoderNSplits(n_splits=10)
target_encoder.fit_transform(categorical_features=X, target=y)
X_ = target_encoder.transform(categorical_features=X)

In [None]:
X_['numerical_features'].describe()