In [1]:
import numpy as np
import pandas as pd

In [2]:
class TargetEncoder:
    
    
    def __init__(self, alpha=1.0, priors=None, n_folds=3, random_state=317):
        from sklearn.model_selection import StratifiedKFold
        
        self._alpha = alpha
        
        if priors is not None:
            self._priors = priors.copy()
        else:
            self._priors = None
        
        self._kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    
    
    def fit_transform(self, X, y, features_values_list):
        import numpy as np
        from sklearn.preprocessing import LabelEncoder as LE
        from sklearn.preprocessing import OneHotEncoder as OHE
        
        self._ohe = OHE(n_values=np.array([np.array(arr).shape[0] for arr in features_values_list]),
                        dtype=np.int32, sparse=True)
        self._le_list = []
        X_ohe = np.zeros(X.shape).astype(np.int32)
        for i in range(X_ohe.shape[1]):
            self._le_list.append(LE())
            self._le_list[i].fit(features_values_list[i])
            X_ohe[:, i] = self._le_list[i].transform(X[:, i])
        X_ohe = self._ohe.fit_transform(X_ohe)
        y_tmp = LE().fit_transform(y)
        self._y_unique = np.unique(y_tmp)

        X_res = np.zeros((X.shape[0], (2 * self._y_unique.shape[0] + 1) * X.shape[1]))

        self._global_counts_array = np.zeros(self._ohe.feature_indices_[-1]).astype(np.int32)
        self._global_successes_matrix = np.zeros((self._y_unique.shape[0],\
                                                  self._ohe.feature_indices_[-1])).astype(np.int32)
        
        if self._priors is None:
            self._priors = np.zeros(self._y_unique.shape[0])
            for class_id in self._y_unique:
                self._priors[class_id] = (y_tmp == class_id).sum() / y_tmp.shape[0]

        for train_idxs, test_idxs in self._kfold.split(X, y_tmp):
            counts_array = np.array(X_ohe[train_idxs].sum(axis=0))[0].astype(np.int32)
    
            successes_matrix = np.zeros((self._y_unique.shape[0], self._ohe.feature_indices_[-1])).astype(np.int32)
            for class_id in self._y_unique:
                successes_matrix[class_id] = np.array(X_ohe[train_idxs[y_tmp[train_idxs] ==\
                                                               class_id]].sum(axis=0))[0].astype(np.int32)
    
            self._global_counts_array += counts_array
            self._global_successes_matrix += successes_matrix
    
            for test_idx in test_idxs:
                features_mask = X_ohe[test_idx].toarray()[0].astype(np.bool)
        
                X_res[test_idx, :X.shape[1]] = X.shape[0] / train_idxs.shape[0] * counts_array[features_mask]
        
                for class_id in self._y_unique:
                    X_res[test_idx, (class_id + 1) * X.shape[1]: (class_id + 2) * X.shape[1]] =\
                    X.shape[0] / train_idxs.shape[0] * successes_matrix[class_id, features_mask]
        
                for class_id in self._y_unique:
                    X_res[test_idx, (self._y_unique.shape[0] + class_id + 1) * X.shape[1]:\
                          (self._y_unique.shape[0] + class_id + 2) * X.shape[1]] =\
                          (X.shape[0] / train_idxs.shape[0] * successes_matrix[class_id,\
                           features_mask] + self._alpha * self._priors[class_id]) /\
                           (X.shape[0] / train_idxs.shape[0] * counts_array[features_mask] + self._alpha)
                    
        return X_res
    
    
    def transform(self, X):
        import numpy as np
        
        X_ohe = np.zeros(X.shape).astype(np.int32)
        for i in range(X.shape[1]):
            X_ohe[:, i] = self._le_list[i].transform(X[:, i])
        X_ohe = self._ohe.transform(X_ohe)
        X_res = np.zeros((X.shape[0], (2 * self._y_unique.shape[0] + 1) * X.shape[1]))
        
        for idx in range(X.shape[0]):
            features_mask = X_ohe[idx].toarray()[0].astype(np.bool)
            
            X_res[idx, :X.shape[1]] = self._global_counts_array[features_mask]
            
            for class_id in self._y_unique:
                X_res[idx, (class_id + 1) * X.shape[1]: (class_id + 2) * X.shape[1]] =\
                      self._global_successes_matrix[class_id, features_mask]
                
            for class_id in self._y_unique:
                X_res[idx, (self._y_unique.shape[0] + class_id + 1) * X.shape[1]:\
                           (self._y_unique.shape[0] + class_id + 2) * X.shape[1]] =\
                      (self._global_successes_matrix[class_id, features_mask] +\
                       self._alpha * self._priors[class_id]) /\
                      (self._global_counts_array[features_mask] + self._alpha)
        
        return X_res

In [3]:
class TargetEncoderCV:
    
    
    def __init__(self, scorer, cv_n_folds=3, cv_random_state=317, alpha=1.0, priors=None, n_folds=3,\
                 random_state=317):
        from sklearn.model_selection import StratifiedKFold
        
        self._scorer = scorer
        self._cv_kfold = StratifiedKFold(n_splits=cv_n_folds, shuffle=True, random_state=cv_random_state)
        
        self._te = TargetEncoder(alpha=alpha, priors=priors, n_folds=n_folds, random_state=random_state)
    
    
    def cv_routine_smooth(self, clf, X, y, features_values_list, clf_params):
        import numpy as np
        
        clf.set_params(**clf_params)
        
        cv_res = []
        
        for train_idxs, test_idxs in self._cv_kfold.split(X, y):
            X_train = self._te.fit_transform(X[train_idxs], y[train_idxs], features_values_list)
            X_test = self._te.transform(X[test_idxs])
            
            clf.fit(X_train, y[train_idxs])
            
            cv_res.append(self._scorer(y_true=y[test_idxs], y_score=clf.predict_proba(X_test)[:, 1]))
        
        return np.mean(cv_res), np.std(cv_res)

In [4]:
data = pd.read_csv('amazon.csv', dtype={'ACTION': np.int64,
                                        'RESOURCE': str,
                                        'MGR_ID': str,
                                        'ROLE_ROLLUP_1': str,
                                        'ROLE_ROLLUP_2': str,
                                        'ROLE_DEPTNAME': str,
                                        'ROLE_TITLE': str,
                                        'ROLE_FAMILY_DESC': str,
                                        'ROLE_FAMILY': str,
                                        'ROLE_CODE': str})

In [5]:
col_names = data.columns
for i in range(1, col_names.shape[0]):
    for j in range(i + 1, col_names.shape[0]):
        data[col_names[i] + '.' + col_names[j]] = data[col_names[i]] + '.' + data[col_names[j]]

In [6]:
data.mean()

ACTION    0.94211
dtype: float64

In [7]:
data.shape

(32769, 46)

In [8]:
features_values_list = []
for i in range(1, data.shape[1]):
    features_values_list.append(np.unique(data.iloc[:, i]))

In [9]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import roc_auc_score
te_cv = TargetEncoderCV(scorer=roc_auc_score, alpha=0.001, n_folds=2)
te_cv.cv_routine_smooth(RFC(n_estimators=200), data.iloc[:, 1:].as_matrix(), data.iloc[:, 0].as_matrix(),
                        features_values_list, {'n_estimators': 200, 'n_jobs': 4, 'random_state': 317})

(0.86686758472026215, 0.0050686411353453352)