In [26]:
import pandas as pd
import numpy as np
import pytictoc as ptt

In [18]:
!pip install joblib

Collecting joblib
  Downloading joblib-0.11-py2.py3-none-any.whl (176kB)
[K    100% |████████████████████████████████| 184kB 4.5MB/s eta 0:00:01
[?25hInstalling collected packages: joblib
Successfully installed joblib-0.11
[33mYou are using pip version 9.0.1, however version 9.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [19]:
# Author: Kamaldinov Ildar (kamildraf@gmail.com)
# MIT License
import numpy as np

def gini(y, **kwargs):
    prob = np.sum(y) / len(y)
    return prob * (1 - prob)


def entropy(y, smooth=0, **kwargs):
    prob = np.sum(y) / len(y)
    return (- prob * np.log(prob + smooth))


class OneFeatureTree(object):

    def __init__(self,
                 criterion,
                 min_samples_leaf=2,
                 smooth_woe=0.001,
                 min_samples_class=1,
                 max_depth=None,
                 smooth_entropy=0.001,
                 dtype=np.float32):
        self._criterion = criterion
        self._max_depth = max_depth
        self._min_samples_leaf = min_samples_leaf
        self._min_samples_class = min_samples_class
        self._smooth_woe = smooth_woe
        self._dtype = dtype
        self._smooth_entropy = smooth_entropy

        self._tree = {}

    def _split_vector(self, x, y, value):
        left_ind = x < value
        left_x, right_x = x[left_ind], x[np.logical_not(left_ind)]
        left_y, right_y = y[left_ind], y[np.logical_not(left_ind)]
        return left_x, right_x, left_y, right_y

    def _calc_woe(self, y, smooth_woe):
        n_pos = np.sum(y)
        n_neg = np.float32(len(y)) - n_pos
        woe = np.log((n_pos + smooth_woe) / (n_neg + smooth_woe))
        return woe

    def _split(self, x, y):
        if self._criterion == 'gini':
            splitter = gini
        elif self._criterion == 'entropy':
            splitter = entropy
        else:
            assert callable(self._criterion)

        n_obs = len(y)
        y = y[np.argsort(x)]

        x_info = np.unique(x, return_counts=True)

        impurities = np.zeros(len(x_info[0]) - 1)
        for ind, n_left in enumerate(np.cumsum(x_info[1])[:-1]):
            impurities[ind] = (
                (splitter(y[:n_left],
                          smooth=self._smooth_entropy) * n_left +
                 splitter(y[n_left:],
                          smooth=self._smooth_entropy) * (n_obs - n_left)) \
                / n_obs)
        thresh_ind = np.argmin(impurities)
        threshold = np.mean(
            x_info[0][[thresh_ind, thresh_ind + 1]])
        return threshold

    def _fit_node(self, x, y,
                  depth, node):

        min_samples = (len(y) > self._min_samples_leaf)
        uniq_x = len(np.unique(x)) > 1
        n_pos = np.sum(y)
        n_neg = len(y) - n_pos
        min_class = np.all(np.array([n_pos, n_neg]) >= self._min_samples_class)
        max_depth = ((depth < self._max_depth)
                     if self._max_depth is not None else True)

        if (min_samples and min_class and max_depth and uniq_x):
            # zero node type for non-terminal nodes
            node['type'] = 0

            threshold = self._split(x, y)
            left_x, right_x, left_y, right_y = self._split_vector(
                x, y, threshold)

            # 0 -- left_child, 1 -- right child
            node[0] = {}
            node[1] = {}
            node['thresh'] = threshold
            self._fit_node(left_x, left_y,
                           depth + 1,
                           node[0])
            self._fit_node(right_x, right_y,
                           depth + 1,
                           node[1])
        else:
            node['type'] = 1
            node['woe'] = self._calc_woe(y, self._smooth_woe)
        return self

    def fit(self, x, y):
        x = np.array(x, dtype=self._dtype)
        y = np.array(y, dtype=self._dtype)

        self._fit_node(x, y, depth=0, node=self._tree)
        return self

    def _transform_node(self, x, node):
        if node['type'] == 0:
            if x < node['thresh']:
                return self._transform_node(x, node[0])
            else:
                return self._transform_node(x, node[1])
        return node['woe']

    def transform(self, x):
        if len(self._tree) == 0:
            return "Not trained yet"
        transformed = np.zeros_like(x, dtype=self._dtype)
        for ind in range(len(x)):
            transformed[ind] = self._transform_node(x[ind], self._tree)
        return transformed

    def fit_transform(self, x, y):
        self.fit(x, y)
        return self.transform(x)

# Author: Kamaldinov Ildar (kamildraf@gmail.com)
# MIT License
import numpy as np
from joblib import Parallel, delayed


class WoeTree(object):

    def __init__(self,
                 criterion,
                 max_depth=None,
                 min_samples_leaf=2,
                 min_samples_class=1,
                 smooth_woe=0.001,
                 n_jobs=1,
                 dtype=np.float32):
        self._criterion = criterion
        self._max_depth = max_depth
        self._min_samples_leaf = min_samples_leaf
        self._min_samples_class = min_samples_class
        self._smooth_woe = smooth_woe
        self._n_jobs = n_jobs
        self._dtype = dtype

        self._trees = []

    def _to_arglist(self, arg, shape):
        if isinstance(arg, list):
            return arg
        else:
            return [arg] * shape

    def _fit(self, feature):
        self._trees[feature].fit(X[:, feature], y)

    def fit(self, X, y):
        n_features = X.shape[1]
        criterion = self._to_arglist(self._criterion, n_features)
        max_depth = self._to_arglist(self._max_depth, n_features)
        min_samples_leaf = self._to_arglist(self._min_samples_leaf, n_features)
        min_samples_class = self._to_arglist(
            self._min_samples_class, n_features)
        smooth_woe = self._to_arglist(self._smooth_woe, n_features)

        for feature in range(n_features):
            self._trees.append(
                OneFeatureTree(
                    criterion=criterion[feature],
                    max_depth=max_depth[feature],
                    min_samples_leaf=min_samples_leaf[feature],
                    min_samples_class=min_samples_class[feature],
                    smooth_woe=smooth_woe[feature]
                )
            )

        self._trees = (Parallel(n_jobs=self._n_jobs)
            (delayed(self._trees[feature].fit)(X[:, feature], y)
                for feature in range(n_features)))
        return self

    def transform(self, X):
        transformed = (Parallel(n_jobs=self._n_jobs)
            (delayed(self._trees[ind].transform)(X[:, ind])
                for ind in range(X.shape[1])))
        return np.array(transformed).T

    def fit_transfrom(self, X, y):
        self.fit(X, y)
        return self.transform(X, y)


In [7]:
raw_train = pd.read_csv("../../data/coms_sep/train_no_cnts.csv", index_col=0)
raw_test = pd.read_csv("../../data/coms_sep/test_no_cnts.csv", index_col=0)
train = pd.read_csv("../../data/coms_sep/train.csv", index_col=0)
test = pd.read_csv("../../data/coms_sep/test.csv", index_col=0)

KeyboardInterrupt: 

In [20]:
feats_cat = ['cat_new_ip',
             'cat_new_prov',
             'op_type',
             'relative',
             'cdf_s_127',
             'cdf_s_135',
             'cdf_s_130',
             'cdf_s_129',
             'cdf_s_134',
             'cdf_s_133',
             'know_recip_card_age',
             'one_region']

feats_num = ['amount',
              'client_age',
              'age_diff',
              'cumulative_sum_total',
              'cumulative_sum_total',
              'data_i_120',
              'know_recip_power',
              'data_i_120',
              'recip_card_age',
              'krp_pow2',
              'log_amount']
feats = feats_cat + feats_num

In [22]:
len(feats)

23

In [23]:
woe_tree = WoeTree('entropy', max_depth=4, n_jobs=10)

In [24]:
woe_tree.fit(raw_train.loc[:, feats].values, raw_train['label'].values)

<__main__.WoeTree at 0x7fb5a2e0aa58>

In [31]:
tt = ptt.TicToc()
tt.tic()
woe_train = woe_tree.transform(raw_train.loc[:, feats].values)
tt.toc()
tt.tic()
woe_test = woe_tree.transform(raw_train.loc[:, feats].values)
tt.toc()

Elapsed time is 30.724023 seconds.
Elapsed time is 30.540150 seconds.


In [74]:
aux_cols = raw_train.columns[np.logical_not(np.in1d(raw_train.columns, feats))].tolist()

In [63]:
new_train = pd.concat([raw_train.reset_index()[aux_cols], 
                      pd.DataFrame(woe_train, columns=feats)], axis=1)

In [77]:
aux_cols.remove('label')
aux_cols.remove('short_date')

In [78]:
new_test = pd.concat([raw_test.reset_index()[aux_cols], 
                      pd.DataFrame(woe_test, columns=feats)], axis=1)

In [81]:
new_train.to_csv("../../data/coms_sep/train_woe.csv")
new_test.to_csv("../../data/coms_sep/test_woe.csv")