```
Feature Interaction among categorical variables.
```

* Binary Encoding of the categorical variables

In [52]:
%matplotlib inline

import numpy as np
import pandas as pd
import os, sys

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cross_validation import KFold, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/AllState_Claims_Severity/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2016)

In [2]:
train = pd.read_csv(os.path.join(basepath, 'data/raw/train.csv'))
test  = pd.read_csv(os.path.join(basepath, 'data/raw/test.csv'))

In [3]:
# concatenate train and test
data = pd.concat((train, test))

In [4]:
y = np.log(train.loss) # log domain

** Encode categorical variables **

<p>Some Helper Functions</p>

In [7]:
def convert_input(X):
    """
    Parameters
    ----------
    
    X: data
    
    Returns
    -------
    
    A Dataframe
    """
    
    if not isinstance(X, pd.DataFrame):
        if isinstance(X, list):
            X = pd.DataFrame(np.array(X))
        elif isinstance(X, (np.generic, np.ndarray)):
            X = pd.DataFrame(X)
        else:
            raise ValueError('Unexpected input type: %s' % (str(type(X))))
    return X

def get_obj_cols(df):
    """
    Parameters
    ---------
    
    df: Dataframe
    
    Returns
    -------
    
    Features of type `object`
    """
    
    return df.select_dtypes(include=['object']).columns

In [12]:
class OrdinalEncoder(BaseEstimator, TransformerMixin):
    """Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed
    in, in this case we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes
    are assumed to have no true order and integers are selected at random.
    
    Parameters
    ----------
    verbose: int
        integer indicating verbosity of output. 0 for none.
    cols: list
        a list of columns to encode, if None, all string columns will be encoded
    drop_invariant: bool
        boolean for whether or not to drop columns with 0 variance
    return_df: bool
        boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array)
    mapping: dict
        a mapping of class to label to use for the encoding, optional.
    impute_missing: bool
        will impute missing categories with -1.
    
    Example
    -------
    >>> from category_encoders import OrdinalEncoder
    >>> from sklearn.datasets import fetch_20newsgroups_vectorized
    >>> bunch = fetch_20newsgroups_vectorized(subset="all")
    >>> X, y = bunch.data, bunch.target
    >>> enc = OrdinalEncoder(return_df=False).fit(X, y)
    >>> numeric_dataset = enc.transform(X)
    
    References
    ----------
    .. [1] Contrast Coding Systems for categorical variables.  UCLA: Statistical Consulting Group. from
    http://www.ats.ucla.edu/stat/r/library/contrast_coding.
    .. [2] Gregory Carey (2003). Coding Categorical Variables, from
    http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
    
    """
    
    def __init__(self, verbose=0, mapping=None, cols=None, drop_invariant=False, return_df=True, impute_missing=True):
        self.return_df = return_df
        self.drop_invariant = drop_invariant
        self.drop_cols = []
        self.verbose = verbose
        self.cols = cols
        self.mapping = mapping
        self.impute_missing = impute_missing
        self._dim = None

    def fit(self, X, y=None, **kwargs):
        """Fit encoder according to X and y.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.
        Returns
        -------
        self : encoder
            Returns self.
        """

        # first check the type
        X = convert_input(X)

        self._dim = X.shape[1]

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = get_obj_cols(X)

        _, categories = self.ordinal_encoding(X, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing)
        self.mapping = categories

        # drop all output columns with 0 variance.
        if self.drop_invariant:
            self.drop_cols = []
            X_temp = self.transform(X)
            self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5]

        return self

    def transform(self, X):
        """Perform the transformation to new categorical data.
        Will use the mapping (if available) and the column list (if available, otherwise every column) to encode the
        data ordinally.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Returns
        -------
        p : array, shape = [n_samples, n_numeric + N]
            Transformed values with encoding applied.
        """

        if self._dim is None:
            raise ValueError('Must train encoder before it can be used to transform data.')

        # first check the type
        X = convert_input(X)

        # then make sure that it is the right size
        if X.shape[1] != self._dim:
            raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim, ))

        if not self.cols:
            return X

        X, _ = self.ordinal_encoding(X, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing)

        if self.drop_invariant:
            for col in self.drop_cols:
                X.drop(col, 1, inplace=True)

        if self.return_df:
            return X
        else:
            return X.values

    @staticmethod
    def ordinal_encoding(X_in, mapping=None, cols=None, impute_missing=True):
        """
        Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed
        in, in this case we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes
        are assumed to have no true order and integers are selected at random.
        """

        X = X_in.copy(deep=True)

        if cols is None:
            cols = X.columns.values

        mapping_out = []
        if mapping is not None:
            for switch in mapping:
                for category in switch.get('mapping'):
                    X.loc[X[switch.get('col')] == category[0], switch.get('col')] = str(category[1])
                if impute_missing:
                    X[switch.get('col')].fillna(-1, inplace=True)
                X[switch.get('col')] = X[switch.get('col')].astype(int).reshape(-1, )
        else:
            for col in cols:
                categories = list(set(X[col].values))
                np.random.shuffle(categories)
                
                for idx, val in enumerate(categories):
                    X.loc[X[col] == val, col] = str(idx)

                if impute_missing:
                    X[col].fillna(-1, inplace=True)

                X[col] = X[col].astype(int).reshape(-1, )

                mapping_out.append({'col': col, 'mapping': [(x[1], x[0]) for x in list(enumerate(categories))]},)

        return X, mapping_out

In [13]:
class BinaryEncoder(BaseEstimator, TransformerMixin):
    """
    Parameters
    ----------
    verbose: int
        integer indicating verbosity of output. 0 for none.
    cols: list
        a list of columns to encode, if None, all string columns will be encoded
    drop_invariant: bool
        boolean for whether or not to drop columns with 0 variance
    return_df: bool
        boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array)
    Example
    -------
    >>> from category_encoders import BinaryEncoder
    >>> from sklearn.datasets import fetch_20newsgroups_vectorized
    >>> bunch = fetch_20newsgroups_vectorized(subset="all")
    >>> X, y = bunch.data, bunch.target
    >>> enc = BinaryEncoder(return_df=False).fit(X, y)
    >>> numeric_dataset = enc.transform(X)
    """
    def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True):
        self.return_df = return_df
        self.drop_invariant = drop_invariant
        self.drop_cols = []
        self.verbose = verbose
        self.cols = cols
        self.ordinal_encoder = None
        self._dim = None

    def fit(self, X, y=None, **kwargs):
        """Fit encoder according to X and y.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.
        Returns
        -------
        self : encoder
            Returns self.
        """

        # if the input dataset isn't already a dataframe, convert it to one (using default column names)
        # first check the type
        X = convert_input(X)

        self._dim = X.shape[1]

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = get_obj_cols(X)

        # train an ordinal pre-encoder
        self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols)
        self.ordinal_encoder = self.ordinal_encoder.fit(X)

        # drop all output columns with 0 variance.
        if self.drop_invariant:
            self.drop_cols = []
            X_temp = self.transform(X)
            self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5]

        return self

    def transform(self, X):
        """Perform the transformation to new categorical data.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Returns
        -------
        p : array, shape = [n_samples, n_numeric + N]
            Transformed values with encoding applied.
        """

        if self._dim is None:
            raise ValueError('Must train encoder before it can be used to transform data.')

        # first check the type
        X = convert_input(X)

        # then make sure that it is the right size
        if X.shape[1] != self._dim:
            raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim, ))

        if not self.cols:
            return X

        X = self.ordinal_encoder.transform(X)

        X = self.binary(X, cols=self.cols)

        if self.drop_invariant:
            for col in self.drop_cols:
                X.drop(col, 1, inplace=True)

        if self.return_df:
            return X
        else:
            return X.values

    def binary(self, X_in, cols=None):
        """
        Binary encoding encodes the integers as binary code with one column per digit.
        """

        X = X_in.copy(deep=True)

        if cols is None:
            cols = X.columns.values
            pass_thru = []
        else:
            pass_thru = [col for col in X.columns.values if col not in cols]

        bin_cols = []
        for col in cols:
            # figure out how many digits we need to represent the classes present
            digits = int(np.ceil(np.log2(len(X[col].unique()))))

            # map the ordinal column into a list of these digits, of length digits
            X[col] = X[col].map(lambda x: self.col_transform(x, digits))

            for dig in range(digits):
                X[str(col) + '_%d' % (dig, )] = X[col].map(lambda r: int(r[dig]) if r is not None else None)
                bin_cols.append(str(col) + '_%d' % (dig, ))

        X = X.reindex(columns=bin_cols + pass_thru)

        return X

    @staticmethod
    def col_transform(col, digits):
        """
        The lambda body to transform the column values
        """

        if col is None or float(col) < 0.0:
            return None
        else:

            col = list("{0:b}".format(int(col)))
            if len(col) == digits:
                return col
            else:
                return [0 for _ in range(digits - len(col))] + col

In [38]:
categorical_columns = [col for col in train.columns if 'cat' in col]
binary = BinaryEncoder(cols=categorical_columns, return_df=False)
enc = binary.fit(train[categorical_columns], y)

In [62]:
cont_columns = [col for col in train.columns if 'cont' in col]
X_ = pd.concat( (pd.DataFrame(X), train[cont_columns] ), axis=1)

In [63]:
X_.shape

(188318, 243)

** Split into training and test sets **

In [66]:
itrain, itest = train_test_split(range(len(train)), test_size=.2, random_state=12121)

X_train = X_.values[itrain]
X_test  = X_.values[itest]

y_train = y.values[itrain]
y_test  = y.values[itest]

In [68]:
kf = KFold(len(X_train), n_folds=3, shuffle=True, random_state=1231)

for index, (itr, ite) in enumerate(kf):
    print('Fold: {}'.format(index))
    
    Xtr = X_train[itr]
    ytr = y_train[itr]
    
    Xte = X_train[ite]
    yte = y_train[ite]
    
#     est = RandomForestRegressor(n_jobs=-1, random_state=1231831)
    est = xgb.XGBRegressor(n_estimators=325, gamma=0.5290, min_child_weight=4.2922, subsample=0.99, colsample_bytree=0.3, max_depth=7, seed=12313)
    est.fit(Xtr, ytr)
    
    yhat = est.predict(Xte)
    
    print('MAE score: {}'.format(mean_absolute_error(np.exp(yte), np.exp(yhat))))

Fold: 0
MAE score: 1167.3550020405903
Fold: 1
MAE score: 1152.3667856184145
Fold: 2
MAE score: 1161.5399681130748


In [70]:
# est = RandomForestRegressor(n_jobs=-1, random_state=1231319)
est = xgb.XGBRegressor(n_estimators=325, gamma=0.5290, min_child_weight=4.2922, subsample=0.99, colsample_bytree=0.3, max_depth=7, seed=12313)
est.fit(X_train, y_train)
pred = est.predict(X_test)

print('Mean Absolute Error: {0}'.format(mean_absolute_error(np.exp(y_test), np.exp(pred))))

Mean Absolute Error: 1137.9178118975565


In [None]:
binary_test = BinaryEncoder(cols=categorical_columns, return_df=False)
enc = binary.fit(test[categorical_columns], y)