The idea here is convert binary columns (many) to only one categorical column (this can be more easily used with tree algoritms). Think about something that revert one-hot-encode to categorical.

In [1]:
# read data
import numpy as np
import pandas as pd
train=pd.read_csv('../input/train.csv')
test =pd.read_csv('../input/test.csv')

In [2]:
# select binary features
bin_cols = [col for col in train.columns if '_bin' in col]
# just to test with non binary features...
cat_cols = [col for col in train.columns if '_cat' in col]

In [3]:
import warnings
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class BinToCat(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None, **kwargs):
        cols=X.columns
        if(len(cols)>64):
            warnings.warn("Caution, more than 64 bin columns, 2**64 can overflow int64")
        for i in cols:
            unique_vals=X[i].unique()
            if(len(unique_vals)>2):
                raise Exception("Column "+i+" have more than 2 values, is it binary? values: "+str(unique_vals))
            if not (0 in unique_vals and 1 in unique_vals):
                raise Exception("Column "+i+" have values different from 0/1, is it binary? values: "+str(unique_vals))
        self.scale=np.array([1<<i for i in range(np.shape(X)[1])])
        
    def transform(self, X):
        return np.sum(self.scale*X,axis=1)
        
        

In [4]:
a=BinToCat()
a.fit(train[bin_cols])
t=train[0:3]
print('scale',a.scale)
print('bin    :',t[bin_cols])
print('bin2cat:',a.transform(t[bin_cols]))

scale [    1     2     4     8    16    32    64   128   256   512  1024  2048
  4096  8192 16384 32768 65536]
bin    :    ps_ind_06_bin  ps_ind_07_bin  ps_ind_08_bin  ps_ind_09_bin  ps_ind_10_bin  \
0              0              1              0              0              0   
1              0              0              1              0              0   
2              0              0              1              0              0   

   ps_ind_11_bin  ps_ind_12_bin  ps_ind_13_bin  ps_ind_16_bin  ps_ind_17_bin  \
0              0              0              0              0              1   
1              0              0              0              0              0   
2              0              0              0              1              0   

   ps_ind_18_bin  ps_calc_15_bin  ps_calc_16_bin  ps_calc_17_bin  \
0              0               0               1               1   
1              1               0               1               1   
2              0               0  

In [5]:
train['bins']=a.transform(train[bin_cols])
test['bins'] =a.transform(test[bin_cols])
print('unique length: ',len(train['bins'].unique()))
print(train['bins'])

unique length:  2520
0          78338
1          46084
2          45316
3            257
4          49409
5         119048
6          37122
7          43265
8          69892
9          37890
10        110852
11         20738
12         73988
13         41218
14         33800
15         46082
16          8452
17         12545
18         16641
19         45313
20         37889
21         33025
22         65794
23         20738
24         29697
25         90369
26         94468
27         20744
28         12545
29          4353
           ...  
595182     49409
595183     45313
595184     66561
595185     20740
595186     24834
595187     78088
595188     12545
595189     53249
595190      8449
595191     90628
595192    110850
595193     20738
595194     12290
595195    103426
595196    105473
595197     57346
595198     12296
595199     41220
595200      8449
595201     55560
595202     45314
595203       514
595204     37122
595205     10498
595206     12552
595207    110856
595208    

In [6]:
train.to_csv('train.withoutbin.csv',index=False)
test.to_csv('test.withoutbin.csv',index=False)

test with categorical values - this should raise an error

In [7]:
a=BinToCat()
a.fit(train[cat_cols])


Exception: Column ps_ind_02_cat have more than 2 values, is it binary? values: [ 2  1  4  3 -1]

test with many columns (more than 64) - this should generate a warning

In [8]:
a=BinToCat()
a.fit(train)


Exception: Column id have more than 2 values, is it binary? values: [      7       9      13 ..., 1488017 1488021 1488027]