In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
import scipy
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('../Data/cat.csv')
df = df.drop('id', axis=1)

dtarget = df['target']
df = df.drop('target', axis=1)

df.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,c389000ab,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,4cd920251,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,de9c9f684,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,4ade6ab69,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,cb43ab175,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8


In [3]:
df = df.drop('bin_0', axis=1)

map_bin_3_4 = {'T': 1, 'F': 0, 'Y': 1, 'N': 0}
df['bin_3'] = df['bin_3'].map(map_bin_3_4)
df['bin_4'] = df['bin_4'].map(map_bin_3_4)

In [7]:
df['ord_5_1'] = df['ord_5'].str[0]
df['ord_5_2'] = df['ord_5'].str[1]
df = df.drop('ord_5', axis=1)

In [10]:
ord_1_values = ['Novice', 'Contributor', 'Expert', 'Master', 'Grandmaster']

map_ord_1 = lambda x: ord_1_values.index(x)

df['ord_1'] = df['ord_1'].apply(map_ord_1)

In [11]:
ord_2_values = ['Freezing', 'Cold', 'Warm', 'Hot', 'Boiling Hot', 'Lava Hot']

map_ord_2 = lambda x: ord_2_values.index(x)

df['ord_2'] = df['ord_2'].apply(map_ord_2)

In [15]:
import string
map_to_ascii_index = lambda x: string.ascii_letters.index(x)

# 'ord_5_2' dropped!
for column in ['ord_3', 'ord_4', 'ord_5_1']:
    df[column] = df[column].apply(map_to_ascii_index)

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin


# This implementation supports numeric features only
class ThermometerEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, cols=None, drop_invariant=True):
        self.cols = cols
        self.drop_invariant = drop_invariant
    
    def get_params(self, deep=True):
        return {'cols': self.cols, 'drop_invariant': self.drop_invariant}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
    
    def fit(self, X, y=None):
        self.bars = {}
        for c in self.cols:
            k = np.arange(X[c].max() + 1)
            self.bars[c] = (k[:-1] < k.reshape(-1, 1)).astype(int)
        return self
    
    def transform(self, X, y=None):
        out = pd.DataFrame(index=X.index)
        for c in self.cols:
            out = out.join(self.transform_one(X, c))
        
        if self.drop_invariant:
            columns_to_drop = []
            for c in out.columns:
                if len(out[c].unique()) == 1:
                    columns_to_drop.append(c)
            out = out.drop(columns_to_drop, axis=1)
        
        return out
    
    def transform_one(self, X, c):
        bars = self.bars[c]
        out = pd.DataFrame(index=X.index, data=bars[X[c]])
        out.columns = [c + '_' + str(k) for k in range(bars.shape[1])]
        return out

In [18]:
thermometer = ThermometerEncoder(['ord_1'])
thermometer_traintest = thermometer.fit_transform(df)

In [19]:
thermometer_traintest

Unnamed: 0,ord_1_0,ord_1_1,ord_1_2,ord_1_3
0,1,1,1,1
1,1,1,1,1
2,1,1,0,0
3,1,1,1,1
4,1,1,1,1
...,...,...,...,...
299995,1,0,0,0
299996,0,0,0,0
299997,0,0,0,0
299998,1,1,1,0


In [20]:
df['ord_1']

0         4
1         4
2         2
3         4
4         4
         ..
299995    1
299996    0
299997    0
299998    3
299999    1
Name: ord_1, Length: 300000, dtype: int64

In [13]:
df['ord_3'].apply(map_to_ascii_index)

0          7
1          0
2          7
3          8
4          0
          ..
299995    10
299996     7
299997    14
299998     7
299999     8
Name: ord_3, Length: 300000, dtype: int64

In [14]:
df['ord_3']

0         h
1         a
2         h
3         i
4         a
         ..
299995    k
299996    h
299997    o
299998    h
299999    i
Name: ord_3, Length: 300000, dtype: object