In [4]:
import os
os.chdir('..')

In [35]:
import pandas as pd
from sklearn.base import TransformerMixin
from pipelines.category_imputer import CategoryImputer
from pipelines.category_merger import CategoryMerger
from sklearn.preprocessing import OneHotEncoder
from collections import defaultdict

In [11]:
df_train = pd.read_csv('data/train.csv')
df_val = pd.read_csv('data/val.csv')

In [12]:
cat_columns = ['EASEMENT', 'BLDGCL', 'TAXCLASS',
               'EXMPTCL', 'EXCD2', 'EXT']

In [13]:
category_imputer = CategoryImputer()
category_imputer.fit(df_train)
category_merger = CategoryMerger()
category_merger.fit(df_train)

<pipelines.category_merger.CategoryMerger at 0x7f579c4c9640>

In [14]:
df_train = category_imputer.transform(df_train)
df_val = category_imputer.transform(df_val)
df_train = category_merger.transform(df_train)
df_val = category_merger.transform(df_val)

In [53]:
class CategoryEncoder(TransformerMixin):

    def fit(self, X, y=None, **kwargs):
        self.one_hot_cols = ['BLDGCL', 'EXT', 'EASEMENT', 'EXCD2']
        self.one_hot_transformer = OneHotEncoder(sparse=False)
        self.one_hot_transformer.fit(X[self.one_hot_cols].astype('str'))
        return self

    def map_exemption(self, ex):
        if ex == 'No':
            return 0
        else:
            return ex[1]

    def transform(self, X, y=None, **kwargs):
        X = X.copy()
        X['EXMPTCL'] = X['EXMPTCL'].apply(self.map_exemption)
        one_hot_transformed = self.one_hot_transformer.transform(
            X[self.one_hot_cols].astype('str')
        )
        one_hot_df = pd.DataFrame(one_hot_transformed,
            columns = self.one_hot_transformer.get_feature_names(
            self.one_hot_transformer.feature_names_in_
        )
        )
        return pd.concat([X, one_hot_df], axis='columns')

In [54]:
encoder = CategoryEncoder()
encoder.fit(df_train)
encoder.transform(df_val)



Unnamed: 0,BBLE,BORO,BLOCK,LOT,EASEMENT,OWNER,BLDGCL,TAXCLASS,LTFRONT,LTDEPTH,...,EXCD2_1017.0,EXCD2_1019.0,EXCD2_1101.0,EXCD2_1200.0,EXCD2_1920.0,EXCD2_1986.0,EXCD2_5112.0,EXCD2_5129.0,EXCD2_5130.0,EXCD2_No
0,1002181115,1,218,1115,No,"BRODY BATEMAN, INGRID",Public use,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1001901520,1,190,1520,No,LIM LAUREEN,Public use,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3007691115,3,769,1115,No,"ZHAN, WEI MIN",Public use,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,5024001172,5,2400,1172,No,FOXWOOD SQUARE LTD,Public use,2,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3008761028,3,876,1028,No,FLORAFAUNA INDUSTRIES,Public use,1A,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1787,1001510020,1,151,20,No,D. E. A. BUILDING COR,Industrial,4,55,96,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1788,5015970016,5,1597,16,No,CARANNANTE CARMINE,Family,1,60,115,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1789,4089370032,4,8937,32,No,ISSA LIKA SYBE LIKA,Family,1,40,100,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1790,1000163094,1,16,3094,No,"HERDTER, MARK",Public use,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
