In [84]:
from sklearn.pipeline import make_pipeline
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from pandas.io.json import json_normalize
import json

from sklearn import preprocessing

import pandas as pd
import numpy as np

# To display all columns when printing
pd.set_option('display.max_columns', None)

In [65]:
class InconsistencyTransformer(TransformerMixin):
    def __init__(self, column, source, target):
        self.column = column
        self.source = source
        self.target = target
        
    def fit(self, df, y=None, **fit_params):
        return self
    
    def transform(self, df, **transform_params):
        sample = df[self.column] == self.source
        df.loc[sample, self.column] = self.target
        return df

In [64]:
class ColumnFixer(TransformerMixin):
    def __init__(self, column, operation):
        self.column = column
        self.operation = operation
        
    def fit(self, df, y=None, **fit_params):
        return self
    
    def transform(self, data, **transform_params):
        data[self.column] = data[self.column].apply(self.operation)
        return data

In [63]:
class JSONTransformer(TransformerMixin):
    def __init__(self, column):
        self.column = column
        
    def fit(self, df, y=None, **fit_params):
        return self
    
    def transform(self, data, **transform_params):
        jsons = pd.DataFrame(data[self.column])
        jsons[self.column] = jsons[self.column].apply(lambda x : x.replace('\'', '\"'))
        jsons = jsons[self.column].apply(json.loads).values.tolist() 
        medical_info = pd.DataFrame(jsons)
        
        data = pd.concat([data, pd.DataFrame(medical_info)], axis=1)
        data = data.drop(columns=[self.column])
        
        return data

In [95]:
def featurize(features):
  transformations = [
                            ('TSH', preprocessing.Normalizer())
  ]

  return DataFrameMapper(filter(lambda x: x[0] in df.columns, transformations))

pipeline = Pipeline([
    ('featurize', featurize(features)), 
    ('forest', RandomForestClassifier())
])

# Code start here

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [103]:
ppl = Pipeline([
            ('transform JSON', JSONTransformer('medical_info')),
    
            ('transform FALSE to f', InconsistencyTransformer('on thyroxine', 'FALSE', 'f')),
            ('transform TRUE to t', InconsistencyTransformer('on thyroxine', 'TRUE', 't')),
            ('transform F to f', InconsistencyTransformer('on thyroxine', 'F', 'f')),
            ('transform T to t', InconsistencyTransformer('on thyroxine', 'T', 't')),
    
            ('transform ?? to NaN', InconsistencyTransformer('query hypothyroid', '??', '?')),
            ('transform nan to ?', InconsistencyTransformer('query hypothyroid', 'nan', '?')),
    
            ('fix workclass " ?" to "?"', InconsistencyTransformer('workclass', ' ?', '?')),
            ('fix occupation " ?" to "?"', InconsistencyTransformer('occupation', ' ?', '?')),
            ('fix native-country " ?" to "?"', InconsistencyTransformer('native-country', ' ?', '?')),
    
            ('fix the .|num in class', ColumnFixer('class', lambda x : x.split('.')[0])),
            ('transform - to _ in relationship', ColumnFixer('relationship', lambda x : x.replace('-','_'))),
            ('lower case the referral source', ColumnFixer('referral source', lambda x : x.lower())),
            ('TSH', preprocessing.Normalizer())
      ])

model = ppl.fit(train)

train_t = model.transform(train)

ValueError: could not convert string to float: 'f'

In [100]:
train_t['TSH'].describe()

count    2516.000000
mean        4.672150
std        21.449453
min         0.005000
25%         0.440000
50%         1.400000
75%         2.600000
max       478.000000
Name: TSH, dtype: float64