In [1]:
from itertools import takewhile, starmap
from functools import partial, reduce
 
def compose_two(g, f):
    """Function composition for two functions, e.g. compose_two(f, g)(x) == f(g(x))"""
    return lambda *args, **kwargs: g(f(*args, **kwargs))
 
def compose(*funcs):
    """Compose an arbitrary number of functions left-to-right passed as args"""
    return reduce(compose_two, funcs)
 
def transform_args(func, transformer):
    return lambda *args: func(*transformer(args))
 
composed_partials = transform_args(compose, partial(starmap, partial))
pipe = transform_args(composed_partials, reversed)
 
pipe_style = pipe(
    (takewhile, lambda x: x < 7),
    (filter, lambda x: x < 2),
    (map, lambda x: 4 * x))

In [2]:
list(pipe_style(range(100)))

[0, 4]

In [3]:
from sklearn.preprocessing import Normalizer, LabelBinarizer,RobustScaler,StandardScaler, OneHotEncoder

In [4]:
import numpy as np
import pandas as pd

In [5]:
df_train = pd.read_json(open("./data/train.json", "r"))
df_test = pd.read_json(open("./data/test.json", "r"))

In [6]:
from multiprocessing import Pool, cpu_count

In [7]:
selection = [
    (df_train, df_test,["bathrooms"],RobustScaler()),
    (df_train, df_test, ["bedrooms"],RobustScaler()),
    #(df_train,"latitude",None),
    #(df_train,"longitude",None),
    (df_train, df_test, ["price"],RobustScaler())
    # (df_train,["interest_level"],LabelBinarizer())
    ]

In [97]:
def transformation(train,test,sCol,Transformer=None):
    if Transformer is None:
        return train, test
    
    def _trans(df, sCol, Transformer,flag):
        if flag == "fit_transform":
            transformed = Transformer.fit_transform(df[sCol]).T
        elif flag == "transform":
            transformed = Transformer.transform(df[sCol]).T
        elif flag == "fit":
            transformed = Transformer.fit(df[sCol]).T
        #else Raise exception
        
        if isinstance(sCol, list):
            label = sCol[0]
        else:
            label = sCol
        
        # To keep the index, we need to assign to the original dataframe and then extract again
        # OneHotEncoder and LabelBinarizer do not have the same interface ....
        feature_list = []
        n = 0
        feat_label = ''
        for serie in transformed:
            if isinstance(Transformer, OneHotEncoder):
                feat_label = label + '_' + Transformer.active_features_[n]
            else:
                try:
                    feat_label = label + '_' + Transformer.classes_[n]
                except:
                    feat_label = label
            df[feat_label] = serie
            feature_list.append(feat_label)
        return df[feature_list]
    
    trn = train.copy()
    tst = test.copy()
    
    trn = _trans(trn, sCol, Transformer,'fit_transform')
    tst = _trans(tst, sCol, Transformer,'transform')
    
    #We only send back what we need
    return (trn, tst)

In [98]:
# d, t = transformation(df_train,["interest_level"],LabelBinarizer())
p = transformation(df_train,df_test,["price"],RobustScaler())
ba = transformation(df_train,df_test,["bathrooms"],RobustScaler())

In [11]:
result = pd.DataFrame()
result

In [99]:
d, t = transformation(df_train,["interest_level"],LabelBinarizer())

In [100]:
d, t

(        bathrooms  bedrooms                       building_id  \
 10            1.5         3  53a5b119ba8f7b61d4e010512e0dfc85   
 10000         1.0         2  c5c8a357cba207596b04d1afd1e4f130   
 100004        1.0         1  c3ba40552e2120b0acfc3cb5730bb2aa   
 100007        1.0         1  28d9ad350afeaab8027513a3e52ac8d5   
 100013        1.0         4                                 0   
 100014        2.0         4  38a913e46c94a7f46ddf19b756a9640c   
 100016        1.0         2  3ba49a93260ca5df92fde024cb4ca61f   
 100020        2.0         1  0372927bcb6a0949613ef5bf893bbac7   
 100026        1.0         1  a7efbeb58190aa267b4a9121cd0c88c0   
 100027        2.0         4                                 0   
 100030        1.0         0                                 0   
 10004         1.0         1                                 0   
 100044        1.0         2  67c9b420da4a365bc26a6cd0ef4a5320   
 100048        2.0         2                                 0   
 10005    

In [146]:
with Pool(cpu_count()) as mp:
    def zip_with(f, list_of_tuple):
        return mp.starmap(f, zip(*list_of_tuple))
    def vconcat(*list_of_df):
        return pd.concat([*list_of_df], axis=1)
    
    ltuples_train_test = mp.starmap(transformation,selection)
    train, test = zip_with(vconcat,ltuples_train_test)
    

In [None]:
[(df_train,df_test,label,transfo) for (label,transfo) in test2]

In [None]:
    test3 = [
    (["bathrooms"],RobustScaler(),df_train,df_test),
    (["bedrooms"],RobustScaler(),df_train,df_test),
    #("latitude",None,df_train,df_test),
    #("longitude",None,df_train,df_test),
    (["price"],RobustScaler(),df_train,df_test)
    ]

In [None]:
test3