**Pandas pipelines in preprocessing**  

Inspiration at: https://www.youtube.com/watch?v=yXGCKqo5cEY

In [121]:
# Import libraries
import pandas as pd
import numpy as np
import pickle
import warnings

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
random_state = 17
warnings.filterwarnings("ignore")

Import dataset

In [122]:
# Data source
url = 'https://docs.google.com/uc?export=download&id=1k21iUIrz0NjfiLE_j-oBQm1bNu3wASX6'
# Read csv
data = pd.read_csv(url, index_col=0)
# Save dataset
with open("house_prices_df.pkl", "wb") as f:
    pickle.dump(data, f)

Define pandas pipeline elements

In [123]:
# We will be working on a copy of dataframe to avoid side effects
def start_pipeline(dataf):
	return dataf.copy()

In [124]:
# Just in case))
def clean_dataset(dataf):
	dataf.columns = [c.replace(" ", "") for c in dataf]
	return dataf

In [125]:
# Select numerical and categorical columns 
# Used in pandas pipeline
def select_num_cols(dataf):
    numerical_columns = data.select_dtypes(
        include=["int", "float"]).columns.to_list()
    return numerical_columns
    
def select_cat_cols(dataf):
    categorical_columns = data.select_dtypes(
       exclude=["int", "float"]).columns.to_list()
    return categorical_columns

In [126]:
def transform_target(dataf):
    num_cols = select_num_cols(dataf)
    dataf[num_cols[-1]] = np.log(dataf[num_cols[-1]])
    return dataf

In [127]:
def remove_outliers(dataf, on=False):
    if on:
        num_cols = select_num_cols(dataf)
        target = dataf[num_cols[-1]]
        m = np.mean(target)
        sigma = np.std(target)
        target.drop(target.index[(target < (m - 3*sigma))
                    | (target > (m + 3*sigma))], inplace=True)
        dataf = dataf.loc[target.index]
        dataf.reset_index(drop=True, inplace=True)
#         target.reset_index(drop=True,inplace=True)
    return dataf

In [128]:
# Pandas pipeline
# df = pd.DataFrame()
df = (data
.pipe(start_pipeline)
.pipe(clean_dataset)
.pipe(transform_target)
.pipe(remove_outliers)
)

In [129]:
df[:10]

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,12.247694
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,12.109011
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,12.317167
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,11.849398
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,12.429216
6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,11.8706
7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,12.634603
8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,Shed,350,11,2009,WD,Normal,12.206073
9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2008,WD,Abnorml,11.77452
10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,Corner,...,0,,,,0,1,2008,WD,Normal,11.67844
