In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold

from sklearn.pipeline import Pipeline

from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.metrics import mean_squared_error, mean_absolute_error

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sys
sys.path.append("..")
from source.clean import general_cleaner, drop_columns
from source.transf_category import recode_cat, make_ordinal
#from source.transf_numeric import tr_numeric
import source.transf_univ as dfp
import source.utility as ut
import source.report as rp

In [2]:
df_train = pd.read_csv('../data/train.csv')

df_train['Target'] = np.log1p(df_train.SalePrice)

df_train = df_train[df_train.GrLivArea < 4500].copy().reset_index(drop=True)

del df_train['SalePrice']

train_set, test_set = ut.make_test(df_train, 
                                test_size=0.2, random_state=654, 
                                strat_feat='Neighborhood')

y = train_set['Target'].copy()
del train_set['Target']

y_test = test_set['Target']
del test_set['Target']

folds = KFold(5, shuffle=True, random_state=541)

In [11]:
class tr_numeric(BaseEstimator, TransformerMixin):
    def __init__(self, SF_room=True, bedroom=True, bath=True, lot=True, service=True):
        self.columns = []  # useful to well behave with FeatureUnion
        self.SF_room = SF_room
        self.bedroom = bedroom
        self.bath = bath
        self.lot = lot
        self.service = service
     

    def fit(self, X, y=None):
        return self
    

    def remove_skew(self, X, column):
        X[column] = np.log1p(X[column])
        return X


    def SF_per_room(self, X):
        if self.SF_room:
            X['sf_per_room'] = X['GrLivArea'] / X['TotRmsAbvGrd']
        return X


    def bedroom_prop(self, X):
        if self.bedroom:
            X['bedroom_prop'] = X['BedroomAbvGr'] / X['TotRmsAbvGrd']
            del X['BedroomAbvGr'] # the new feature makes it redundant and it is not important
        return X


    def total_bath(self, X):
        if self.bath:
            X['total_bath'] = (X[[col for col in X.columns if 'FullBath' in col]].sum(axis=1) +
                             0.5 * X[[col for col in X.columns if 'HalfBath' in col]].sum(axis=1))
            del X['FullBath']  # redundant 

        del X['HalfBath']  # not useful anyway
        del X['BsmtHalfBath']
        del X['BsmtFullBath']
        return X


    def lot_prop(self, X):
        if self.lot:
            X['lot_prop'] = X['LotArea'] / X['GrLivArea']
        return X 


    def service_area(self, X):
        if self.service:
            X['service_area'] = X['TotalBsmtSF'] + X['GarageArea']
            del X['TotalBsmtSF']
            del X['GarageArea']
        return X
    
    def square_year(self, X):
        X['YearBuilt'] = X['YearBuilt']**2
        return X
    

    def transform(self, X, y=None):
        for col in ['GrLivArea', '1stFlrSF', 'LotArea']:
            X = self.remove_skew(X, col)

        X = self.SF_per_room(X)
        X = self.bedroom_prop(X)
        X = self.total_bath(X)
        X = self.lot_prop(X)
        X = self.service_area(X)
        #X = self.square_year(X)
        
        self.columns = X.columns
        return X
    

    def get_features_name(self):
        return self.columns

In [12]:
numeric_lasso = Pipeline([('fs', dfp.feat_sel('numeric')),
                         ('imp', dfp.df_imputer(strategy='median')),
                         ('transf', tr_numeric(lot=False, 
                                               bedroom=False, 
                                               SF_room=False))])


cat_lasso = Pipeline([('fs', dfp.feat_sel('category')),
                     ('imp', dfp.df_imputer(strategy='most_frequent')), 
                     ('ord', make_ordinal(['BsmtQual', 'KitchenQual', 'ExterQual', 'HeatingQC'], 
                                          extra_cols=['BsmtExposure', 'BsmtCond', 'ExterCond'],
                                          include_extra='include')), 
                     ('recode', recode_cat()), 
                     ('dummies', dfp.dummify(drop_first=True))])


processing_lasso = dfp.FeatureUnion_df(transformer_list=[('cat', cat_lasso),
                                                 ('num', numeric_lasso)])

lasso_pipe = Pipeline([('gen_cl', general_cleaner()),
                       ('proc', processing_lasso),
                       ('scaler', dfp.df_scaler(method='standard')),
                       ('dropper', drop_columns(lasso=True)), 
                       ('lasso', Lasso(alpha=0.001, tol=0.005))])

In [13]:
lasso_oof, coefs = ut.cv_score(train_set, y, folds, lasso_pipe, imp_coef=True)

coefs

Unnamed: 0_level_0,mean,std
feat,Unnamed: 1_level_1,Unnamed: 2_level_1
OverallQual,0.083286,0.003093
service_area,0.078932,0.002313
2ndFlrSF,0.049664,0.006035
1stFlrSF,0.049538,0.006489
GrLivArea,0.048768,0.008201
Neighborhood,0.044442,0.003186
OverallCond,0.043416,0.002327
YearBuilt,0.031246,0.004268
LotArea,0.02909,0.002211
BsmtUnfSF,-0.02504,0.000229


In [6]:
# Base
print(f'RMSE: {round(np.sqrt(mean_squared_error(y, lasso_oof)), 4)}')
print(f'MAE: {round(mean_absolute_error(np.expm1(y), np.expm1(lasso_oof)), 4)}')

rp.plot_predictions(train_set, y, lasso_oof)

RMSE: 0.1173
MAE: 14797.6254


In [20]:
# neigh in dummies
print(f'RMSE: {round(np.sqrt(mean_squared_error(y, lasso_oof)), 4)}')
print(f'MAE: {round(mean_absolute_error(np.expm1(y), np.expm1(lasso_oof)), 4)}')

rp.plot_predictions(train_set, y, lasso_oof)

RMSE: 0.1406
MAE: 18943.4766


In [6]:
# no neigh
print(f'RMSE: {round(np.sqrt(mean_squared_error(y, lasso_oof)), 4)}')
print(f'MAE: {round(mean_absolute_error(np.expm1(y), np.expm1(lasso_oof)), 4)}')

rp.plot_predictions(train_set, y, lasso_oof)

RMSE: 0.1191
MAE: 14894.7199


In [6]:
# with ext1st
print(f'RMSE: {round(np.sqrt(mean_squared_error(y, lasso_oof)), 4)}')
print(f'MAE: {round(mean_absolute_error(np.expm1(y), np.expm1(lasso_oof)), 4)}')

rp.plot_predictions(train_set, y, lasso_oof)

RMSE: 0.1451
MAE: 19686.0463


In [6]:
# with ext1st, only metal
print(f'RMSE: {round(np.sqrt(mean_squared_error(y, lasso_oof)), 4)}')
print(f'MAE: {round(mean_absolute_error(np.expm1(y), np.expm1(lasso_oof)), 4)}')

rp.plot_predictions(train_set, y, lasso_oof)

RMSE: 0.1375
MAE: 18817.7917


In [6]:
# with more smoothing
print(f'RMSE: {round(np.sqrt(mean_squared_error(y, lasso_oof)), 4)}')
print(f'MAE: {round(mean_absolute_error(np.expm1(y), np.expm1(lasso_oof)), 4)}')

rp.plot_predictions(train_set, y, lasso_oof)

RMSE: 0.1176
MAE: 14863.9977


In [10]:
# with squared year
print(f'RMSE: {round(np.sqrt(mean_squared_error(y, lasso_oof)), 4)}')
print(f'MAE: {round(mean_absolute_error(np.expm1(y), np.expm1(lasso_oof)), 4)}')

rp.plot_predictions(train_set, y, lasso_oof)

RMSE: 0.1173
MAE: 14796.4925


In [14]:
# removing skew after
print(f'RMSE: {round(np.sqrt(mean_squared_error(y, lasso_oof)), 4)}')
print(f'MAE: {round(mean_absolute_error(np.expm1(y), np.expm1(lasso_oof)), 4)}')

rp.plot_predictions(train_set, y, lasso_oof)

RMSE: 0.1173
MAE: 14797.6254
