<a href="https://colab.research.google.com/github/maschere/aml/blob/main/ratemycrib_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# download house price csv
!wget "https://github.com/maschere/public-jupyter/blob/master/data/ames%20housing/ames.csv?raw=true" -O ames.csv

In [None]:
# read data
import pandas as pd
import numpy as np
dat = pd.read_csv("ames.csv").iloc[0:2000]
#set aside test
dat_test = pd.read_csv("ames.csv").iloc[2001:]

#get target
salePrice = dat.Sale_Price.values
dat.drop(columns=['Sale_Price'], inplace=True)

dat.describe()

In [None]:
!pip install mlxtend -U

In [None]:
RANDOM_SEED = 124
from mlxtend.regressor import StackingCVRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

# select models
ridge = Ridge()
lasso = Lasso()
rf = RandomForestRegressor(random_state=RANDOM_SEED)
svm = LinearSVR(dual=True)

#create pipelines
##get col types
categorical_cols = [col for col in dat.columns if dat[col].dtype == 'object']
numerical_cols = [col for col in dat.columns if (dat[col].dtype == 'int64' or dat[col].dtype == 'float64')]
##define preprocessing
numerical_transformer = Pipeline(steps=
                                 [('imputer',SimpleImputer()),
                                  ('scaler',StandardScaler())])
categorical_transformer = Pipeline(steps=
                                   [('imputer', SimpleImputer(strategy='most_frequent')),
                                    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=
                                 [('num', numerical_transformer, numerical_cols), 
                                  ('cat', categorical_transformer, categorical_cols)])

#create stack
stack = StackingCVRegressor(regressors=(ridge, rf),
                            meta_regressor=lasso)
#put model(stack) and preprocess into pipeline
pipeline = Pipeline(steps=
                   [('pre', preprocessor),
                   ('model', stack)])

# do cross-val grid search
grid = GridSearchCV(
    estimator=pipeline, 
    param_grid={
        #'model__linearsvr__C': [0.5, 1.0, 2.0],
        'model__ridge__alpha': [0.5],
        'model__randomforestregressor__n_estimators': [10,100,200],
        'model__meta_regressor__alpha': [2.0],
    }, 
    cv=3,
    n_jobs=1,
    verbose=4,
    scoring='neg_root_mean_squared_error',
    refit=True
)
#list all possible params to tune in the pipeline
pipeline.get_params().keys()

In [None]:
# run the fit
grid.fit(dat, salePrice)
print("Best: %f using %s" % (grid.best_score_, grid.best_params_)