# Authenticate to Kaggle

In [1]:
!mkdir ~/.kaggle

In [6]:
!cp kaggle.json ~/.kaggle/kaggle.json

In [7]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!pip list

In [10]:
!kaggle competitions download -c house-prices-advanced-regression-techniques

Downloading house-prices-advanced-regression-techniques.zip to /content
  0% 0.00/199k [00:00<?, ?B/s]
100% 199k/199k [00:00<00:00, 66.4MB/s]


In [11]:
!unzip house-prices-advanced-regression-techniques.zip

Archive:  house-prices-advanced-regression-techniques.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# Pipeline Practice

In [45]:
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [14]:
df = pd.read_csv('train.csv')

In [None]:
df.head()

In [27]:
# Only certain columns so I don't need to deal with NaNs for non
select_df = df[['MSSubClass','MSZoning','LotFrontage','LotArea', 'Street', 'LotShape',
        'LandContour',	'Utilities', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SalePrice']].dropna()

In [26]:
# Only certain columns so I don't need to deal with NaNs for now
X = pd.get_dummies(select_df.drop('SalePrice', axis=1))
y = select_df.SalePrice

In [29]:
pipeline = make_pipeline(StandardScaler(), RandomForestRegressor())

In [30]:
pipeline.fit(X,y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor', RandomForestRegressor())])

In [31]:
pipeline.predict(X)

array([203543.5 , 169576.5 , 214726.  , ..., 225193.58, 143516.  ,
       152326.  ])

# Save the Pipeline

In [32]:
import pickle

In [34]:
with open('pipelinemodel.pkl', 'wb') as f:
  pickle.dump(pipeline, f)

In [35]:
with open('pipelinemodel.pkl', 'rb') as f: 
  reloaded_model = pickle.load(f)

In [36]:
reloaded_model

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor', RandomForestRegressor())])

In [None]:
reloaded_model.steps[1][1].predict(X)

# Using the Pipeline Class 

In [48]:
# with the pipeline class
custom_pipeline = Pipeline([('scaling', StandardScaler()),('rfmodel', RandomForestRegressor())])

In [50]:
custom_pipeline

Pipeline(steps=[('scaling', StandardScaler()),
                ('rfmodel', RandomForestRegressor())])

In [49]:
# with the make_pipeline class
make_pipeline_model = make_pipeline(StandardScaler(), RandomForestRegressor())

In [51]:
make_pipeline_model

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor', RandomForestRegressor())])

# Column Transformers

In [53]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [55]:
select_df.select_dtypes('object').columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType'],
      dtype='object')

In [85]:
# Numeric Features
numeric_features = select_df.drop('SalePrice', axis=1).select_dtypes(exclude='object').columns
numeric_pipeline = Pipeline([('scaler', StandardScaler())])

In [86]:
# Categorical Features
categorical_features = select_df.select_dtypes('object').columns
categorical_pipeline = Pipeline([('onehot', OneHotEncoder())])

In [87]:
transformer = ColumnTransformer([('numeric_preprocessing', numeric_pipeline, numeric_features), 
                                 ('categorical_preprocessing', categorical_pipeline, categorical_features)])

In [88]:
ml_pipeline = Pipeline([('all_column_preprocessing', transformer), ('randforestclassifier', RandomForestRegressor())])

In [89]:
X = select_df.drop('SalePrice', axis=1)
y = select_df['SalePrice']

In [90]:
ml_pipeline.fit(X, y)

Pipeline(steps=[('all_column_preprocessing',
                 ColumnTransformer(transformers=[('numeric_preprocessing',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['MSSubClass', 'LotFrontage', 'LotArea', 'MiscVal', 'MoSold', 'YrSold'], dtype='object')),
                                                 ('categorical_preprocessing',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder())]),
                                                  Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType'],
      dtype='object'))])),
                ('randforestclassifier', RandomForestRegressor())])

In [91]:
ml_pipeline.predict(X)

array([204506.  , 165339.64, 210766.72, ..., 232544.5 , 144572.75,
       155401.5 ])

In [92]:
with open('columntransformermodel.pkl', 'wb') as f: 
  pickle.dump(ml_pipeline, f)

In [93]:
with open('columntransformermodel.pkl', 'rb') as f: 
  reloaded_ml_pipeline = pickle.load(f)

In [94]:
reloaded_ml_pipeline

Pipeline(steps=[('all_column_preprocessing',
                 ColumnTransformer(transformers=[('numeric_preprocessing',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['MSSubClass', 'LotFrontage', 'LotArea', 'MiscVal', 'MoSold', 'YrSold'], dtype='object')),
                                                 ('categorical_preprocessing',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder())]),
                                                  Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType'],
      dtype='object'))])),
                ('randforestclassifier', RandomForestRegressor())])