# Pipeline is the boss

In [1]:
# Create a pipeline that standardizes the data then creates a model
import os
from datetime import datetime
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor


# Import CSV

In [2]:
#read data, create listings dataframe
path = '../../../data/new-york-city-airbnb-open-data/'
listings_csv = os.path.join(path,'selected_columns_listings.csv')
listings = pd.read_csv(listings_csv, index_col = 'id')

# Clean up Price
It is currently outside pipeline, needs to move it inside

In [9]:
#function
def fix_currency(row):
    row = row.replace(',', '')
    row = row.replace('$', '')
    return row

In [10]:
column = 'price'
listings[column] = listings[column].apply(lambda col: fix_currency(col)).astype(float)


# Transformer
Optimus Prime

In [40]:
numeric_features = ['bathrooms', 'bedrooms', 'beds', 'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month']


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), #strategy='constant', fill_value='0', 
    ('scaler', RobustScaler())])

categorical_features = ['neighbourhood_group_cleansed']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
        ])

# Test Train Split


In [41]:
feature_list = numeric_features + categorical_features
features = listings[feature_list]

target = listings['price']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

# Pipeline is King

In [42]:
pl_Lasso = Pipeline(steps=[('preprocessor', preprocessor),
                    ('regressor', Lasso(alpha=0.5))
                    ])

pl_Gboost = Pipeline(steps=[('preprocessor', preprocessor),
                    ('regressor', GradientBoostingRegressor(random_state=0))
                    ])


In [43]:
pl_Lasso.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [44]:
pl_Lasso.score(X_train, y_train)

0.05511276454487668

In [27]:
pl_Lasso.predict(X_test).mean().mean()

166.39338810022923

In [28]:
pl_Lasso.score(X_test, y_test)

0.03375158530852662

In [29]:
pl_Gboost.fit(X_train, y_train)
pl_Gboost.score(X_train, y_train)


0.08406031867363362