# Pipeline is the boss

In [43]:
# Create a pipeline that standardizes the data then creates a model
import os
from datetime import datetime
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor


# Import CSV

In [44]:
#read data, create listings dataframe
path = '../../../data/new-york-city-airbnb-open-data/'
listings_csv = os.path.join(path,'selected_columns_listings.csv')
listings = pd.read_csv(listings_csv, index_col = 'id')

# Convert Data Type

In [45]:
#all dollars fields
#function
def fix_currency(row):
    row = row.replace(',', '')
    row = row.replace('$', '')
    return row

colname = 'extra_people'
filt = listings[colname].notna()
listings[colname] = listings[colname][filt].apply(lambda col: fix_currency(col)).astype(float)

colname = 'cleaning_fee'
filt = listings[colname].notna()
listings[colname] = listings[colname][filt].apply(lambda col: fix_currency(col)).astype(float)

colname = 'security_deposit'
filt = listings[colname].notna()
listings[colname] = listings[colname][filt].apply(lambda col: fix_currency(col)).astype(float)

colname = 'price'
filt = listings[colname].notna()
listings[colname] = listings[colname][filt].apply(lambda col: fix_currency(col)).astype(float)


In [46]:
def convert_bool(row):
    row = row.replace('f', '0')
    row = row.replace('t', '1')
    return row

# #update rows and convert to boolean, only non null values
Boolean_columns = ['host_is_superhost', 'is_location_exact', 'instant_bookable', 'host_identity_verified']

for column in Boolean_columns:
    filt = listings[column].notna()
    listings[column] = listings[column][filt].apply(lambda col: convert_bool(col)).astype(int)


In [47]:
#conver to date first then number, only do for non null
listings['host_since'] = pd.to_datetime(listings['host_since'])

filt = listings['host_since'].notna()
listings['host_since'] = listings['host_since'][filt].apply(lambda x: x.toordinal())


# Feature Engineering

In [48]:
#turn cleaning fee to yes and no column
def cleaning_fee_yes_no (row):
    if row['cleaning_fee'] > 0.00:
        return '1'
    else:
        return '0'

listings['cleaning_fee_yes_no'] = listings.apply(cleaning_fee_yes_no, axis=1)

print(listings.groupby('cleaning_fee_yes_no').agg({'price':'mean'}))


price
cleaning_fee_yes_no            
0                    202.574334
1                    153.643631


In [49]:
#turn security deposit to yes and no column
def security_deposit_yes_no (row):
    if row['security_deposit'] > 0.00:
        return '1'
    else:
        return '0'

listings['security_deposit_yes_no'] = listings.apply(security_deposit_yes_no, axis=1)

print(listings.groupby('security_deposit_yes_no').agg({'price':'mean'}))

price
security_deposit_yes_no            
0                        162.646346
1                        171.818497


In [50]:
#turn extra people fee to yes and no
def extra_people_yes_no (row):
    if row['extra_people'] > 0.00:
        return '1'
    else:
        return '0'

listings['extra_people_yes_no'] = listings.apply(extra_people_yes_no, axis=1)

print(listings.groupby('extra_people_yes_no').agg({'price':'mean'}))

price
extra_people_yes_no            
0                    183.485233
1                    146.856568


In [51]:
#amenities_count

listings['amenities_count'] = listings['amenities'].str.count(',')
print (listings['amenities_count'].sort_values(ascending=False).head())

# drop original column
listings.drop(columns='amenities', inplace = True)

id
21980723    71
4471513     66
41552433    65
37499093    63
37494649    62
Name: amenities_count, dtype: int64


In [52]:
# host_response_rate

def convert_string_to_int(row):
    if row == '100%':
        row = '1'
    else: row = '0'
    return row

listings['host_response_rate_calc'] = listings['host_response_rate'].apply(lambda col: convert_string_to_int(col)).astype(float)

# drop original column
listings.drop(columns='host_response_rate', inplace = True)

#check result
print(listings.groupby('host_response_rate_calc')['host_response_rate_calc'].size())

host_response_rate_calc
0.0    28313
1.0    22483
Name: host_response_rate_calc, dtype: int64


In [53]:
#host_acceptance_rate
#needs to use function that conver 100% to 1 and the rest to 0 (from previous cell)

listings['host_acceptance_rate_calc'] = listings['host_acceptance_rate'].apply(lambda col: convert_string_to_int(col)).astype(float)

# drop original column
listings.drop(columns='host_acceptance_rate', inplace = True)

#check result
print(listings.groupby('host_acceptance_rate_calc')['host_acceptance_rate_calc'].size())

host_acceptance_rate_calc
0.0    39286
1.0    11510
Name: host_acceptance_rate_calc, dtype: int64


In [67]:
# host_response_time

def convert_string_to_int(row):
    if row == 'within an hour':
        row = '1'
    else: row = '0'
    return row

listings['host_response_time_one_hour'] = listings['host_response_time'].apply(lambda col: convert_string_to_int(col)).astype(float)

# drop original column
listings.drop(columns='host_response_time', inplace = True)

#check result
print(listings.groupby('host_response_time_one_hour')['host_response_time_one_hour'].size())

host_response_time_one_hour
0.0    30351
1.0    20445
Name: host_response_time_one_hour, dtype: int64


In [54]:
#property_type, after modeling, loft seems to affect price. But looking at the data loft price is between 28 and 900 dollars.
# I am going to split propety type into two, apt or non
#listings.groupby('property_type').agg({'property_type': 'size', 'price':'max'}).sort_values(by='price',ascending=False)

def convert_string_to_int(row):
    if row == 'Apartment':
        row = '1'
    else: row = '0'
    return row

listings['apt_yes_no'] = listings['property_type'].apply(lambda col: convert_string_to_int(col)).astype(float)

# drop original column
listings.drop(columns='property_type', inplace = True)

#check result
print(listings.groupby('apt_yes_no')['apt_yes_no'].size())


apt_yes_no
0.0    11159
1.0    39637
Name: apt_yes_no, dtype: int64


In [55]:
#bed_type --- may not be needed, let's see

def convert_string_to_int(row):
    if row == 'Real Bed':
        row = '1'
    else: row = '0'
    return row

listings['real_bed_yes_no'] = listings['bed_type'].apply(lambda col: convert_string_to_int(col)).astype(float)

# drop original column
listings.drop(columns='bed_type', inplace = True)

#check result
print(listings.groupby('real_bed_yes_no')['real_bed_yes_no'].size())

real_bed_yes_no
0.0      713
1.0    50083
Name: real_bed_yes_no, dtype: int64


In [56]:
#cancellation_policy -- let's do 3 classes (flexible, moderate, and strict)

def convert_string_to_int(row):
    row = row.replace('strict_14_with_grace_period', 'strict')
    row = row.replace('super_strict_30', 'strict')
    row = row.replace('super_strict_60', 'strict')
    return row

#new column
listings['cancellation_policy_calc'] = listings['cancellation_policy'].apply(lambda col: convert_string_to_int(col))

# drop original column
listings.drop(columns='cancellation_policy', inplace = True)

#check result
print(listings.groupby('cancellation_policy_calc')['cancellation_policy_calc'].size())

cancellation_policy_calc
flexible    15461
moderate    11871
strict      23464
Name: cancellation_policy_calc, dtype: int64


In [57]:
#host_listings_count

def convert_string_to_int(row):
    if row < 1:
        row = 0
    else: row = 1
    return row

listings['multiple_listings'] = listings['host_listings_count'].apply(lambda col: convert_string_to_int(col)).astype(float)

# drop original column
listings.drop(columns='host_listings_count', inplace = True)

#check result
print(listings.groupby('multiple_listings')['multiple_listings'].size())


multiple_listings
0.0     4706
1.0    46090
Name: multiple_listings, dtype: int64


# Transformer
Optimus Prime

In [70]:
numeric_features = ['bathrooms', 'bedrooms', 'beds', 'number_of_reviews', 'number_of_reviews_ltm', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month', 'amenities_count', 'host_response_time_one_hour', 'host_response_rate_calc','host_acceptance_rate_calc','apt_yes_no','real_bed_yes_no','multiple_listings', 'host_is_superhost', 'is_location_exact', 'instant_bookable', 'host_identity_verified', 'host_since', 'accommodates', 'guests_included', 'cleaning_fee_yes_no','security_deposit_yes_no','extra_people_yes_no']



numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='0')), # strategy='median' 
    ('scaler', RobustScaler())])

categorical_features = ['neighbourhood_group_cleansed', 'room_type', 'cancellation_policy_calc']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
        ])


# Test Train Split


In [71]:
feature_list = numeric_features + categorical_features
features = listings[feature_list]

target = listings['price']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

# Model

In [72]:
pl_Lasso = Pipeline(steps=[('preprocessor', preprocessor),
                    ('regressor', Lasso(alpha=0.5))
                    ])

pl_Gboost = Pipeline(steps=[('preprocessor', preprocessor),
                    ('regressor', GradientBoostingRegressor(random_state=0))
                    ])


In [79]:
pl_Lasso.fit(X_train, y_train)
print(pl_Lasso.score(X_train, y_train))
print(pl_Lasso.predict(X_test).mean().mean())
print(pl_Lasso.score(X_test, y_test))


0.08224451082007711
165.79585112066226
0.06949921367727208


In [77]:
pl_Gboost.fit(X_train, y_train)
print(pl_Gboost.fit(X_train, y_train))
print(pl_Gboost.predict(X_test).mean().mean())
print(pl_Gboost.score(X_train, y_train))


Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='0',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                            