# Pipeline is the boss

In [63]:
# Create a pipeline that standardizes the data then creates a model
import os
from datetime import datetime
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import eli5

# Import CSV

In [64]:
#read data, create listings dataframe
path = '../data/austin-airbnb/'
listings_csv = os.path.join(path,'listings.csv')
#print(listings_csv)
listings = pd.read_csv(listings_csv, index_col = 'id')

  interactivity=interactivity, compiler=compiler, result=result)


# Convert Data Type

In [65]:
#all dollars fields
#function
def fix_currency(row):
    row = row.replace(',', '')
    row = row.replace('$', '')
    return row

# #update rows and convert to boolean, only non null values
Currency_columns = ['extra_people', 'cleaning_fee', 'security_deposit', 'price']

for column in Currency_columns:
    filt = listings[column].notna()
    listings[column] = listings[column][filt].apply(lambda col: fix_currency(col)).astype(float)



In [66]:
def convert_bool(row):
    row = row.replace('f', '0')
    row = row.replace('t', '1')
    return row

# #update rows and convert to boolean, only non null values
Boolean_columns = ['host_is_superhost', 'is_location_exact', 'instant_bookable', 'host_identity_verified']

for column in Boolean_columns:
    filt = listings[column].notna()
    listings[column] = listings[column][filt].apply(lambda col: convert_bool(col)).astype(int)


In [67]:
#conver to date first then number, only do for non null
listings['host_since'] = pd.to_datetime(listings['host_since'])

filt = listings['host_since'].notna()
listings['host_since'] = listings['host_since'][filt].apply(lambda x: x.toordinal())


# Filter Rows
Based on previous analyisis: 
 1. Records with reviews within one year
 2. Rmove all hotels property type
 3. Limit price range (0-$800)

In [68]:
#count how many records got reviews last 1 year
scraped_date = datetime.strptime('2020-03-14',"%Y-%m-%d")
listings['last_review_days_ago'] = (scraped_date - pd.to_datetime(listings['last_review'])).dt.days
filt = listings['last_review_days_ago'] <= 365
listings['property_type'][filt].count()
listings = listings[filt]

In [69]:
#dropping records with certain property type, mostly hotels
filt = (~listings['property_type'].isin (['Aparthotel','Bed and breakfast','Boutique hotel',' Hostel', 'Hotel', 'Resort', 'Serviced apartment']))

listings = listings[filt]

In [70]:
#limit price
filt = (listings['price'] > 0) & (listings['price'] < 500)
listings = listings[filt]


In [71]:
listings.shape

(6094, 106)

# Feature Engineering

In [72]:
#turn cleaning fee to yes and no column
def cleaning_fee_yes_no (row):
    if row['cleaning_fee'] > 0.00:
        return '1'
    else:
        return '0'

listings['cleaning_fee_yes_no'] = listings.apply(cleaning_fee_yes_no, axis=1)

print(listings.groupby('cleaning_fee_yes_no').agg({'price':'mean'}))

                          price
cleaning_fee_yes_no            
0                    110.985135
1                    147.105902


In [73]:
#turn security deposit to yes and no column
def security_deposit_yes_no (row):
    if row['security_deposit'] > 0.00:
        return '1'
    else:
        return '0'

listings['security_deposit_yes_no'] = listings.apply(security_deposit_yes_no, axis=1)

print(listings.groupby('security_deposit_yes_no').agg({'price':'mean'}))

                              price
security_deposit_yes_no            
0                        127.146404
1                        158.854995


In [74]:
#turn extra people fee to yes and no
def extra_people_yes_no (row):
    if row['extra_people'] > 0.00:
        return '1'
    else:
        return '0'

listings['extra_people_yes_no'] = listings.apply(extra_people_yes_no, axis=1)

print(listings.groupby('extra_people_yes_no').agg({'price':'mean'}))

                          price
extra_people_yes_no            
0                    143.938462
1                    141.545747


In [75]:
#amenities_count

listings['amenities_count'] = listings['amenities'].str.count(',')
print (listings['amenities_count'].sort_values(ascending=False).head())

# drop original column
listings.drop(columns='amenities', inplace = True)

id
5372834     97
19011922    89
17458846    84
14993882    83
19328783    83
Name: amenities_count, dtype: int64


In [76]:
# host_response_rate

def convert_string_to_int(row):
    if row == '100%':
        row = '1'
    else: row = '0'
    return row

listings['host_response_rate_calc'] = listings['host_response_rate'].apply(lambda col: convert_string_to_int(col)).astype(float)

# drop original column
listings.drop(columns='host_response_rate', inplace = True)

#check result
print(listings.groupby('host_response_rate_calc')['host_response_rate_calc'].size())

host_response_rate_calc
0.0    1607
1.0    4487
Name: host_response_rate_calc, dtype: int64


In [77]:
#host_acceptance_rate
#needs to use function that conver 100% to 1 and the rest to 0 (from previous cell)

listings['host_acceptance_rate_calc'] = listings['host_acceptance_rate'].apply(lambda col: convert_string_to_int(col)).astype(float)

# drop original column
listings.drop(columns='host_acceptance_rate', inplace = True)

#check result
print(listings.groupby('host_acceptance_rate_calc')['host_acceptance_rate_calc'].size())

host_acceptance_rate_calc
0.0    3502
1.0    2592
Name: host_acceptance_rate_calc, dtype: int64


In [78]:
# host_response_time

def convert_string_to_int(row):
    if row == 'within an hour':
        row = '1'
    else: row = '0'
    return row

listings['host_response_time_one_hour'] = listings['host_response_time'].apply(lambda col: convert_string_to_int(col)).astype(float)

# drop original column
listings.drop(columns='host_response_time', inplace = True)

#check result
print(listings.groupby('host_response_time_one_hour')['host_response_time_one_hour'].size())

host_response_time_one_hour
0.0    1621
1.0    4473
Name: host_response_time_one_hour, dtype: int64


In [79]:
#property_type, after modeling, loft seems to affect price. But looking at the data loft price is between 28 and 900 dollars.
# I am going to split propety type into two, apt or non
#listings.groupby('property_type').agg({'property_type': 'size', 'price':'max'}).sort_values(by='price',ascending=False)

def convert_string_to_int(row):
    if row == 'Apartment':
        row = '1'
    else: row = '0'
    return row

listings['apt_yes_no'] = listings['property_type'].apply(lambda col: convert_string_to_int(col)).astype(float)

# drop original column
listings.drop(columns='property_type', inplace = True)

#check result
print(listings.groupby('apt_yes_no')['apt_yes_no'].size())


apt_yes_no
0.0    4826
1.0    1268
Name: apt_yes_no, dtype: int64


In [80]:
#bed_type --- may not be needed, let's see

def convert_string_to_int(row):
    if row == 'Real Bed':
        row = '1'
    else: row = '0'
    return row

listings['real_bed_yes_no'] = listings['bed_type'].apply(lambda col: convert_string_to_int(col)).astype(float)

# drop original column
listings.drop(columns='bed_type', inplace = True)

#check result
print(listings.groupby('real_bed_yes_no')['real_bed_yes_no'].size())

real_bed_yes_no
0.0      42
1.0    6052
Name: real_bed_yes_no, dtype: int64


In [81]:
#cancellation_policy -- let's do 3 classes (flexible, moderate, and strict)

def convert_string_to_int(row):
    row = row.replace('strict_14_with_grace_period', 'strict')
    row = row.replace('super_strict_30', 'strict')
    row = row.replace('super_strict_60', 'strict')
    return row

#new column
listings['cancellation_policy_calc'] = listings['cancellation_policy'].apply(lambda col: convert_string_to_int(col))

# drop original column
listings.drop(columns='cancellation_policy', inplace = True)

#check result
print(listings.groupby('cancellation_policy_calc')['cancellation_policy_calc'].size())

cancellation_policy_calc
flexible    1277
moderate    2035
strict      2782
Name: cancellation_policy_calc, dtype: int64


In [82]:
#host_listings_count

def convert_string_to_int(row):
    if row < 1:
        row = 0
    else: row = 1
    return row

listings['multiple_listings'] = listings['host_listings_count'].apply(lambda col: convert_string_to_int(col)).astype(float)

# drop original column
listings.drop(columns='host_listings_count', inplace = True)

#check result
print(listings.groupby('multiple_listings')['multiple_listings'].size())


multiple_listings
0.0     503
1.0    5591
Name: multiple_listings, dtype: int64


# Transformer
Optimus Prime

In [83]:
numeric_features = ['bathrooms', 'bedrooms', 'beds', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month', 'amenities_count', 'host_response_time_one_hour', 'host_response_rate_calc','host_acceptance_rate_calc','apt_yes_no','real_bed_yes_no','multiple_listings', 'host_is_superhost', 'is_location_exact', 'instant_bookable', 'host_identity_verified', 'host_since', 'accommodates', 'guests_included', 'cleaning_fee_yes_no','security_deposit_yes_no','extra_people_yes_no']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)), # strategy='median' 
    ('scaler', StandardScaler())
    ])

#other city doesn't have 'neighbourhood_group_cleansed' so using 'host_neighbourhood' instead
categorical_features = ['host_neighbourhood', 'room_type', 'cancellation_policy_calc']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
        ])


# Test Train Split


In [84]:
feature_list = numeric_features + categorical_features
features = listings[feature_list]

target = listings['price']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

# Running Model(s)

In [85]:
pl_Lasso = Pipeline(steps=[('preprocessor', preprocessor),
                    ('regressor', Lasso(alpha=0.5))
                    ])

pl_Gboost = Pipeline(steps=[('preprocessor', preprocessor),
                    ('regressor', GradientBoostingRegressor(random_state=0))
                    ])

pl_RandomForest = Pipeline(steps=[('preprocessor', preprocessor),
                    ('regressor', RandomForestRegressor(n_estimators=100, random_state=None, min_samples_split=100))
                    ])


In [86]:
pl_Lasso.fit(X_train, y_train)


Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=0,
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                              

In [87]:
print(pl_Lasso.score(X_train, y_train))
print(pl_Lasso.predict(X_test).mean().mean())
print(pl_Lasso.score(X_test, y_test))


0.5196622734496227
140.67118175345428
0.518480797190376


In [88]:
pl_Gboost.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=0,
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                              

In [89]:
print(pl_Gboost.score(X_train, y_train))
print(pl_Gboost.predict(X_test).mean().mean())
print(pl_Gboost.score(X_test, y_test))


0.6344006581988668
140.31925079160666
0.5592227614667493


In [90]:
#get feature importance
onehot_columns = list(pl_Gboost.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names(input_features=categorical_features))
numeric_features_list = list(numeric_features)
numeric_features_list.extend(onehot_columns)

eli5.explain_weights(pl_Gboost.named_steps['regressor'], top=50, feature_names=numeric_features_list)

Weight,Feature
0.3962  ± 0.3637,bedrooms
0.1365  ± 0.3025,bathrooms
0.1148  ± 0.2393,accommodates
0.1141  ± 0.1183,room_type_Entire home/apt
0.0672  ± 0.2173,reviews_per_month
0.0284  ± 0.1331,host_neighbourhood_Downtown
0.0168  ± 0.2081,host_since
0.0159  ± 0.1721,guests_included
0.0133  ± 0.1284,host_neighbourhood_Santa Clara
0.0106  ± 0.1953,amenities_count


In [91]:
pl_RandomForest.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=0,
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                              

In [92]:
print(pl_RandomForest.score(X_train, y_train))
print(pl_RandomForest.predict(X_test).mean().mean())
print(pl_RandomForest.score(X_test, y_test))

0.6311204837382776
140.90993937663956
0.5151906366294889
