In [None]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split

# sklearn :: models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# sklearn :: evaluation metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

sns.set_style('whitegrid')

## Import Raw Data

In [None]:
df = pd.read_csv('AirBnB - Regression/train.csv')
df.columns
df.head()

## Clean Up Dataset

In [None]:
# remove thumbnail_url column
df = df.drop('thumbnail_url', axis=1)

In [None]:
# convert data type - strip % symbol from host_response_rate
df['host_response_rate'] = df['host_response_rate'].str.replace('%', '').astype(float)

## Missing Data

In [None]:
# see how many missing values exist and where they are
print(df.isnull().sum())

In [None]:
# fill in missing values in host_has_profile_pic, host_identity_verified with 'N/A'
df['host_has_profile_pic'] = df['host_has_profile_pic'].fillna('N/A')
df['host_identity_verified'] = df['host_identity_verified'].fillna('N/A')

In [None]:
# option 1: remove the listings that don't have a first/last review date
# option 2: replace the missing date values with feature mean value


In [None]:
# host_response_rate, host_since_date: fill in nans with feature mean since some listings with reviews have nan host_response_rate
# decent number of listings without a host_reponse_rate value

In [None]:
# those without a review date have 0 for number_of_reviews and nan review_scores_rating

## Feature Engineering
One-hot encode the following categorical features:
    - property_type
    - room_type
    - bed_type
    - cancellation_policy
    - cleaning_fee
    - city
    - host_has_profile_pic
    - host_identity_verified
    - instant_bookable
    - zipcode

In [None]:
# one-hot encode features
df_proptype = pd.get_dummies(df['property_type'])
df = pd.concat([df, df_proptype], axis=1)

df_roomtype = pd.get_dummies(df['room_type'])
df = pd.concat([df, df_roomtype], axis=1)

df_bedtype = pd.get_dummies(df['bed_type'])
df = pd.concat([df, df_bedtype], axis=1)

df_cancel = pd.get_dummies(df['cancellation_policy'])
df = pd.concat([df, df_cancel], axis=1)

df_cleanfee = pd.get_dummies(df['cleaning_fee'])
df_cleanfee.columns = ['cleaning_fee_false', 'cleaning_fee_true'] # got to check how they arrange new one-hot encoded columns
df = pd.concat([df, df_cleanfee], axis=1)

df_city = pd.get_dummies(df['city'])
df = pd.concat([df, df_city], axis=1)

df_hostpic = pd.get_dummies(df['host_has_profile_pic'])
df_hostpic.columns = ['host_pic_false', 'host_pic_true', 'host_pic_na']
df = pd.concat([df, df_hostpic], axis=1)

df_hostid = pd.get_dummies(df['host_identity_verified'])
df_hostid.columns = ['host_id_false', 'host_id_true', 'host_id_na']
df = pd.concat([df, df_hostid], axis=1)

df_instant = pd.get_dummies(df['instant_bookable'])
df_instant.columns = ['instant_false', 'instant_true', 'instant_na']
df = pd.concat([df, df_instant], axis=1)

df_zipcode = pd.get_dummies(df['zipcode'])
df = pd.concat([df, df_zipcode], axis=1)

df.head()

In [None]:
# convert dates to number of days_ago
import dateutil.parser
from datetime import datetime

def ndays(date): # pass in timestamp
    date_format = "%d/%m/%Y"
    today = '14/5/2019'
    delta = datetime.strptime(today, date_format) - datetime.strptime(str(dateutil.parser.parse(str(date)).date()), date_format)
    return delta.days

df['first_review'].apply(ndays)
df['host_since'].apply(ndays)
df['last_review'].apply(ndays)

## Models

In [None]:
# import Regressors
import sklearn.linear_model # has LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

In [None]:
# functions to run models

def model_training(model_name, model, X_train, y_train):
    model.fit(X_train, y_train)
    return model
    
def model_prediction(model, X_test):
    y_pred = model.predict(X_test)
    return y_pred

def model_evaluation(model_name, y_test, y_pred):
    print(model_name)
    print('MAE', mean_absolute_error(y_test, y_pred))
    print('RMSE', np.sqrt(mean_squared_error(y_test, y_pred)))
    plt.scatter(y_test, y_pred, alpha=0.3)
    plt.plot(range(0,5000000, 100), range(0,5000000, 100), '--r', alpha=0.3, label='Line1')
    plt.title(model_name)
    plt.xlabel('True Value')
    plt.ylabel('Predict Value')
    plt.xlim([0, 5000000])
    plt.ylim([0, 5000000])
    plt.show()
    print('')

def run_experiment(model_name, model, X_train, y_train, X_test):
    train_model = model_training(model_name, model, X_train, y_train)
    predictions = model_prediction(train_model, X_test)
    model_evaluation(model_name, y_test, predictions)

In [None]:
X_columns = ['accomodates', 'bathrooms', 'first_review', 'host_response_rate', 'host_since', 'last_review', 'number_of_reviews', 
             'review_scores_rating', 'bedrooms', 'beds'] + list(df_proptype.columns) + list(df_roomtype.columns) + 
    list(df_bedtype.columns) + list(df_cancel.columns) + list(df_cleanfee.columns) + list(df_city.columns) + list(df_zipcode.columns) + 
    ['host_pic_false', 'host_pic_true', 'host_id_false', 'host_id_true', 'instant_false', 'instant_true']
y_column = ['log_price']

In [None]:
# normalizing columns to be on same scale as log_price