# Purpose

To answer the following question:
* Which LA neighbourhood has the most listings?
* What is the most popular neighbourhood, according to reviews?
* What is the best time of year to visit LA?
* Can we predict the price of a new listing?

# Setup


## Library import
We import all the required Python libraries

In [0]:
# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
# from IPython.core.display import display, HTML
# pd.options.display.max_columns = None
# pd.options.display.max_rows = None
# display(HTML("<style>.container { width:85% !important; }</style>"))
# pd.options.display.float_format =   {:,}'.format

# Visualizations
import plotly.express as px
import plotly.graph_objects as go

# ML
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.model_selection import learning_curve, GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix

# Others
from tqdm.notebook import tqdm


## Helper functions



In [0]:
# Define helper functions for eda and data cleansing

def clean_date(date_col):
    '''Format datetime columns'''
    return date_col.astype(np.datetime64)

def clean_boolean(boolean_col):
    '''Change raw boolean columns to 0/1'''
    return boolean_col.replace('t',1).replace('f',0)

In [0]:
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
    gs.fit(X, y)
    print("BEST", gs.best_params_, gs.best_score_, gs.cv_results_)
    best = gs.best_estimator_
    return best

# Data import
We retrieve all the required data for the analysis.

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [5]:
path = '/content/drive/My Drive/Colab Notebooks/ML_01_Los Angeles travel exploration/Data'

neighbourhoods = pd.read_csv(path + '/neighbourhoods.csv')
listings = pd.read_csv(path + '/listings.csv')
detailed_listings = pd.read_csv(path + '/detailed_listings.csv')
reviews = pd.read_csv(path + '/reviews.csv')


Columns (61,62) have mixed types.Specify dtype option on import or set low_memory=False.



#EDA

In [6]:
neighbourhoods.describe(include='all')

Unnamed: 0,neighbourhood_group,neighbourhood
count,270,270
unique,3,270
top,City of Los Angeles,Irwindale
freq,114,1


In [7]:
listings.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,37048,,,,25193000.0,12780600.0,109.0,15184900.0,26295700.0,36848500.0,43383800.0
name,37046,36324.0,"Artist Community for Creatives, Month to Month!",21.0,,,,,,,
host_id,37048,,,,92541200.0,95376600.0,521.0,14145600.0,51154500.0,153049000.0,345481000.0
host_name,37043,7924.0,David,349.0,,,,,,,
neighbourhood_group,37048,3.0,City of Los Angeles,21579.0,,,,,,,
neighbourhood,37048,264.0,Hollywood,1876.0,,,,,,,
latitude,37048,,,,34.0504,0.121134,33.3388,33.9984,34.0615,34.1042,34.8112
longitude,37048,,,,-118.318,0.165569,-118.934,-118.422,-118.343,-118.251,-117.653
room_type,37048,4.0,Entire home/apt,23305.0,,,,,,,
price,37048,,,,227.916,685.161,0.0,69.0,109.0,185.0,25000.0


In [8]:
detailed_listings.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,37048,,,,2.5193e+07,1.27806e+07,109,1.51849e+07,2.62957e+07,3.68485e+07,4.33838e+07
listing_url,37048,37048,https://www.airbnb.com/rooms/17227056,1,,,,,,,
scrape_id,37048,,,,2.02005e+13,10.965,2.02005e+13,2.02005e+13,2.02005e+13,2.02005e+13,2.02005e+13
last_scraped,37048,3,2020-05-09,20173,,,,,,,
name,37046,36324,"Artist Community for Creatives, Month to Month!",21,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
calculated_host_listings_count,37048,,,,8.12551,23.0307,1,1,2,5,206
calculated_host_listings_count_entire_homes,37048,,,,6.20811,22.6284,0,0,1,2,206
calculated_host_listings_count_private_rooms,37048,,,,1.32533,3.61096,0,0,0,1,57
calculated_host_listings_count_shared_rooms,37048,,,,0.516141,3.88839,0,0,0,0,66


In [9]:
reviews.describe(include='all')

Unnamed: 0,listing_id,date
count,1304141.0,1304141
unique,,3644
top,,2019-11-11
freq,,2747
mean,15828310.0,
std,10876780.0,
min,109.0,
25%,6121930.0,
50%,15315180.0,
75%,23234490.0,


In [10]:
listings.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,109,Amazing bright elegant condo park front *UPGRA...,521,Paolo,Other Cities,Culver City,33.98209,-118.38494,Entire home/apt,122,30,2,2016-05-15,0.02,1,38
1,344,Family perfect;Pool;Near Studios!,767,Melissa,Other Cities,Burbank,34.16562,-118.33458,Entire home/apt,168,2,8,2019-10-19,0.17,1,97
2,2708,Mirrored Mini-Suite with Fireplace - W. Hollywood,3008,Chas.,City of Los Angeles,Hollywood,34.09768,-118.34602,Private room,79,30,24,2020-03-17,0.33,2,281
3,2732,Zen Life at the Beach,3041,Yoga Priestess,Other Cities,Santa Monica,34.00475,-118.48127,Private room,155,1,21,2019-12-27,0.19,2,365
4,2864,* Beautiful Master Suite/Jacuzzi Tub/*,3207,Bernadine,Other Cities,Bellflower,33.87619,-118.11397,Entire home/apt,80,14,0,,,1,0


In [11]:
detailed_listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,house_rules,thumbnail_url,medium_url,picture_url,xl_picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,...,extra_people,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,109,https://www.airbnb.com/rooms/109,20200508043406,2020-05-09,Amazing bright elegant condo park front *UPGRA...,"*** Unit upgraded with new bamboo flooring, br...","*** Unit upgraded with new bamboo flooring, br...","*** Unit upgraded with new bamboo flooring, br...",none,,,,,,Camelot NEW RESIDENTS’ GENERAL INFORMATION F...,,,https://a0.muscache.com/im/pictures/4321499/1d...,,521,https://www.airbnb.com/users/show/521,Paolo,2008-06-27,"San Francisco, California, United States",Search for me on the Internet with the keyword...,within a day,100%,0%,f,https://a0.muscache.com/im/users/521/profile_p...,https://a0.muscache.com/im/users/521/profile_p...,Culver City,1.0,1.0,"['email', 'phone', 'facebook', 'reviews', 'kba']",t,t,"Culver City, CA, United States",Culver City,Culver City,...,$25.00,30,730,30,30,730,730,30.0,730.0,3 months ago,t,0,0,0,38,2020-05-09,2,0,2011-08-15,2016-05-15,80.0,10.0,10.0,6.0,8.0,10.0,8.0,f,,"{""Culver City"","" CA""}",f,f,strict_14_with_grace_period,t,f,1,1,0,0,0.02
1,344,https://www.airbnb.com/rooms/344,20200508043406,2020-05-08,Family perfect;Pool;Near Studios!,This home is perfect for families; aspiring ch...,"Cheerful & comfortable; near studios, amusemen...",This home is perfect for families; aspiring ch...,none,Quiet-yet-close to all the fun in LA! Hollywoo...,"One dog may be on premises, friendly and cared...",Short drive to subway and elevated trains runn...,"Pool, patio and self-contained main house all ...",Host and caretaker may be available throughout...,Host asks that guests refrain from partying lo...,,,https://a0.muscache.com/im/pictures/cc4b724d-d...,,767,https://www.airbnb.com/users/show/767,Melissa,2008-07-11,"Burbank, California, United States","Single mother, CEO and Owner of an internation...",within a day,60%,33%,f,https://a0.muscache.com/im/users/767/profile_p...,https://a0.muscache.com/im/users/767/profile_p...,Burbank,1.0,1.0,"['email', 'phone', 'reviews', 'jumio', 'kba', ...",t,t,"Burbank, CA, United States",Burbank,Burbank,...,$0.00,2,14,2,2,14,14,2.0,14.0,8 months ago,t,0,1,6,97,2020-05-08,8,2,2016-06-14,2019-10-19,97.0,10.0,10.0,10.0,10.0,10.0,10.0,f,,,t,f,flexible,f,f,1,1,0,0,0.17
2,2708,https://www.airbnb.com/rooms/2708,20200508043406,2020-05-09,Mirrored Mini-Suite with Fireplace - W. Hollywood,Our best memory foam pillows you'll ever sleep...,Flickering fireplace. Blendtec® Designer 625 ...,Our best memory foam pillows you'll ever sleep...,none,We are minutes away from the Mentor Language I...,Blendtec® Designer 625 Blender Bundle with Twi...,There are many buses; bus stops going in every...,"Kitchen with new refrigerator, dishwasher, sto...",I am friendly and available to help you with y...,I just have one rule. The Golden Rule Do unto ...,,,https://a0.muscache.com/im/pictures/40618141/2...,,3008,https://www.airbnb.com/users/show/3008,Chas.,2008-09-16,"Los Angeles, California, United States",Writer.\r\nLiterary Manager.\r\nPhotographer.\...,within a few hours,100%,100%,t,https://a0.muscache.com/im/pictures/user/d17cf...,https://a0.muscache.com/im/pictures/user/d17cf...,Hollywood,2.0,2.0,"['email', 'phone', 'facebook', 'reviews', 'off...",t,t,"Los Angeles, CA, United States",Hollywood,Hollywood,...,$0.00,30,366,1,30,1125,1125,29.5,1125.0,4 months ago,t,7,7,7,281,2020-05-09,24,6,2014-06-09,2020-03-17,97.0,10.0,10.0,10.0,10.0,10.0,10.0,t,,"{""City of Los Angeles"","" CA""}",t,f,strict_14_with_grace_period,f,f,2,0,2,0,0.33
3,2732,https://www.airbnb.com/rooms/2732,20200508043406,2020-05-09,Zen Life at the Beach,,This is a beautiful three story townhouse that...,This is a beautiful three story townhouse that...,none,"This is the best part of Santa Monica. Quiet, ...",,"Walking distance to all transportation: buses,...",,,ABOUT YOU. Friendly travelers or people comin...,,,https://a0.muscache.com/im/pictures/1082974/0f...,,3041,https://www.airbnb.com/users/show/3041,Yoga Priestess,2008-09-17,"Santa Monica, California, United States",I have been teaching yoga and meditation for 3...,within an hour,100%,70%,f,https://a0.muscache.com/im/users/3041/profile_...,https://a0.muscache.com/im/users/3041/profile_...,Santa Monica,2.0,2.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,f,"Santa Monica, CA, United States",Santa Monica,Santa Monica,...,$0.00,1,180,1,1,180,180,1.0,180.0,4 months ago,t,30,60,90,365,2020-05-09,21,3,2011-06-06,2019-12-27,94.0,9.0,9.0,9.0,9.0,10.0,9.0,t,228269.0,"{""Santa Monica"","" Santa Monica"","" CA""}",f,f,strict_14_with_grace_period,f,f,2,1,1,0,0.19
4,2864,https://www.airbnb.com/rooms/2864,20200508043406,2020-05-09,* Beautiful Master Suite/Jacuzzi Tub/*,Centrally located.... Furnished with King Size...,Safe living on a cul de sac in newer neighborh...,Centrally located.... Furnished with King Size...,none,What makes the neighborhood unique is that the...,"If you are doing business travel, this studio ...",Public transportation is a 3 minutes walk to t...,Good access to all things in Los Angeles and O...,I am always available for questions throughout...,No loud music after 10pm. Close front door qu...,,,https://a0.muscache.com/im/pictures/23817858/d...,,3207,https://www.airbnb.com/users/show/3207,Bernadine,2008-09-25,"Bellflower, California, United States","Fair, open, honest and very informative for ne...",within a day,100%,,f,https://a0.muscache.com/im/pictures/user/8b82a...,https://a0.muscache.com/im/pictures/user/8b82a...,Bellflower,1.0,1.0,"['email', 'phone', 'facebook', 'kba']",t,t,"Bellflower, CA, United States",Bellflower,Bellflower,...,$25.00,14,730,14,14,1125,1125,14.0,1125.0,3 weeks ago,t,0,0,0,0,2020-05-09,0,0,,,,,,,,,,f,,,t,f,strict_14_with_grace_period,f,f,1,1,0,0,


# Answering questions

## Which LA neighbourhood has the most listings?

In [12]:
nbh = listings.groupby(by='neighbourhood')['id'].agg(['count', lambda x: x.count()/listings.shape[0]])
nbh.columns = ['Listing count', '% of total']
nbh['% of total'] = nbh['% of total'].apply(lambda x:f'{x:.1%}')
nbh.sort_values(by = '% of total', ascending=False, inplace=True)
nbh.head()

Unnamed: 0_level_0,Listing count,% of total
neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1
Hollywood,1876,5.1%
Venice,1868,5.0%
Long Beach,1740,4.7%
Santa Monica,1258,3.4%
Downtown,1267,3.4%


## What is the most popular neighbourhood, according to reviews?

In [13]:
nbhrv = detailed_listings.groupby(by='neighbourhood')['number_of_reviews', 'review_scores_rating'].agg(
    {'number_of_reviews': 'sum',
    'review_scores_rating': 'mean'})
nbhrv.columns = ['Review count', 'Average rating']
nbhrv['Average rating'] = nbhrv['Average rating'].apply(lambda x:f'{x:.2f}')
nbhrv.dropna(inplace=True)
nbhrv.replace(["nan"], np.nan, inplace = True)
nbhrv.sort_values(by = 'Average rating', ascending=False, inplace=True)
nbhrv.head()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0_level_0,Review count,Average rating
neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1
Hawaiian Gardens,47,99.0
Porter Ranch,186,98.67
Mission Hills,233,98.67
Bell Gardens,37,98.67
Irwindale,265,98.5


## What is the best time of year to visit LA?

In [0]:
reviews['date'] = clean_date(reviews['date'])

In [15]:
reviews.groupby(by=pd.Grouper(key='date', freq='Y')).count()

Unnamed: 0_level_0,listing_id
date,Unnamed: 1_level_1
2009-12-31,22
2010-12-31,389
2011-12-31,1791
2012-12-31,4275
2013-12-31,10319
2014-12-31,25681
2015-12-31,60238
2016-12-31,120729
2017-12-31,196251
2018-12-31,307826


In [0]:
# remove incomplete 1st & last year to ensure all months are treated equal
reviews_trimmed = reviews[(reviews['date'].dt.year>2009)&(reviews['date'].dt.year<2020)]

In [17]:
reviews_trimmed['month']=reviews_trimmed['date'].dt.month
reviews_grp = reviews_trimmed.groupby(by='month')[['listing_id']].count()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [18]:
fig = px.bar(reviews_grp,
             x=reviews_grp.index,
             y='listing_id'
             )

fig.update_layout(title="No. of reviews per month",
                  xaxis_title="Month",
                  yaxis_title="No. of reviews",
                  )

fig.show()


## Can we predict the price of a new listing?

In [0]:
listing_combined = listings.merge(detailed_listings, on='id',
                 how='outer', suffixes=('', '_y'))
listing_combined.drop(listing_combined.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
listing_combined.filter(regex='price').columns.tolist()
listing_combined.drop(['weekly_price', 'monthly_price'], axis=1, inplace=True)

In [0]:
# # Create test/train mask
# itrain, itest = train_test_split(range(listing_combined.shape[0]), train_size=0.6)
# mask=np.ones(listing_combined.shape[0], dtype='int')
# mask[itrain]=1
# mask[itest]=0
# mask = (mask==1)

In [0]:
# Choose relevant columns that a new listing may have to prevent data leakage 
object_cols = ['neighbourhood','room_type',
               'property_type']
float_cols = ['bathrooms' ,'bedrooms' ,'beds', 'square_feet',
              'price']
bool_cols = ['requires_license', 'instant_bookable', 'is_business_travel_ready',
             'require_guest_profile_picture', 'require_guest_phone_verification']
int_cols = ['minimum_nights', 'availability_365' ,'accommodates','guests_included']

In [0]:
# Create training data df
df = listing_combined[object_cols+float_cols+bool_cols + int_cols]

In [0]:
# Onehotencode categorical data, drop na values
df = df.join(pd.get_dummies(df[object_cols]))
df.drop(object_cols,axis=1,inplace=True)
df.dropna(axis=0,inplace=True)

In [0]:
# Clean boolean columns
df[bool_cols] = clean_boolean(df[bool_cols])

In [0]:
# Define X, y
y = df.pop('price')
X = df

In [0]:
# Scale numerical columns
sc = StandardScaler()
numeric_cols = X.select_dtypes(['int64','float64']).columns

sc_transformed_cols = pd.DataFrame(sc.fit_transform(X[numeric_cols]), columns=numeric_cols, index = X.index)
X.drop(numeric_cols,axis=1,inplace=True)
X = X.merge(sc_transformed_cols,left_index= True, right_index = True, how='left')

In [0]:
# Split train test df
X_train,X_test,y_train,y_test =train_test_split(X,y)

In [0]:
# Define models
names = ['RandomForestRegressor', 'AdaBoostRegressor', 
          'BaggingRegressor', 'GradientBoostingRegressor']
models = [RandomForestRegressor(), AdaBoostRegressor(), 
          BaggingRegressor(), GradientBoostingRegressor()]

In [134]:
# Fit, train, evaluate models 
scores = {}
for name, model in tqdm(list(zip(names,models))): 
    model.fit(X_train, y_train)
    score = model.score(X_test,y_test)
    scores[name] = score
scores

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




{'AdaBoostRegressor': 0.9404818780695479,
 'BaggingRegressor': 0.8785515548123201,
 'GradientBoostingRegressor': 0.9531422390357929,
 'RandomForestRegressor': 0.9366996068212843}

In [144]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
Feature_importance = pd.DataFrame(list(zip(X_train.columns, model.feature_importances_)))
Feature_importance.columns = ['Feature', 'Importance']
px.bar(data_frame=Feature_importance.sort_values(by='Importance', ascending=False).head(10),
       x = 'Feature',
       y = 'Importance')

##Parameter tuning

In [148]:
model = GradientBoostingRegressor()
parameters = {'learning_rate':[0.001, 0.01, 0.1, 1], 'n_estimators':[10, 100, 1000],
              'min_samples_split':[2, 3, 4], 'min_samples_leaf':[1, 2, 3], 
              'max_depth':[3]}
X = X_train
y = y_train

optimised_model = cv_optimize(model, parameters, X, y)

BEST {'learning_rate': 1, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000} 0.574161499839035 {'mean_fit_time': array([0.01070118, 0.07421441, 0.70544586, 0.00942755, 0.07169218,
       0.69569268, 0.00946436, 0.07117729, 0.69989781, 0.00946484,
       0.07025127, 0.69423933, 0.00926437, 0.07035508, 0.69956765,
       0.00934262, 0.07066283, 0.69934359, 0.01038337, 0.07051163,
       0.69053111, 0.00953879, 0.07136135, 0.6885283 , 0.0095026 ,
       0.07530565, 0.69419379, 0.00957789, 0.0726687 , 0.69629936,
       0.00952191, 0.07196155, 0.71205797, 0.009723  , 0.07277999,
       0.70507679, 0.00961499, 0.07280583, 0.72496786, 0.00945978,
       0.07123694, 0.7029882 , 0.00960793, 0.07173977, 0.69576197,
       0.00947604, 0.07283559, 0.69679117, 0.01056371, 0.07130723,
       0.68868608, 0.00960698, 0.07190018, 0.68855224, 0.00953116,
       0.07302361, 0.69993682, 0.00950637, 0.07335033, 0.6945683 ,
       0.00954309, 0.07151461, 0.69206042, 0.0094

In [153]:
new_score = optimised_model.score(X_test,y_test)
new_score, scores['GradientBoostingRegressor']

(0.956670353999435, 0.9531422390357929)

In [152]:
Feature_importance = pd.DataFrame(list(zip(X_train.columns, optimised_model.feature_importances_)))
Feature_importance.columns = ['Feature', 'Importance']
px.bar(data_frame=Feature_importance.sort_values(by='Importance', ascending=False).head(10),
       x = 'Feature',
       y = 'Importance')