# Introduction

# Data Import

In [49]:
# import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt
from geopy.distance import vincenty
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

In [50]:
# load listings data
Boston_listings0 = pd.read_csv('/Users/mengphilshen/Dropbox/Project/Data_Challenges/Proj_Airbnb/data/Boston/Boston_listings.csv')
Boston_listings0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6155 entries, 0 to 6154
Columns: 106 entries, id to reviews_per_month
dtypes: float64(25), int64(21), object(60)
memory usage: 5.0+ MB


In [5]:
# check column names
list(Boston_listings0.columns)

['id',
 'listing_url',
 'scrape_id',
 'last_scraped',
 'name',
 'summary',
 'space',
 'description',
 'experiences_offered',
 'neighborhood_overview',
 'notes',
 'transit',
 'access',
 'interaction',
 'house_rules',
 'thumbnail_url',
 'medium_url',
 'picture_url',
 'xl_picture_url',
 'host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'state',
 'zipcode',
 'market',
 'smart_location',
 'country_code',
 'country',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',


# Data Cleaning

## Missing Values

In [6]:
# check missing values
missing_data = Boston_listings0.isnull().sum(axis = 0).reset_index()
missing_data.columns = ['variable', 'missing values']
missing_data['missing%'] = missing_data['missing values'] / Boston_listings0.shape[0] * 100
missing_data_srt = missing_data.sort_values('missing%', ascending = False).reset_index(drop = True)
missing_data_srt[missing_data_srt['missing%'] > 0]

Unnamed: 0,variable,missing values,missing%
0,host_acceptance_rate,6155,100.0
1,thumbnail_url,6155,100.0
2,neighbourhood_group_cleansed,6155,100.0
3,jurisdiction_names,6155,100.0
4,license,6155,100.0
5,xl_picture_url,6155,100.0
6,medium_url,6155,100.0
7,square_feet,6057,98.407799
8,weekly_price,5628,91.437855
9,monthly_price,5626,91.405361


## Classify Variables

In [37]:
# identify id and url features
id_url_cols = ['id',             
               'scrape_id',
               'host_id',
               'listing_url',
               'thumbnail_url',
               'medium_url',               
               'picture_url',
               'xl_picture_url',
               'host_url',
               'host_thumbnail_url',
               'host_picture_url',
               'zipcode']
Boston_listings0[id_url_cols].head()

Unnamed: 0,id,scrape_id,host_id,listing_url,thumbnail_url,medium_url,picture_url,xl_picture_url,host_url,host_thumbnail_url,host_picture_url,zipcode
0,3781,20190209175027,4804,https://www.airbnb.com/rooms/3781,,,https://a0.muscache.com/im/pictures/24670/b2de...,,https://www.airbnb.com/users/show/4804,https://a0.muscache.com/im/users/4804/profile_...,https://a0.muscache.com/im/users/4804/profile_...,2128
1,5506,20190209175027,8229,https://www.airbnb.com/rooms/5506,,,https://a0.muscache.com/im/pictures/1598e8b6-5...,,https://www.airbnb.com/users/show/8229,https://a0.muscache.com/im/users/8229/profile_...,https://a0.muscache.com/im/users/8229/profile_...,2119
2,6695,20190209175027,8229,https://www.airbnb.com/rooms/6695,,,https://a0.muscache.com/im/pictures/38ac4797-e...,,https://www.airbnb.com/users/show/8229,https://a0.muscache.com/im/users/8229/profile_...,https://a0.muscache.com/im/users/8229/profile_...,2119
3,6976,20190209175027,16701,https://www.airbnb.com/rooms/6976,,,https://a0.muscache.com/im/pictures/85bf0653-b...,,https://www.airbnb.com/users/show/16701,https://a0.muscache.com/im/pictures/d9075ba6-7...,https://a0.muscache.com/im/pictures/d9075ba6-7...,2131
4,8789,20190209175027,26988,https://www.airbnb.com/rooms/8789,,,https://a0.muscache.com/im/pictures/32210/7fdd...,,https://www.airbnb.com/users/show/26988,https://a0.muscache.com/im/pictures/user/d7d71...,https://a0.muscache.com/im/pictures/user/d7d71...,2108


In [8]:
# identify datetime features
datetime_cols = ['last_scraped',                 
                 'calendar_last_scraped',
                 'host_since',
                 'first_review',
                 'last_review']
Boston_listings0[datetime_cols].head()

Unnamed: 0,last_scraped,calendar_last_scraped,host_since,first_review,last_review
0,2019-02-09,2019-02-09,2008-12-03,2015-07-10,2018-07-02
1,2019-02-09,2019-02-09,2009-02-19,2009-03-21,2018-11-18
2,2019-02-09,2019-02-09,2009-02-19,2009-08-06,2018-11-25
3,2019-02-09,2019-02-09,2009-05-11,2009-07-19,2018-10-23
4,2019-02-09,2019-02-09,2009-07-22,2014-08-12,2018-12-20


In [22]:
# identify numeric features
numeric_cols = ['host_acceptance_rate',
                'host_listings_count',
                'host_total_listings_count',
                'neighbourhood_group_cleansed',
                'accommodates',
                'bathrooms',
                'bedrooms',
                'beds',
                'square_feet',
                'guests_included',
                'minimum_nights',
                'maximum_nights',
                'minimum_minimum_nights',
                'maximum_minimum_nights',
                'minimum_maximum_nights',
                'maximum_maximum_nights',
                'minimum_nights_avg_ntm',
                'maximum_nights_avg_ntm',
                'availability_30',
                'availability_60',
                'availability_90',
                'availability_365',
                'number_of_reviews',
                'number_of_reviews_ltm',
                'review_scores_rating',
                'review_scores_accuracy',
                'review_scores_cleanliness',
                'review_scores_checkin',
                'review_scores_communication',
                'review_scores_location',
                'review_scores_value',
                'license',
                'jurisdiction_names',
                'reviews_per_month',
                'calculated_host_listings_count',
                'calculated_host_listings_count_entire_homes',
                'calculated_host_listings_count_private_rooms',
                'calculated_host_listings_count_shared_rooms']
Boston_listings0[numeric_cols].head()

Unnamed: 0,host_acceptance_rate,host_listings_count,host_total_listings_count,neighbourhood_group_cleansed,accommodates,bathrooms,bedrooms,beds,square_feet,guests_included,...,review_scores_communication,review_scores_location,review_scores_value,license,jurisdiction_names,reviews_per_month,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms
0,,4.0,4.0,,2,1.0,1.0,1.0,,2,...,10.0,10.0,10.0,,,0.32,1,1,0,0
1,,6.0,6.0,,2,1.0,1.0,1.0,225.0,1,...,10.0,9.0,10.0,,,0.66,6,6,0,0
2,,6.0,6.0,,4,1.0,1.0,2.0,,2,...,10.0,9.0,9.0,,,0.73,6,6,0,0
3,,1.0,1.0,,2,1.0,1.0,1.0,,1,...,10.0,9.0,10.0,,,0.64,1,0,1,0
4,,11.0,11.0,,2,1.0,1.0,1.0,,1,...,10.0,10.0,9.0,,,0.4,10,10,0,0


In [32]:
# identify textual features
textual_cols = ['name',
                'summary',
                'space',
                'description',
                'experiences_offered',
                'neighborhood_overview',
                'notes',
                'transit',
                'access',
                'interaction',
                'house_rules',
                'host_name',
                'host_location',
                'host_about',
                'host_response_time',
                'host_neighbourhood',
                'host_verifications',
                'street',
                'neighbourhood',
                'neighbourhood_cleansed',
                'neighbourhood_group_cleansed',
                'city',
                'state',
                'zipcode',
                'market',
                'smart_location',
                'country_code',
                'country',
                'property_type',
                'room_type',
                'bed_type',
                'amenities',
                'calendar_updated', 
                'cancellation_policy']
Boston_listings0[textual_cols].head()

Unnamed: 0,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,...,market,smart_location,country_code,country,property_type,room_type,bed_type,amenities,calendar_updated,cancellation_policy
0,HARBORSIDE-Walk to subway,Fully separate apartment in a two apartment bu...,This is a totally separate apartment located o...,Fully separate apartment in a two apartment bu...,none,"Mostly quiet ( no loud music, no crowed sidewa...",Building is on quiet side of airport. Building...,Local subway stop ( Maverick Station on the Bl...,Guests solely occupy the 1 floor apartment wit...,We sometimes travel. Always available via: mob...,...,Boston,"Boston, MA",US,United States,Apartment,Entire home/apt,Real Bed,"{TV,""Cable TV"",Wifi,""Air conditioning"",Kitchen...",4 months ago,super_strict_30
1,**$79 Special ** Private! Minutes to center!,This is a private guest room with private bath...,**THE BEST Value in BOSTON!!*** PRIVATE GUEST ...,This is a private guest room with private bath...,none,"Peacful, Architecturally interesting, historic...","Guest room is on the 3rd floor, Steps are req...","Excellent Access to all of Boston, Walk to Mus...",You get full access to the guest room with pri...,"We give guests privacy, but we are available ...",...,Boston,"Boston, MA",US,United States,Guest suite,Entire home/apt,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",today,strict_14_with_grace_period
2,$99 Special!! Home Away! Condo,,** WELCOME *** FULL PRIVATE APARTMENT In a His...,** WELCOME *** FULL PRIVATE APARTMENT In a His...,none,"Peaceful, Architecturally interesting, histori...",Check out some of the other accommodations we ...,"We are a 5 minute walk to the subway, 5 minut...","Full Private apartment. 1 bedroom, kitchen, ...",,...,Boston,"Boston, MA",US,United States,Condominium,Entire home/apt,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",today,strict_14_with_grace_period
3,Mexican Folk Art Haven in Boston Residential Area,Come stay with me in Boston's Roslindale neigh...,"This is a well-maintained, two-family house bu...",Come stay with me in Boston's Roslindale neigh...,none,The LOCATION: Roslindale is a safe and diverse...,"There is no smoking allowed, inside the apartm...","PUBLIC TRANSPORTATION: From the house, quick p...","You can have access to your room, the adjacent...",ABOUT ME: I am one of the original Boston AIRB...,...,Boston,"Boston, MA",US,United States,Apartment,Private room,Real Bed,"{TV,""Cable TV"",Wifi,""Air conditioning"",Kitchen...",today,moderate
4,Curved Glass Studio/1bd facing Park,"Bright, 1 bed with curved glass windows facing...",Fully Furnished studio with enclosed bedroom. ...,"Bright, 1 bed with curved glass windows facing...",none,Beacon Hill is a historic neighborhood filled ...,Guests should expect to show government ID to ...,The MBTA site is a great reference for public ...,Guests have access to the full unit,I'm available for questions and/or issues.,...,Boston,"Boston, MA",US,United States,Apartment,Entire home/apt,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",yesterday,strict_14_with_grace_period


## Engineer Features

### Convert string monetary values to numeric monetary values

In [10]:
# identify monetary features
monetary_cols = ['price', 
                 'weekly_price', 
                 'monthly_price', 
                 'security_deposit', 
                 'cleaning_fee', 
                 'extra_people']
Boston_listings0[monetary_cols].head()

Unnamed: 0,price,weekly_price,monthly_price,security_deposit,cleaning_fee,extra_people
0,$125.00,$750.00,"$2,250.00",$0.00,$75.00,$0.00
1,$145.00,$980.00,"$3,000.00",$0.00,$40.00,$0.00
2,$169.00,,,,$70.00,$8.00
3,$65.00,$395.00,"$1,350.00",$0.00,$0.00,$30.00
4,$99.00,"$1,200.00","$2,900.00","$1,000.00",$250.00,$0.00


In [11]:
# define strPrice_to_numPrice function
def strPrice_to_numPrice(price_string):
    '''
    Converts USD prices from string to numeric format
    
    Args:
        price_string (string): USD price in string format (e.g., '$123,456.00')
    
    Returns:
        price_numeric (float): USD price in numeric format (e.g., 123456.00)
    '''
    
    price_numeric = float(str(price_string).replace(',', '').split('$')[-1])
    return price_numeric

In [12]:
# apply strPrice_to_numPrice function over monetary features
Boston_listings1 = Boston_listings0.copy()
for col in monetary_cols:
    Boston_listings1[col] = Boston_listings1[col].apply(strPrice_to_numPrice)
Boston_listings1[monetary_cols].head()

Unnamed: 0,price,weekly_price,monthly_price,security_deposit,cleaning_fee,extra_people
0,125.0,750.0,2250.0,0.0,75.0,0.0
1,145.0,980.0,3000.0,0.0,40.0,0.0
2,169.0,,,,70.0,8.0
3,65.0,395.0,1350.0,0.0,0.0,30.0
4,99.0,1200.0,2900.0,1000.0,250.0,0.0


### Convert string boolean values to numeric boolean values

In [13]:
# identify boolean features
boolean_cols = ['host_is_superhost', 
                'host_has_profile_pic',
                'host_identity_verified',
                'is_location_exact',
                'has_availability',
                'requires_license',
                'instant_bookable',
                'is_business_travel_ready',
                'require_guest_profile_picture',
                'require_guest_phone_verification']
Boston_listings1[boolean_cols].head()

Unnamed: 0,host_is_superhost,host_has_profile_pic,host_identity_verified,is_location_exact,has_availability,requires_license,instant_bookable,is_business_travel_ready,require_guest_profile_picture,require_guest_phone_verification
0,t,t,f,t,t,f,f,f,f,f
1,t,t,t,t,t,f,t,f,f,f
2,t,t,t,t,t,f,t,f,f,f
3,t,t,t,t,t,f,f,f,t,f
4,f,t,f,t,t,f,f,f,f,f


In [14]:
# define strBoolean_to_numBoolean function
def strBoolean_to_numBoolean(s):
    '''
    Converts "first letter boolean" strings to integers
    
    Args:
        bool_string (string): 't', 'f' or other
    
    Returns:
        bool_number (int): 1, 0 or None
    '''
        
    if s == 'f':
        return 0
    elif s == 't':
        return 1
    return None

In [15]:
# apply strBoolean_to_numBoolean function over boolean features
Boston_listings2 = Boston_listings1.copy()
for col in boolean_cols:
    Boston_listings2[col] = Boston_listings2[col].apply(strBoolean_to_numBoolean)
Boston_listings2[boolean_cols].head()

Unnamed: 0,host_is_superhost,host_has_profile_pic,host_identity_verified,is_location_exact,has_availability,requires_license,instant_bookable,is_business_travel_ready,require_guest_profile_picture,require_guest_phone_verification
0,1.0,1.0,0.0,1,1,0,0,0,0,0
1,1.0,1.0,1.0,1,1,0,1,0,0,0
2,1.0,1.0,1.0,1,1,0,1,0,0,0
3,1.0,1.0,1.0,1,1,0,0,0,1,0
4,0.0,1.0,0.0,1,1,0,0,0,0,0


### Create distance_to_center feature

In [17]:
# sepecify coordinates of Boston
Boston_listings3 = Boston_listings2.copy()
Boston_listings3['lat_center'] = 42.3584300
Boston_listings3['lon_center'] = -71.0597700

In [19]:
# calculates vincenty distance
Boston_listings3['distance_to_center'] = Boston_listings3.apply(lambda x: vincenty((x['latitude'], x['longitude']), (x['lat_center'], x['lon_center'])).km, axis = 1)
Boston_listings3['distance_to_center'].head()

0    2.616808
1    4.338483
2    4.212168
3    9.642258
4    0.251808
Name: distance_to_center, dtype: float64

# Exploratory Data Analysis

## Numeric Variables

In [25]:
# create a list of numeric variables_to_keep
numVar = ['price', 
          'security_deposit', 
          'cleaning_fee', 
          'extra_people',
          'distance_to_center',
          'host_listings_count',
          'host_total_listings_count',
          'accommodates',
          'bathrooms',
          'bedrooms',
          'beds',
          'guests_included',
          'minimum_nights',
          'maximum_nights',
          'minimum_minimum_nights',
          'maximum_minimum_nights',
          'minimum_maximum_nights',
          'maximum_maximum_nights',
          'minimum_nights_avg_ntm',
          'maximum_nights_avg_ntm',
          'availability_30',
          'availability_60',
          'availability_90',
          'availability_365',
          'number_of_reviews',
          'number_of_reviews_ltm',
          'review_scores_rating',
          'review_scores_accuracy',
          'review_scores_cleanliness',
          'review_scores_checkin',
          'review_scores_communication',
          'review_scores_location',
          'review_scores_value',
          'reviews_per_month',
          'calculated_host_listings_count',
          'calculated_host_listings_count_entire_homes',
          'calculated_host_listings_count_private_rooms',
          'calculated_host_listings_count_shared_rooms']

In [26]:
# describe numeric variables
Boston_listings3[numVar].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,6155.0,206.795126,375.4239,0.0,83.0,150.0,220.0,5000.0
security_deposit,4280.0,196.570561,317.9056,0.0,0.0,100.0,300.0,5000.0
cleaning_fee,5176.0,73.192427,48.81477,0.0,35.0,70.0,100.0,495.0
extra_people,6155.0,11.45199,20.3398,0.0,0.0,0.0,20.0,300.0
distance_to_center,6155.0,3.958259,2.839366,0.086062,1.737084,3.189664,5.86236,14.88431
host_listings_count,6153.0,152.358524,372.0544,0.0,1.0,4.0,38.0,1480.0
host_total_listings_count,6153.0,152.358524,372.0544,0.0,1.0,4.0,38.0,1480.0
accommodates,6155.0,3.407311,2.175956,1.0,2.0,3.0,4.0,16.0
bathrooms,6150.0,1.268049,0.5111175,0.0,1.0,1.0,1.5,6.0
bedrooms,6149.0,1.351114,0.9266306,0.0,1.0,1.0,2.0,8.0


## Boolean Variables

In [47]:
# create a list of boolean variables_to_keep
booVar = ['host_is_superhost', 
          'host_has_profile_pic',
          'host_identity_verified',
          'is_location_exact',
          'instant_bookable',
          'require_guest_profile_picture',
          'require_guest_phone_verification']

In [48]:
# describe boolean variables
Boston_listings3[booVar].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
host_is_superhost,6153.0,0.232732,0.422607,0.0,0.0,0.0,0.0,1.0
host_has_profile_pic,6153.0,0.998862,0.033713,0.0,1.0,1.0,1.0,1.0
host_identity_verified,6153.0,0.387453,0.487208,0.0,0.0,0.0,1.0,1.0
is_location_exact,6155.0,0.829569,0.376041,0.0,1.0,1.0,1.0,1.0
instant_bookable,6155.0,0.476848,0.499504,0.0,0.0,0.0,1.0,1.0
require_guest_profile_picture,6155.0,0.019496,0.138273,0.0,0.0,0.0,0.0,1.0
require_guest_phone_verification,6155.0,0.051178,0.220378,0.0,0.0,0.0,0.0,1.0


## Categorical Variables

In [43]:
# create a list of categorical variables_to_keep
catVar = ['host_response_time',
          'neighbourhood_cleansed',
          'property_type',
          'room_type',
          'bed_type', 
          'cancellation_policy']

In [44]:
# check uninformative features
for col in catVar:
    print("\n")
    print(col)
    print(Boston_listings3[col].value_counts())



host_response_time
within an hour        3495
within a few hours     768
within a day           537
a few days or more     117
Name: host_response_time, dtype: int64


neighbourhood_cleansed
Dorchester                 537
Jamaica Plain              514
Back Bay                   494
Downtown                   453
South End                  445
Fenway                     440
Brighton                   358
South Boston               338
Allston                    329
Roxbury                    319
East Boston                311
Beacon Hill                255
North End                  245
Mission Hill               214
Charlestown                156
West End                   154
Chinatown                  139
Roslindale                 117
South Boston Waterfront     76
Mattapan                    75
West Roxbury                70
Hyde Park                   59
Bay Village                 39
Longwood Medical Area       11
Leather District             7
Name: neighbourhood_cleansed, dt

# Appendix

In [95]:
list(Boston_listings0.select_dtypes(include=['int']).columns)

['id',
 'scrape_id',
 'host_id',
 'accommodates',
 'guests_included',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'number_of_reviews',
 'number_of_reviews_ltm',
 'calculated_host_listings_count',
 'calculated_host_listings_count_entire_homes',
 'calculated_host_listings_count_private_rooms',
 'calculated_host_listings_count_shared_rooms']

In [None]:
# identify uninformative features
uninformative_cols = ['host_acceptance_rate',
                      'thumbnail_url',
                      'neighbourhood_group_cleansed',
                      'jurisdiction_names',
                      'license',
                      'xl_picture_url',
                      'medium_url',
                      'experiences_offered',
                      'state',
                      'country_code',
                      'country']

In [None]:
# check uninformative features
for col in uninformative_cols:
    print(Boston_listings0[col].value_counts())
    print("\n")