In [1]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import re
from functools import reduce
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error,classification_report, roc_curve 
%matplotlib inline

In [2]:
listing_df=pd.read_csv('listings.csv')

In [3]:
listing_df = listing_df.drop(columns=["bathrooms","listing_url","scrape_id","last_scraped","picture_url",
                                      "host_url","host_name","host_thumbnail_url","host_picture_url","neighbourhood_group_cleansed",
                                      "calendar_updated","calendar_last_scraped"])

# Listing dataset-Data preprocessing

In [4]:
listing_df.duplicated().any()

False

In [5]:
pd.set_option('display.max_rows',None)
print(listing_df.isnull().sum())

id                                                 0
name                                               0
description                                       71
neighborhood_overview                           1790
host_id                                            0
host_since                                         1
host_location                                      6
host_about                                      1934
host_response_time                              1129
host_response_rate                              1129
host_acceptance_rate                             837
host_is_superhost                                  1
host_neighbourhood                               362
host_listings_count                                1
host_total_listings_count                          1
host_verifications                                 0
host_has_profile_pic                               1
host_identity_verified                             1
neighbourhood                                 

# HANDLING MISSING VALUES

In [6]:

print(listing_df['host_response_rate'].isnull().sum())
print(listing_df['host_response_rate'].unique())

1129
['100%' nan '50%' '56%' '90%' '88%' '87%' '0%' '92%' '93%' '20%' '67%'
 '85%' '60%' '83%' '80%' '79%' '86%' '77%' '89%' '75%' '96%' '25%' '40%'
 '27%' '95%' '33%' '82%' '91%' '94%' '71%' '97%' '17%' '99%' '48%' '78%'
 '51%' '55%' '70%' '10%' '63%' '98%' '64%' '5%' '76%' '57%' '13%' '53%'
 '81%']


In [7]:
listing_df['host_response_rate']=listing_df['host_response_rate'].str.replace('%', '')
x=listing_df['host_response_rate'].dropna().astype(int).mean()
listing_df['host_response_rate']=listing_df['host_response_rate'].fillna(x).astype(int)


In [8]:
listing_df['host_acceptance_rate']=listing_df['host_acceptance_rate'].str.replace('%', '')
x=listing_df['host_acceptance_rate'].dropna().astype(int).mean()
listing_df['host_acceptance_rate']=listing_df['host_acceptance_rate'].fillna(x).astype(int)

In [9]:
#print(listing_df['host_response_time'].value_counts()) # 1129 missing values
#print(listing_df['host_response_time'].value_counts())
listing_df['host_response_time'] = listing_df['host_response_time'].fillna(4).astype('category') # replace with mode

In [10]:
#0 = within an hour, 1 = within a few hours, 2 = within a day, 3 = a few days or more
listing_df['host_response_time'].replace(['within an hour', 'within a few hours', 'within a day', 'a few days or more'],
                        [0,1,2,3], inplace=True)

In [11]:
# review_scores_rating=0 ---> (23 records)The host canceled this reservation before arrival. This is an automated posting
# review_scores_rating=NULL --->(1298 records)No ratings posted
listing_df['review_scores_rating']=listing_df['review_scores_rating'].fillna(0)
listing_df['review_scores_cleanliness']=listing_df['review_scores_cleanliness'].fillna(0)
listing_df['review_scores_checkin']=listing_df['review_scores_checkin'].fillna(0)
listing_df['review_scores_communication']=listing_df['review_scores_communication'].fillna(0)
listing_df['review_scores_location']=listing_df['review_scores_location'].fillna(0)

In [12]:
listing_df['bathrooms_text'].value_counts()

1 bath               3133
1 private bath        865
2 baths               731
1 shared bath         649
1.5 baths             282
1.5 shared baths      113
2.5 baths             103
2 shared baths         90
3 baths                89
5 shared baths         64
3.5 baths              41
2.5 shared baths       34
4 baths                32
0 shared baths         25
3 shared baths         25
0 baths                22
4 shared baths         21
10 shared baths        17
5 baths                17
4.5 baths              14
3.5 shared baths        8
Shared half-bath        6
6 shared baths          6
Private half-bath       3
Half-bath               3
10 baths                2
6.5 shared baths        2
4.5 shared baths        2
5.5 baths               1
6 baths                 1
Name: bathrooms_text, dtype: int64

In [13]:
#Improve naming consistency 
listing_df.loc[listing_df['bathrooms_text'] == 'Half-bath', 'bathrooms_text'] = '0.5 bath' 
listing_df.loc[listing_df['bathrooms_text'] == 'Shared half-bath', 'bathrooms_text'] = '0.5 shared bath'  
listing_df.loc[listing_df['bathrooms_text'] == 'Private half-bath', 'bathrooms_text'] = '0.5 private bath'  

In [14]:
listing_df['bathrooms_text'].isnull().sum() #12 missing rows
#Row indexes of null bathroom data
bathrooms_null_data = listing_df[listing_df['bathrooms_text'].isnull()]
bath_null_ids=bathrooms_null_data.index.tolist()
#Find rows with description
bath_desc_ids = bathrooms_null_data.index[bathrooms_null_data['description'].notnull()].tolist()
#Find the statistics of bathrooms
listing_df['bathrooms_text'].describe() #top       1 bath with freq 3133


count       6401
unique        30
top       1 bath
freq        3133
Name: bathrooms_text, dtype: object

In [15]:
#Fill bathrooms from description or mode (1)
def find_baths(index):
    item = listing_df.iloc[index]['description']    
    hb_count=0
    fb_count = 0 
  #find half-baths  
    hb_index = item.find("half-bath")
  #found
    if hb_index != -1:
        hb_check = item[hb_index-2]
        if hb_check.isdigit() == True: 
            hb_count = hb_check
        else: 
            hb_count = 1
        hb_count = hb_count *0.5  
  #find full baths
    fb_index = item.find("full bath")
    if fb_index != -1:
        fb_check = item[fb_index-2]
        if fb_check.isdigit() == True: 
            fb_count = fb_check
        else: 
            fb_count =1
 
    total = str(hb_count + fb_count) + " baths"
   
  #no data found, set to mode (1)
    if total == "0 baths": 
        total = "1 bath"
    return total
 
#loop through indexes with descriptions 
for x in bath_desc_ids:
    baths=find_baths(x)
    #print(baths)
    listing_df.at[x, 'bathrooms_text'] = baths

In [16]:
#Fill null bathroom data with null descriptions (use mode of 1)
from collections import Counter
bath_desc_null_ids = list((Counter(bath_null_ids) - Counter(bath_desc_ids)).elements())
for x in bath_desc_null_ids:
    listing_df.at[x, 'bathrooms_text'] = "1 bath"
listing_df['bathrooms_text'].isnull().sum()

0

Bedrooms and Beds Missing Data

In [17]:
listing_df['bedrooms'].value_counts() #mode=1 bedroom (freq=3601)
listing_df['bedrooms'].isnull().sum() # 942 missing records

942

In [18]:
bedrooms_null_data = listing_df[listing_df['bedrooms'].isnull()]
bedrooms_null_ids = bedrooms_null_data.index.tolist()
bedrooms_desc_ids = bedrooms_null_data.index[bedrooms_null_data['description'].notnull()].tolist()
#Fill Bedrooms from description 
def find_bedrooms(index):
    item = listing_df.iloc[index]['description']    
    count=0.0  
    index = item.find("bedroom")
   #found
    if index != -1:   
        check = item[index-2]    
        if check.isdigit() == True: 
              count = check
        else: 
              count = 1.0   
    total = count    
  #no data found, set to mode (1)
    if total == 0.0: 
        total = 1.0
    return total 

#loop through indexes with descriptions 
for x in bedrooms_desc_ids:
    bedrooms=find_bedrooms(x)
    #print(bedrooms)
    listing_df.at[x, 'bedrooms'] = bedrooms

In [19]:
#Fill bedrooms data with mode (1)
bedrooms_desc_null_ids=list((Counter(bedrooms_null_ids)-Counter(bedrooms_desc_ids)).elements())
for i in bedrooms_desc_null_ids:
    listing_df.at[i,'bedrooms']=1.0
listing_df['bedrooms'].isnull().sum()

0

In [20]:
listing_df['beds'].value_counts()#mode is 1 bed
listing_df['beds'].isnull().sum()# 169 missing records
beds_null_data = listing_df[listing_df['beds'].isnull()]
beds_null_ids = beds_null_data.index.tolist()
beds_desc_ids = beds_null_data.index[beds_null_data['description'].notnull()].tolist()

In [21]:
def find_beds(index):
    item = listing_df.iloc[index]['description']    
    count=0.0  
    index = item.find("bed")
    #found
    if index != -1:   
        check = item[index-2]    
        if check.isdigit() == True: 
              count = check
        else: 
              count = 1.0   
    total = count    
  #no data found, set to mode (1)
    if total == 0.0: 
        total = 1.0
    return total 

#loop through indexes with descriptions 
for x in beds_desc_ids:
    beds=find_beds(x)
    listing_df.at[x, 'beds'] = beds

In [22]:
beds_desc_null_ids = list((Counter(beds_null_ids) - Counter(beds_desc_ids)).elements())
for x in beds_desc_null_ids:
    listing_df.at[x, 'beds'] = 1.0
listing_df['beds'].isnull().sum()

0

In [23]:
host_verifications=[]
for i in listing_df['host_verifications'].values:
    verif=str(i).replace('[','').replace(']','').replace('"','').replace("'","").replace(' ','')
    j=verif.split(',')
    host_verifications.extend(j)
host_verifications=[i.lower() for i in host_verifications]
print(len(set(host_verifications)))

17


In [24]:
verif_dict={}
for i in host_verifications:
    verif_dict[i]=verif_dict.get(i,0)+1
{k:v for k,v in sorted(verif_dict.items(),key=lambda item:item[1],reverse=True)}

{'phone': 6395,
 'email': 5967,
 'reviews': 4597,
 'government_id': 4125,
 'offline_government_id': 3057,
 'jumio': 2825,
 'selfie': 1978,
 'kba': 1803,
 'identity_manual': 1764,
 'work_email': 1229,
 'facebook': 920,
 'google': 314,
 'manual_offline': 54,
 'manual_online': 38,
 'sent_id': 7,
 'zhima_selfie': 2,
 'none': 1}

In [25]:
pd.set_option('display.max_columns',None)
listing_df.head()

Unnamed: 0,id,name,description,neighborhood_overview,host_id,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,958,"Bright, Modern Garden Unit - 1BR/1BTH",Please check local laws re Covid before you re...,Quiet cul de sac in friendly neighborhood<br /...,1169,2008-07-31,"San Francisco, California, United States",We are a family of four that live upstairs. W...,0,100,98,t,Duboce Triangle,1.0,1.0,"['email', 'phone', 'facebook', 'reviews', 'kba']",t,t,"San Francisco, California, United States",Western Addition,37.77028,-122.43317,Entire serviced apartment,Entire home/apt,3,1 bath,1.0,2.0,"[""Wifi"", ""Hot water"", ""Dedicated workspace"", ""...",$172.00,2,30,2.0,2.0,1125.0,1125.0,2.0,1125.0,t,10,17,22,136,315,42,0,2009-07-23,2021-12-06,4.87,4.94,4.95,4.95,4.9,4.98,4.78,City Registration Pending,f,1,1,0,0,2.08
1,5858,Creative Sanctuary,<b>The space</b><br />We live in a large Victo...,I love how our neighborhood feels quiet but is...,8904,2009-03-02,"San Diego County, California, United States",Philip: English transplant to the Bay Area and...,2,100,71,f,Bernal Heights,2.0,2.0,"['email', 'phone', 'reviews', 'kba', 'work_ema...",t,t,"San Francisco, California, United States",Bernal Heights,37.74474,-122.42089,Entire rental unit,Entire home/apt,5,1 bath,2.0,3.0,"[""First aid kit"", ""Fire extinguisher"", ""Long t...",$235.00,30,60,30.0,30.0,60.0,60.0,30.0,60.0,t,30,60,90,365,111,0,0,2009-05-03,2017-08-06,4.88,4.85,4.87,4.89,4.85,4.77,4.68,,f,1,1,0,0,0.72
2,7918,A Friendly Room - UCSF/USF - San Francisco,Nice and good public transportation. 7 minute...,"Shopping old town, restaurants, McDonald, Whol...",21994,2009-06-17,"San Francisco, California, United States",7 minutes walk to UCSF hospital & school campu...,0,100,100,f,Cole Valley,10.0,10.0,"['email', 'phone', 'reviews', 'jumio', 'govern...",t,t,"San Francisco, California, United States",Haight Ashbury,37.76555,-122.45213,Private room in rental unit,Private room,2,4 shared baths,1.0,1.0,"[""Fire extinguisher"", ""Lock on bedroom door"", ...",$56.00,32,60,32.0,32.0,60.0,60.0,32.0,60.0,t,30,60,90,365,19,0,0,2009-08-31,2020-03-06,4.2,3.73,3.87,4.67,4.6,4.73,4.0,,f,9,0,9,0,0.13
3,8142,Friendly Room Apt. Style -UCSF/USF - San Franc...,Nice and good public transportation. 7 minute...,,21994,2009-06-17,"San Francisco, California, United States",7 minutes walk to UCSF hospital & school campu...,0,100,100,f,Cole Valley,10.0,10.0,"['email', 'phone', 'reviews', 'jumio', 'govern...",t,t,,Haight Ashbury,37.76555,-122.45213,Private room in rental unit,Private room,2,4 shared baths,1.0,1.0,"[""Fire extinguisher"", ""Lock on bedroom door"", ...",$56.00,32,90,32.0,32.0,90.0,90.0,32.0,90.0,t,30,60,90,365,8,0,0,2014-09-08,2018-09-12,4.63,4.38,4.38,4.75,4.75,4.63,4.63,,f,9,0,9,0,0.09
4,8339,Historic Alamo Square Victorian,Pls email before booking. <br />Interior featu...,,24215,2009-07-02,"San Francisco, California, United States",I'm an Interior Stylist living in SF. \r\n\r\n...,4,94,0,f,Alamo Square,2.0,2.0,"['email', 'phone', 'reviews', 'kba']",t,t,,Western Addition,37.77564,-122.43642,Entire condominium (condo),Entire home/apt,4,1.5 baths,2.0,2.0,"[""Wifi"", ""Hot water"", ""Dedicated workspace"", ""...",$895.00,7,111,7.0,7.0,111.0,111.0,7.0,111.0,t,30,60,90,365,28,0,0,2009-09-25,2019-06-28,4.87,4.88,5.0,4.94,5.0,4.94,4.75,STR-0000264,f,2,2,0,0,0.19


In [26]:
# derived column -number of years from host_since
dt=datetime.datetime(2022, 1, 6, 0, 0)
listing_df['host_since']=pd.to_datetime(listing_df['host_since'])
x=dt-listing_df['host_since']
listing_df['host_since_years']=round(x/datetime.timedelta(days=365),1)
listing_df['host_since_years']=listing_df['host_since_years'].fillna(0) # 1 missing record

In [27]:
listing_df['reviews_per_month']=listing_df['reviews_per_month'].fillna(0)
listing_df['host_is_superhost']=listing_df['host_is_superhost'].fillna('f')
listing_df['host_total_listings_count']=listing_df['host_total_listings_count'].fillna(0)
listing_df['host_has_profile_pic']=listing_df['host_has_profile_pic'].fillna('f')
listing_df['host_identity_verified']=listing_df['host_identity_verified'].fillna('f')
listing_df['reviews_per_month']=listing_df['reviews_per_month'].fillna(0)

In [28]:
listing_df.isnull().sum()

id                                                 0
name                                               0
description                                       71
neighborhood_overview                           1790
host_id                                            0
host_since                                         1
host_location                                      6
host_about                                      1934
host_response_time                                 0
host_response_rate                                 0
host_acceptance_rate                               0
host_is_superhost                                  0
host_neighbourhood                               362
host_listings_count                                1
host_total_listings_count                          0
host_verifications                                 0
host_has_profile_pic                               0
host_identity_verified                             0
neighbourhood                                 

# calendar dataset

In [29]:
calendar_df=pd.read_csv('calendar.csv')
calendar_df['listing_id']=calendar_df['listing_id'].astype('category')
calendar_df['date']=pd.to_datetime(calendar_df['date'])
calendar_df['available']=calendar_df['available'].astype('category')
calendar_df['price']=calendar_df['price'].str.replace('$','').str.replace(',','').astype(float)
calendar_df['adjusted_price']=calendar_df['adjusted_price'].str.replace('$','').str.replace(',','').astype(float)
calendar_df['price_diff']=calendar_df['price']-calendar_df['adjusted_price']

  """
  


In [30]:
#create day of week column using "DATE" column
calendar_df['day_of_week'] = [i.day_name() for i in calendar_df['date'].tolist()]
calendar_df['day_of_week'] = calendar_df['day_of_week'] .astype('category')

In [31]:
#extract Number of days booked, 5 Bins based on days booked from calendar data
df=pd.crosstab(calendar_df.listing_id,calendar_df.available)
df.columns=['days_booked','days_not_booked']
df['listing_id']=df.index
df.reset_index(drop=True,inplace=True)
df=pd.DataFrame(df,columns=['listing_id','days_booked','days_not_booked'])
df.drop(columns=['days_not_booked'],inplace=True)
bins=[0,68,211,305,365]
df['booking_status']=np.searchsorted(bins, df['days_booked'].values)

In [32]:
# Extract min/max/mean adjusted price from calendar data
df1_min_price=calendar_df.groupby('listing_id',as_index=False)['adjusted_price'].min()
df1_min_price.columns=['listing_id','min_price']
df1_max_price=calendar_df.groupby('listing_id',as_index=False)['adjusted_price'].max()
df1_max_price.columns=['listing_id','max_price']
df1_avg_price=calendar_df.groupby('listing_id',as_index=False)['adjusted_price'].mean()
df1_avg_price.columns=['listing_id','mean_price']

In [33]:
#Merge all the calculated columns from calendar data
tables=[df,df1_min_price,df1_max_price,df1_avg_price]
dfs = reduce(lambda x,y: pd.merge(x,y,on='listing_id'), tables)
dfs.head()

Unnamed: 0,listing_id,days_booked,booking_status,min_price,max_price,mean_price
0,958,229,3,170.0,192.0,172.232877
1,5858,0,0,235.0,235.0,235.0
2,7918,0,0,56.0,65.0,62.090411
3,8142,0,0,56.0,65.0,62.090411
4,8339,0,0,895.0,895.0,895.0


In [34]:
# merge "calendar data" with "listings data"
combined_df=pd.merge(listing_df,dfs,how='inner',left_on='id',right_on='listing_id')
combined_df.drop('listing_id',axis=1,inplace=True)
combined_df['mean_price']=combined_df['mean_price'].round(decimals=1)

In [35]:
combined_df['Success_rate']=1-(0.25*combined_df['availability_30']/30+0.25*(combined_df['availability_60']-combined_df['availability_30'])/30+0.25*(combined_df['availability_90']-combined_df['availability_60'])/30+0.25*(combined_df['availability_365']-combined_df['availability_90'])/275)

# model dataset

In [36]:
model_data=combined_df.copy()

In [37]:
for i in model_data.index:
    if model_data.loc[i,'host_location']=='San Francisco, California, United States':
        model_data.loc[i,'host_in_sfo']=1
    else:
        model_data.loc[i,'host_in_sfo']=0

In [38]:
for i in model_data.index:
    if str(model_data.loc[i,'license']).startswith('STR'):
        model_data.loc[i,'licensed']=1
    else:
        model_data.loc[i,'licensed']=0

In [39]:
model_data.drop(['id','name','host_about','calculated_host_listings_count_entire_homes','review_scores_value','review_scores_accuracy','description','neighborhood_overview','host_id','latitude','longitude','host_since','host_location','host_neighbourhood','host_listings_count','host_verifications','neighbourhood','amenities','price','availability_30','availability_60','availability_90','availability_365','first_review','last_review','license','calculated_host_listings_count','calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms','days_booked','booking_status','min_price','max_price'],axis=1,inplace=True)

In [40]:
#model_data.isnull().sum() #no null values

In [41]:
model_data.shape

(6411, 38)

In [47]:
data=model_data.copy()
data.head()

Unnamed: 0,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms_text,bedrooms,beds,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,has_availability,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,instant_bookable,reviews_per_month,host_since_years,mean_price,Success_rate,host_in_sfo,licensed
0,0,100,98,t,1.0,t,t,Western Addition,Entire serviced apartment,Entire home/apt,3,1 bath,1.0,2.0,2,30,2.0,2.0,1125.0,1125.0,2.0,1125.0,t,315,42,0,4.87,4.95,4.95,4.9,4.98,f,2.08,13.4,172.2,0.71303,1.0,0.0
1,2,100,71,f,2.0,t,t,Bernal Heights,Entire rental unit,Entire home/apt,5,1 bath,2.0,3.0,30,60,30.0,30.0,60.0,60.0,30.0,60.0,t,111,0,0,4.88,4.87,4.89,4.85,4.77,f,0.72,12.9,235.0,0.0,0.0,0.0
2,0,100,100,f,10.0,t,t,Haight Ashbury,Private room in rental unit,Private room,2,4 shared baths,1.0,1.0,32,60,32.0,32.0,60.0,60.0,32.0,60.0,t,19,0,0,4.2,3.87,4.67,4.6,4.73,f,0.13,12.6,62.1,0.0,1.0,0.0
3,0,100,100,f,10.0,t,t,Haight Ashbury,Private room in rental unit,Private room,2,4 shared baths,1.0,1.0,32,90,32.0,32.0,90.0,90.0,32.0,90.0,t,8,0,0,4.63,4.38,4.75,4.75,4.63,f,0.09,12.6,62.1,0.0,1.0,0.0
4,4,94,0,f,2.0,t,t,Western Addition,Entire condominium (condo),Entire home/apt,4,1.5 baths,2.0,2.0,7,111,7.0,7.0,111.0,111.0,7.0,111.0,t,28,0,0,4.87,5.0,4.94,5.0,4.94,f,0.19,12.5,895.0,0.0,1.0,1.0


In [48]:
data.isnull().sum()

host_response_time             0
host_response_rate             0
host_acceptance_rate           0
host_is_superhost              0
host_total_listings_count      0
host_has_profile_pic           0
host_identity_verified         0
neighbourhood_cleansed         0
property_type                  0
room_type                      0
accommodates                   0
bathrooms_text                 0
bedrooms                       0
beds                           0
minimum_nights                 0
maximum_nights                 0
minimum_minimum_nights         0
maximum_minimum_nights         0
minimum_maximum_nights         0
maximum_maximum_nights         0
minimum_nights_avg_ntm         0
maximum_nights_avg_ntm         0
has_availability               0
number_of_reviews              0
number_of_reviews_ltm          0
number_of_reviews_l30d         0
review_scores_rating           0
review_scores_cleanliness      0
review_scores_checkin          0
review_scores_communication    0
review_sco

In [49]:
pd.set_option('display.max_columns',None)
for i,j in enumerate(data.columns.values.tolist()):
    print(i,j)

0 host_response_time
1 host_response_rate
2 host_acceptance_rate
3 host_is_superhost
4 host_total_listings_count
5 host_has_profile_pic
6 host_identity_verified
7 neighbourhood_cleansed
8 property_type
9 room_type
10 accommodates
11 bathrooms_text
12 bedrooms
13 beds
14 minimum_nights
15 maximum_nights
16 minimum_minimum_nights
17 maximum_minimum_nights
18 minimum_maximum_nights
19 maximum_maximum_nights
20 minimum_nights_avg_ntm
21 maximum_nights_avg_ntm
22 has_availability
23 number_of_reviews
24 number_of_reviews_ltm
25 number_of_reviews_l30d
26 review_scores_rating
27 review_scores_cleanliness
28 review_scores_checkin
29 review_scores_communication
30 review_scores_location
31 instant_bookable
32 reviews_per_month
33 host_since_years
34 mean_price
35 Success_rate
36 host_in_sfo
37 licensed


In [50]:
data['host_response_time'].replace(['within an hour', 'within a few hours', 'within a day', 'a few days or more'],
                        [0,1,2,3], inplace=True)

In [51]:
#data=data.iloc[:,[0,1,2,3,4,5,6,22,31,34,35,36]]
data=data.iloc[:,[0,1,2,3,4,5,6,22,31,33,34,35,36,37]]

In [52]:
data.head()

Unnamed: 0,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,has_availability,instant_bookable,host_since_years,mean_price,Success_rate,host_in_sfo,licensed
0,0,100,98,t,1.0,t,t,t,f,13.4,172.2,0.71303,1.0,0.0
1,2,100,71,f,2.0,t,t,t,f,12.9,235.0,0.0,0.0,0.0
2,0,100,100,f,10.0,t,t,t,f,12.6,62.1,0.0,1.0,0.0
3,0,100,100,f,10.0,t,t,t,f,12.6,62.1,0.0,1.0,0.0
4,4,94,0,f,2.0,t,t,t,f,12.5,895.0,0.0,1.0,1.0


In [53]:
len(data.groupby(['host_is_superhost']).size())

2

In [54]:
data=data.astype({'licensed':'category',"host_in_sfo":'category',
                               "has_availability":'category',"host_is_superhost":'category',
                                "host_has_profile_pic":'category',"host_identity_verified":'category',
                               "instant_bookable":'category'})

dummy_columns=['host_is_superhost',
        'host_has_profile_pic',
       'host_identity_verified', 'has_availability', 
       'instant_bookable', 'host_in_sfo',
       'licensed']
for i in dummy_columns:
    if len(data.groupby([i]).size())>2:
        encoded_data=pd.get_dummies(data,prefix=[i],columns=[i])
encoded_data=pd.get_dummies(data,drop_first=True)

In [55]:
test=encoded_data.loc[encoded_data['host_response_time']==4,:]
train=encoded_data.loc[encoded_data['host_response_time']!=4,:]

In [56]:
encoded_data['host_response_time'].unique()

array([0, 2, 4, 1, 3], dtype=int64)

In [57]:
train.head()

Unnamed: 0,host_response_time,host_response_rate,host_acceptance_rate,host_total_listings_count,host_since_years,mean_price,Success_rate,host_is_superhost_t,host_has_profile_pic_t,host_identity_verified_t,has_availability_t,instant_bookable_t,host_in_sfo_1.0,licensed_1.0
0,0,100,98,1.0,13.4,172.2,0.71303,1,1,1,1,0,1,0
1,2,100,71,2.0,12.9,235.0,0.0,0,1,1,1,0,0,0
2,0,100,100,10.0,12.6,62.1,0.0,0,1,1,1,0,1,0
3,0,100,100,10.0,12.6,62.1,0.0,0,1,1,1,0,1,0
6,2,50,100,1.0,12.4,150.0,0.075,0,1,1,1,0,1,1


In [58]:
X=train.iloc[:,[1,2,3,4,5,6,7,8,9,10,11,12,13]]
Y=train.iloc[:,0]
X_test=test.iloc[:,[1,2,3,4,5,6,7,8,9,10,11,12,13]]
Y_test=test.iloc[:,0]

In [59]:
X_train,X_valid,Y_train,Y_valid=train_test_split(X,Y,test_size=0.1,random_state=1)

In [60]:
X_train.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_total_listings_count,host_since_years,mean_price,Success_rate,host_is_superhost_t,host_has_profile_pic_t,host_identity_verified_t,has_availability_t,instant_bookable_t,host_in_sfo_1.0,licensed_1.0
3010,100,66,2.0,11.3,300.0,0.05,0,1,1,1,0,1,1
5372,100,92,0.0,0.9,251.4,0.333333,0,1,1,1,0,0,0
1589,100,93,19.0,10.1,169.0,0.619091,1,1,1,1,0,0,0
5935,99,99,36.0,2.8,148.1,0.444091,1,1,0,1,0,1,0
3452,100,59,2.0,9.4,107.8,0.417576,0,1,1,1,0,1,1


In [61]:
cols=X_train.columns
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_valid=scaler.transform(X_valid)
X_test=scaler.transform(X_test)
X_train=pd.DataFrame(X_train,columns=cols)
X_valid=pd.DataFrame(X_valid,columns=cols)
X_test=pd.DataFrame(X_test,columns=cols)

In [62]:
k=np.arange(1,20,1).tolist()
for i in k:
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train.values,Y_train.values)
    Y_pred_valid=knn.predict(X_valid.values)
    print('Model accuracy score: {0:0.4f}'. format(accuracy_score(Y_valid, Y_pred_valid)), 'for k=',i)
    #print(classification_report(Y_valid, Y_pred_valid))

Model accuracy score: 0.7524 for k= 1
Model accuracy score: 0.7788 for k= 2
Model accuracy score: 0.7845 for k= 3
Model accuracy score: 0.7788 for k= 4
Model accuracy score: 0.7826 for k= 5
Model accuracy score: 0.7864 for k= 6
Model accuracy score: 0.7732 for k= 7
Model accuracy score: 0.7750 for k= 8
Model accuracy score: 0.7732 for k= 9
Model accuracy score: 0.7769 for k= 10
Model accuracy score: 0.7694 for k= 11
Model accuracy score: 0.7656 for k= 12
Model accuracy score: 0.7618 for k= 13
Model accuracy score: 0.7599 for k= 14
Model accuracy score: 0.7599 for k= 15
Model accuracy score: 0.7618 for k= 16
Model accuracy score: 0.7599 for k= 17
Model accuracy score: 0.7599 for k= 18
Model accuracy score: 0.7637 for k= 19


In [63]:
knn=KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train.values, Y_train.values)
Y_pred_test=knn.predict(X_test.values)
#print('Model accuracy score in test set:{0:0.4f}'.format(accuracy_score(Y_test,Y_pred_test)))
#print(classification_report(Y_test, Y_pred_test))

In [65]:
index=test.index.tolist()
prediction=Y_pred_test
pred={}
for i,v in zip(index,prediction):
    pred[i]=v
df=pd.DataFrame(data=pred,index=[0])
df=df.T
df.columns=['host_response_time']
print(df)
df.to_csv('missing_responses.csv')

      host_response_time
4                      1
5                      0
13                     0
21                     0
22                     0
25                     0
28                     2
37                     0
40                     0
50                     0
51                     0
52                     1
69                     2
76                     1
102                    0
103                    1
106                    2
118                    2
128                    0
131                    0
143                    0
148                    0
154                    2
162                    0
163                    0
166                    0
174                    1
177                    0
198                    0
203                    0
221                    0
227                    1
232                    0
236                    1
246                    1
248                    1
253                    0
260                    1
261                    0
