In [27]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier


In [15]:
# run data prep file to be able to call dataframes
%run "mm_data_prep.ipynb"

Number of rows in denver is: 5388
<class 'pandas.core.frame.DataFrame'>
Index: 3844 entries, 0 to 5357
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           3844 non-null   int64  
 1   description                  3841 non-null   object 
 2   host_id                      3844 non-null   int64  
 3   host_since                   3844 non-null   object 
 4   host_has_profile_pic         3844 non-null   object 
 5   host_identity_verified       3844 non-null   object 
 6   neighbourhood_cleansed       3844 non-null   object 
 7   latitude                     3844 non-null   float64
 8   longitude                    3844 non-null   float64
 9   property_type                3844 non-null   object 
 10  room_type                    3844 non-null   object 
 11  accommodates                 3844 non-null   int64  
 12  bedrooms                     3844 non-null   fl

In [16]:
# Check to see that I can access dataframe with outliers removed
denver_outliers_removed.head()

Unnamed: 0,id,description,host_id,host_since,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bedrooms,beds,amenities,price,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,reviews_per_month,city,bath_number,bath_text
0,360,Enjoy the famous Colorado weather and unplug i...,666,2008-07-08,t,t,Highland,39.766415,-105.002098,Entire guesthouse,Entire home/apt,3,2.0,2.0,"[""Extra pillows and blankets"", ""First aid kit""...",90.0,t,4,27,57,147,179,7,0,4.99,4.99,4.96,5.0,5.0,5.0,4.91,f,2.87,Denver,1.0,bath
1,364,"Modern 1,000 square foot loft in the heart of ...",783,2008-07-11,t,t,Five Points,39.76672,-104.97906,Entire loft,Entire home/apt,3,1.0,1.0,"[""First aid kit"", ""Fire extinguisher"", ""Wifi"",...",179.0,t,23,53,83,358,87,0,0,4.85,4.78,4.81,4.95,4.96,4.65,4.71,f,0.5,Denver,1.5,baths
7,31503,CORONA VIRUS RESPONSIBLE - ESSENTIAL WORKERS W...,135298,2010-05-30,t,t,West Highland,39.76179,-105.02845,Entire guest suite,Entire home/apt,2,1.0,1.0,"[""Radiant heating"", ""Extra pillows and blanket...",103.0,t,12,42,72,347,159,27,3,4.91,4.92,5.0,4.95,4.99,4.88,4.88,f,1.0,Denver,1.0,bath
8,39405,Enjoy our oasis in the city and stay at one of...,666,2008-07-08,t,t,Highland,39.766053,-105.003078,Entire cottage,Entire home/apt,2,1.0,1.0,"[""Extra pillows and blankets"", ""Cooking basics...",136.0,t,6,7,32,114,667,35,3,4.92,4.88,4.93,4.98,4.97,4.85,4.85,f,4.17,Denver,1.0,bath
10,154999,This condo is right in the heart of Downtown D...,745200,2011-06-26,t,t,CBD,39.74439,-104.98927,Entire condo,Entire home/apt,2,1.0,1.0,"[""Extra pillows and blankets"", ""Dishwasher"", ""...",162.0,t,0,0,0,139,11,0,0,4.7,4.4,4.7,4.7,4.9,4.9,4.6,f,0.07,Denver,1.0,bath


In [31]:
# create separate ID dataframe
id2 = denver_outliers_removed[['id']].reset_index(drop=True)
id2.head()

Unnamed: 0,id
0,360
1,364
2,31503
3,39405
4,154999


In [33]:
denver_outliers_removed.columns.to_list()

['id',
 'description',
 'host_id',
 'host_since',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood_cleansed',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bedrooms',
 'beds',
 'amenities',
 'price',
 'has_availability',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'number_of_reviews',
 'number_of_reviews_ltm',
 'number_of_reviews_l30d',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'instant_bookable',
 'reviews_per_month',
 'city',
 'bath_number',
 'bath_text']

In [38]:
# make list of columns to drop

# Columns to remove

list_columns_to_remove = [
'id',
'description',
'host_id',
'host_since',
'host_has_profile_pic',
'host_identity_verified',
'neighbourhood_cleansed',
'latitude',
'longitude',
#'property_type',
'room_type',
#'accommodates',
#'bedrooms',
#'beds',
'amenities',
#'price',
#'has_availability',
#'availability_30',
#'availability_60',
#'availability_90',
#'availability_365',
#'number_of_reviews',
#'number_of_reviews_ltm',
#'number_of_reviews_l30d',
#'review_scores_rating',
#'review_scores_accuracy',
#'review_scores_cleanliness',
#'review_scores_checkin',
#'review_scores_communication',
#'review_scores_location',
#'review_scores_value',
'instant_bookable',
'reviews_per_month',
'city',
#'bath_number',
'bath_text'
]

# Subset dropped columns

rf_data = denver_outliers_removed.drop(columns=list_columns_to_remove)
rf_data.head()

Unnamed: 0,property_type,accommodates,bedrooms,beds,price,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,bath_number
0,Entire guesthouse,3,2.0,2.0,90.0,t,4,27,57,147,179,7,0,4.99,4.99,4.96,5.0,5.0,5.0,4.91,1.0
1,Entire loft,3,1.0,1.0,179.0,t,23,53,83,358,87,0,0,4.85,4.78,4.81,4.95,4.96,4.65,4.71,1.5
7,Entire guest suite,2,1.0,1.0,103.0,t,12,42,72,347,159,27,3,4.91,4.92,5.0,4.95,4.99,4.88,4.88,1.0
8,Entire cottage,2,1.0,1.0,136.0,t,6,7,32,114,667,35,3,4.92,4.88,4.93,4.98,4.97,4.85,4.85,1.0
10,Entire condo,2,1.0,1.0,162.0,t,0,0,0,139,11,0,0,4.7,4.4,4.7,4.7,4.9,4.9,4.6,1.0


In [39]:
# Assign "other" property type to any property types not in top 5
top_property_types = denver_outliers_removed['property_type'].value_counts().nlargest(6).index
top_property_types

Index(['Entire home', 'Entire rental unit', 'Entire condo',
       'Entire guest suite', 'Entire townhouse', 'Entire guesthouse'],
      dtype='object', name='property_type')

In [40]:
# reduce property types to top 6 plus "other"

rf_data.loc[:,'prop_type_reduced'] = rf_data['property_type'].apply(lambda x: x if x in top_property_types else 'Other')
rf_data.head()

Unnamed: 0,property_type,accommodates,bedrooms,beds,price,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,bath_number,prop_type_reduced
0,Entire guesthouse,3,2.0,2.0,90.0,t,4,27,57,147,179,7,0,4.99,4.99,4.96,5.0,5.0,5.0,4.91,1.0,Entire guesthouse
1,Entire loft,3,1.0,1.0,179.0,t,23,53,83,358,87,0,0,4.85,4.78,4.81,4.95,4.96,4.65,4.71,1.5,Other
7,Entire guest suite,2,1.0,1.0,103.0,t,12,42,72,347,159,27,3,4.91,4.92,5.0,4.95,4.99,4.88,4.88,1.0,Entire guest suite
8,Entire cottage,2,1.0,1.0,136.0,t,6,7,32,114,667,35,3,4.92,4.88,4.93,4.98,4.97,4.85,4.85,1.0,Other
10,Entire condo,2,1.0,1.0,162.0,t,0,0,0,139,11,0,0,4.7,4.4,4.7,4.7,4.9,4.9,4.6,1.0,Entire condo


In [55]:
encoder = OneHotEncoder()
encoded = encoder.fit_transform(rf_data[['prop_type_reduced']])
feature_names = encoder.get_feature_names_out()
feature_names

array(['prop_type_reduced_Entire condo',
       'prop_type_reduced_Entire guest suite',
       'prop_type_reduced_Entire guesthouse',
       'prop_type_reduced_Entire home',
       'prop_type_reduced_Entire rental unit',
       'prop_type_reduced_Entire townhouse', 'prop_type_reduced_Other'],
      dtype=object)

In [54]:
# convert transformed array to DataFrame

encoded_df = pd.DataFrame(encoded, columns=feature_names)

encoded_df.head()

Unnamed: 0,prop_type_reduced_Entire condo,prop_type_reduced_Entire guest suite,prop_type_reduced_Entire guesthouse,prop_type_reduced_Entire home,prop_type_reduced_Entire rental unit,prop_type_reduced_Entire townhouse,prop_type_reduced_Other
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# remove property_type column
rf_data = rf_data.drop(columns='property_type')
rf_data.head()

In [17]:
##### random forest no need to encode or scale

In [23]:
# create x and y (drop 30 day booking)

X = denver_outliers_removed.copy()
X.drop('availability_30', axis=1, inplace=True)
y = denver_outliers_removed['availability_30']

In [26]:
# X.head()
y.ravel()

array([ 4, 23, 12, ...,  4, 24,  6], dtype=int64)

In [28]:
# train_test_split on x and y
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [29]:
# create random forest classifier instance

rf_model = RandomForestClassifier(n_estimators=100, random_state=1)

In [30]:
# fit the model

rf_model = rf_model.fit(X_train, y_train)

ValueError: could not convert string to float: 'Relax with the whole family at this peaceful place to stay.<br /><br /><b>Registration number</b><br />2021-BFN-0007911'