In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, roc_auc_score, accuracy_score


In [2]:
# run data prep file to be able to call dataframes
%run "mm_data_prep.ipynb"

Number of rows in denver is: 5388
<class 'pandas.core.frame.DataFrame'>
Index: 3844 entries, 0 to 5357
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           3844 non-null   int64  
 1   description                  3841 non-null   object 
 2   host_id                      3844 non-null   int64  
 3   host_since                   3844 non-null   object 
 4   host_has_profile_pic         3844 non-null   object 
 5   host_identity_verified       3844 non-null   object 
 6   neighbourhood_cleansed       3844 non-null   object 
 7   latitude                     3844 non-null   float64
 8   longitude                    3844 non-null   float64
 9   property_type                3844 non-null   object 
 10  room_type                    3844 non-null   object 
 11  accommodates                 3844 non-null   int64  
 12  bedrooms                     3844 non-null   fl

In [3]:
# Check to see that I can access dataframe with outliers removed
denver_outliers_removed.head()

Unnamed: 0,id,description,host_id,host_since,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bedrooms,beds,amenities,price,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,reviews_per_month,city,bath_number,bath_text
0,360,Enjoy the famous Colorado weather and unplug i...,666,2008-07-08,t,t,Highland,39.766415,-105.002098,Entire guesthouse,Entire home/apt,3,2.0,2.0,"[""Extra pillows and blankets"", ""First aid kit""...",90.0,t,4,27,57,147,179,7,0,4.99,4.99,4.96,5.0,5.0,5.0,4.91,f,2.87,Denver,1.0,bath
1,364,"Modern 1,000 square foot loft in the heart of ...",783,2008-07-11,t,t,Five Points,39.76672,-104.97906,Entire loft,Entire home/apt,3,1.0,1.0,"[""First aid kit"", ""Fire extinguisher"", ""Wifi"",...",179.0,t,23,53,83,358,87,0,0,4.85,4.78,4.81,4.95,4.96,4.65,4.71,f,0.5,Denver,1.5,baths
7,31503,CORONA VIRUS RESPONSIBLE - ESSENTIAL WORKERS W...,135298,2010-05-30,t,t,West Highland,39.76179,-105.02845,Entire guest suite,Entire home/apt,2,1.0,1.0,"[""Radiant heating"", ""Extra pillows and blanket...",103.0,t,12,42,72,347,159,27,3,4.91,4.92,5.0,4.95,4.99,4.88,4.88,f,1.0,Denver,1.0,bath
8,39405,Enjoy our oasis in the city and stay at one of...,666,2008-07-08,t,t,Highland,39.766053,-105.003078,Entire cottage,Entire home/apt,2,1.0,1.0,"[""Extra pillows and blankets"", ""Cooking basics...",136.0,t,6,7,32,114,667,35,3,4.92,4.88,4.93,4.98,4.97,4.85,4.85,f,4.17,Denver,1.0,bath
10,154999,This condo is right in the heart of Downtown D...,745200,2011-06-26,t,t,CBD,39.74439,-104.98927,Entire condo,Entire home/apt,2,1.0,1.0,"[""Extra pillows and blankets"", ""Dishwasher"", ""...",162.0,t,0,0,0,139,11,0,0,4.7,4.4,4.7,4.7,4.9,4.9,4.6,f,0.07,Denver,1.0,bath


In [4]:
# create separate ID dataframe
id2 = denver_outliers_removed[['id']].reset_index(drop=True)
id2.head()

Unnamed: 0,id
0,360
1,364
2,31503
3,39405
4,154999


In [5]:
denver_outliers_removed.columns.to_list()

['id',
 'description',
 'host_id',
 'host_since',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood_cleansed',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bedrooms',
 'beds',
 'amenities',
 'price',
 'has_availability',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'number_of_reviews',
 'number_of_reviews_ltm',
 'number_of_reviews_l30d',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'instant_bookable',
 'reviews_per_month',
 'city',
 'bath_number',
 'bath_text']

In [6]:
# make list of columns to drop

# Columns to remove

list_columns_to_remove = [
#'id',
'description',
'host_id',
'host_since',
'host_has_profile_pic',
'host_identity_verified',
'neighbourhood_cleansed',
'latitude',
'longitude',
#'property_type',
'room_type',
#'accommodates',
#'bedrooms',
#'beds',
'amenities',
#'price',
#'has_availability',
#'availability_30',
#'availability_60',
#'availability_90',
#'availability_365',
#'number_of_reviews',
#'number_of_reviews_ltm',
#'number_of_reviews_l30d',
#'review_scores_rating',
#'review_scores_accuracy',
#'review_scores_cleanliness',
#'review_scores_checkin',
#'review_scores_communication',
#'review_scores_location',
#'review_scores_value',
'instant_bookable',
'reviews_per_month',
'city',
#'bath_number',
'bath_text'
]

# Subset dropped columns

rf_data = denver_outliers_removed.drop(columns=list_columns_to_remove)
rf_data.head()

Unnamed: 0,id,property_type,accommodates,bedrooms,beds,price,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,bath_number
0,360,Entire guesthouse,3,2.0,2.0,90.0,t,4,27,57,147,179,7,0,4.99,4.99,4.96,5.0,5.0,5.0,4.91,1.0
1,364,Entire loft,3,1.0,1.0,179.0,t,23,53,83,358,87,0,0,4.85,4.78,4.81,4.95,4.96,4.65,4.71,1.5
7,31503,Entire guest suite,2,1.0,1.0,103.0,t,12,42,72,347,159,27,3,4.91,4.92,5.0,4.95,4.99,4.88,4.88,1.0
8,39405,Entire cottage,2,1.0,1.0,136.0,t,6,7,32,114,667,35,3,4.92,4.88,4.93,4.98,4.97,4.85,4.85,1.0
10,154999,Entire condo,2,1.0,1.0,162.0,t,0,0,0,139,11,0,0,4.7,4.4,4.7,4.7,4.9,4.9,4.6,1.0


In [7]:
# Assign "other" property type to any property types not in top 5
top_property_types = denver_outliers_removed['property_type'].value_counts().nlargest(6).index
top_property_types

Index(['Entire home', 'Entire rental unit', 'Entire condo',
       'Entire guest suite', 'Entire townhouse', 'Entire guesthouse'],
      dtype='object', name='property_type')

In [8]:
# reduce property types to top 6 plus "other"

rf_data.loc[:,'prop_type_reduced'] = rf_data['property_type'].apply(lambda x: x if x in top_property_types else 'Other')
rf_data.head()

Unnamed: 0,id,property_type,accommodates,bedrooms,beds,price,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,bath_number,prop_type_reduced
0,360,Entire guesthouse,3,2.0,2.0,90.0,t,4,27,57,147,179,7,0,4.99,4.99,4.96,5.0,5.0,5.0,4.91,1.0,Entire guesthouse
1,364,Entire loft,3,1.0,1.0,179.0,t,23,53,83,358,87,0,0,4.85,4.78,4.81,4.95,4.96,4.65,4.71,1.5,Other
7,31503,Entire guest suite,2,1.0,1.0,103.0,t,12,42,72,347,159,27,3,4.91,4.92,5.0,4.95,4.99,4.88,4.88,1.0,Entire guest suite
8,39405,Entire cottage,2,1.0,1.0,136.0,t,6,7,32,114,667,35,3,4.92,4.88,4.93,4.98,4.97,4.85,4.85,1.0,Other
10,154999,Entire condo,2,1.0,1.0,162.0,t,0,0,0,139,11,0,0,4.7,4.4,4.7,4.7,4.9,4.9,4.6,1.0,Entire condo


In [9]:
encoder = OneHotEncoder(sparse=False)
encoded = encoder.fit_transform(rf_data[['prop_type_reduced', 'has_availability']])
feature_names = encoder.get_feature_names_out()
feature_names



array(['prop_type_reduced_Entire condo',
       'prop_type_reduced_Entire guest suite',
       'prop_type_reduced_Entire guesthouse',
       'prop_type_reduced_Entire home',
       'prop_type_reduced_Entire rental unit',
       'prop_type_reduced_Entire townhouse', 'prop_type_reduced_Other',
       'has_availability_f', 'has_availability_t'], dtype=object)

In [10]:
encoded

array([[0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.]])

In [11]:
# convert transformed array to DataFrame

encoded_df = pd.DataFrame(encoded, columns=feature_names)
encoded_df.head()

Unnamed: 0,prop_type_reduced_Entire condo,prop_type_reduced_Entire guest suite,prop_type_reduced_Entire guesthouse,prop_type_reduced_Entire home,prop_type_reduced_Entire rental unit,prop_type_reduced_Entire townhouse,prop_type_reduced_Other,has_availability_f,has_availability_t
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
# concatenate ID with the encoded DF to be able to join later with the rf_data
encoded_id_df = pd.concat([id2, encoded_df], axis=1)
encoded_id_df.head()

Unnamed: 0,id,prop_type_reduced_Entire condo,prop_type_reduced_Entire guest suite,prop_type_reduced_Entire guesthouse,prop_type_reduced_Entire home,prop_type_reduced_Entire rental unit,prop_type_reduced_Entire townhouse,prop_type_reduced_Other,has_availability_f,has_availability_t
0,360,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,364,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,31503,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,39405,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,154999,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
# remove property_type column
rf_data = rf_data.drop(columns=['property_type', 'prop_type_reduced', 'has_availability'])

rf_data.head()

Unnamed: 0,id,accommodates,bedrooms,beds,price,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,bath_number
0,360,3,2.0,2.0,90.0,4,27,57,147,179,7,0,4.99,4.99,4.96,5.0,5.0,5.0,4.91,1.0
1,364,3,1.0,1.0,179.0,23,53,83,358,87,0,0,4.85,4.78,4.81,4.95,4.96,4.65,4.71,1.5
7,31503,2,1.0,1.0,103.0,12,42,72,347,159,27,3,4.91,4.92,5.0,4.95,4.99,4.88,4.88,1.0
8,39405,2,1.0,1.0,136.0,6,7,32,114,667,35,3,4.92,4.88,4.93,4.98,4.97,4.85,4.85,1.0
10,154999,2,1.0,1.0,162.0,0,0,0,139,11,0,0,4.7,4.4,4.7,4.7,4.9,4.9,4.6,1.0


In [14]:
# make sure id2 file and transformed_no_outliers has same row count

rf_rows = len(rf_data)
enc_rows = len(encoded_id_df)
print(f"rf rows {rf_rows}")
print(f"encoded rows {enc_rows}")

rf rows 3565
encoded rows 3565


In [15]:
# Check for nulls in rf_data and encoded_id_df so no issues when join.
null_counts_mm = rf_data.isnull().sum().sort_values(ascending=False)
null_counts_over0_mm = null_counts_mm[null_counts_mm > 0]
null_counts_mm2 = encoded_id_df.isnull().sum().sort_values(ascending=False)
null_counts_over0_mm2 = null_counts_mm2[null_counts_mm2 > 0]
print(null_counts_over0_mm)
print(null_counts_over0_mm2)

Series([], dtype: int64)
Series([], dtype: int64)


In [16]:
# merge encoded dataframe with rest of data by id

random_forest_encoded_df = pd.merge(rf_data, encoded_id_df, on='id', how='inner')
random_forest_encoded_df.head()



Unnamed: 0,id,accommodates,bedrooms,beds,price,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,bath_number,prop_type_reduced_Entire condo,prop_type_reduced_Entire guest suite,prop_type_reduced_Entire guesthouse,prop_type_reduced_Entire home,prop_type_reduced_Entire rental unit,prop_type_reduced_Entire townhouse,prop_type_reduced_Other,has_availability_f,has_availability_t
0,360,3,2.0,2.0,90.0,4,27,57,147,179,7,0,4.99,4.99,4.96,5.0,5.0,5.0,4.91,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,364,3,1.0,1.0,179.0,23,53,83,358,87,0,0,4.85,4.78,4.81,4.95,4.96,4.65,4.71,1.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,31503,2,1.0,1.0,103.0,12,42,72,347,159,27,3,4.91,4.92,5.0,4.95,4.99,4.88,4.88,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,39405,2,1.0,1.0,136.0,6,7,32,114,667,35,3,4.92,4.88,4.93,4.98,4.97,4.85,4.85,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,154999,2,1.0,1.0,162.0,0,0,0,139,11,0,0,4.7,4.4,4.7,4.7,4.9,4.9,4.6,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [17]:
# Check for nulls in rf_data and encoded_id_df so no issues when join.
rf_nulls = random_forest_encoded_df.isnull().sum().sort_values(ascending=False)
rf_nulls_over0_mm = rf_nulls[rf_nulls > 0]

print(rf_nulls_over0_mm)


Series([], dtype: int64)


In [18]:
##### random forest no need to encode or scale

In [19]:
# create x and y (drop 30 day booking)

X = random_forest_encoded_df.copy()
X.drop('availability_30', axis=1, inplace=True)
y = random_forest_encoded_df['availability_30']

In [20]:
# X.head()
y.ravel()

array([ 4, 23, 12, ...,  4, 24,  6])

In [21]:
# train_test_split on x and y
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [22]:
# create random forest classifier instance

rf_model = RandomForestClassifier(n_estimators=100, random_state=1)

In [23]:
# fit the model

rf_model = rf_model.fit(X_train, y_train)

In [24]:
# make predictions

y_pred = rf_model.predict(X_test)

In [25]:
# calculate accuracy score

acc_score = accuracy_score(y_test, y_pred)

 # Display results
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.4260089686098655


In [26]:
 # Print classification reports
print(f"Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.68      0.98      0.80       254
           1       0.40      0.06      0.11        31
           2       0.17      0.05      0.07        21
           3       0.11      0.03      0.05        32
           4       0.19      0.26      0.22        31
           5       0.09      0.04      0.06        23
           6       0.14      0.19      0.16        36
           7       0.11      0.11      0.11        28
           8       0.11      0.10      0.10        31
           9       0.00      0.00      0.00        33
          10       0.08      0.06      0.07        33
          11       0.10      0.03      0.05        32
          12       0.08      0.29      0.13        14
          13       0.06      0.05      0.06        19
          14       0.00      0.00      0.00        23
          15       0.19      0.24      0.21        25
          16       0.40      0.33      0.36        18
     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
 # Calculate the balanced accuracy score
print(balanced_accuracy_score(y, y_pred))

ValueError: Found input variables with inconsistent numbers of samples: [3565, 892]