In [None]:
# Import dependencies.
import pandas as pd
import hvplot.pandas
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, mean_squared_error, r2_score
import joblib


In [None]:
# Read in cleaned CSV file.
df3=pd.read_csv("../Resources/cleaned_data_sea_final.csv", dtype={"bathrooms_cat": np.object,
                                                                "bedrooms_cat": np.object,
                                                                "zipcode": np.object,
# Exclude the first column and preview the dataframe.                                                                "years_in_business_cat": np.object})
df3=df3.iloc[: , 1:]
df3.head()

In [None]:
# Show concise summary of the dataframe.
df3.info()

In [None]:
# Convert 'zipcode' column to object data type.
df3["zipcode"]=df3["zipcode"].astype(object)

In [None]:
# Preview all unique bedroom categories in the dataframe.
df3["bedrooms_cat"].unique()

In [None]:
df3["bathrooms_cat"].unique()

In [None]:
# Drop the 'amenities_count' column.
df3=df3.drop(columns="amenities_count")
df3

In [None]:
df3.info()

In [None]:
# Transform the categorical columns using get_dummies
categorical_columns_2=df3.dtypes[df3.dtypes=="object"].index.tolist()
categorical_columns_2.remove("listing_url")
categorical_columns_2.remove("last_scraped")
categorical_columns_2.remove("host_since")
categorical_columns_2.remove("expected_annual_occupency")




categorical_dummies_2 = pd.get_dummies(df3[categorical_columns_2])

# Display the transformed data
categorical_dummies_2

In [None]:
# the numeric columns
numeric_columns_2=df3.dtypes[df3.dtypes!="object"].index.tolist()
# have to remove the id, lat and long, etc. columns out
data_2=df3[numeric_columns_2].loc[:,~df3[numeric_columns_2].columns.isin(["listing_url","host_since","last_scraped","latitude","longitude",
                                                                    "0.25_price","0.5_price","0.75_price",
                                                                    "0.25_price_bedcat","0.5_price_bedcat","0.75_price_bedcat",
                                                                    "0.25_acc","0.5_acc","0.75_acc",
                                                                    "0.25_bathroom","0.5_bathroom","0.75_bathroom",
                                                                    "0.25_revenue","0.5_revenue","0.75_revenue",
                                                                    "0.25_revenue_acc","0.5_revenue_acc","0.75_revenue_acc",
                                                                    "outlier", "outlier_2","outlier_3",# outliers
                                                                    "bedrooms", "bathrooms_text", "years_in_business" ## these are inthe categorical columns
                                                                    ])]
data_2

In [None]:
# Create the dataframe to be used for ML by adding the 'price' column from df3 and dropping all null values.
df_dummies_2=pd.concat([data_2.reset_index(drop=True),categorical_dummies_2.reset_index(drop=True)], axis=1)
df_dummies_2

# Random Forest
Target: price

In [None]:
# check the columns
for i in df_dummies_2.columns:
    print(i)

In [None]:
#defining features, some rows have to be removed due to nans. This steps will be taken in the next cells
X1=df_dummies_2.drop(columns=["revenue","revenue_per_accommodates","availability_365","price",
                              "price_cat_<25th","price_cat_25th-50th","price_cat_50th-75th","price_cat_>75th",
                              "revenue_cat_<25th","revenue_cat_25th-50th","revenue_cat_50th-75th","revenue_cat_>75th",
                              "revenue_cat_acc_<25th","revenue_cat_acc_25th-50th","revenue_cat_acc_50th-75th","revenue_cat_acc_>75th"
                                     ])

# check the features columns
for i in X1.columns:
    print(i)


In [None]:
#dataframe for ML
rf_data=pd.concat([X1.reset_index(drop=True),pd.DataFrame(df3["price"]).reset_index(drop=True)], axis=1).dropna()
rf_data

In [None]:
#defining features
X=rf_data.drop("price", axis=1)
X

In [None]:
#defining target

y=rf_data["price"].ravel()
y

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Random Forest doesn't care about scaling

In [None]:
# #Scaling the data
# X_train_scale=StandardScaler().fit_transform(X_train)
# X_test_scale=StandardScaler().fit_transform(X_test)

In [None]:
# Get the value counts of our y_test dataset.
pd.DataFrame(y_test).value_counts()

In [None]:
# Fitting the random forest model
regressor = RandomForestRegressor(n_estimators = 500, random_state = 0)
regressor.fit(X_train, y_train)
#predict the model
prediction = regressor.predict(X_test)

In [None]:
# print the metrics
print("MSE:",mean_squared_error(y_test, prediction))

print("R2:", r2_score(y_test, prediction))

# Hyperparameter tunning

In [None]:
# Create parameters for the hyperparameter tunning process.
param_grid={'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, n_iter = 40, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
# Fitting the random forest model with best parameters
regressor = RandomForestRegressor(n_estimators = 1600, random_state = 0,min_samples_split=5, min_samples_leaf=2,
                                  max_features='sqrt', max_depth=80, bootstrap=False)
regressor.fit(X_train, y_train)
#predict the model
prediction = regressor.predict(X_test)

In [None]:
# print the metrics
print("rMSE:",mean_squared_error(y_test, prediction, squared=False))

print("R2:", r2_score(y_test, prediction))

In [None]:
# Create a dataframe with the actual and predicted prices of the listings.
actual=pd.DataFrame(y_test).rename({0:"Actual: price"}, axis=1)
predicted=pd.DataFrame(prediction).rename({0:"Predicted: price"}, axis=1)
fit_df_price=pd.concat([actual,predicted], axis=1)
fit_df_price

In [None]:
# Create a scatter plot depicting the above dataframe.
fig1=fit_df_price.hvplot.scatter(x="Actual: price", y="Predicted: price", width=800, height=400)
fig2=pd.DataFrame(range(0,900)).hvplot.line(x=0, y=0, c="red")
fig3=fig1 * fig2

fig3=fig3.options(xlabel='Actual: price',
                  ylabel='Predicted: price',
                  fontsize={'title': 14,'labels': 14,'xticks': 12,'yticks': 12},
                  title='')
fig3

In [None]:
hvplot.save(fig3, 'price_prediction.png', fmt='png')

# Save and load the model to test with the test data

Please note that the model was large to be pushed to Github. Please run the save model cell below and load the model after that to test the model

In [None]:
# save the model
joblib.dump(regressor, "./random_forest.joblib")

In [None]:
# load the model
model = joblib.load("./random_forest.joblib")

# 
Input the test data

In [None]:
# the test data
df_test=pd.DataFrame([{'accommodates': 6,
'beds':2,
'minimum_nights':2,
'maximum_nights':30,
'zipcode':'98178.0',
'room_type':'Entire home/apt',
'amenities_cat':'moderate',
'instant_bookable':'t',
'bedrooms_cat':'2.0',
'bathrooms_cat':'2',
'years_in_business_cat':' 1-5'}])
df_test

In [None]:
# Transform the categorical columns using get_dummies
categorical_columns_test=df_test.dtypes[df_test.dtypes=="object"].index.tolist()


categorical_columns_test = pd.get_dummies(df_test[categorical_columns_test])

# Display the transformed data
categorical_columns_test

In [None]:
# the numeric columns
numeric_columns_test=df_test.dtypes[df_test.dtypes!="object"].index.tolist()
# have to remove the id, lat and long, etc. columns out
data_test=df_test[numeric_columns_test]
data_test


In [None]:
test_data=pd.concat([data_test,categorical_columns_test], axis=1)
test_data

In [None]:
#making dummy table with all 0s to fill the test dataset
X10=X1.iloc[:1]*0
X10

In [None]:
temp = pd.concat([X10, test_data]).fillna(0)
X_Test=temp.iloc[:1]+temp.iloc[1:2]
X_Test

In [None]:
# Make predictions with the testing data.
predictions=model.predict(X_Test)

In [None]:
predicted_test=pd.DataFrame(predictions).rename({0:"Predicted: price"}, axis=1)

predicted_test