In [33]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, roc_auc_score, accuracy_score


In [34]:
# Read in data sets - first set has denver data with outliers removed and encoded. Next set has ID and classification groupings.
df = pd.read_csv('1_denver_no_outliers_encoded_not_scaled.csv')

segments = pd.read_csv("1_pca_kmeans_seg.csv")

In [35]:
# segments.head()

rows = len(segments)
rows2 = len(df)
print({rows})
print({rows2})

{3565}
{3565}


In [36]:
# Check for nulls in rf_data and encoded_id_df so no issues when join.
null_counts = df.isnull().sum().sort_values(ascending=False)
null_counts_over0 = null_counts[null_counts > 0]
null_counts2 = segments.isnull().sum().sort_values(ascending=False)
null_counts2_over0 = null_counts2[null_counts2 > 0]
print(null_counts_over0)
print(null_counts2_over0)

Series([], dtype: int64)
Series([], dtype: int64)


In [37]:
# merge encoded dataframe with rest of data by id

rf_regress_df = pd.merge(segments, df, on='id', how='inner')
rf_regress_df.head()

Unnamed: 0,id,customer_segments_pca,accommodates,bedrooms,beds,price,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,bath_number,prop_type_reduced_Entire condo,prop_type_reduced_Entire guest suite,prop_type_reduced_Entire guesthouse,prop_type_reduced_Entire home,prop_type_reduced_Entire rental unit,prop_type_reduced_Entire townhouse,prop_type_reduced_Other,has_availability_f,has_availability_t
0,360,1,3,2.0,2.0,90.0,4,27,57,147,179,7,0,4.99,4.99,4.96,5.0,5.0,5.0,4.91,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,364,3,3,1.0,1.0,179.0,23,53,83,358,87,0,0,4.85,4.78,4.81,4.95,4.96,4.65,4.71,1.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,31503,3,2,1.0,1.0,103.0,12,42,72,347,159,27,3,4.91,4.92,5.0,4.95,4.99,4.88,4.88,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,39405,0,2,1.0,1.0,136.0,6,7,32,114,667,35,3,4.92,4.88,4.93,4.98,4.97,4.85,4.85,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,154999,1,2,1.0,1.0,162.0,0,0,0,139,11,0,0,4.7,4.4,4.7,4.7,4.9,4.9,4.6,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [38]:
# Check for nulls in rf_data and encoded_id_df so no issues when join.
rf_nulls = rf_regress_df.isnull().sum().sort_values(ascending=False)
rf_nulls_over0 = rf_nulls[rf_nulls > 0]

print(rf_nulls_over0)

Series([], dtype: int64)


In [39]:
# Set id as index so can retain id through the split and prediction process
rf_regress_df= rf_regress_df.set_index('id')

In [20]:
# check row count of merged dataframe

final_rows = len(rf_regress_df)
print(f"rf rows {final_rows}")

rf rows 3565


Below section builds the random forest regressor model.

We need to use regression because the 2 data points we plan to predict are continuous data (not classification data).

The predictions will be for:
- review_scores_rating
- availability_30



In [22]:
##### random forest no need to encode or scale

In [40]:
rf_regress_df.columns

Index(['customer_segments_pca', 'accommodates', 'bedrooms', 'beds', 'price',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'number_of_reviews_ltm',
       'number_of_reviews_l30d', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'bath_number',
       'prop_type_reduced_Entire condo',
       'prop_type_reduced_Entire guest suite',
       'prop_type_reduced_Entire guesthouse', 'prop_type_reduced_Entire home',
       'prop_type_reduced_Entire rental unit',
       'prop_type_reduced_Entire townhouse', 'prop_type_reduced_Other',
       'has_availability_f', 'has_availability_t'],
      dtype='object')

In [41]:
# create x and y (drop 30 day booking or review scores rating)

X_avail = rf_regress_df.copy()
X_avail.drop('availability_30', axis=1, inplace=True)
X_rating = rf_regress_df.copy()
X_rating.drop('review_scores_rating', axis=1, inplace=True)
X_price = rf_regress_df.copy()
X_price.drop('price', axis=1, inplace=True)

y_avail = rf_regress_df['availability_30']
y_rating = rf_regress_df['review_scores_rating']
y_price = rf_regress_df['price']

In [42]:
# get y sets into array format
y_avail.ravel()
y_rating.ravel()
y_price.ravel()

array([ 90., 179., 103., ..., 128.,  50.,  86.])

In [43]:
# train_test_split on x and y for availability, ratings, price data sets
X_avail_train, X_avail_test, y_avail_train, y_avail_test = train_test_split(X_avail, y_avail, random_state=1)

X_rating_train, X_rating_test, y_rating_train, y_rating_test = train_test_split(X_rating, y_rating, random_state=1)

X_price_train, X_price_test, y_price_train, y_price_test = train_test_split(X_price, y_price, random_state=1)


In [44]:
# create random forest regression models for availability and bookings
rf_regress_avail_model = RandomForestRegressor()

rf_regress_avail_model = rf_regress_avail_model.fit(X_avail_train, y_avail_train)

rf_regress_rating_model = RandomForestRegressor()

rf_regress_rating_model = rf_regress_rating_model.fit(X_rating_train, y_rating_train)

rf_regress_price_model = RandomForestRegressor()

rf_regress_price_model = rf_regress_price_model.fit(X_price_train, y_price_train)

In [45]:
# make predictions

y_pred_regress_avail = rf_regress_avail_model.predict(X_avail_test)
y_pred_regress_rating = rf_regress_rating_model.predict(X_rating_test)
y_pred_regress_price = rf_regress_price_model.predict(X_price_test)

In [46]:
# Calculate Mean Absolute Error
mae_avail = mean_absolute_error(y_avail_test, y_pred_regress_avail)
mae_rating = mean_absolute_error(y_rating_test, y_pred_regress_rating)
mae_price = mean_absolute_error(y_price_test, y_pred_regress_price)
print(f"Mean Absolute Error availability: {mae_avail}")
print(f"Mean Absolute Error rating: {mae_rating}")
print(f"Mean Absolute Error price: {mae_price}")
print("------------------------------------------")

# Calculate Mean Squared Error
mse_avail = mean_squared_error(y_avail_test, y_pred_regress_avail)
mse_rating = mean_squared_error(y_rating_test, y_pred_regress_rating)
mse_price = mean_squared_error(y_price_test, y_pred_regress_price)
print(f"Mean Squared Error availiability: {mse_avail}")
print(f"Mean Squared Error rating: {mse_rating}")
print(f"Mean Squared Error price: {mse_price}")
print("------------------------------------------")

# Calculate Root Mean Squared Error
rmse_avail = np.sqrt(mse_avail)
rmse_rating = np.sqrt(mse_rating)
rmse_price = np.sqrt(mse_price)
print(f"Root Mean Squared Error availability: {rmse_avail}")
print(f"Root Mean Squared Error rating: {rmse_rating}")
print(f"Root Mean Squared Error price: {rmse_price}")
print("------------------------------------------")

# Calculate R-squared
r2_avail = r2_score(y_avail_test, y_pred_regress_avail)
r2_rating = r2_score(y_rating_test, y_pred_regress_rating)
r2_price = r2_score(y_price_test, y_pred_regress_price)
print(f"R-squared availability: {r2_avail}")
print(f"R-squared rating: {r2_rating}")
print(f"R-squared price: {r2_price}")

Mean Absolute Error availability: 1.845291479820628
Mean Absolute Error rating: 0.06323419282511199
Mean Absolute Error price: 41.913274343369636
------------------------------------------
Mean Squared Error availiability: 9.471533183856504
Mean Squared Error rating: 0.01840462350896862
Mean Squared Error price: 3070.229349595383
------------------------------------------
Root Mean Squared Error availability: 3.0775856095089384
Root Mean Squared Error rating: 0.13566364107220705
Root Mean Squared Error price: 55.40965032912032
------------------------------------------
R-squared availability: 0.8859635931997473
R-squared rating: 0.8049707791893899
R-squared price: 0.501057820702572


We are trying to predict continuous data for both the 30 day bookings and the review ratings.  Therefore we have to use a regression model (not a classification model). And our metrics to measure accuracy need to be suited for regression, not classification.

Below is further info on the 4 regression metrics:

mean absolute error: 
- lower outcome is better
- range 0 to infinity
- MAE provides the average absolute difference between the predicted values and the actual values. A MAE of 0 means perfect predictions with no errors. The "goodness" of MAE depends on the scale of the target variable; a small MAE is more desirable, indicating that the model's predictions are close to the actual values.

mean squared error:
- lower outcome is better
- range 0 to infinity
- MSE is similar to MAE but squares the differences before averaging them, which penalizes larger errors more severely. A MSE of 0 indicates perfect predictions. Like MAE, whether an MSE is considered good depends on the scale of the data. Lower MSE values indicate better model performance.

root mean squared error:
- lower outcome is better
- range 0 to infinity
- RMSE is the square root of MSE, bringing the error metric back to the scale of the target variable. This makes it more interpretable than MSE. A lower RMSE indicates better model performance, and an RMSE of 0 indicates perfect predictions.

R-squared:
- higher outcome is better
- range neg. infinity to 1
- R-squared measures the proportion of the variance in the dependent variable that is predictable from the independent variables. A value of 1 indicates that the model perfectly predicts the target variable, while a value of 0 indicates that the model does no better than simply predicting the mean of the target variable. Negative R-squared values can occur when the chosen model fits worse than a horizontal line. Higher values indicate better fit, but the "goodness" of an R-squared value also depends on the context and complexity of the model.