In [1]:
# Import dependencies.
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, IsolationForest, RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score
import tensorflow as tf

In [2]:
# Read in CSV files (from local drive for analysis purposes).
reviews_ml = pd.read_csv('saved_csvs/reviews_ml.csv')

In [3]:
# Preview dataframe.
reviews_ml

Unnamed: 0,accommodates,price,availability_30,availability_60,availability_90,availability_365,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,...,revenue_pp_365,rpp_90_quartile,rpp_90_binary,num_of_revs_cat_1-25,num_of_revs_cat_101-150,num_of_revs_cat_151-200,num_of_revs_cat_201-300,num_of_revs_cat_26-50,num_of_revs_cat_300+,num_of_revs_cat_51-100
0,4,725.0,28,58,88,178,15,4.93,4.93,4.64,...,33893.750000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6,349.0,0,0,0,0,153,4.81,4.82,4.71,...,21230.833333,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,6,225.0,0,0,13,123,92,4.77,4.76,4.56,...,9075.000000,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,14,794.0,5,14,19,205,25,4.50,4.54,4.21,...,9074.285714,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12,1250.0,14,31,43,261,26,4.72,4.67,4.56,...,10833.333333,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10848,4,152.0,5,26,55,63,2,5.00,5.00,5.00,...,11476.000000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
10849,6,144.0,15,45,75,136,2,5.00,5.00,5.00,...,5496.000000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
10850,2,157.0,25,55,84,84,1,5.00,5.00,5.00,...,22058.500000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
10851,2,171.0,21,49,78,78,1,5.00,5.00,5.00,...,24538.500000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Preview columns.
reviews_ml.columns

Index(['accommodates', 'price', 'availability_30', 'availability_60',
       'availability_90', 'availability_365', 'number_of_reviews',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'revenue_30', 'revenue_60', 'revenue_90',
       'revenue_365', 'revenue_pp_30', 'revenue_pp_60', 'revenue_pp_90',
       'revenue_pp_365', 'rpp_90_quartile', 'rpp_90_binary',
       'num_of_revs_cat_1-25', 'num_of_revs_cat_101-150',
       'num_of_revs_cat_151-200', 'num_of_revs_cat_201-300',
       'num_of_revs_cat_26-50', 'num_of_revs_cat_300+',
       'num_of_revs_cat_51-100'],
      dtype='object')

In [5]:
# Take out unnecessary columns for machine learning with 90 day availability.
reviews_ml_90 = reviews_ml.drop(columns=['price', 'number_of_reviews', 'availability_30', 'availability_60', 
                             'availability_90', 'availability_365', 'revenue_30', 'revenue_60', 'revenue_90', 
                             'revenue_365', 'revenue_pp_30', 'revenue_pp_60', 'revenue_pp_90', 'revenue_pp_365', 
                             'rpp_90_quartile'])

reviews_ml_90.head(5)

Unnamed: 0,accommodates,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,rpp_90_binary,num_of_revs_cat_1-25,num_of_revs_cat_101-150,num_of_revs_cat_151-200,num_of_revs_cat_201-300,num_of_revs_cat_26-50,num_of_revs_cat_300+,num_of_revs_cat_51-100
0,4,4.93,4.93,4.64,4.79,5.0,5.0,4.71,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6,4.81,4.82,4.71,4.99,4.97,4.77,4.8,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,6,4.77,4.76,4.56,4.91,4.91,4.81,4.76,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,14,4.5,4.54,4.21,4.54,4.54,4.83,4.42,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12,4.72,4.67,4.56,4.92,4.83,4.79,4.58,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Binary Revenue per Person Random Forest

In [6]:
# Define features and target sets.
X = reviews_ml_90.drop(columns=['rpp_90_binary'])

y = reviews_ml_90['rpp_90_binary']

In [7]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
# Creating StandardScaler instance.
scaler = StandardScaler()

# Fitting StandardScaler.
X_scaler = scaler.fit(X_train)

# Scaling data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

# Fitting the model.
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [10]:
# Calculating the confusion matrix.
matrix = confusion_matrix(y_test, predictions)
matrix_df = pd.DataFrame(
    matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Displaying results.
print("Confusion Matrix")
display(matrix_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,784,528
Actual 1,555,847


Accuracy Score : 0.600957995578482
Classification Report
              precision    recall  f1-score   support

         0.0       0.59      0.60      0.59      1312
         1.0       0.62      0.60      0.61      1402

    accuracy                           0.60      2714
   macro avg       0.60      0.60      0.60      2714
weighted avg       0.60      0.60      0.60      2714



In [11]:
# Calculate feature importance.
importances = rf_model.feature_importances_

# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.13820604699354747, 'review_scores_location'),
 (0.13610229945254512, 'review_scores_value'),
 (0.12494164773409344, 'review_scores_cleanliness'),
 (0.12247817533418837, 'review_scores_rating'),
 (0.11287556787042781, 'review_scores_accuracy'),
 (0.10218783291222519, 'accommodates'),
 (0.10061715445033444, 'review_scores_checkin'),
 (0.09844773437826662, 'review_scores_communication'),
 (0.012276909404664875, 'num_of_revs_cat_1-25'),
 (0.012021865451384164, 'num_of_revs_cat_26-50'),
 (0.011617476053101837, 'num_of_revs_cat_51-100'),
 (0.008724656706530581, 'num_of_revs_cat_101-150'),
 (0.006893096233091311, 'num_of_revs_cat_201-300'),
 (0.006858097539112454, 'num_of_revs_cat_151-200'),
 (0.005751439486486377, 'num_of_revs_cat_300+')]

# Binary Revenue per Person Random Forest with 'accommodates' binned

In [12]:
# Make a copy of 'reviews_ml_90' dataframe for binning purposes.
reviews_ml_90_acc_bin = reviews_ml_90.copy()

In [13]:
# Preview dataframe.
reviews_ml_90_acc_bin.head()

Unnamed: 0,accommodates,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,rpp_90_binary,num_of_revs_cat_1-25,num_of_revs_cat_101-150,num_of_revs_cat_151-200,num_of_revs_cat_201-300,num_of_revs_cat_26-50,num_of_revs_cat_300+,num_of_revs_cat_51-100
0,4,4.93,4.93,4.64,4.79,5.0,5.0,4.71,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6,4.81,4.82,4.71,4.99,4.97,4.77,4.8,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,6,4.77,4.76,4.56,4.91,4.91,4.81,4.76,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,14,4.5,4.54,4.21,4.54,4.54,4.83,4.42,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12,4.72,4.67,4.56,4.92,4.83,4.79,4.58,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [14]:
# Loop through each row and bin them together accordingly.
reviews_ml_90_acc_bin['accommodates_cat'] = reviews_ml_90_acc_bin['accommodates']

for i in reviews_ml_90_acc_bin['accommodates_cat']:

    if 0 < i <= 4:
        reviews_ml_90_acc_bin['accommodates_cat'] = reviews_ml_90_acc_bin['accommodates_cat'].replace(i, '1-4')

    elif 4 < i <= 8:
        reviews_ml_90_acc_bin['accommodates_cat'] = reviews_ml_90_acc_bin['accommodates_cat'].replace(i, '5-8')

    elif 8 < i <= 12:
        reviews_ml_90_acc_bin['accommodates_cat'] = reviews_ml_90_acc_bin['accommodates_cat'].replace(i, '9-12')
        
    else:
        reviews_ml_90_acc_bin['accommodates_cat'] = reviews_ml_90_acc_bin['accommodates_cat'].replace(i, '13-16')
        
reviews_ml_90_acc_bin['accommodates_cat'].value_counts()

accommodates_cat
1-4      6269
5-8      3473
9-12      806
13-16     305
Name: count, dtype: int64

In [15]:
# Drop old 'accommodates' column.
reviews_ml_90_acc_bin = reviews_ml_90_acc_bin.drop(columns=['accommodates'])

In [16]:
# Check data types.
reviews_ml_90_acc_bin.dtypes

review_scores_rating           float64
review_scores_accuracy         float64
review_scores_cleanliness      float64
review_scores_checkin          float64
review_scores_communication    float64
review_scores_location         float64
review_scores_value            float64
rpp_90_binary                  float64
num_of_revs_cat_1-25           float64
num_of_revs_cat_101-150        float64
num_of_revs_cat_151-200        float64
num_of_revs_cat_201-300        float64
num_of_revs_cat_26-50          float64
num_of_revs_cat_300+           float64
num_of_revs_cat_51-100         float64
accommodates_cat                object
dtype: object

In [17]:
# Create categorical variable list.
cat_columns = reviews_ml_90_acc_bin.dtypes[reviews_ml_90_acc_bin.dtypes == "object"].index.tolist()

In [18]:
# Create a OneHotEncoder instance.
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list.
encode_df = pd.DataFrame(enc.fit_transform(reviews_ml_90_acc_bin[cat_columns]))

# Add the encoded variable names to the dataframe.
encode_df.columns = enc.get_feature_names_out()
encode_df



Unnamed: 0,accommodates_cat_1-4,accommodates_cat_13-16,accommodates_cat_5-8,accommodates_cat_9-12
0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0
...,...,...,...,...
10848,1.0,0.0,0.0,0.0
10849,0.0,0.0,1.0,0.0
10850,1.0,0.0,0.0,0.0
10851,1.0,0.0,0.0,0.0


In [19]:
# Merge one-hot encoded features and drop the originals.
reviews_ml_90_acc_bin = reviews_ml_90_acc_bin.merge(encode_df, left_index=True, right_index=True)
reviews_ml_90_acc_bin = reviews_ml_90_acc_bin.drop(columns=cat_columns)
reviews_ml_90_acc_bin.head(5)

Unnamed: 0,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,rpp_90_binary,num_of_revs_cat_1-25,num_of_revs_cat_101-150,num_of_revs_cat_151-200,num_of_revs_cat_201-300,num_of_revs_cat_26-50,num_of_revs_cat_300+,num_of_revs_cat_51-100,accommodates_cat_1-4,accommodates_cat_13-16,accommodates_cat_5-8,accommodates_cat_9-12
0,4.93,4.93,4.64,4.79,5.0,5.0,4.71,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,4.81,4.82,4.71,4.99,4.97,4.77,4.8,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,4.77,4.76,4.56,4.91,4.91,4.81,4.76,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,4.5,4.54,4.21,4.54,4.54,4.83,4.42,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4.72,4.67,4.56,4.92,4.83,4.79,4.58,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [20]:
# Define features and target sets.
X = reviews_ml_90_acc_bin.drop(columns=['rpp_90_binary'])

y= reviews_ml_90_acc_bin['rpp_90_binary']

In [21]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [22]:
# Creating StandardScaler instance.
scaler = StandardScaler()

# Fitting StandardScaler.
X_scaler = scaler.fit(X_train)

# Scaling data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [23]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

# Fitting the model.
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [24]:
# Calculating the confusion matrix.
matrix = confusion_matrix(y_test, predictions)
matrix_df = pd.DataFrame(
    matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Displaying results.
print("Confusion Matrix")
display(matrix_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,781,531
Actual 1,569,833


Accuracy Score : 0.5946941783345615
Classification Report
              precision    recall  f1-score   support

         0.0       0.58      0.60      0.59      1312
         1.0       0.61      0.59      0.60      1402

    accuracy                           0.59      2714
   macro avg       0.59      0.59      0.59      2714
weighted avg       0.60      0.59      0.59      2714



In [25]:
# Calculate feature importance.
importances = rf_model.feature_importances_

# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.14533122162082596, 'review_scores_location'),
 (0.14502691869485232, 'review_scores_value'),
 (0.13207614456285077, 'review_scores_cleanliness'),
 (0.12933570684946752, 'review_scores_rating'),
 (0.12198579733775763, 'review_scores_accuracy'),
 (0.10750292046356019, 'review_scores_checkin'),
 (0.10460442881687619, 'review_scores_communication'),
 (0.015992412101464865, 'accommodates_cat_1-4'),
 (0.013971129254505017, 'accommodates_cat_5-8'),
 (0.013235343439254468, 'num_of_revs_cat_1-25'),
 (0.013096360766553218, 'num_of_revs_cat_26-50'),
 (0.012804327455990088, 'num_of_revs_cat_51-100'),
 (0.009641595550829853, 'num_of_revs_cat_101-150'),
 (0.009003715321706508, 'accommodates_cat_9-12'),
 (0.0075682270649378, 'num_of_revs_cat_151-200'),
 (0.007522978144142733, 'num_of_revs_cat_201-300'),
 (0.006107083879110785, 'num_of_revs_cat_300+'),
 (0.005193688675314045, 'accommodates_cat_13-16')]

# Binary Price Random Forest

In [26]:
# Calculate the quartiles.
quartiles = reviews_ml['price'].quantile([.25, .5, .75])
lowerq = quartiles[0.25]
mean = quartiles[0.5]
upperq = quartiles[0.75]
iqr = upperq - lowerq 
lower_bound = lowerq - (1.5 * iqr)
upper_bound = upperq + (1.5 * iqr)

In [27]:
# Loop through each row and bin them together accordingly.
reviews_ml['price_binary'] = reviews_ml['price']

for i in reviews_ml['price_binary']:
    
    if i > mean:
        reviews_ml['price_binary'] = reviews_ml['price_binary'].replace(i, 1)
        
    else:
        reviews_ml['price_binary'] = reviews_ml['price_binary'].replace(i, 0)

In [28]:
# Preview columns.
reviews_ml_90.columns

Index(['accommodates', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'rpp_90_binary', 'num_of_revs_cat_1-25',
       'num_of_revs_cat_101-150', 'num_of_revs_cat_151-200',
       'num_of_revs_cat_201-300', 'num_of_revs_cat_26-50',
       'num_of_revs_cat_300+', 'num_of_revs_cat_51-100'],
      dtype='object')

In [29]:
# Define features and target sets.
# 'accommodates' column has to be removed because it is too cause_and_effect with price. About 80% accuracy with it included.
X = reviews_ml_90.drop(columns=['rpp_90_binary', 'accommodates'])

y = reviews_ml['price_binary']

In [30]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [31]:
# Creating StandardScaler instance.
scaler = StandardScaler()

# Fitting StandardScaler.
X_scaler = scaler.fit(X_train)

# Scaling data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [32]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

# Fitting the model.
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [33]:
# Calculating the confusion matrix.
matrix = confusion_matrix(y_test, predictions)
matrix_df = pd.DataFrame(
    matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Displaying results.
print("Confusion Matrix")
display(matrix_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,808,553
Actual 1,468,885


Accuracy Score : 0.6238025055268975
Classification Report
              precision    recall  f1-score   support

         0.0       0.63      0.59      0.61      1361
         1.0       0.62      0.65      0.63      1353

    accuracy                           0.62      2714
   macro avg       0.62      0.62      0.62      2714
weighted avg       0.62      0.62      0.62      2714



In [34]:
# Calculate feature importance.
importances = rf_model.feature_importances_

# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.16930573269756086, 'review_scores_location'),
 (0.1614071160893729, 'review_scores_value'),
 (0.13920118509132506, 'review_scores_cleanliness'),
 (0.13081108605635772, 'review_scores_rating'),
 (0.11990226956839088, 'review_scores_accuracy'),
 (0.10776980069760073, 'review_scores_communication'),
 (0.1073075001908408, 'review_scores_checkin'),
 (0.011176010439778332, 'num_of_revs_cat_1-25'),
 (0.010757547922328668, 'num_of_revs_cat_51-100'),
 (0.010540413746128225, 'num_of_revs_cat_26-50'),
 (0.009591526138724915, 'num_of_revs_cat_300+'),
 (0.00818288757202981, 'num_of_revs_cat_201-300'),
 (0.007432758834628282, 'num_of_revs_cat_101-150'),
 (0.0066141649549329735, 'num_of_revs_cat_151-200')]

# Check for more Outliers

In [35]:
# Create an empty list to hold our possible outliers.
outliers = []

# Calculate the IQR and quantitatively determine if there are any potential outliers in the 'price' column.
quartiles = reviews_ml_90['accommodates'].quantile([0.05, .25, .5, .75, 0.95])
lowerq = quartiles[0.05]
upperq = quartiles[0.95]
iqr = upperq - lowerq 
lower_bound = lowerq - (1.5 * iqr)
upper_bound = upperq + (1.5 * iqr)

In [36]:
# Determine outliers using upper and lower bounds.
for i in reviews_ml_90['accommodates']:
    if (i < lower_bound) | (i > upper_bound):
        outliers.append(i)

# Print out all the possible outliers.
print(f'Count: {len(outliers)}, Potential outliers: {outliers}')

Count: 0, Potential outliers: []


In [37]:
# Create an empty list to hold our possible outliers.
outliers_2 = []

# Calculate the IQR and quantitatively determine if there are any potential outliers in the 'price' column.
quartiles = reviews_ml_90['review_scores_rating'].quantile([0.05, .25, .5, .75, 0.95])
lowerq = quartiles[0.05]
upperq = quartiles[0.95]
iqr = upperq - lowerq 
lower_bound = lowerq - (1.5 * iqr)
upper_bound = upperq + (1.5 * iqr)

In [38]:
# Determine outliers using upper and lower bounds.
for i in reviews_ml_90['review_scores_rating']:
    if (i < lower_bound) | (i > upper_bound):
        outliers_2.append(i)

# Print out all the possible outliers.
print(f'Count: {len(outliers_2)}, Potential outliers: {outliers_2}')

Count: 36, Potential outliers: [1.0, 1.0, 1.5, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.5, 2.67, 2.5, 1.0, 1.0, 1.5, 2.33, 2.5, 1.5, 2.5, 2.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0]


In [39]:
# Loop through each row and bin them together accordingly.
reviews_ml_90['ratings_cat'] = reviews_ml_90['review_scores_rating']

for i in reviews_ml_90['ratings_cat']:

    if 4.5 < i <= 5:
        reviews_ml_90['ratings_cat'] = reviews_ml_90['ratings_cat'].replace(i, '4.51-5.0')

    elif 4 <= i <= 4.5:
        reviews_ml_90['ratings_cat'] = reviews_ml_90['ratings_cat'].replace(i, '4.0-4.5')
        
    elif 3 <= i < 4:
        reviews_ml_90['ratings_cat'] = reviews_ml_90['ratings_cat'].replace(i, '3.0-3.99')

    elif 2 <= i < 3:
        reviews_ml_90['ratings_cat'] = reviews_ml_90['ratings_cat'].replace(i, '2.0-2.99')
        
    else:
        reviews_ml_90['ratings_cat'] = reviews_ml_90['ratings_cat'].replace(i, '<2.0')
        
reviews_ml_90['ratings_cat'].value_counts()

ratings_cat
4.51-5.0    9448
4.0-4.5     1155
3.0-3.99     214
<2.0          23
2.0-2.99      13
Name: count, dtype: int64

In [40]:
# Create new dataframe with 'review_scores_rating' outliers intact and 'review_score_rating' column dropped
# (for later ml purposes).
reviews_ml_90_with_rat_outliers = reviews_ml_90.drop(columns=['review_scores_rating'])

In [41]:
# Remove outliers from 'review_scores_rating' column.
for i in reviews_ml_90['review_scores_rating']:
    if (i < lower_bound) | (i > upper_bound):
        reviews_ml_90['review_scores_rating'] = reviews_ml_90['review_scores_rating'].replace(i, 'DROP')

reviews_ml_90 = reviews_ml_90.drop(reviews_ml_90[reviews_ml_90['review_scores_rating'] == 'DROP'].index)

In [42]:
# Check to see that 36 rows were dropped.
len(reviews_ml_90)

10817

# Binary RpP Random Forest with 'review_score_ratings' Outliers dropped

In [43]:
# Preview dataframe.
reviews_ml_90

Unnamed: 0,accommodates,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,rpp_90_binary,num_of_revs_cat_1-25,num_of_revs_cat_101-150,num_of_revs_cat_151-200,num_of_revs_cat_201-300,num_of_revs_cat_26-50,num_of_revs_cat_300+,num_of_revs_cat_51-100,ratings_cat
0,4,4.93,4.93,4.64,4.79,5.00,5.00,4.71,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.51-5.0
1,6,4.81,4.82,4.71,4.99,4.97,4.77,4.80,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.51-5.0
2,6,4.77,4.76,4.56,4.91,4.91,4.81,4.76,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.51-5.0
3,14,4.5,4.54,4.21,4.54,4.54,4.83,4.42,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0-4.5
4,12,4.72,4.67,4.56,4.92,4.83,4.79,4.58,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.51-5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10848,4,5.0,5.00,5.00,5.00,5.00,4.00,4.50,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.51-5.0
10849,6,5.0,5.00,5.00,5.00,5.00,5.00,5.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.51-5.0
10850,2,5.0,5.00,5.00,5.00,5.00,5.00,5.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.51-5.0
10851,2,5.0,5.00,5.00,5.00,5.00,5.00,5.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.51-5.0


In [44]:
# Define features and target sets.
X = reviews_ml_90.drop(columns=['rpp_90_binary', 'ratings_cat'])

y = reviews_ml_90['rpp_90_binary']

In [45]:
# Preview features dataset.
X.head(5)

Unnamed: 0,accommodates,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,num_of_revs_cat_1-25,num_of_revs_cat_101-150,num_of_revs_cat_151-200,num_of_revs_cat_201-300,num_of_revs_cat_26-50,num_of_revs_cat_300+,num_of_revs_cat_51-100
0,4,4.93,4.93,4.64,4.79,5.0,5.0,4.71,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6,4.81,4.82,4.71,4.99,4.97,4.77,4.8,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,6,4.77,4.76,4.56,4.91,4.91,4.81,4.76,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,14,4.5,4.54,4.21,4.54,4.54,4.83,4.42,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12,4.72,4.67,4.56,4.92,4.83,4.79,4.58,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [46]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [47]:
# Creating StandardScaler instance.
scaler = StandardScaler()

# Fitting StandardScaler.
X_scaler = scaler.fit(X_train)

# Scaling data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [48]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

# Fitting the model.
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [49]:
# Calculating the confusion matrix.
matrix = confusion_matrix(y_test, predictions)
matrix_df = pd.DataFrame(
    matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Displaying results.
print("Confusion Matrix")
display(matrix_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,746,621
Actual 1,484,854


Accuracy Score : 0.5914972273567468
Classification Report
              precision    recall  f1-score   support

         0.0       0.61      0.55      0.57      1367
         1.0       0.58      0.64      0.61      1338

    accuracy                           0.59      2705
   macro avg       0.59      0.59      0.59      2705
weighted avg       0.59      0.59      0.59      2705



In [50]:
# Calculate feature importance.
importances = rf_model.feature_importances_

# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.1384868640131458, 'review_scores_location'),
 (0.13342279303725782, 'review_scores_value'),
 (0.12356838354250164, 'review_scores_cleanliness'),
 (0.12253293441247985, 'review_scores_rating'),
 (0.11366193405978264, 'review_scores_accuracy'),
 (0.10178547065637521, 'accommodates'),
 (0.10117837112373447, 'review_scores_checkin'),
 (0.10068863769039371, 'review_scores_communication'),
 (0.012729132239838672, 'num_of_revs_cat_1-25'),
 (0.01197299461724169, 'num_of_revs_cat_26-50'),
 (0.011801822703539945, 'num_of_revs_cat_51-100'),
 (0.009107492935296972, 'num_of_revs_cat_101-150'),
 (0.006847898790681202, 'num_of_revs_cat_151-200'),
 (0.006584337323166106, 'num_of_revs_cat_201-300'),
 (0.005630932854564544, 'num_of_revs_cat_300+')]

# Binary RpP Random Forest with 'review_score_ratings' Categorized (reviews outliers intact)

In [51]:
# Preview dataframe.
reviews_ml_90_with_rat_outliers

Unnamed: 0,accommodates,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,rpp_90_binary,num_of_revs_cat_1-25,num_of_revs_cat_101-150,num_of_revs_cat_151-200,num_of_revs_cat_201-300,num_of_revs_cat_26-50,num_of_revs_cat_300+,num_of_revs_cat_51-100,ratings_cat
0,4,4.93,4.64,4.79,5.00,5.00,4.71,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.51-5.0
1,6,4.82,4.71,4.99,4.97,4.77,4.80,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.51-5.0
2,6,4.76,4.56,4.91,4.91,4.81,4.76,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.51-5.0
3,14,4.54,4.21,4.54,4.54,4.83,4.42,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0-4.5
4,12,4.67,4.56,4.92,4.83,4.79,4.58,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.51-5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10848,4,5.00,5.00,5.00,5.00,4.00,4.50,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.51-5.0
10849,6,5.00,5.00,5.00,5.00,5.00,5.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.51-5.0
10850,2,5.00,5.00,5.00,5.00,5.00,5.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.51-5.0
10851,2,5.00,5.00,5.00,5.00,5.00,5.00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.51-5.0


In [52]:
# Create categorical variable list.
cat_columns = reviews_ml_90_with_rat_outliers.dtypes[reviews_ml_90_with_rat_outliers.dtypes == "object"].index.tolist()

In [53]:
# Create a OneHotEncoder instance.
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list.
encode_df = pd.DataFrame(enc.fit_transform(reviews_ml_90_with_rat_outliers[cat_columns]))

# Add the encoded variable names to the dataframe.
encode_df.columns = enc.get_feature_names_out()
encode_df



Unnamed: 0,ratings_cat_2.0-2.99,ratings_cat_3.0-3.99,ratings_cat_4.0-4.5,ratings_cat_4.51-5.0,ratings_cat_<2.0
0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...
10848,0.0,0.0,0.0,1.0,0.0
10849,0.0,0.0,0.0,1.0,0.0
10850,0.0,0.0,0.0,1.0,0.0
10851,0.0,0.0,0.0,1.0,0.0


In [54]:
# Merge one-hot encoded features and drop the originals.
reviews_ml_90_with_rat_outliers = reviews_ml_90_with_rat_outliers.merge(encode_df, left_index=True, right_index=True)
reviews_ml_90_with_rat_outliers = reviews_ml_90_with_rat_outliers.drop(columns=cat_columns)
reviews_ml_90_with_rat_outliers.head(5)

Unnamed: 0,accommodates,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,rpp_90_binary,num_of_revs_cat_1-25,num_of_revs_cat_101-150,num_of_revs_cat_151-200,num_of_revs_cat_201-300,num_of_revs_cat_26-50,num_of_revs_cat_300+,num_of_revs_cat_51-100,ratings_cat_2.0-2.99,ratings_cat_3.0-3.99,ratings_cat_4.0-4.5,ratings_cat_4.51-5.0,ratings_cat_<2.0
0,4,4.93,4.64,4.79,5.0,5.0,4.71,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,6,4.82,4.71,4.99,4.97,4.77,4.8,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,6,4.76,4.56,4.91,4.91,4.81,4.76,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,14,4.54,4.21,4.54,4.54,4.83,4.42,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,12,4.67,4.56,4.92,4.83,4.79,4.58,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [55]:
# Define features and target sets.
X = reviews_ml_90_with_rat_outliers.drop(columns=['rpp_90_binary'])

y = reviews_ml_90_with_rat_outliers['rpp_90_binary']

In [56]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [57]:
# Creating StandardScaler instance.
scaler = StandardScaler()

# Fitting StandardScaler.
X_scaler = scaler.fit(X_train)

# Scaling data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [58]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

# Fitting the model.
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [59]:
# Calculating the confusion matrix.
matrix = confusion_matrix(y_test, predictions)
matrix_df = pd.DataFrame(
    matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Displaying results.
print("Confusion Matrix")
display(matrix_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,773,539
Actual 1,553,849


Accuracy Score : 0.5976418570375829
Classification Report
              precision    recall  f1-score   support

         0.0       0.58      0.59      0.59      1312
         1.0       0.61      0.61      0.61      1402

    accuracy                           0.60      2714
   macro avg       0.60      0.60      0.60      2714
weighted avg       0.60      0.60      0.60      2714



In [60]:
# Calculate feature importance.
importances = rf_model.feature_importances_

# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.15696816935348423, 'review_scores_value'),
 (0.15267385094130173, 'review_scores_location'),
 (0.14123211651587908, 'review_scores_cleanliness'),
 (0.12907840269591575, 'review_scores_accuracy'),
 (0.11420201543871691, 'review_scores_checkin'),
 (0.11216545541990566, 'review_scores_communication'),
 (0.11093833645705908, 'accommodates'),
 (0.013231896107801795, 'num_of_revs_cat_1-25'),
 (0.012679330316634454, 'num_of_revs_cat_26-50'),
 (0.012436404936883744, 'num_of_revs_cat_51-100'),
 (0.009307606849780803, 'num_of_revs_cat_101-150'),
 (0.007284661924819125, 'num_of_revs_cat_201-300'),
 (0.0071953695804809845, 'num_of_revs_cat_151-200'),
 (0.006513386748163672, 'ratings_cat_4.51-5.0'),
 (0.006238249492755322, 'num_of_revs_cat_300+'),
 (0.005664294513948332, 'ratings_cat_4.0-4.5'),
 (0.001626250388917697, 'ratings_cat_3.0-3.99'),
 (0.00030037257847533684, 'ratings_cat_<2.0'),
 (0.0002638297390762445, 'ratings_cat_2.0-2.99')]

# Isolation Forest with initial data from the top

In [61]:
# Define features and target sets.
X = reviews_ml_90.drop(columns=['rpp_90_binary', 'ratings_cat'])

y = reviews_ml_90['rpp_90_binary']

In [62]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [64]:
# Creating StandardScaler instance.
scaler = StandardScaler()

# Fitting StandardScaler.
X_scaler = scaler.fit(X_train)

# Scaling data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [65]:
# Create an isolation forest classifier.
if_model = IsolationForest(n_estimators=500, random_state=1)

# Fitting the model.
if_model = if_model.fit(X_train_scaled)

# Making predictions using the testing data.
predictions = if_model.predict(X_test_scaled)

In [66]:
# Calculating the confusion matrix.
matrix = confusion_matrix(y_test, predictions)
matrix_df = pd.DataFrame

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Displaying results.
print("Confusion Matrix")
display(matrix_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


pandas.core.frame.DataFrame

Accuracy Score : 0.46432532347504624
Classification Report
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         0.0       0.00      0.00      0.00      1367
         1.0       0.52      0.94      0.67      1338

    accuracy                           0.46      2705
   macro avg       0.17      0.31      0.22      2705
weighted avg       0.26      0.46      0.33      2705



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Availability_365 as Target

In [67]:
# Take out unnecessary columns for machine learning with 365 day availability.
reviews_ml_365 = reviews_ml.drop(columns=['accommodates', 'price', 'number_of_reviews', 'availability_30', 'availability_60', 
                             'availability_90', 'revenue_30', 'revenue_60', 'revenue_90', 
                             'revenue_365', 'revenue_pp_30', 'revenue_pp_60', 'revenue_pp_90', 'revenue_pp_365', 
                             'rpp_90_quartile', 'rpp_90_binary'])

reviews_ml_365

Unnamed: 0,availability_365,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,num_of_revs_cat_1-25,num_of_revs_cat_101-150,num_of_revs_cat_151-200,num_of_revs_cat_201-300,num_of_revs_cat_26-50,num_of_revs_cat_300+,num_of_revs_cat_51-100,price_binary
0,178,4.93,4.93,4.64,4.79,5.00,5.00,4.71,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,4.81,4.82,4.71,4.99,4.97,4.77,4.80,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,123,4.77,4.76,4.56,4.91,4.91,4.81,4.76,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,205,4.50,4.54,4.21,4.54,4.54,4.83,4.42,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,261,4.72,4.67,4.56,4.92,4.83,4.79,4.58,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10848,63,5.00,5.00,5.00,5.00,5.00,4.00,4.50,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10849,136,5.00,5.00,5.00,5.00,5.00,5.00,5.00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10850,84,5.00,5.00,5.00,5.00,5.00,5.00,5.00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10851,78,5.00,5.00,5.00,5.00,5.00,5.00,5.00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
# Preview columns.
reviews_ml_365.columns

Index(['availability_365', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'num_of_revs_cat_1-25',
       'num_of_revs_cat_101-150', 'num_of_revs_cat_151-200',
       'num_of_revs_cat_201-300', 'num_of_revs_cat_26-50',
       'num_of_revs_cat_300+', 'num_of_revs_cat_51-100', 'price_binary'],
      dtype='object')

In [69]:
# Define features and target sets.
X = reviews_ml_365.drop(columns=['availability_365'])

y = reviews_ml_365['availability_365'].ravel()

In [70]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [71]:
# Creating StandardScaler instance.
scaler = StandardScaler()

# Fitting StandardScaler.
X_scaler = scaler.fit(X_train)

# Scaling data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [72]:
# Create a random forest regressor classifier.
regressor = RandomForestRegressor(n_estimators = 500, random_state = 0)

# Fitting the model.
regressor.fit(X_train, y_train)

# Predict the model.
prediction = regressor.predict(X_test)

In [73]:
# Print the metrics.
print("MSE:", mean_squared_error(y_test, prediction))

print("R2:", r2_score(y_test, prediction))

MSE: 15055.84368413138
R2: 0.013224708688496678


In [74]:
# Calculate feature importance.
importances = regressor.feature_importances_

# Sort the features by their importance.
sorted(zip(regressor.feature_importances_, X.columns), reverse=True)

[(0.1694447883175201, 'review_scores_value'),
 (0.14186947571387523, 'review_scores_cleanliness'),
 (0.13288359311174025, 'review_scores_location'),
 (0.12400173542201208, 'review_scores_rating'),
 (0.11937590521389586, 'review_scores_accuracy'),
 (0.10900485397666215, 'review_scores_communication'),
 (0.10712566040841599, 'review_scores_checkin'),
 (0.023854490263051118, 'price_binary'),
 (0.01478972573724084, 'num_of_revs_cat_26-50'),
 (0.012237553754715089, 'num_of_revs_cat_1-25'),
 (0.011924134744532888, 'num_of_revs_cat_51-100'),
 (0.009867801779396335, 'num_of_revs_cat_101-150'),
 (0.008913808864828912, 'num_of_revs_cat_201-300'),
 (0.00786492877398604, 'num_of_revs_cat_151-200'),
 (0.006841543918127206, 'num_of_revs_cat_300+')]

# With Price

In [75]:
# Take out unnecessary columns for machine learning with 365 day availability.
reviews_ml_365_price = reviews_ml.drop(columns=['number_of_reviews', 'availability_30', 'availability_365', 
                             'availability_90', 'revenue_30', 'revenue_60', 'revenue_90', 
                             'revenue_365', 'revenue_pp_30', 'revenue_pp_60', 'revenue_pp_90', 'revenue_pp_365', 
                             'rpp_90_quartile', 'rpp_90_binary'])

reviews_ml_365_price

Unnamed: 0,accommodates,price,availability_60,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,num_of_revs_cat_1-25,num_of_revs_cat_101-150,num_of_revs_cat_151-200,num_of_revs_cat_201-300,num_of_revs_cat_26-50,num_of_revs_cat_300+,num_of_revs_cat_51-100,price_binary
0,4,725.0,58,4.93,4.93,4.64,4.79,5.00,5.00,4.71,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,6,349.0,0,4.81,4.82,4.71,4.99,4.97,4.77,4.80,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,6,225.0,0,4.77,4.76,4.56,4.91,4.91,4.81,4.76,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,14,794.0,14,4.50,4.54,4.21,4.54,4.54,4.83,4.42,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,12,1250.0,31,4.72,4.67,4.56,4.92,4.83,4.79,4.58,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10848,4,152.0,26,5.00,5.00,5.00,5.00,5.00,4.00,4.50,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10849,6,144.0,45,5.00,5.00,5.00,5.00,5.00,5.00,5.00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10850,2,157.0,55,5.00,5.00,5.00,5.00,5.00,5.00,5.00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10851,2,171.0,49,5.00,5.00,5.00,5.00,5.00,5.00,5.00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
# Preview columns.
reviews_ml_365_price.columns

Index(['accommodates', 'price', 'availability_60', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'num_of_revs_cat_1-25',
       'num_of_revs_cat_101-150', 'num_of_revs_cat_151-200',
       'num_of_revs_cat_201-300', 'num_of_revs_cat_26-50',
       'num_of_revs_cat_300+', 'num_of_revs_cat_51-100', 'price_binary'],
      dtype='object')

In [77]:
# Define features and target sets.
X = reviews_ml_365_price.drop(columns=['availability_60'])

y = reviews_ml_365_price['availability_60'].ravel()

In [78]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [79]:
# Creating StandardScaler instance.
scaler = StandardScaler()

# Fitting StandardScaler.
X_scaler = scaler.fit(X_train)

# Scaling data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [80]:
# Create a random forest regressor classifier.
regressor = RandomForestRegressor(n_estimators = 500, random_state = 0)

# Fitting the model.
regressor.fit(X_train_scaled, y_train)

# Predict the model.
prediction = regressor.predict(X_test_scaled)

In [81]:
# Print the metrics.
print("MSE:", mean_squared_error(y_test, prediction, squared=False))

print("R2:", r2_score(y_test, prediction))

MSE: 20.150246241518033
R2: 0.013197713251124843


In [82]:
# Calculate feature importance.
importances = regressor.feature_importances_

# Sort the features by their importance.
sorted(zip(regressor.feature_importances_, X.columns), reverse=True)

[(0.22867650622791258, 'price'),
 (0.12672788804120202, 'review_scores_value'),
 (0.09927940755133499, 'review_scores_cleanliness'),
 (0.08871853841154666, 'review_scores_rating'),
 (0.08871630234943353, 'review_scores_location'),
 (0.08407773910050699, 'review_scores_accuracy'),
 (0.08233470362064438, 'accommodates'),
 (0.07673965941964951, 'review_scores_checkin'),
 (0.07342873630859242, 'review_scores_communication'),
 (0.010363212239229728, 'num_of_revs_cat_26-50'),
 (0.010032947803424803, 'num_of_revs_cat_1-25'),
 (0.008351464516818943, 'num_of_revs_cat_51-100'),
 (0.006867916328729738, 'num_of_revs_cat_101-150'),
 (0.004723854822147994, 'num_of_revs_cat_151-200'),
 (0.004425406776015509, 'num_of_revs_cat_201-300'),
 (0.0032994834878403703, 'price_binary'),
 (0.0032362329949697905, 'num_of_revs_cat_300+')]