In [3]:
import pandas as pd
import numpy as np
import category_encoders as ce

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as MSE

#preparing data
df = pd.read_csv("listings-full-08.csv")
df_sept = pd.read_csv("listings-full-09.csv")
df = df.append(df_sept, ignore_index=True, sort=False)
df = df.drop(["neighbourhood_cleansed",
              "first_review_days_ago",
              "last_review_days_ago",
              "review_scores_accuracy",
              "review_scores_cleanliness",
              "review_scores_checkin",
              "review_scores_communication",
              "review_scores_location",
              "review_scores_value"],
             axis=1)
print(df.info())
#dropping rows with missing values
df = df.dropna(axis=0)

#making dummy variables using the "one hot" method
onehot_encoder = ce.one_hot.OneHotEncoder(use_cat_names=True)
df = onehot_encoder.fit_transform(df)

#droping one from each group of dummy variables to avoid the Dummy Variable Trap(linear regressor)
df = df.drop(["neighbourhood_group_cleansed_Staten Island",
              "property_type_Treehouse",
              "room_type_Hotel room",
              "bed_type_Airbed",
              "cancellation_policy_strict_14_with_grace_period"],
              axis=1)

#separaing the independent variables from the dependent variable
x = df.drop(["price"], axis=1)
y = df["price"]

#initializing lists for metrics calculation later
#rmse: root mean square error
#r2: r squared
#mape: mean absolute percatage error
knn_rmse = []
knn_r2 = []
knn_mape = []
ols_rmse = []
ols_r2 = []
ols_mape = []
rf_rmse = []
rf_r2 = []
rf_mape = []
mlp_rmse = []
mlp_r2 = []
mlp_mape = []

#ten-fold cross validation
kf = KFold(n_splits=10, shuffle = True, random_state=89423)
for train_index, test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    print("fold")
    
    #k-nearest-neighbor regressor
    #11 seems to be a good value of k
    k=11
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(x_train, y_train)
    y_predicted_knn = knn.predict(x_test)
    knn_rmse.append(np.sqrt(MSE(y_test, y_predicted_knn)))
    knn_r2.append(r2_score(y_test, y_predicted_knn))
    #dropping infinity and negative infinity results from calculation
    #of (y_predicted_knn - y_test) / y_test
    s_knn = np.absolute((y_predicted_knn - y_test) / y_test)
    s_knn = s_knn.replace([np.inf, -np.inf], np.nan).dropna()
    knn_mape.append(s_knn.mean())

    #linear regressor
    ols = LinearRegression().fit(x_train, y_train)
    y_predicted_ols = ols.predict(x_test)
    ols_rmse.append(np.sqrt(MSE(y_test, y_predicted_ols)))
    ols_r2.append(r2_score(y_test, y_predicted_ols))
    #dropping infinity and negative infinity
    s_ols = np.absolute((y_predicted_ols - y_test) / y_test)
    s_ols = s_ols.replace([np.inf, -np.inf], np.nan).dropna()
    ols_mape.append(s_ols.mean())

    #random forest regressor
    rf = RandomForestRegressor(n_estimators=21)
    rf.fit(x_train, y_train)
    y_predicted_rf = rf.predict(x_test)
    rf_rmse.append(np.sqrt(MSE(y_test, y_predicted_rf)))
    rf_r2.append(r2_score(y_test, y_predicted_rf))
    #dropping infinity and negative infinity
    s_rf = np.absolute((y_predicted_rf - y_test) / y_test)
    s_rf = s_rf.replace([np.inf, -np.inf], np.nan).dropna()
    rf_mape.append(s_rf.mean())
    
    #multi-layer perceptron
    mlp = MLPRegressor()
    mlp.fit(x_train, y_train)
    y_predicted_mlp = mlp.predict(x_test)
    mlp_rmse.append(np.sqrt(MSE(y_test, y_predicted_mlp)))
    mlp_r2.append(r2_score(y_test, y_predicted_mlp))
    #dropping infinity an dnegative infinity
    s_mlp = np.absolute((y_predicted_mlp - y_test) / y_test)
    s_mlp = s_mlp.replace([np.inf, -np.inf], np.nan).dropna()
    mlp_mape.append(s_mlp.mean())

print("With ten-fold Cross-Validation:")
print("11nn RMSE=" + str(np.mean(knn_rmse)))
print("11nn r^2 score=" + str(np.mean(knn_r2)))
print("11nn mean absolute precentage error=" + str(np.mean(knn_mape) * 100) + "%")
print("ols RMSE=" + str(np.mean(ols_rmse)))
print("ols r^2 score=" + str(np.mean(ols_r2)))
print("ols mean absolute precentage error=" + str(np.mean(ols_mape) * 100) + "%")
print("rf RMSE=" + str(np.mean(rf_rmse)))
print("rf r^2 score=" + str(np.mean(rf_r2)))
print("rf mean absolute precentage error=" + str(np.mean(rf_mape) * 100) + "%")
print("mlf RMSE=" + str(np.mean(mlp_rmse)))
print("mlp r^2 score=" + str(np.mean(mlp_r2)))
print("mlp mean absolute precentage error=" + str(np.mean(mlp_mape) * 100) + "%")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97272 entries, 0 to 97271
Data columns (total 24 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   price                         97272 non-null  int64  
 1   host_response_time            64495 non-null  object 
 2   host_response_rate            64495 non-null  float64
 3   host_is_superhost             97230 non-null  float64
 4   neighbourhood_group_cleansed  97272 non-null  object 
 5   latitude                      97272 non-null  float64
 6   longitude                     97272 non-null  float64
 7   is_location_exact             97272 non-null  int64  
 8   property_type                 97272 non-null  object 
 9   room_type                     97272 non-null  object 
 10  accommodates                  97272 non-null  int64  
 11  bathrooms                     97168 non-null  float64
 12  bedrooms                      97209 non-null  float64
 13  b