In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

In [2]:
# loading dataset
df_dev = pd.read_csv("../datasets/NYC_Airbnb/development.csv")
# df_eval = pd.read_csv("../datasets/NYC_Airbnb/evaluation.csv")
df_dev.drop(columns=["name", "host_name", "host_id"], inplace=True)

In [3]:
# fixing NaN values
df_dev.fillna(
    value = {
        "last_review": "2008-08-11", # launch date of AirBnB
        "reviews_per_month": 0
    },
    inplace=True
)

In [4]:
# converting last_review feature to a numerical value representing days from the newest review
last_reviews = pd.to_datetime(df_dev["last_review"])
newest_review = datetime.today()
df_dev["last_review"] = last_reviews.apply(lambda x: abs((newest_review - x).days))

In [5]:
# one-hot encoding of categorical features
df_dev = pd.get_dummies(df_dev, columns=["room_type", "neighbourhood_group", "neighbourhood"], prefix=["room_type=", "neig_group=", "neig="])
#enc = OneHotEncoder(sparse=False)
#enc.fit(df_dev["room_type"].values.reshape(-1, 1))
#room_type_one_hot = enc.transform(df_dev["room_type"].values.reshape(-1, 1))
#enc.fit(df_dev["neighbourhood_group"].values.reshape(-1, 1))
#neig_group_one_hot = enc.transform(df_dev["neighbourhood_group"].values.reshape(-1, 1))
#enc.fit(df_dev["neighbourhood"].values.reshape(-1, 1))
#neig_one_hot = enc.transform(df_dev["neighbourhood"].values.reshape(-1, 1))

#df_dev.drop(columns=["room_type", "neighbourhood_group", "neighbourhood"], inplace=True)

In [6]:
# getting dataset as numpy arrays
X_dev = df_dev.drop(columns=["id", "price"], inplace=False).values
y_dev = df_dev["price"].values

In [8]:
K = 5
kf = KFold(n_splits=K)
scores = np.zeros((K))
n = 0
for train_indices, validation_indices in kf.split(X_dev):
    X_train = X_dev[train_indices]
    X_valid = X_dev[validation_indices]
    y_train = y_dev[train_indices]
    y_valid = y_dev[validation_indices]
    model = RandomForestRegressor(n_estimators=500, max_features="sqrt", max_depth=30, n_jobs=16)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    scores[n] = r2_score(y_valid, y_pred)
    n += 1
print("Scores: ", scores)
print("Mean: ", scores.mean())

Scores:  [0.25156191 0.14560448 0.12692966 0.15869533 0.12409321]
Mean:  0.16137691842159432


In [8]:
# testing random forest
X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=0.2)
model = RandomForestRegressor(n_estimators=500, max_features="sqrt", max_depth=30, n_jobs=16)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = r2_score(y_test, y_pred)
print(score)

0.20500422389166773


In [None]:
# JUNK CODE DO NOT EXECUTE
X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=0.2)

# separating categorical features from numerical ones
X_train_categorical = X_train[:, 8:]
X_test_categorical = X_test[:, 8:]
X_train_numerical = X_train[:, :8]
X_test_numerical = X_test[:, :8]

# feature normalization
scaler = StandardScaler()
scaler.fit(X_train_numerical)
X_train_numerical = scaler.transform(X_train_numerical)
X_test_numerical = scaler.transform(X_test_numerical)

# applying PCA
#pca = PCA(n_components=(X_train_numerical.shape[1] - FEATURES_TO_REMOVE), svd_solver="auto")
#pca.fit(X_train_numerical)
#X_train_numerical = pca.transform(X_train_numerical)
#X_test_numerical = pca.transform(X_test_numerical)

# recomposing X_train and X_test
X_train = np.hstack((X_train_numerical, X_train_categorical,))
X_test = np.hstack((X_test_numerical, X_test_categorical,))