In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
listing_data = pd.read_csv("./data/modelling_data")
listing_data.head()

Unnamed: 0,name,borough,neighbourhood,room_type_encoded,price,bathrooms,accommodates,bedrooms,price_range_encoded,minimum_nights,...,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,borough_Brooklyn,borough_Manhattan,borough_Queens,borough_Staten Island
0,"Spacious Brooklyn Duplex, Patio + Garden",Brooklyn,Sunset Park,0,275,1.0,4,2.0,2,21,...,0,0,0,0,0,0,1,0,0,0
1,Cozy Clean Guest Room - Family Apt,Manhattan,Upper West Side,1,75,1.0,1,1.0,0,2,...,0,0,0,0,0,0,0,1,0,0
2,BlissArtsSpace!,Brooklyn,Bedford-Stuyvesant,1,60,1.0,2,1.0,1,30,...,0,0,0,0,0,0,1,0,0,0
3,Large Furnished Room Near B'way,Manhattan,Midtown,1,68,1.0,2,1.0,1,2,...,0,0,0,0,0,0,0,1,0,0
4,Uptown Sanctuary w/ Private Bath (Month to Month),Manhattan,East Harlem,1,65,1.0,1,1.0,1,30,...,0,0,0,0,0,0,0,1,0,0


In [3]:
X_train, X_test, y_train, y_test = train_test_split(listing_data.drop(columns='price'),
                                                    listing_data.price, test_size=0.3,
                                                    random_state=18)


names_list = ['name', 'borough','neighbourhood']
names_train = X_train[names_list]
names_test = X_test[names_list]
X_train.drop(columns=names_list, inplace=True)
X_test.drop(columns=names_list, inplace=True)
X_train.shape, X_test.shape

((26221, 232), (11238, 232))

In the previous step, the random forest regression had an RSME of 62.69. 

I will perform a gridsearchCV to tune the parameters and calculate the new performance.

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': np.arange(50,110,10),
    'max_depth': np.arange(5,25,5)
}

rf = RandomForestRegressor()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best hyperparameters: {'max_depth': 15, 'n_estimators': 80}
Best score: 0.866128929070032


In [7]:
rfr = RandomForestRegressor(n_estimators=80, max_depth=15, random_state=18)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("R-squared:", r2)
print("MSE:", mse)
rmse = np.sqrt(mse)
print("This random forest model's predictions would be off by an average of ${} from the true values".format(round(rmse,2)))

R-squared: 0.8554399030350232
MSE: 3942.739833943989
This random forest model's predictions would be off by an average of $62.79 from the true values


# Engineering and adding more features 

The following are popular landmarks from each borough of New York City: <br>

- **Manhattan**: Empire State Building (40.7485° N, 73.9857° W)
- **Brooklyn**: Brooklyn Bridge (40.7061° N, 73.9969° W)
- **Queens**: Flushing Meadows-Corona Park (40.7400° N, 73.8407° W)
- **The Bronx**: Yankee Stadium (40.8296° N, 73.9262° W)
- **Staten Island**: Staten Island Ferry (40.6437° N, 74.0719° W)

In [3]:
coordinate_data = pd.read_csv('./data/coordinate_data')
coordinate_data.head()

Unnamed: 0,id,name,longitude,latitude,price,borough
0,5136,"Spacious Brooklyn Duplex, Patio + Garden",-73.99454,40.66265,275,Brooklyn
1,5203,Cozy Clean Guest Room - Family Apt,-73.96751,40.8038,75,Manhattan
2,5121,BlissArtsSpace!,-73.95512,40.68535,60,Brooklyn
3,5178,Large Furnished Room Near B'way,-73.98317,40.76457,68,Manhattan
4,6872,Uptown Sanctuary w/ Private Bath (Month to Month),-73.94255,40.80107,65,Manhattan


In [4]:
landmarks = {
    'Manhattan': {'name': 'Empire State Building', 'lat': 40.7485, 'lon': -73.9857},
    'Brooklyn': {'name': 'Brooklyn Bridge', 'lat': 40.7061, 'lon': -73.9969},
    'Queens': {'name': 'Flushing Meadows - Corona Park', 'lat': 40.7400, 'lon': -73.8407},
    'Bronx': {'name': 'Yankee Stadium', 'lat': 40.8296, 'lon': -73.9262},
    'Staten Island': {'name': 'Staten Island Ferry', 'lat': 40.6437, 'lon': -74.0719}
}

In [5]:
import math

# defining a function to calculate the distance between a listing and its respective landmark

def haversine_distance(row):
    # getting the name and coordinate of landmark for the borough of the listing
    borough = row['borough']
    landmark_name = landmarks[borough]['name']
    landmark_lat = landmarks[borough]['lat']
    landmark_lon = landmarks[borough]['lon']

    # getting the lat and lon of the listing
    lat = row['latitude']
    lon = row['longitude']
    
    # converting the lat and lon to radians
    lat, lon, landmark_lat, landmark_lon = map(math.radians, [lat, lon, landmark_lat, landmark_lon])

    # Haversine formula
    dlon = landmark_lon - lon
    dlat = landmark_lat - lat
    a = math.sin(dlat / 2) ** 2 + math.cos(lat) * math.cos(landmark_lat) * math.sin(dlon / 2) ** 2
    c = 2 *math.asin(math.sqrt(a))
    r = 6371 # earth radius
    distance = c * r
    return distance, landmark_name

In [6]:
# Calculate the distance between each listing and its respective landmark
coordinate_data[['landmark_distance_km', 'landmark_name']] = coordinate_data.apply(haversine_distance, axis=1, result_type='expand')

In [8]:
coordinate_data.head()

Unnamed: 0,id,name,longitude,latitude,price,borough,landmark_distance_km,landmark_name
0,5136,"Spacious Brooklyn Duplex, Patio + Garden",-73.99454,40.66265,275,Brooklyn,4.835516,Brooklyn Bridge
1,5203,Cozy Clean Guest Room - Family Apt,-73.96751,40.8038,75,Manhattan,6.336971,Empire State Building
2,5121,BlissArtsSpace!,-73.95512,40.68535,60,Brooklyn,4.210733,Brooklyn Bridge
3,5178,Large Furnished Room Near B'way,-73.98317,40.76457,68,Manhattan,1.799564,Empire State Building
4,6872,Uptown Sanctuary w/ Private Bath (Month to Month),-73.94255,40.80107,65,Manhattan,6.882754,Empire State Building


In [7]:
df = pd.merge(listing_data, coordinate_data['landmark_distance_km'], left_index=True, right_index=True, how='left')

In [8]:
df.drop(["name", "borough", "neighbourhood"], axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,room_type_encoded,price,bathrooms,accommodates,bedrooms,price_range_encoded,minimum_nights,availability_365,neighbourhood_Arden Heights,neighbourhood_Arrochar,...,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,borough_Brooklyn,borough_Manhattan,borough_Queens,borough_Staten Island,landmark_distance_km
0,0,275,1.0,4,2.0,2,21,267,0,0,...,0,0,0,0,0,1,0,0,0,4.835516
1,1,75,1.0,1,1.0,0,2,0,0,0,...,0,0,0,0,0,0,1,0,0,6.336971
2,1,60,1.0,2,1.0,1,30,322,0,0,...,0,0,0,0,0,1,0,0,0,4.210733
3,1,68,1.0,2,1.0,1,2,79,0,0,...,0,0,0,0,0,0,1,0,0,1.799564
4,1,65,1.0,1,1.0,1,30,300,0,0,...,0,0,0,0,0,0,1,0,0,6.882754


In [9]:
X = df.drop(['price'], axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=18)

In [10]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=80, max_depth=15, random_state=18)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("R-squared:", r2)
print("MSE:", mse)
rmse = np.sqrt(mse)
print("This random forest model's predictions would be off by an average of ${} from the true values".format(round(rmse,2)))

That is an improvement from the original model. 

Finally, I'll try the popular gradient boosting model with XGBoost to see if it improves upon the RandomForest.

In [19]:
import xgboost as xgb

# Define the parameter grid to search over
param_grid = {
    'max_depth': [3,5,7],
    'learning_rate': [0.01, 0.05, 0.1,0.3],
    'n_estimators': [50,100,200]
}

# Create an XGBoost regressor object
xgb_reg = xgb.XGBRegressor()

# Use GridSearchCV to search over the parameter grid
grid_search = GridSearchCV(
    estimator=xgb_reg, 
    param_grid=param_grid, 
    cv=5, 
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

# Fit the grid search object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the model on the test data
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Root mean squared error: {rmse}")

Root mean squared error: 61.44057628304587


In [20]:
print(best_params)

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}


So the performance slightly dropped with the XGB model. The best model is the RFR with the added landmark proximity feature.

## ROC Analysis

Assuming the company's current strategy is using the mean.

In [11]:
test_results = X_test.copy()
test_results['predicted_price'] = rfr.predict(X_test)

In [12]:
from sklearn.dummy import DummyRegressor

dummy = DummyRegressor(strategy='mean')
dummy.fit(X_train, y_train)


DummyRegressor()

In [21]:
print(dummy.predict(X_test))

[177.9889783 177.9889783 177.9889783 ... 177.9889783 177.9889783
 177.9889783]


### NEXT STEPS

#### 1. Additional feature engineering:
  - Using `amenities` as an additional feature.
  - Using another encooding method for `room_type`
  - Using different landmarks - possibly extracted from the `name` feature if available


#### 2. Apply models on other cities 

