In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split, GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
brooklyn = pd.read_csv('brooklyn.csv')
queens = pd.read_csv('queens.csv')
manhattan = pd.read_csv('manhattan.csv')


In [3]:
X = brooklyn[['bedrooms', 'bathrooms', 'size_sqft', 'min_to_subway', 'floor', 'building_age_yrs', 'no_fee', 'has_roofdeck', 'has_washer_dryer', 'has_doorman', 'has_elevator', 'has_dishwasher', 'has_patio', 'has_gym']]
y = brooklyn[['rent']]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 1)

In [5]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(810, 14)
(203, 14)
(810, 1)
(203, 1)


In [6]:
print(len(y))
print(len(X))

1013
1013


In [7]:
mlr = LinearRegression()
parameters = {'fit_intercept':(True, False), 'normalize': (True, False), 'n_jobs': (1, -1)}
mlr_grid_cv = GridSearchCV(param_grid = parameters, estimator = mlr, scoring = 'neg_mean_squared_error')
mlr_grid_cv.fit(X_train, y_train)
mlr_grid_cv.best_score_
mlr_best = mlr_grid_cv.best_estimator_

In [8]:
from sklearn.naive_bayes import MultinomialNB
mnnb = MultinomialNB()
mnnb.fit(X_train, y_train)
mnnb_score = mnnb.score(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [9]:
from sklearn.svm import SVR
import warnings; warnings.simplefilter('ignore')
svr = SVR()
parameters = {'kernel': ('linear', 'rbf'), 'gamma': ('scale', 'auto'),'max_iter': (1, -1) }
svr_grid_cv = GridSearchCV(estimator = svr,param_grid = parameters, scoring = 'neg_mean_squared_error', return_train_score = True)
svr_grid_cv.fit(X_train, y_train)
svr_grid_cv.best_score_


-1114736.4531109778

In [10]:
svr_best = svr_grid_cv.best_estimator_
print(svr_best)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)


In [11]:
svr_best.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [12]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
import warnings; warnings.simplefilter('ignore')
lasso = Lasso(random_state = 1)
parameters = parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100], 'fit_intercept': (True, False), 'normalize': (True, False), 'selection':('cyclic', 'random')}
lasso_regressor = GridSearchCV(lasso, parameters, scoring = 'neg_mean_squared_error', cv = 5)
lasso_regressor.fit(X_train, y_train)
lasso_regressor.best_score_


-1017393.1128575733

In [13]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
parameters = {'n_neighbors': [1, 2, 3, 4, 5], 'weights': ('uniform', 'distance'), 'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'), 'leaf_size': range(20, 40)}
knn_gs = GridSearchCV(estimator = knn, param_grid = parameters, scoring = 'neg_mean_squared_error', return_train_score = True)
knn_gs.fit(X_train, y_train)


GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
                         'leaf_size': range(20, 40),
                         'n_neighbors': [1, 2, 3, 4, 5],
                         'weights': ('uniform', 'distance')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='neg_mean_squared_error', verbose=0)

In [14]:
knn_best = knn_gs.best_estimator_
print(knn_best)

KNeighborsClassifier(algorithm='ball_tree', leaf_size=20, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')


In [15]:
knn_best_score = knn_gs.best_score_
print(knn_best_score)

-2004181.6358024694


In [16]:
knn_pred = knn_best.predict(X_train)

In [17]:
lasso_best = lasso_regressor.best_estimator_
print(lasso_best)

Lasso(alpha=10, copy_X=True, fit_intercept=False, max_iter=1000, normalize=True,
      positive=False, precompute=False, random_state=1, selection='random',
      tol=0.0001, warm_start=False)


In [18]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 1 )

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
params = {'n_estimators':n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap}
rf_regressor = RandomizedSearchCV(estimator = rf, param_distributions = params, cv = 5, n_iter = 50, scoring = 'neg_mean_squared_error', n_jobs = 4, verbose = 5, return_train_score = True, random_state = 1)
rf_regressor.fit(X_train, y_train)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    5.1s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   19.3s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:   41.4s
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:  1.1min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [19]:
rf_regressor.best_score_

-705650.2835314281

In [20]:
rf_best = rf_regressor.best_estimator_

In [21]:
print(rf_best)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=110, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=800, n_jobs=None, oob_score=False,
                      random_state=1, verbose=0, warm_start=False)


In [22]:
rf_best.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=110, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=800, n_jobs=None, oob_score=False,
                      random_state=1, verbose=0, warm_start=False)

In [23]:
rf_pred_on_train = rf_best.predict(X_train)
rf_pred_on_test = rf_best.predict(X_test)

In [24]:
len_pred_train = len(rf_pred_on_train)
len_pred_test = len(rf_pred_on_test)
print(len_pred_train)
print(len_pred_test)

810
203


In [25]:
from sklearn import tree
dcr = tree.DecisionTreeRegressor(random_state = 1)
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
min_samples_split = [2 ,3 , 4]
min_samples_leaf = [1 , 2 , 3]

params = {'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf}
dcr_rs = RandomizedSearchCV(estimator = dcr, param_distributions = params, n_iter = 15, scoring = 'neg_mean_squared_error', cv =5 )
dcr_rs.fit(X_train,y_train)


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=DecisionTreeRegressor(ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features=None,
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   presort='deprecated',
                                                   random_state=1,
                                                   splitter='best'),
                   iid='depr

In [27]:
dcr_rs_best = dcr_rs.best_estimator_
print(dcr_rs_best)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=10,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=3,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=1, splitter='best')


In [29]:
dcr_rs_best_score = dcr_rs.best_score_
print(dcr_rs_best_score)

-1085079.1423293457


In [30]:
print('Random Forest Regressor train score: ', + rf_best.score(X_train, y_train))

Random Forest Regressor train score:  0.9592342548915662


In [31]:
print('Support Vector Machine train score: ', + svr_best.score(X_train, y_train) )

Support Vector Machine train score:  0.5267951240194464


In [32]:
print('Lasso Regressor train score: ', + lasso_best.score(X_train, y_train))

Lasso Regressor train score:  0.5826669887322684


In [33]:
print('Multiple Linear Regression train score: ', + mlr_best.score(X_train, y_train))

Multiple Linear Regression train score:  0.5847367135112033


In [34]:
print('Multinomial Naive Bayes train score: ', + mnnb_score)

Multinomial Naive Bayes train score:  0.07160493827160494


In [35]:
print('K Nearest Neighbors train score: ', + knn_best.score(X_train, y_train))

K Nearest Neighbors train score:  0.9864197530864197


In [36]:
print('Decision Tree regressor best score: ', + dcr_rs_best.score(X_train, y_train))

Decision Tree regressor best score:  0.9158499142929163
