# New Baseline Regression Models with Transformed Data

##### Modeling Step 2

### Notebook Summary:

#### Objective: create a new baseline model to predict AirBnB listing prices with the newly created features

* We now operate on 138 features resulting from data extraction from existing features and additional ETL efforts with files originating from the County of San Diego website 
* We create a new Linear Regression model and this time obtain accuracy of approximately 80% on the test set. Training and Validation accuracies are a bit higher
* We then compare results with Decisition Tree, Support Vector, and K-Neighbors Regressions For each, we tune parameters as necessary to minimize validation RMSE. Model results are generally good, but not as good as Linear Regression results
* We test the same model on both normalized and scaled versions of the same data

#### Conclusions: 
* This model seems to offer a satisfactory bias/variance trade off (See plots)
* Geographic features (distance to ocean, number of recreation structures in the vicinity, parks, etc.) provide significant improvement to accuracy. Other features related to the amenities offered, host attributes and listing description text topics also play a significant role
* As our new baseline model, we pick Linear Regression

#### Next Steps: 
###### In the poly_regressions notebook we try additional techniques to further improve model accuracy including adding 200 ca. new features created from ratios of certain existing features, interaction and polynomial transformations, and feature selection

In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import matplotlib.cm as cm
%matplotlib inline

In [2]:
from sklearn import linear_model
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error
from sklearn.model_selection import KFold,cross_val_predict,cross_val_score, cross_validate, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, PolynomialFeatures, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.neighbors import LocalOutlierFactor, KNeighborsRegressor
from sklearn.feature_selection import RFE, f_regression, RFECV
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [3]:
import sys
sys.path.append('./../lib')
from airbnb_modeling import detect_feature_importance, scale_data, normalize_data, eval_metrics, plot_residuals, plot_predictions, rank_to_dict
from parse_methods import parse_columns
from airbnb_modeling import detect_interactions, add_interactions, map_variable, plot_rmse_instances,plot_rmse_features, plot_accuracy_instances
from model_visualizations import plot_learning_curve

  from pandas.core import datetools


NameError: name 'np' is not defined

In [None]:
listings = pd.read_csv('../Datasources/listings_augmented/listings_augmented_2018-05-31_V3.csv',low_memory=False)
listings = listings.drop(listings.index[4323:4325])
listings.index = [i for i in range(len(listings))]

In [None]:
#Excluded variables from the featuresExploration notebook
%store -r excluded

In [None]:
cols = [i for i in listings.columns if i not in excluded]
X = listings[cols]

In [None]:
X[X.columns[X.isnull().any()]].isnull().sum()

In [None]:
X = X.fillna(X.mean())

In [None]:
X = parse_columns(X, ['has_Pets_Allowed','has_Wheelchair_Accessible','has_First_Aid_Kit',
'has_Cat(s)','has_24-Hour_Check-in','uses_jumio','description-Topic0','description-Topic1',
'description-Topic4','description-Topic5','description-Topic6','description-Topic10',
'description-Topic11','description-Topic12','description-Topic13','description-Topic15',
'description-Topic17','description-Topic18','description-Dominant_Topic'])

In [None]:
X = X[[i for i in X.columns if i not in X.filter(regex='enc').columns]]
donotscale = X.filter(regex='bin').columns
cols = [i for i in X.columns if i not in donotscale]

In [None]:
cols.remove('space')
cols.remove('amenity_level')
cols.remove('hol_skew_of_price')

In [None]:
helper_df = pd.DataFrame(preprocessing.normalize(X[cols]), columns = cols)

In [None]:
X_normed = helper_df.merge(X[donotscale], right_index=True, left_index=True)

Now adding new features by taking ratios between features 

In [None]:
other = ['calculated_host_listings_count','extra_people', 'minimum_nights', 'number_of_reviews']
candidates = list(X_normed.filter(regex='event').columns) \
+ list(X_normed.filter(regex='park').columns) + list(X_normed.filter(regex='ocean').columns)\
+ list(X_normed.filter(regex='ratio').columns) + other

In [None]:
import itertools
a = []
for subset in itertools.combinations(candidates, 2):
    a.append(subset)

In [None]:
new = [i for i in a if "bin" not in i[0] and "bin" not in i[1]]

In [None]:
X_ratios = X_normed.copy()
for i in new:
    name = str(i[0]) + '/' + str(i[1]) + '_ratio'
    X_ratios[name] = X_ratios[i[0]]/X_ratios[i[1]]

In [None]:
#Lots of nulls above so dropping columns that have more than 300 nulls and imputing the remaining ones
X_ratios = X_ratios.dropna(axis = 1,thresh = len(X_normed)-300)
X_ratios = X_ratios.replace([np.inf, -np.inf], np.nan)
X_ratios = X_ratios.fillna(X_ratios.mean())

In [None]:
y_normed = X_normed['price_y'].fillna(X_normed['price_y'].mean())
X_normed = X_normed[X_normed.columns.drop(X_normed[list(X_normed.filter(regex='price'))])]
X_ratios = X_ratios[X_ratios.columns.drop(X_ratios[list(X_ratios.filter(regex='price'))])]

In [None]:
#Checking that features are clean - output should be False - True
print np.any(np.isnan(X_normed))
print np.all(np.isfinite(X_normed))

Step1: Simple Model with Linear Regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_normed,y_normed, test_size=0.3, random_state=42)

In [None]:
print 'Final Number of Features Used: ', len(X_train.columns)

In [None]:
lin_reg = linear_model.LinearRegression(fit_intercept=True, normalize=False)
lin_reg.fit(X_train, y_train)

In [None]:
lin_reg_rmse_test = np.sqrt(mean_squared_error(y_test, lin_reg.predict(X_test)))
print 'Test RMSE for Initial Linear Regression: ', lin_reg_rmse_test

In [None]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train,y_train)

Using Cross Validation

In [None]:
scores_lin = cross_validate(lin_reg, X_train, y_train, cv=10, return_train_score=True,
                         scoring=('r2', 'neg_mean_squared_error','neg_mean_absolute_error'))

In [None]:
scores_tree = cross_validate(tree_reg, X_train, y_train, cv=10, return_train_score=True,
                         scoring=('r2', 'neg_mean_squared_error','neg_mean_absolute_error'))

In [None]:
print 'Evaluation Metrics for Linear Regression with CV: '
eval_metrics(scores_lin)
print '----'
print '----'
print 'Evaluation Metrics for Tree Regression with CV: '
eval_metrics(scores_tree)

In [None]:
plot_rmse_instances(lin_reg, X_train, y_train)

In [None]:
plot_rmse_instances(tree_reg, X_train, y_train)

Using Nested Cross Validation with GridSearch CV - Testing Decision Tree, Suppor Vector, KNN Regressions

In [None]:
param_grid = {'max_depth': np.arange(3, 10)}

In [None]:
gs = GridSearchCV(tree_reg, param_grid, cv=4, scoring='neg_mean_squared_error')

In [None]:
gs.fit(X_train, y_train)

In [None]:
scores_dec_tree = cross_validate(gs, X_train, y_train, cv=10, return_train_score=True,
                         scoring='neg_mean_squared_error')

In [None]:
gs_results_dtree = gs.cv_results_
print 'Avg Mean Train Score: ', np.sqrt(-gs_results_dtree['mean_train_score'].mean())
print 'Avg Mean Val Score: ', np.sqrt(-gs_results_dtree['mean_test_score'].mean())

In [None]:
best_model_dtree = gs.best_estimator_

In [None]:
plot_rmse_instances(best_model_dtree, X_train, y_train)

SVR

In [None]:
sv_reg = SVR()

In [None]:
Cs = np.logspace(-0.1, 10, 30)
param_grid = [
    {'C':Cs, 'epsilon':[i for i in range(1,8,1)], 'kernel':['linear'],
    'degree':[2,3,4]},]

In [None]:
gs = GridSearchCV(sv_reg, param_grid, cv=4, scoring='neg_mean_squared_error')

In [None]:
gs.fit(X_train, y_train)

In [None]:
scores_sv_reg = cross_validate(gs, X_train, y_train, cv=10, return_train_score=True,
                         scoring='neg_mean_squared_error')

In [None]:
gs_results_svreg = gs.cv_results_
print 'Avg Mean Train Score: ', np.sqrt(-gs_results_svreg['mean_train_score'].mean())
print 'Avg Mean Val Score: ', np.sqrt(-gs_results_svreg['mean_test_score'].mean())

In [None]:
best_model_svr = gs.best_estimator_

In [None]:
plot_rmse_instances(best_model_svr, X_train, y_train)

KNN

In [None]:
neigh_reg = KNeighborsRegressor()

In [None]:
param_grid = [
    {'n_neighbors':[2,3,4], 
     'metric':['euclidean']},]
#euclidean: sqrt(sum((x - y)^2)) 
#minkowski: sum(|x - y|^p)^(1/p)

In [None]:
gs = GridSearchCV(neigh_reg, param_grid, cv=3, scoring='neg_mean_squared_error')

In [None]:
gs.fit(X_train, y_train)

In [None]:
scores_neigh_reg = cross_validate(gs, X_train, y_train, cv=3, return_train_score=True,
                         scoring=('r2', 'neg_mean_squared_error','neg_mean_absolute_error'))

In [None]:
gs_results_nn = gs.cv_results_
mean_train_score_nnreg = np.sqrt(-gs_results_nn['mean_train_score'].mean())
mean_val_score = np.sqrt(-gs_results_nn['mean_test_score'].mean())
print 'Avg Mean Train Score: ', mean_train_score_nnreg
print 'Avg Mean Val Score: ', mean_val_score

In [None]:
best_model_kneigh = gs.best_estimator_

In [None]:
#Takes way too long
#plot_rmse_instances(best_model_kneigh, X_train, y_train)

In [None]:
test_predictions_lin_reg = lin_reg.predict(X_test)
test_predictions_tree_reg = best_model_dtree.predict(X_test)
test_predictions_best_model_svr = best_model_svr.predict(X_test)
test_predictions_best_model_kneigh = best_model_kneigh.predict(X_test)

In [None]:
print 'Evaluation Metrics for Decision Tree Regression'
print 'Test R2: ',r2_score(y_test, test_predictions_lin_reg)
print 'Test RMSE: ',np.sqrt(mean_squared_error(y_test, test_predictions_lin_reg))
print 'Test MAE: ',mean_absolute_error(y_test, test_predictions_lin_reg)
map_variable(y_test-test_predictions_lin_reg, listings)

In [None]:
from yellowbrick.regressor import ResidualsPlot

In [None]:
tree_reg_pred_cv = cross_val_predict(tree_reg, X_train, y_train, cv=10)

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(tree_reg_pred_cv, tree_reg_pred_cv-y_train, 
            c='steelblue', marker='o', edgecolor='white',
           label='CV Train Data')
plt.scatter(test_predictions_tree_reg, test_predictions_tree_reg-y_test, 
            c='limegreen', marker='x', edgecolor='red',
           label='Test Data')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.legend(loc='upper right')
plt.hlines(y=0, color='black', xmin=0, xmax=.35, lw=3)
plt.title('Predicted Values vs Residuals - Decision Tree Regression')
plt.show()

In [None]:
print 'Evaluation Metrics for Support Vector Regression'
print 'Test R2: ',r2_score(y_test, test_predictions_best_model_svr)
print 'Test RMSE: ',np.sqrt(mean_squared_error(y_test, test_predictions_best_model_svr))
print 'Test MAE: ',mean_absolute_error(y_test, test_predictions_best_model_svr)
map_variable(y_test-test_predictions_best_model_svr, listings)

In [None]:
reg_svr_best_model_pred_cv = cross_val_predict(best_model_svr, X_train, y_train, cv=10)

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(reg_svr_best_model_pred_cv, reg_svr_best_model_pred_cv-y_train, 
            c='steelblue', marker='o', edgecolor='white',
           label='CV Train Data')
plt.scatter(test_predictions_best_model_svr, test_predictions_best_model_svr-y_test, 
            c='limegreen', marker='x', edgecolor='red',
           label='Test Data')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.legend(loc='upper right')
plt.hlines(y=0, color='black', xmin=0, xmax=.35, lw=3)
plt.title('Predicted Values vs Residuals - Tuned Support Vector Regression')
plt.show()

In [None]:
kneigh_best_model_pred_cv = cross_val_predict(best_model_kneigh, X_train, y_train, cv=10)

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(kneigh_best_model_pred_cv, kneigh_best_model_pred_cv-y_train, 
            c='steelblue', marker='o', edgecolor='white',
           label='CV Train Data')
plt.scatter(test_predictions_best_model_kneigh, test_predictions_best_model_kneigh-y_test, 
            c='limegreen', marker='x', edgecolor='red',
           label='Test Data')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.legend(loc='upper right')
plt.hlines(y=0, color='black', xmin=-.05, xmax=.35, lw=3)
plt.title('Predicted Values vs Residuals - Tuned K Neighbor Regression')
plt.show()

In [None]:
print 'Evaluation Metrics for K Nearest Neighbors Regression'
print 'Test R2: ',r2_score(y_test, test_predictions_best_model_kneigh)
print 'Test RMSE: ',np.sqrt(mean_squared_error(y_test, test_predictions_best_model_kneigh))
print 'Test MAE: ',mean_absolute_error(y_test, test_predictions_best_model_kneigh)
map_variable(y_test-test_predictions_best_model_kneigh, listings)

In [None]:
#Storing Feature Importances for Linear Regression

In [None]:
ranks = {}

In [None]:
ranks["Linear_Reg"] = rank_to_dict(np.abs(lin_reg.coef_), X_normed.columns)
ranks["DT_Reg"] = rank_to_dict(np.abs(tree_reg.feature_importances_), X_normed.columns)
ranks["SV_Reg"] = rank_to_dict(np.abs(sv_reg.coef_), X_normed.columns)

In [None]:
%store ranks

In [None]:
%store scores_lin
%store scores_tree
%store scores_sv_reg
%store scores_neigh_reg

In [None]:
%store best_model_svr 
%store best_model_kneigh
%store best_model_dtree
%store lin_reg

In [None]:
%store X_ratios
%store X_normed
%store X_test
%store y_normed
%store y_test
%store listings

In [None]:
%store scores_lin
%store scores_tree
%store gs_results_svreg
%store gs_results_nn

In [None]:
%store test_predictions_lin_reg
%store test_predictions_tree_reg
%store test_predictions_best_model_svr
%store test_predictions_best_model_kneigh