In [1]:
# standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# Additional imports
from sklearn import metrics
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.cross_validation import train_test_split
import time

In [2]:
def load_df(df_file):
    # function to load csv files from data folder into DataFrame
    basepath = 'data/'
    filepath = basepath + df_file
    load_df = pd.DataFrame.from_csv(filepath)
    return load_df

In [3]:
df_all = load_df('df_all.csv')

In [34]:
df_all.shape

(1836, 17)

In [35]:
df_all.head()

Unnamed: 0,Bedrooms,Capacity,Price,Review_Count,Room_Type,Star_Rating,Avg_Price,log_Price,log_Avg_Price,log_Review_Count,Has_Star_Rating,log_Capacity,City_DA,City_DC,City_LA,City_NY,City_SF
0,1,1,38,1,0,0.0,92.918301,3.637586,4.414075,0.0,0,0.0,0,0,0,1,0
1,1,1,61,67,1,4.5,92.918301,4.110874,4.414075,4.204693,1,0.0,0,0,0,1,0
2,1,2,58,6,1,5.0,92.918301,4.060443,4.414075,1.791759,1,0.693147,0,0,0,1,0
3,1,2,56,36,1,5.0,92.918301,4.025352,4.414075,3.583519,1,0.693147,0,0,0,1,0
4,0,2,130,4,2,5.0,92.918301,4.867534,4.414075,1.386294,1,0.693147,0,0,0,1,0


In [5]:
regression_features = ['log_Review_Count','Star_Rating','Has_Star_Rating','Bedrooms','log_Capacity','Room_Type',
                       'City_NY','City_SF','City_DA','City_DC','City_LA']
X = df_all[regression_features]
y = df_all.log_Price

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

In [6]:
def train_model(clf, X_train, y_train):
    """Trains the classifier"""
    print "----------------------------"
    print "Training {}...".format(clf.__class__.__name__)
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    print "Done!\nTraining time (secs): {:.3f}".format(end - start)
    
    return clf

In [7]:
def model_predictions(clf, X_train, X_test):
    """Generate predictions from the training and testing data"""
    train_predictions = clf.predict(X_train)
    test_predictions = clf.predict(X_test)
    
    return (train_predictions, test_predictions)

In [8]:
def score_model(train_preds, test_preds, train_truth, test_truth):
    """Return the R-squared on the test data, and the RMSE on the train and test data"""
    train_RMSE = np.sqrt(metrics.mean_squared_error(train_truth, train_preds))
    test_RMSE = np.sqrt(metrics.mean_squared_error(test_truth, test_preds))
    
    test_rsquare = metrics.r2_score(test_truth, test_preds)
    
    return (train_RMSE, test_RMSE, test_rsquare)

In [9]:
def regression_model(clf, X_train, X_test, y_train, y_test):
    """This function:
     - trains the model on the training
     - makes predictions on the test set
     - reports the r_squared and RMSE of the model
    """
    clf = train_model(clf, X_train, y_train)
    train_predictions, test_predictions = model_predictions(clf, X_train, X_test)
    train_RMSE, test_RMSE, test_rsquare = score_model(train_predictions, test_predictions, y_train, y_test)
    
    print "RMSE for training set: {}".format(train_RMSE)
    print "RMSE score for test set: {}".format(test_RMSE)
    print "R-Squared for test set: {}".format(test_rsquare)
    
    return clf 

In [10]:
lr_ridge = Ridge()
lr_ols = LinearRegression()
lr_lasso = Lasso()
lr_elasticnet = ElasticNet()

In [11]:
lr_ridge = regression_model(lr_ridge, X_train, X_test, y_train, y_test)

----------------------------
Training Ridge...
Done!
Training time (secs): 0.016
RMSE for training set: 0.299120258281
RMSE score for test set: 0.307587431567
R-Squared for test set: 0.604488303974


In [13]:
zip(regression_features,lr_ridge.coef_)

[('log_Review_Count', 0.028749550830963966),
 ('Star_Rating', 0.16446968966852374),
 ('Has_Star_Rating', -0.84940431545876405),
 ('Bedrooms', 0.046434748476839857),
 ('log_Capacity', 0.21339652301495851),
 ('Room_Type', 0.50146715667403308),
 ('City_NY', 0.27987871019536537),
 ('City_SF', 0.49004034918077327),
 ('City_DA', 0.011829603214813246),
 ('City_DC', 0.14184689585272608),
 ('City_LA', 0.1070203604732762)]

In [14]:
lr_ols = regression_model(lr_ols, X_train, X_test, y_train, y_test)

----------------------------
Training LinearRegression...
Done!
Training time (secs): 0.110
RMSE for training set: 0.29884203303
RMSE score for test set: 0.308058619347
R-Squared for test set: 0.60327562101


In [15]:
zip(regression_features,lr_ols.coef_)

[('log_Review_Count', 0.02940535069278457),
 ('Star_Rating', 0.21454280681665469),
 ('Has_Star_Rating', -1.0987599497457279),
 ('Bedrooms', 0.045926256563343329),
 ('log_Capacity', 0.21148470266927472),
 ('Room_Type', 0.5029616283056898),
 ('City_NY', 0.28899307066236102),
 ('City_SF', 0.49852215489204332),
 ('City_DA', 0.017045948298588229),
 ('City_DC', 0.15025350350503219),
 ('City_LA', 0.11404300351544711)]

In [16]:
lr_lasso = regression_model(lr_lasso, X_train, X_test, y_train, y_test)

----------------------------
Training Lasso...
Done!
Training time (secs): 0.002
RMSE for training set: 0.471416661391
RMSE score for test set: 0.489695327447
R-Squared for test set: -0.0024762210639


In [17]:
zip(regression_features,lr_lasso.coef_)

[('log_Review_Count', 0.0),
 ('Star_Rating', 0.0),
 ('Has_Star_Rating', 0.0),
 ('Bedrooms', -0.0),
 ('log_Capacity', 0.0),
 ('Room_Type', 0.0),
 ('City_NY', 0.0),
 ('City_SF', 0.0),
 ('City_DA', -0.0),
 ('City_DC', 0.0),
 ('City_LA', -0.0)]

In [None]:
lr_elasticnet = regression_model(lr_elasticnet, X_train, X_test, y_train, y_test)

In [19]:
zip(regression_features,lr_elasticnet.coef_)

[('log_Review_Count', 0.0),
 ('Star_Rating', 0.0),
 ('Has_Star_Rating', 0.0),
 ('Bedrooms', -0.0),
 ('log_Capacity', 0.0),
 ('Room_Type', 0.0),
 ('City_NY', 0.0),
 ('City_SF', 0.0),
 ('City_DA', -0.0),
 ('City_DC', 0.0),
 ('City_LA', -0.0)]

### Seems like the regularization is too high on the lasso and elastic net models since all of the coefficients are 0, let's try different reg parameters

In [29]:
alphas = [10e-7,10e-6, 10e-5, 10e-4, 10e-3, 10e-2, 10e-1, 0, 1, 10, 100, 1000, 10000, 100000] #default is 1

In [32]:
coefs = []
for alpha in alphas:
    print "Alpha: {}".format(alpha)
    lr_elasticnet_cv = ElasticNet(alpha=alpha, l1_ratio = 0.01)
    lr_elasticnet_cv = regression_model(lr_elasticnet, X_train, X_test, y_train, y_test)
    coefs.append(zip(regression_features,lr_elasticnet_cv.coef_))  

Alpha: 1e-06
----------------------------
Training ElasticNet...
Done!
Training time (secs): 0.001
RMSE for training set: 0.471416661391
RMSE score for test set: 0.489695327447
R-Squared for test set: -0.0024762210639
Alpha: 1e-05
----------------------------
Training ElasticNet...
Done!
Training time (secs): 0.001
RMSE for training set: 0.471416661391
RMSE score for test set: 0.489695327447
R-Squared for test set: -0.0024762210639
Alpha: 0.0001
----------------------------
Training ElasticNet...
Done!
Training time (secs): 0.001
RMSE for training set: 0.471416661391
RMSE score for test set: 0.489695327447
R-Squared for test set: -0.0024762210639
Alpha: 0.001
----------------------------
Training ElasticNet...
Done!
Training time (secs): 0.001
RMSE for training set: 0.471416661391
RMSE score for test set: 0.489695327447
R-Squared for test set: -0.0024762210639
Alpha: 0.01
----------------------------
Training ElasticNet...
Done!
Training time (secs): 0.001
RMSE for training set: 0.4714

In [33]:
coefs

[[('log_Review_Count', 0.0),
  ('Star_Rating', 0.0),
  ('Has_Star_Rating', 0.0),
  ('Bedrooms', -0.0),
  ('log_Capacity', 0.0),
  ('Room_Type', 0.0),
  ('City_NY', 0.0),
  ('City_SF', 0.0),
  ('City_DA', -0.0),
  ('City_DC', 0.0),
  ('City_LA', -0.0)],
 [('log_Review_Count', 0.0),
  ('Star_Rating', 0.0),
  ('Has_Star_Rating', 0.0),
  ('Bedrooms', -0.0),
  ('log_Capacity', 0.0),
  ('Room_Type', 0.0),
  ('City_NY', 0.0),
  ('City_SF', 0.0),
  ('City_DA', -0.0),
  ('City_DC', 0.0),
  ('City_LA', -0.0)],
 [('log_Review_Count', 0.0),
  ('Star_Rating', 0.0),
  ('Has_Star_Rating', 0.0),
  ('Bedrooms', -0.0),
  ('log_Capacity', 0.0),
  ('Room_Type', 0.0),
  ('City_NY', 0.0),
  ('City_SF', 0.0),
  ('City_DA', -0.0),
  ('City_DC', 0.0),
  ('City_LA', -0.0)],
 [('log_Review_Count', 0.0),
  ('Star_Rating', 0.0),
  ('Has_Star_Rating', 0.0),
  ('Bedrooms', -0.0),
  ('log_Capacity', 0.0),
  ('Room_Type', 0.0),
  ('City_NY', 0.0),
  ('City_SF', 0.0),
  ('City_DA', -0.0),
  ('City_DC', 0.0),
  ('City_L