In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

import wrangle

In [3]:
train_scaled, validate_scaled, test_scaled = wrangle.wrangle_split_scale()

In [4]:
train_scaled.head(5)

Unnamed: 0,bedroomcnt,bathroomcnt,square_feet,taxvaluedollarcnt,yearbuilt,taxamount,fips_name
1483210,0.2,0.272727,0.072759,0.044189,1949,1633.33,Los Angeles
635933,0.4,0.272727,0.074562,0.054043,1964,1711.51,Los Angeles
647772,0.6,0.363636,0.123669,0.378193,1979,8660.46,Orange
1251279,0.2,0.090909,0.011526,0.005963,1920,758.49,Los Angeles
772816,0.4,0.272727,0.040804,0.096488,1972,2528.32,Orange


In [5]:
train_scaled.dtypes

bedroomcnt           float64
bathroomcnt          float64
square_feet          float64
taxvaluedollarcnt    float64
yearbuilt              int64
taxamount            float64
fips_name             object
dtype: object

### turn it into our train,validate,test sets

In [6]:
x_train = train_scaled.drop(columns = ['taxvaluedollarcnt', 'fips_name'])
y_train = pd.DataFrame(train_scaled[['taxvaluedollarcnt']])

x_validate = validate_scaled.drop(columns = ['taxvaluedollarcnt', 'fips_name'])
y_validate = pd.DataFrame(validate_scaled[['taxvaluedollarcnt']])

x_test = train_scaled.drop(columns = ['taxvaluedollarcnt', 'fips_name'])
y_train = pd.DataFrame(train_scaled[['taxvaluedollarcnt']])

In [7]:
x_train

Unnamed: 0,bedroomcnt,bathroomcnt,square_feet,yearbuilt,taxamount
1483210,0.2,0.272727,0.072759,1949,1633.33
635933,0.4,0.272727,0.074562,1964,1711.51
647772,0.6,0.363636,0.123669,1979,8660.46
1251279,0.2,0.090909,0.011526,1920,758.49
772816,0.4,0.272727,0.040804,1972,2528.32
...,...,...,...,...,...
602633,0.2,0.272727,0.037199,1950,2608.45
1749433,0.2,0.090909,0.027913,1954,3874.34
1921252,0.6,0.272727,0.075982,1993,4376.51
1322629,0.6,0.272727,0.040422,1922,3909.02


In [8]:
y_train

Unnamed: 0,taxvaluedollarcnt
1483210,0.044189
635933,0.054043
647772,0.378193
1251279,0.005963
772816,0.096488
...,...
602633,0.093856
1749433,0.165289
1921252,0.147756
1322629,0.151755


In [9]:
y_train['taxvaluedollarcnt'].mean()

0.19552691997145147

## Setting up the baseline:
For Regression, we want to decide if we need to use mean or median baseline, so lets look at both

In [10]:
#Create mean baseline model
y_train['baseline_mean_pred'] = y_train['taxvaluedollarcnt'].mean()
y_validate['baseline_mean_pred'] = y_validate['taxvaluedollarcnt'].mean()

#Create median baseline model
y_train['baseline_median_pred'] = y_train['taxvaluedollarcnt'].median()
y_validate['baseline_median_pred'] = y_validate['taxvaluedollarcnt'].median()

In [11]:
#determine RSMSE
results = []

baseline_mean = {
    'model':'baseline_mean',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['baseline_mean_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['baseline_mean_pred'])**(0.5)

}
baseline_median = {
    'model':'baseline_median',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['baseline_median_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['baseline_median_pred'])**(0.5)

}
results.append(baseline_mean)
results.append(baseline_median)

In [12]:
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,0.157652,0.157191
1,baseline_median,0.162263,0.161799


# Linear Regression

In [13]:
lr = LinearRegression(normalize = True)

lr.fit(x_train, y_train['taxvaluedollarcnt'])

y_train['lr_train_pred'] = lr.predict(x_train)

y_validate['lr_validate_pred'] = lr.predict(x_validate)

y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lr_train_pred
1483210,0.044189,0.195527,0.157119,0.061441
635933,0.054043,0.195527,0.157119,0.062347
647772,0.378193,0.195527,0.157119,0.355415
1251279,0.005963,0.195527,0.157119,0.018660
772816,0.096488,0.195527,0.157119,0.091064
...,...,...,...,...
602633,0.093856,0.195527,0.157119,0.096933
1749433,0.165289,0.195527,0.157119,0.148353
1921252,0.147756,0.195527,0.157119,0.169871
1322629,0.151755,0.195527,0.157119,0.149965


In [14]:
lr_rmse = {
    'model':'linear regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['lr_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['lr_validate_pred'])**(0.5)

}
results.append(lr_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,0.157652,0.157191
1,baseline_median,0.162263,0.161799
2,linear regression,0.033879,0.03433


# LassoLars

In [15]:
lars = LassoLars(alpha=1.0)

lars.fit(x_train, y_train['taxvaluedollarcnt'])

y_train['lars_train_pred'] = lars.predict(x_train)

y_validate['lars_validate_pred'] = lars.predict(x_validate)

y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lr_train_pred,lars_train_pred
1483210,0.044189,0.195527,0.157119,0.061441,0.195527
635933,0.054043,0.195527,0.157119,0.062347,0.195527
647772,0.378193,0.195527,0.157119,0.355415,0.195527
1251279,0.005963,0.195527,0.157119,0.018660,0.195527
772816,0.096488,0.195527,0.157119,0.091064,0.195527
...,...,...,...,...,...
602633,0.093856,0.195527,0.157119,0.096933,0.195527
1749433,0.165289,0.195527,0.157119,0.148353,0.195527
1921252,0.147756,0.195527,0.157119,0.169871,0.195527
1322629,0.151755,0.195527,0.157119,0.149965,0.195527


In [16]:
lars_rmse = {
    'model':'LassoLars regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['lars_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['lars_validate_pred'])**(0.5)

}
results.append(lars_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,0.157652,0.157191
1,baseline_median,0.162263,0.161799
2,linear regression,0.033879,0.03433
3,LassoLars regression,0.157652,0.157191


# Polynomial

In [17]:
# make the polynomial features to get a new set of features
pr = PolynomialFeatures(degree=2)

# fit and transform X_train_scaled
x_train_degree2 = pr.fit_transform(x_train)

# transform X_validate_scaled & X_test_scaled
x_validate_degree2 = pr.transform(x_validate)

In [18]:
# create the model object
lr2 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lr2.fit(x_train_degree2, y_train['taxvaluedollarcnt'])

# predict train
y_train['poly_train_pred'] = lr2.predict(x_train_degree2)

# predict validate
y_validate['poly_validate_pred'] = lr2.predict(x_validate_degree2)

In [19]:
y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lr_train_pred,lars_train_pred,poly_train_pred
1483210,0.044189,0.195527,0.157119,0.061441,0.195527,0.057735
635933,0.054043,0.195527,0.157119,0.062347,0.195527,0.060377
647772,0.378193,0.195527,0.157119,0.355415,0.195527,0.362174
1251279,0.005963,0.195527,0.157119,0.018660,0.195527,0.004370
772816,0.096488,0.195527,0.157119,0.091064,0.195527,0.091646
...,...,...,...,...,...,...
602633,0.093856,0.195527,0.157119,0.096933,0.195527,0.099854
1749433,0.165289,0.195527,0.157119,0.148353,0.195527,0.152173
1921252,0.147756,0.195527,0.157119,0.169871,0.195527,0.162794
1322629,0.151755,0.195527,0.157119,0.149965,0.195527,0.148879


In [20]:
poly_rmse = {
    'model':'Polynomial regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['poly_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['poly_validate_pred'])**(0.5)

}
results.append(poly_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,0.157652,0.157191
1,baseline_median,0.162263,0.161799
2,linear regression,0.033879,0.03433
3,LassoLars regression,0.157652,0.157191
4,Polynomial regression,0.03269,0.033056


# TweedieRegressor

In [21]:
twr = TweedieRegressor()

twr.fit(x_train, y_train['taxvaluedollarcnt'])

y_train['twr_train_pred'] = twr.predict(x_train)

y_validate['twr_validate_pred'] = twr.predict(x_validate)

y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lr_train_pred,lars_train_pred,poly_train_pred,twr_train_pred
1483210,0.044189,0.195527,0.157119,0.061441,0.195527,0.057735,0.057391
635933,0.054043,0.195527,0.157119,0.062347,0.195527,0.060377,0.059162
647772,0.378193,0.195527,0.157119,0.355415,0.195527,0.362174,0.352601
1251279,0.005963,0.195527,0.157119,0.018660,0.195527,0.004370,0.023268
772816,0.096488,0.195527,0.157119,0.091064,0.195527,0.091646,0.092974
...,...,...,...,...,...,...,...
602633,0.093856,0.195527,0.157119,0.096933,0.195527,0.099854,0.098667
1749433,0.165289,0.195527,0.157119,0.148353,0.195527,0.152173,0.151899
1921252,0.147756,0.195527,0.157119,0.169871,0.195527,0.162794,0.169238
1322629,0.151755,0.195527,0.157119,0.149965,0.195527,0.148879,0.156954


In [22]:
twr_rmse = {
    'model':'TweedieRegressor regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['twr_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['twr_validate_pred'])**(0.5)

}
results.append(twr_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,0.157652,0.157191
1,baseline_median,0.162263,0.161799
2,linear regression,0.033879,0.03433
3,LassoLars regression,0.157652,0.157191
4,Polynomial regression,0.03269,0.033056
5,TweedieRegressor regression,0.034162,0.034609


In [23]:
results_df = pd.DataFrame(results)
results_df.sort_values(['RMSE_train', 'RMSE_validate'])

Unnamed: 0,model,RMSE_train,RMSE_validate
4,Polynomial regression,0.03269,0.033056
2,linear regression,0.033879,0.03433
5,TweedieRegressor regression,0.034162,0.034609
0,baseline_mean,0.157652,0.157191
3,LassoLars regression,0.157652,0.157191
1,baseline_median,0.162263,0.161799
