In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

import wrangle

import warnings
warnings.filterwarnings("ignore")

In [24]:
train_scaled, validate_scaled, test_scaled = wrangle.wrangle_split_scale()

In [25]:
train_scaled.head(5)

Unnamed: 0.1,Unnamed: 0,bedroomcnt,bathroomcnt,square_feet,taxvaluedollarcnt,taxamount,yearbuilt,zipcode,fips_name,fips_name_Orange,fips_name_Ventura
35112,35112,0.6,0.363636,0.153707,682000,9822.42,1999,96993.0,Orange,1,0
12968,12968,0.4,0.0,0.076198,68288,1098.85,1950,96212.0,Los Angeles,0,0
5870,5870,0.4,0.181818,0.095535,233647,2877.34,1953,97040.0,Orange,1,0
37565,37565,0.4,0.181818,0.104793,74948,1190.28,1960,97118.0,Ventura,0,1
47416,47416,0.4,0.545455,0.26694,1649536,18987.3,1998,96979.0,Orange,1,0


In [26]:
train_scaled.dtypes

Unnamed: 0             int64
bedroomcnt           float64
bathroomcnt          float64
square_feet          float64
taxvaluedollarcnt      int64
taxamount            float64
yearbuilt              int64
zipcode              float64
fips_name             object
fips_name_Orange       uint8
fips_name_Ventura      uint8
dtype: object

### turn it into our train,validate,test sets

In [27]:
# Create the object
# Since taxamount and taxvalue amount are so similar I don't want to move forward with 
# it in my model.
x_train = train_scaled.drop(columns = ['taxvaluedollarcnt', 'taxamount', 'fips_name', 'Unnamed: 0'])
y_train = pd.DataFrame(train_scaled[['taxvaluedollarcnt']])

x_validate = validate_scaled.drop(columns = ['taxvaluedollarcnt', 'taxamount','fips_name', 'Unnamed: 0'])
y_validate = pd.DataFrame(validate_scaled[['taxvaluedollarcnt']])

x_test = train_scaled.drop(columns = ['taxvaluedollarcnt', 'taxamount', 'fips_name', 'Unnamed: 0'])
y_train = pd.DataFrame(train_scaled[['taxvaluedollarcnt']])

In [28]:
x_train

Unnamed: 0,bedroomcnt,bathroomcnt,square_feet,yearbuilt,zipcode,fips_name_Orange,fips_name_Ventura
35112,0.6,0.363636,0.153707,1999,96993.0,1,0
12968,0.4,0.000000,0.076198,1950,96212.0,0,0
5870,0.4,0.181818,0.095535,1953,97040.0,1,0
37565,0.4,0.181818,0.104793,1960,97118.0,0,1
47416,0.4,0.545455,0.266940,1998,96979.0,1,0
...,...,...,...,...,...,...,...
10209,0.6,0.363636,0.198034,1972,96206.0,0,0
24383,0.8,0.636364,0.307169,2003,97041.0,1,0
24870,0.4,0.181818,0.075952,1955,97025.0,1,0
2677,0.2,0.000000,0.019582,1927,96282.0,0,0


In [29]:
y_train

Unnamed: 0,taxvaluedollarcnt
35112,682000
12968,68288
5870,233647
37565,74948
47416,1649536
...,...
10209,815861
24383,874267
24870,225466
2677,343000


In [30]:
y_train['taxvaluedollarcnt'].mean()

513131.8967355784

## Setting up the baseline:
For Regression, we want to decide if we need to use mean or median baseline, so sometimes its best to look at both. This way we can compare the numbers to see consistency between train and validate to make sure there is no overfitting happening.

In [31]:
#Create mean baseline model
y_train['baseline_mean_pred'] = y_train['taxvaluedollarcnt'].mean()
y_validate['baseline_mean_pred'] = y_validate['taxvaluedollarcnt'].mean()

#Create median baseline model
y_train['baseline_median_pred'] = y_train['taxvaluedollarcnt'].median()
y_validate['baseline_median_pred'] = y_validate['taxvaluedollarcnt'].median()

In [32]:
#determine RSMSE
results = []

baseline_mean = {
    'model':'baseline_mean',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['baseline_mean_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['baseline_mean_pred'])**(0.5)

}
baseline_median = {
    'model':'baseline_median',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['baseline_median_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['baseline_median_pred'])**(0.5)

}
results.append(baseline_mean)
results.append(baseline_median)

In [33]:
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,596704.058777,621286.160927
1,baseline_median,612533.700708,636062.789038


# Linear Regression

In [34]:
lr = LinearRegression(normalize = True)

lr.fit(x_train, y_train['taxvaluedollarcnt'])

y_train['lr_train_pred'] = lr.predict(x_train)

y_validate['lr_validate_pred'] = lr.predict(x_validate)

y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lr_train_pred
35112,682000,513131.896736,374778.0,5.685022e+05
12968,68288,513131.896736,374778.0,2.097126e+05
5870,233647,513131.896736,374778.0,4.618026e+05
37565,74948,513131.896736,374778.0,4.217020e+05
47416,1649536,513131.896736,374778.0,1.357090e+06
...,...,...,...,...
10209,815861,513131.896736,374778.0,8.282406e+05
24383,874267,513131.896736,374778.0,1.336844e+06
24870,225466,513131.896736,374778.0,3.627495e+05
2677,343000,513131.896736,374778.0,1.542520e+05


In [35]:
lr_rmse = {
    'model':'linear regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['lr_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['lr_validate_pred'])**(0.5)

}
results.append(lr_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,596704.058777,621286.160927
1,baseline_median,612533.700708,636062.789038
2,linear regression,472280.911553,489893.568821


# LassoLars

In [36]:
lars = LassoLars(alpha=1.0)

lars.fit(x_train, y_train['taxvaluedollarcnt'])

y_train['lars_train_pred'] = lars.predict(x_train)

y_validate['lars_validate_pred'] = lars.predict(x_validate)

y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lr_train_pred,lars_train_pred
35112,682000,513131.896736,374778.0,5.685022e+05,5.686724e+05
12968,68288,513131.896736,374778.0,2.097126e+05,2.102051e+05
5870,233647,513131.896736,374778.0,4.618026e+05,4.612739e+05
37565,74948,513131.896736,374778.0,4.217020e+05,4.220001e+05
47416,1649536,513131.896736,374778.0,1.357090e+06,1.355855e+06
...,...,...,...,...,...
10209,815861,513131.896736,374778.0,8.282406e+05,8.283105e+05
24383,874267,513131.896736,374778.0,1.336844e+06,1.336381e+06
24870,225466,513131.896736,374778.0,3.627495e+05,3.623269e+05
2677,343000,513131.896736,374778.0,1.542520e+05,1.539970e+05


In [37]:
lars_rmse = {
    'model':'LassoLars regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['lars_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['lars_validate_pred'])**(0.5)

}
results.append(lars_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,596704.058777,621286.160927
1,baseline_median,612533.700708,636062.789038
2,linear regression,472280.911553,489893.568821
3,LassoLars regression,472281.284547,489905.973584


# Polynomial

In [38]:
#Polynomial Features
pr = PolynomialFeatures(degree=2)

x_train_degree2 = pr.fit_transform(x_train)

x_validate_degree2 = pr.transform(x_validate)

In [39]:
# create the model object
lr2 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lr2.fit(x_train_degree2, y_train['taxvaluedollarcnt'])

# predict train
y_train['poly_train_pred'] = lr2.predict(x_train_degree2)

# predict validate
y_validate['poly_validate_pred'] = lr2.predict(x_validate_degree2)

In [40]:
y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lr_train_pred,lars_train_pred,poly_train_pred
35112,682000,513131.896736,374778.0,5.685022e+05,5.686724e+05,6.546999e+05
12968,68288,513131.896736,374778.0,2.097126e+05,2.102051e+05,3.299276e+05
5870,233647,513131.896736,374778.0,4.618026e+05,4.612739e+05,3.531319e+05
37565,74948,513131.896736,374778.0,4.217020e+05,4.220001e+05,3.183072e+05
47416,1649536,513131.896736,374778.0,1.357090e+06,1.355855e+06,1.374060e+06
...,...,...,...,...,...,...
10209,815861,513131.896736,374778.0,8.282406e+05,8.283105e+05,7.656415e+05
24383,874267,513131.896736,374778.0,1.336844e+06,1.336381e+06,1.301868e+06
24870,225466,513131.896736,374778.0,3.627495e+05,3.623269e+05,3.096119e+05
2677,343000,513131.896736,374778.0,1.542520e+05,1.539970e+05,1.931964e+05


In [41]:
poly_rmse = {
    'model':'Polynomial regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['poly_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['poly_validate_pred'])**(0.5)

}
results.append(poly_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,596704.058777,621286.160927
1,baseline_median,612533.700708,636062.789038
2,linear regression,472280.911553,489893.568821
3,LassoLars regression,472281.284547,489905.973584
4,Polynomial regression,450927.291425,468084.675542


# TweedieRegressor

In [42]:
twr = TweedieRegressor()

twr.fit(x_train, y_train['taxvaluedollarcnt'])

y_train['twr_train_pred'] = twr.predict(x_train)

y_validate['twr_validate_pred'] = twr.predict(x_validate)

y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lr_train_pred,lars_train_pred,poly_train_pred,twr_train_pred
35112,682000,513131.896736,374778.0,5.685022e+05,5.686724e+05,6.546999e+05,689540.241541
12968,68288,513131.896736,374778.0,2.097126e+05,2.102051e+05,3.299276e+05,436903.931560
5870,233647,513131.896736,374778.0,4.618026e+05,4.612739e+05,3.531319e+05,468498.357391
37565,74948,513131.896736,374778.0,4.217020e+05,4.220001e+05,3.183072e+05,503894.174164
47416,1649536,513131.896736,374778.0,1.357090e+06,1.355855e+06,1.374060e+06,693137.601760
...,...,...,...,...,...,...,...
10209,815861,513131.896736,374778.0,8.282406e+05,8.283105e+05,7.656415e+05,557791.638324
24383,874267,513131.896736,374778.0,1.336844e+06,1.336381e+06,1.301868e+06,725573.409143
24870,225466,513131.896736,374778.0,3.627495e+05,3.623269e+05,3.096119e+05,477066.564127
2677,343000,513131.896736,374778.0,1.542520e+05,1.539970e+05,1.931964e+05,328183.946130


In [43]:
twr_rmse = {
    'model':'TweedieRegressor regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['twr_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['twr_validate_pred'])**(0.5)

}
results.append(twr_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,596704.058777,621286.160927
1,baseline_median,612533.700708,636062.789038
2,linear regression,472280.911553,489893.568821
3,LassoLars regression,472281.284547,489905.973584
4,Polynomial regression,450927.291425,468084.675542
5,TweedieRegressor regression,581166.179948,605838.269172


In [44]:
results_df = pd.DataFrame(results)
results_df.sort_values(['RMSE_train', 'RMSE_validate'])

Unnamed: 0,model,RMSE_train,RMSE_validate
4,Polynomial regression,450927.291425,468084.675542
2,linear regression,472280.911553,489893.568821
3,LassoLars regression,472281.284547,489905.973584
5,TweedieRegressor regression,581166.179948,605838.269172
0,baseline_mean,596704.058777,621286.160927
1,baseline_median,612533.700708,636062.789038
