In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

import wrangle

In [2]:
train_scaled, validate_scaled, test_scaled = wrangle.wrangle_split_scale()

In [3]:
train_scaled.head(5)

Unnamed: 0.1,Unnamed: 0,bedroomcnt,bathroomcnt,square_feet,taxvaluedollarcnt,taxamount,yearbuilt,zipcode,fips_name,fips_name_Orange,fips_name_Ventura
35112,35112,0.4,0.181818,0.087946,32461,929.72,1926,95982.0,Los Angeles,0,0
12968,12968,0.0,0.0,0.022846,16714,422.74,1924,95984.0,Los Angeles,0,0
5870,5870,0.4,0.181818,0.096053,233647,2877.34,1953,97040.0,Orange,1,0
37565,37565,0.4,0.272727,0.153374,450261,4748.24,1995,96989.0,Orange,1,0
47416,47416,0.4,0.0,0.055028,165984,2333.17,1951,96213.0,Los Angeles,0,0


In [4]:
train_scaled.dtypes

Unnamed: 0             int64
bedroomcnt           float64
bathroomcnt          float64
square_feet          float64
taxvaluedollarcnt      int64
taxamount            float64
yearbuilt              int64
zipcode              float64
fips_name             object
fips_name_Orange       uint8
fips_name_Ventura      uint8
dtype: object

### turn it into our train,validate,test sets

In [5]:
x_train = train_scaled.drop(columns = ['taxvaluedollarcnt', 'fips_name'])
y_train = pd.DataFrame(train_scaled[['taxvaluedollarcnt']])

x_validate = validate_scaled.drop(columns = ['taxvaluedollarcnt', 'fips_name'])
y_validate = pd.DataFrame(validate_scaled[['taxvaluedollarcnt']])

x_test = train_scaled.drop(columns = ['taxvaluedollarcnt', 'fips_name'])
y_train = pd.DataFrame(train_scaled[['taxvaluedollarcnt']])

In [6]:
x_train

Unnamed: 0.1,Unnamed: 0,bedroomcnt,bathroomcnt,square_feet,taxamount,yearbuilt,zipcode,fips_name_Orange,fips_name_Ventura
35112,35112,0.4,0.181818,0.087946,929.72,1926,95982.0,0,0
12968,12968,0.0,0.000000,0.022846,422.74,1924,95984.0,0,0
5870,5870,0.4,0.181818,0.096053,2877.34,1953,97040.0,1,0
37565,37565,0.4,0.272727,0.153374,4748.24,1995,96989.0,1,0
47416,47416,0.4,0.000000,0.055028,2333.17,1951,96213.0,0,0
...,...,...,...,...,...,...,...,...,...
10209,10209,0.4,0.181818,0.092941,5404.62,1977,96941.0,1,0
24383,24383,0.4,0.181818,0.058631,2144.51,1991,97330.0,0,0
24870,24870,0.4,0.181818,0.072470,2287.93,1957,97318.0,0,0
2677,2677,0.2,0.000000,0.020144,4096.17,1927,96282.0,0,0


In [7]:
y_train

Unnamed: 0,taxvaluedollarcnt
35112,32461
12968,16714
5870,233647
37565,450261
47416,165984
...,...
10209,519967
24383,106101
24870,63451
2677,343000


In [8]:
y_train['taxvaluedollarcnt'].mean()

512484.22575762786

## Setting up the baseline:
For Regression, we want to decide if we need to use mean or median baseline, so lets look at both

In [9]:
#Create mean baseline model
y_train['baseline_mean_pred'] = y_train['taxvaluedollarcnt'].mean()
y_validate['baseline_mean_pred'] = y_validate['taxvaluedollarcnt'].mean()

#Create median baseline model
y_train['baseline_median_pred'] = y_train['taxvaluedollarcnt'].median()
y_validate['baseline_median_pred'] = y_validate['taxvaluedollarcnt'].median()

In [10]:
#determine RSMSE
results = []

baseline_mean = {
    'model':'baseline_mean',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['baseline_mean_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['baseline_mean_pred'])**(0.5)

}
baseline_median = {
    'model':'baseline_median',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['baseline_median_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['baseline_median_pred'])**(0.5)

}
results.append(baseline_mean)
results.append(baseline_median)

In [11]:
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,612417.949481,607410.094119
1,baseline_median,628005.785271,623748.462807


# Linear Regression

In [12]:
lr = LinearRegression(normalize = True)

lr.fit(x_train, y_train['taxvaluedollarcnt'])

y_train['lr_train_pred'] = lr.predict(x_train)

y_validate['lr_validate_pred'] = lr.predict(x_validate)

y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lr_train_pred
35112,32461,512484.225758,373432.0,59355.907417
12968,16714,512484.225758,373432.0,19479.968396
5870,233647,512484.225758,373432.0,265331.998322
37565,450261,512484.225758,373432.0,402597.548383
47416,165984,512484.225758,373432.0,155765.933001
...,...,...,...,...
10209,519967,512484.225758,373432.0,464851.553955
24383,106101,512484.225758,373432.0,112210.120330
24870,63451,512484.225758,373432.0,151115.844107
2677,343000,512484.225758,373432.0,327211.615173


In [13]:
lr_rmse = {
    'model':'linear regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['lr_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['lr_validate_pred'])**(0.5)

}
results.append(lr_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,612417.949481,607410.094119
1,baseline_median,628005.785271,623748.462807
2,linear regression,87994.261043,89416.290932


# LassoLars

In [14]:
lars = LassoLars(alpha=1.0)

lars.fit(x_train, y_train['taxvaluedollarcnt'])

y_train['lars_train_pred'] = lars.predict(x_train)

y_validate['lars_validate_pred'] = lars.predict(x_validate)

y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lr_train_pred,lars_train_pred
35112,32461,512484.225758,373432.0,59355.907417,58873.700397
12968,16714,512484.225758,373432.0,19479.968396,18634.166039
5870,233647,512484.225758,373432.0,265331.998322,264802.956581
37565,450261,512484.225758,373432.0,402597.548383,402419.898160
47416,165984,512484.225758,373432.0,155765.933001,156437.179781
...,...,...,...,...,...
10209,519967,512484.225758,373432.0,464851.553955,464846.796818
24383,106101,512484.225758,373432.0,112210.120330,113352.631960
24870,63451,512484.225758,373432.0,151115.844107,151457.019409
2677,343000,512484.225758,373432.0,327211.615173,327067.179902


In [15]:
lars_rmse = {
    'model':'LassoLars regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['lars_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['lars_validate_pred'])**(0.5)

}
results.append(lars_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,612417.949481,607410.094119
1,baseline_median,628005.785271,623748.462807
2,linear regression,87994.261043,89416.290932
3,LassoLars regression,87996.818373,89408.084239


# Polynomial

In [16]:
# make the polynomial features to get a new set of features
pr = PolynomialFeatures(degree=2)

# fit and transform X_train_scaled
x_train_degree2 = pr.fit_transform(x_train)

# transform X_validate_scaled & X_test_scaled
x_validate_degree2 = pr.transform(x_validate)

In [17]:
# create the model object
lr2 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lr2.fit(x_train_degree2, y_train['taxvaluedollarcnt'])

# predict train
y_train['poly_train_pred'] = lr2.predict(x_train_degree2)

# predict validate
y_validate['poly_validate_pred'] = lr2.predict(x_validate_degree2)

In [18]:
y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lr_train_pred,lars_train_pred,poly_train_pred
35112,32461,512484.225758,373432.0,59355.907417,58873.700397,62605.669946
12968,16714,512484.225758,373432.0,19479.968396,18634.166039,15160.602550
5870,233647,512484.225758,373432.0,265331.998322,264802.956581,257161.419539
37565,450261,512484.225758,373432.0,402597.548383,402419.898160,389964.655074
47416,165984,512484.225758,373432.0,155765.933001,156437.179781,175122.339029
...,...,...,...,...,...,...
10209,519967,512484.225758,373432.0,464851.553955,464846.796818,463816.021935
24383,106101,512484.225758,373432.0,112210.120330,113352.631960,98726.092887
24870,63451,512484.225758,373432.0,151115.844107,151457.019409,153636.251397
2677,343000,512484.225758,373432.0,327211.615173,327067.179902,307099.498301


In [19]:
poly_rmse = {
    'model':'Polynomial regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['poly_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['poly_validate_pred'])**(0.5)

}
results.append(poly_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,612417.949481,607410.094119
1,baseline_median,628005.785271,623748.462807
2,linear regression,87994.261043,89416.290932
3,LassoLars regression,87996.818373,89408.084239
4,Polynomial regression,84157.593951,86404.592871


# TweedieRegressor

In [20]:
twr = TweedieRegressor()

twr.fit(x_train, y_train['taxvaluedollarcnt'])

y_train['twr_train_pred'] = twr.predict(x_train)

y_validate['twr_validate_pred'] = twr.predict(x_validate)

y_train

Unnamed: 0,taxvaluedollarcnt,baseline_mean_pred,baseline_median_pred,lr_train_pred,lars_train_pred,poly_train_pred,twr_train_pred
35112,32461,512484.225758,373432.0,59355.907417,58873.700397,62605.669946,60203.047819
12968,16714,512484.225758,373432.0,19479.968396,18634.166039,15160.602550,16902.287597
5870,233647,512484.225758,373432.0,265331.998322,264802.956581,257161.419539,228989.640558
37565,450261,512484.225758,373432.0,402597.548383,402419.898160,389964.655074,379842.032503
47416,165984,512484.225758,373432.0,155765.933001,156437.179781,175122.339029,174914.947775
...,...,...,...,...,...,...,...
10209,519967,512484.225758,373432.0,464851.553955,464846.796818,463816.021935,441415.495612
24383,106101,512484.225758,373432.0,112210.120330,113352.631960,98726.092887,148047.108170
24870,63451,512484.225758,373432.0,151115.844107,151457.019409,153636.251397,169257.652269
2677,343000,512484.225758,373432.0,327211.615173,327067.179902,307099.498301,333865.806643


In [21]:
twr_rmse = {
    'model':'TweedieRegressor regression',
    'RMSE_train': mean_squared_error(y_train['taxvaluedollarcnt'], y_train['twr_train_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['taxvaluedollarcnt'], y_validate['twr_validate_pred'])**(0.5)

}
results.append(twr_rmse)
pd.DataFrame(results)

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,612417.949481,607410.094119
1,baseline_median,628005.785271,623748.462807
2,linear regression,87994.261043,89416.290932
3,LassoLars regression,87996.818373,89408.084239
4,Polynomial regression,84157.593951,86404.592871
5,TweedieRegressor regression,91225.847442,93019.416726


In [22]:
results_df = pd.DataFrame(results)
results_df.sort_values(['RMSE_train', 'RMSE_validate'])

Unnamed: 0,model,RMSE_train,RMSE_validate
4,Polynomial regression,84157.593951,86404.592871
2,linear regression,87994.261043,89416.290932
3,LassoLars regression,87996.818373,89408.084239
5,TweedieRegressor regression,91225.847442,93019.416726
0,baseline_mean,612417.949481,607410.094119
1,baseline_median,628005.785271,623748.462807
