In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import acquire
import prepare

In [2]:
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
rawdf = acquire.get_zillow_data()

In [4]:
rawdf.head()

Unnamed: 0,longitude,latitude,bedrooms,bathrooms,square_feet,lot_size,has_pool,fips_code,age,assessed_value,tax_amount
0,-118740133,34251502,4.0,2.0,1323.0,6825.0,,6111,53.0,453000.0,5235.32
1,-118217048,33798657,4.0,2.0,1339.0,6107.0,,6037,40.0,298371.0,3714.58
2,-118531253,34039467,2.0,2.0,1231.0,11837.0,,6037,67.0,161897.0,2031.39
3,-117613897,33663005,5.0,3.0,3008.0,6760.0,1.0,6059,22.0,472384.0,7234.74
4,-118393309,34227422,2.0,1.0,996.0,7518.0,,6037,77.0,58693.0,751.66


In [5]:
df = prepare.prep_zillow(rawdf)

In [6]:
df.shape

(47281, 23)

In [11]:
df.head()

Unnamed: 0,longitude,latitude,bedrooms,bathrooms,square_feet,lot_size,has_pool,fips_code,age,assessed_value,...,bathrooms_size,county_Los Angeles,county_Orange,county_Ventura,bedrooms_size_small,bedrooms_size_medium,bedrooms_size_large,bathrooms_size_small,bathrooms_size_medium,bathrooms_size_large
0,-118740133,34251502,4.0,2.0,1323.0,6825.0,0.0,6111,53,453000.0,...,small,0,0,1,0,1,0,1,0,0
1,-118217048,33798657,4.0,2.0,1339.0,6107.0,0.0,6037,40,298371.0,...,small,1,0,0,0,1,0,1,0,0
2,-118531253,34039467,2.0,2.0,1231.0,11837.0,0.0,6037,67,161897.0,...,small,1,0,0,1,0,0,1,0,0
3,-117613897,33663005,5.0,3.0,3008.0,6760.0,1.0,6059,22,472384.0,...,medium,0,1,0,0,0,1,0,1,0
6,-118689128,34284379,3.0,2.5,1666.0,3301.0,0.0,6111,20,362552.0,...,small,0,0,1,0,1,0,1,0,0


## Simple Model
$$ y = mx + b $$

$$y = f(x)$$

In [8]:
scaled_df = df.copy()

In [9]:
scaler = MinMaxScaler()

In [10]:
scaler.fit(scaled_df[['square_feet', 'lot_size', 'age']])

MinMaxScaler()

In [11]:
scaled_df[['square_feet', 'lot_size', 'age']] = scaler.transform(scaled_df[['square_feet', 'lot_size', 'age']])

In [12]:
scaled_df.head()

Unnamed: 0,longitude,latitude,bedrooms,bathrooms,square_feet,lot_size,has_pool,fips_code,age,assessed_value,...,bathrooms_size,county_Los Angeles,county_Orange,county_Ventura,bedrooms_size_small,bedrooms_size_medium,bedrooms_size_large,bathrooms_size_small,bathrooms_size_medium,bathrooms_size_large
0,-118740133,34251502,4.0,2.0,0.13,0.05,0.0,6111,0.38,453000.0,...,small,0,0,1,0,1,0,1,0,0
1,-118217048,33798657,4.0,2.0,0.13,0.04,0.0,6037,0.28,298371.0,...,small,1,0,0,0,1,0,1,0,0
2,-118531253,34039467,2.0,2.0,0.12,0.09,0.0,6037,0.48,161897.0,...,small,1,0,0,1,0,0,1,0,0
3,-117613897,33663005,5.0,3.0,0.4,0.05,1.0,6059,0.15,472384.0,...,medium,0,1,0,0,0,1,0,1,0
4,-118393309,34227422,2.0,1.0,0.08,0.06,0.0,6037,0.55,58693.0,...,small,1,0,0,1,0,0,1,0,0


In [13]:
s_train, s_validate, s_test = prepare.split(scaled_df)

In [14]:
pd.set_option('display.max_columns', None)

In [17]:
cols = ['square_feet', 'lot_size', 'has_pool', 'age', 'county_Los Angeles', 'county_Orange', 'bedrooms', 'bathrooms']

X_train = s_train[cols]
y_train = s_train.assessed_value

X_validate = s_validate[cols]
y_validate = s_validate.assessed_value

X_test = s_test[cols]
y_test = s_test.assessed_value

In [21]:
train_predictions = pd.DataFrame({
    'actual': s_train.assessed_value
}) 
validate_predictions = pd.DataFrame({
    'actual': s_validate.assessed_value
}) 

## Multiple  Regression + RFE

In [19]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
k = 6

# 1. Transform our X
rfe = RFE(lm, n_features_to_select=5)
rfe.fit(X_train, y_train)
print('selected top 8 features:', X_train.columns[rfe.support_])
X_train_rfe = rfe.transform(X_train)

selected top 8 features: Index(['square_feet', 'lot_size', 'age', 'county_Los Angeles', 'bedrooms'], dtype='object')


### Looping through k

In [22]:
for k in range(2,8):
    lm = LinearRegression()
    # 1. Transform our X
    rfe = RFE(lm, n_features_to_select=k)
    rfe.fit(X_train, y_train)
    # 2. Use the transformed x in our model
    X_train_rfe = rfe.transform(X_train)
    X_validate_rfe = rfe.transform(X_validate)
    lm.fit(X_train_rfe, y_train)
    
    # Make predictions
    train_predictions['multiple_rfe_k=', k] = lm.predict(X_train_rfe)
    validate_predictions['multiple_rfe_k=', k] = lm.predict(X_validate_rfe)
    k+=1

In [23]:
train_predictions.head()

Unnamed: 0,actual,"(multiple_rfe_k=, 2)","(multiple_rfe_k=, 3)","(multiple_rfe_k=, 4)","(multiple_rfe_k=, 5)","(multiple_rfe_k=, 6)","(multiple_rfe_k=, 7)"
27797,339675.0,427543.61,457229.19,429248.28,421741.62,417489.23,411993.11
18599,913000.0,562177.81,592701.08,682423.31,680160.04,669224.02,658384.19
512,543000.0,170947.34,161485.15,223216.51,230087.86,244580.55,244224.0
18002,547000.0,348856.82,382724.94,387402.68,384078.6,383429.19,379395.83
44818,397288.0,330636.74,365378.25,366886.52,367149.54,373503.19,404321.63


In [24]:
train_predictions['baseline'] = y_train.mean()

In [25]:
def calculate_rmse(y_predicted):
    return mean_squared_error(train_predictions.actual, y_predicted, squared = False)

train_predictions.apply(calculate_rmse).sort_values()

actual                       0.00
(multiple_rfe_k=, 7)   232,439.10
(multiple_rfe_k=, 6)   232,815.36
(multiple_rfe_k=, 5)   233,566.15
(multiple_rfe_k=, 4)   233,620.68
(multiple_rfe_k=, 3)   235,668.09
(multiple_rfe_k=, 2)   236,841.47
baseline               274,767.86
dtype: float64

In [27]:
validate_predictions['baseline'] = y_validate.mean()

In [28]:
validate_predictions.head()

Unnamed: 0,actual,"(multiple_rfe_k=, 2)","(multiple_rfe_k=, 3)","(multiple_rfe_k=, 4)","(multiple_rfe_k=, 5)","(multiple_rfe_k=, 6)","(multiple_rfe_k=, 7)",baseline
46740,629682.0,321471.41,304925.54,346627.95,339405.72,347812.14,346171.45,406783.72
18140,187026.0,469130.6,498559.38,478239.42,481696.22,481770.43,472452.04,406783.72
1508,768308.0,819603.59,786081.38,777394.88,779554.62,732972.62,749730.97,406783.72
7139,188466.0,403396.11,384394.31,394463.66,386548.95,414469.99,409149.96,406783.72
30114,73980.0,262122.37,248808.16,280281.54,285707.84,276369.4,273728.21,406783.72


In [29]:
def calculate_rmse(y_predicted):
    return mean_squared_error(validate_predictions.actual, y_predicted, squared = False)

validate_predictions.apply(calculate_rmse).sort_values()

actual                       0.00
(multiple_rfe_k=, 7)   231,828.39
(multiple_rfe_k=, 6)   232,008.57
(multiple_rfe_k=, 5)   232,843.79
(multiple_rfe_k=, 4)   233,020.30
(multiple_rfe_k=, 3)   234,724.57
(multiple_rfe_k=, 2)   236,085.70
baseline               273,944.34
dtype: float64

In [30]:
# k=7 is the best

231828-232439

-611

**K=7 has the lowest rmse, highest performance**

## Polynomial Features

In [31]:
train_pred = pd.DataFrame({
    'actual': s_train.assessed_value
}) 
validate_pred = pd.DataFrame({
    'actual': s_validate.assessed_value
}) 

In [48]:
from sklearn.preprocessing import PolynomialFeatures

# 1. Generate Polynomial Features, k=2
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)

In [49]:
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)
train_pred['baseline'] = y_train.mean()
validate_pred['baseline'] = y_validate.mean()
train_pred['polynomial degree 2'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 2'] = lm.predict(X_validate_poly)
train_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2,polynomial degree 2 only interaction
27797,339675.0,407658.98,533287.61,537645.58
18599,913000.0,407658.98,636001.73,665864.73
512,543000.0,407658.98,337269.36,352657.34
18002,547000.0,407658.98,410062.86,410121.81
44818,397288.0,407658.98,373780.36,380158.07


In [35]:
validate_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2
46740,629682.0,406783.72,287334.63
18140,187026.0,406783.72,430215.98
1508,768308.0,406783.72,696422.9
7139,188466.0,406783.72,373457.15
30114,73980.0,406783.72,334905.86


In [40]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 2'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 2'], squared = False)

In [38]:
train_rmse_b = mean_squared_error(train_pred.actual,train_pred['baseline'], squared = False)
validate_rmse_b = mean_squared_error(validate_pred.actual,validate_pred['baseline'], squared = False)

In [41]:
train_rmse, validate_rmse

(225563.1468062608, 224965.8539400748)

In [42]:
validate_rmse-train_rmse

-597.292866186006

In [39]:
train_rmse_b, validate_rmse_b

(274767.8629314713, 273944.3392279506)

In [50]:
# Interaction terms only

poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)
lm = LinearRegression()
lm.fit(X_train_poly, y_train)

X_validate_poly = poly.transform(X_validate)
train_pred['polynomial degree 2 only interaction'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 2 only interaction'] = lm.predict(X_validate_poly)
train_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2,polynomial degree 2 only interaction
27797,339675.0,407658.98,533287.61,537645.58
18599,913000.0,407658.98,636001.73,665864.73
512,543000.0,407658.98,337269.36,352657.34
18002,547000.0,407658.98,410062.86,410121.81
44818,397288.0,407658.98,373780.36,380158.07


In [51]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 2 only interaction'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 2 only interaction'], squared = False)
train_rmse, validate_rmse

(225829.11823267405, 225167.73661349496)

In [52]:
validate_rmse-train_rmse

-661.3816191790975

In [53]:
# k=3
poly = PolynomialFeatures(degree=3, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)

In [54]:
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)
train_pred['baseline'] = y_train.mean()
validate_pred['baseline'] = y_validate.mean()
train_pred['polynomial degree 3'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 3'] = lm.predict(X_validate_poly)
train_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2,polynomial degree 2 only interaction,polynomial degree 3
27797,339675.0,407658.98,533287.61,537645.58,582286.17
18599,913000.0,407658.98,636001.73,665864.73,638633.23
512,543000.0,407658.98,337269.36,352657.34,271184.3
18002,547000.0,407658.98,410062.86,410121.81,383339.48
44818,397288.0,407658.98,373780.36,380158.07,362223.42


In [55]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 3'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 3'], squared = False)
train_rmse, validate_rmse

(222833.2843929147, 223820.32945259503)

In [56]:
# k=4
poly = PolynomialFeatures(degree=4, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)

In [57]:
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)
train_pred['baseline'] = y_train.mean()
validate_pred['baseline'] = y_validate.mean()
train_pred['polynomial degree 4'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 4'] = lm.predict(X_validate_poly)
train_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2,polynomial degree 2 only interaction,polynomial degree 3,polynomial degree 4
27797,339675.0,407658.98,533287.61,537645.58,582286.17,583336.55
18599,913000.0,407658.98,636001.73,665864.73,638633.23,719444.55
512,543000.0,407658.98,337269.36,352657.34,271184.3,189598.25
18002,547000.0,407658.98,410062.86,410121.81,383339.48,386924.89
44818,397288.0,407658.98,373780.36,380158.07,362223.42,391558.28


In [58]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 4'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 4'], squared = False)
train_rmse, validate_rmse

(220599.9432621597, 228423.90903466812)

**K=3 has the lowest rmse**

## Lasso-Lars

In [59]:
from sklearn.linear_model import LassoLars
# create the model object
lars = LassoLars(alpha=0)

# fit the model to our training data
lars.fit(X_train, y_train)

# predict validate
X_train_pred_lars = lars.predict(X_train)

pd.Series(lars.coef_, index=X_train.columns).sort_values()

lot_size              -156,273.52
bedrooms               -52,310.25
county_Los Angeles     -50,624.11
county_Ventura               0.00
county_Orange           12,648.51
has_pool                34,981.79
bathrooms               40,450.36
age                     71,769.57
square_feet          1,166,547.13
dtype: float64

In [60]:
# create the model object
lars = LassoLars(alpha=1)

# fit the model to our training data
lars.fit(X_train, y_train)

# predict validate
X_train_pred_lars = lars.predict(X_train)
X_validate_pred_lars = lars.predict(X_validate)
# Add lassolars predictions to our predictions DataFrame
train_pred['lasso_lars'] = X_train_pred_lars
validate_pred['lasso_lars'] = X_validate_pred_lars

In [61]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['lasso_lars'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['lasso_lars'], squared = False)
train_rmse, validate_rmse

(232417.49582058165, 231827.48777021127)

In [62]:
validate_rmse-train_rmse

-590.0080503703794

## Generalized Linear Model

In [63]:
from sklearn.linear_model import TweedieRegressor

# create the model object
glm = TweedieRegressor(power=1, alpha=0)

# fit the model to our training data
glm.fit(X_train, y_train)

# predict train
X_train_predict_glm = glm.predict(X_train)
X_validate_predict_glm = glm.predict(X_validate)
# Add lassolars predictions to our predictions DataFrame
train_pred['glm'] = X_train_predict_glm
validate_pred['glm'] = X_validate_predict_glm

In [64]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['glm'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['glm'], squared = False)
train_rmse, validate_rmse

(235324.13823328284, 234667.38804766495)

In [65]:
validate_rmse-train_rmse

-656.750185617886

## Evaluation