In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import acquire
import prepare

In [2]:
rawdf = acquire.get_zillow_data()

In [3]:
df = prepare.prep_zillow(rawdf)

In [4]:
df.head()

Unnamed: 0,longitude,latitude,bedrooms,bathrooms,square_feet,lot_size,has_pool,fips_code,age,assessed_value,...,bathrooms_size,county_Los Angeles,county_Orange,county_Ventura,bedrooms_size_small,bedrooms_size_medium,bedrooms_size_large,bathrooms_size_small,bathrooms_size_medium,bathrooms_size_large
0,-118740133,34251502,4.0,2.0,1323.0,6825.0,0.0,6111,53,453000.0,...,small,0,0,1,0,1,0,1,0,0
1,-118217048,33798657,4.0,2.0,1339.0,6107.0,0.0,6037,40,298371.0,...,small,1,0,0,0,1,0,1,0,0
2,-118531253,34039467,2.0,2.0,1231.0,11837.0,0.0,6037,67,161897.0,...,small,1,0,0,1,0,0,1,0,0
3,-117613897,33663005,5.0,3.0,3008.0,6760.0,1.0,6059,22,472384.0,...,medium,0,1,0,0,0,1,0,1,0
4,-118393309,34227422,2.0,1.0,996.0,7518.0,0.0,6037,77,58693.0,...,small,1,0,0,1,0,0,1,0,0


In [5]:
train, validate, test = prepare.split(df)

In [6]:
train.head()

Unnamed: 0,longitude,latitude,bedrooms,bathrooms,square_feet,lot_size,has_pool,fips_code,age,assessed_value,...,bathrooms_size,county_Los Angeles,county_Orange,county_Ventura,bedrooms_size_small,bedrooms_size_medium,bedrooms_size_large,bathrooms_size_small,bathrooms_size_medium,bathrooms_size_large
37570,-117621344,33597035,4.0,3.0,2391.0,4620.0,0.0,6059,22,115324.0,...,medium,0,1,0,0,1,0,0,1,0
34758,-118079657,34551890,3.0,2.0,1296.0,6307.0,0.0,6037,35,100794.0,...,small,1,0,0,0,1,0,1,0,0
17145,-118064295,34017797,3.0,2.0,1341.0,4972.0,0.0,6037,60,330792.0,...,small,1,0,0,0,1,0,1,0,0
37725,-118663461,34267619,3.0,2.0,1531.0,7105.0,1.0,6111,38,298084.0,...,small,0,0,1,0,1,0,1,0,0
22413,-118633552,34507840,5.0,3.0,2477.0,5991.0,1.0,6037,19,420256.0,...,medium,1,0,0,0,0,1,0,1,0


In [66]:
predictions = pd.DataFrame({
    'actual': s_train.assessed_value
}) 

In [67]:
predictions.head()

Unnamed: 0,actual
37570,115324.0
34758,100794.0
17145,330792.0
37725,298084.0
22413,420256.0


## Simple Model
$$ y = mx + b $$

$$y = f(x)$$

In [18]:
scaled_df.dtypes

longitude                  object
latitude                   object
bedrooms                  float64
bathrooms                 float64
square_feet               float64
lot_size                  float64
has_pool                  float64
fips_code                  object
age                         int64
assessed_value            float64
tax_amount                float64
county                     object
bedrooms_size            category
bathrooms_size           category
county_Los Angeles          uint8
county_Orange               uint8
county_Ventura              uint8
bedrooms_size_small         uint8
bedrooms_size_medium        uint8
bedrooms_size_large         uint8
bathrooms_size_small        uint8
bathrooms_size_medium       uint8
bathrooms_size_large        uint8
dtype: object

In [12]:
scaled_df = df.copy()

In [14]:
scaler = MinMaxScaler()

In [15]:
scaler.fit(scaled_df[['square_feet', 'lot_size', 'age']])

MinMaxScaler()

In [19]:
scaled_df[['square_feet', 'lot_size', 'age']] = scaler.transform(scaled_df[['square_feet', 'lot_size', 'age']])

In [23]:
scaled_df.head()

Unnamed: 0,longitude,latitude,bedrooms,bathrooms,square_feet,lot_size,has_pool,fips_code,age,assessed_value,tax_amount,county,bedrooms_size,bathrooms_size,county_Los Angeles,county_Orange,county_Ventura,bedrooms_size_small,bedrooms_size_medium,bedrooms_size_large,bathrooms_size_small,bathrooms_size_medium,bathrooms_size_large
0,-118740133,34251502,4.0,2.0,0.113282,0.040121,0.0,6111,0.376812,453000.0,5235.32,Ventura,medium,small,0,0,1,0,1,0,1,0,0
1,-118217048,33798657,4.0,2.0,0.115469,0.03534,0.0,6037,0.282609,298371.0,3714.58,Los Angeles,medium,small,1,0,0,0,1,0,1,0,0
2,-118531253,34039467,2.0,2.0,0.100711,0.073497,0.0,6037,0.478261,161897.0,2031.39,Los Angeles,small,small,1,0,0,1,0,0,1,0,0
3,-117613897,33663005,5.0,3.0,0.343536,0.039688,1.0,6059,0.152174,472384.0,7234.74,Orange,large,medium,0,1,0,0,0,1,0,1,0
4,-118393309,34227422,2.0,1.0,0.068598,0.044736,0.0,6037,0.550725,58693.0,751.66,Los Angeles,small,small,1,0,0,1,0,0,1,0,0


In [21]:
s_train, s_validate, s_test = prepare.split(scaled_df)

In [22]:
pd.set_option('display.max_columns', None)

In [24]:
cols = ['square_feet', 'lot_size', 'has_pool', 'age', 'county_Los Angeles', 'county_Orange', 'county_Ventura', 'bedrooms_size_small', 'bedrooms_size_medium', 'bathrooms_size_small', 'bathrooms_size_medium']

X_train = s_train[cols]
y_train = s_train.assessed_value

X_validate = s_validate[cols]
y_validate = s_validate.assessed_value

X_test = s_test[cols]
y_test = s_test.assessed_value

## Multiple  Regression + RFE

In [68]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
k = 2

# 1. Transform our X
rfe = RFE(lm, n_features_to_select=2)
rfe.fit(X_train, y_train)
print('selected top 2 features:', X_train.columns[rfe.support_])
X_train_rfe = rfe.transform(X_train)

selected top 2 features: Index(['square_feet', 'bathrooms_size_small'], dtype='object')


In [69]:
# 2. Use the transformed x in our model
lm.fit(X_train_rfe, y_train)

# 3. Make predictions
X_train_rfe = rfe.transform(X_train)
predictions['multiple_rfe'] = lm.predict(X_train_rfe)

predictions.head()

Unnamed: 0,actual,multiple_rfe
37570,115324.0,605725.122674
34758,100794.0,312572.36194
17145,330792.0,324629.193518
37725,298084.0,375535.815732
22413,420256.0,628767.067466


## Polynomial Features

In [70]:
from sklearn.preprocessing import PolynomialFeatures

# 1. Generate Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=train.index,
)
X_train_poly.head()

Unnamed: 0,square_feet,lot_size,has_pool,age,county_Los Angeles,county_Orange,county_Ventura,bedrooms_size_small,bedrooms_size_medium,bathrooms_size_small,bathrooms_size_medium,square_feet^2,square_feet lot_size,square_feet has_pool,square_feet age,square_feet county_Los Angeles,square_feet county_Orange,square_feet county_Ventura,square_feet bedrooms_size_small,square_feet bedrooms_size_medium,square_feet bathrooms_size_small,square_feet bathrooms_size_medium,lot_size^2,lot_size has_pool,lot_size age,lot_size county_Los Angeles,lot_size county_Orange,lot_size county_Ventura,lot_size bedrooms_size_small,lot_size bedrooms_size_medium,lot_size bathrooms_size_small,lot_size bathrooms_size_medium,has_pool^2,has_pool age,has_pool county_Los Angeles,has_pool county_Orange,has_pool county_Ventura,has_pool bedrooms_size_small,has_pool bedrooms_size_medium,has_pool bathrooms_size_small,has_pool bathrooms_size_medium,age^2,age county_Los Angeles,age county_Orange,age county_Ventura,age bedrooms_size_small,age bedrooms_size_medium,age bathrooms_size_small,age bathrooms_size_medium,county_Los Angeles^2,county_Los Angeles county_Orange,county_Los Angeles county_Ventura,county_Los Angeles bedrooms_size_small,county_Los Angeles bedrooms_size_medium,county_Los Angeles bathrooms_size_small,county_Los Angeles bathrooms_size_medium,county_Orange^2,county_Orange county_Ventura,county_Orange bedrooms_size_small,county_Orange bedrooms_size_medium,county_Orange bathrooms_size_small,county_Orange bathrooms_size_medium,county_Ventura^2,county_Ventura bedrooms_size_small,county_Ventura bedrooms_size_medium,county_Ventura bathrooms_size_small,county_Ventura bathrooms_size_medium,bedrooms_size_small^2,bedrooms_size_small bedrooms_size_medium,bedrooms_size_small bathrooms_size_small,bedrooms_size_small bathrooms_size_medium,bedrooms_size_medium^2,bedrooms_size_medium bathrooms_size_small,bedrooms_size_medium bathrooms_size_medium,bathrooms_size_small^2,bathrooms_size_small bathrooms_size_medium,bathrooms_size_medium^2
37570,0.259224,0.025438,0.0,0.152174,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.067197,0.006594,0.0,0.039447,0.0,0.259224,0.0,0.0,0.259224,0.0,0.259224,0.000647,0.0,0.003871,0.0,0.025438,0.0,0.0,0.025438,0.0,0.025438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023157,0.0,0.152174,0.0,0.0,0.152174,0.0,0.152174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
34758,0.109593,0.036672,0.0,0.246377,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.012011,0.004019,0.0,0.027001,0.109593,0.0,0.0,0.0,0.109593,0.109593,0.0,0.001345,0.0,0.009035,0.036672,0.0,0.0,0.0,0.036672,0.036672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060702,0.246377,0.0,0.0,0.0,0.246377,0.246377,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
17145,0.115742,0.027782,0.0,0.427536,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.013396,0.003216,0.0,0.049484,0.115742,0.0,0.0,0.0,0.115742,0.115742,0.0,0.000772,0.0,0.011878,0.027782,0.0,0.0,0.0,0.027782,0.027782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.182787,0.427536,0.0,0.0,0.0,0.427536,0.427536,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
37725,0.141705,0.041986,1.0,0.268116,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.02008,0.00595,0.141705,0.037993,0.0,0.0,0.141705,0.0,0.141705,0.141705,0.0,0.001763,0.041986,0.011257,0.0,0.0,0.041986,0.0,0.041986,0.041986,0.0,1.0,0.268116,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.071886,0.0,0.0,0.268116,0.0,0.268116,0.268116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
22413,0.270976,0.034567,1.0,0.130435,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.073428,0.009367,0.270976,0.035345,0.270976,0.0,0.0,0.0,0.0,0.0,0.270976,0.001195,0.034567,0.004509,0.034567,0.0,0.0,0.0,0.0,0.0,0.034567,1.0,0.130435,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.017013,0.130435,0.0,0.0,0.0,0.0,0.0,0.130435,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [58]:
X_train_poly.shape, y_train.shape, predictions.shape

((27794, 77), (27794,), (27794, 2))

In [71]:
lm = LinearRegression()
lm.fit(X_train_poly, y_train)

predictions['polynomial degree 2'] = lm.predict(X_train_poly)
predictions.head()

Unnamed: 0,actual,multiple_rfe,polynomial degree 2
37570,115324.0,605725.122674,676796.611514
34758,100794.0,312572.36194,199958.057135
17145,330792.0,324629.193518,293606.057135
37725,298084.0,375535.815732,400529.362428
22413,420256.0,628767.067466,398322.057135


In [72]:
feature_names = poly.get_feature_names(X_train.columns)
pd.Series(lm.coef_, index=feature_names).sort_values()

square_feet county_Ventura                  -1.908391e+17
square_feet county_Orange                   -1.908391e+17
square_feet county_Los Angeles              -1.908391e+17
square_feet age                             -1.799032e+06
square_feet lot_size                        -1.495580e+06
                                                 ...     
square_feet bathrooms_size_small             4.954320e+05
square_feet^2                                5.378560e+05
bedrooms_size_small bathrooms_size_medium    5.395382e+05
square_feet bathrooms_size_medium            8.284800e+05
square_feet                                  1.908391e+17
Length: 77, dtype: float64

In [73]:
# Interaction terms only

poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=train.index,
)
lm = LinearRegression()
lm.fit(X_train_poly, y_train)

X_validate_poly = poly.transform(X_validate)
predictions['polynomial only interaction'] = lm.predict(X_train_poly)

predictions.head()

Unnamed: 0,actual,multiple_rfe,polynomial degree 2,polynomial only interaction
37570,115324.0,605725.122674,676796.611514,679182.057135
34758,100794.0,312572.36194,199958.057135,194478.057135
17145,330792.0,324629.193518,293606.057135,294674.057135
37725,298084.0,375535.815732,400529.362428,399509.330899
22413,420256.0,628767.067466,398322.057135,380608.057135


In [74]:
pd.Series(lm.coef_, index=poly.get_feature_names(X_train.columns)).sort_values()

square_feet county_Ventura                  -3.958965e+16
square_feet county_Orange                   -3.958965e+16
square_feet county_Los Angeles              -3.958965e+16
square_feet age                             -1.936590e+06
square_feet lot_size                        -1.095710e+06
                                                 ...     
bedrooms_size_small bathrooms_size_small     4.422563e+05
bedrooms_size_small bathrooms_size_medium    5.022850e+05
age county_Los Angeles                       5.399915e+05
square_feet bathrooms_size_medium            6.540320e+05
square_feet                                  3.958965e+16
Length: 66, dtype: float64

## Lasso-Lars

In [76]:
from sklearn.linear_model import LassoLars
# create the model object
lars = LassoLars(alpha=0)

# fit the model to our training data
lars.fit(X_train, y_train)

# predict validate
X_train_pred_lars = lars.predict(X_train)

pd.Series(lars.coef_, index=X_train.columns).sort_values()

bathrooms_size_small    -2.801133e+05
bathrooms_size_medium   -2.495691e+05
lot_size                -2.115608e+05
county_Los Angeles      -1.494747e+04
county_Ventura           0.000000e+00
has_pool                 3.009361e+04
county_Orange            3.814465e+04
bedrooms_size_medium     1.008212e+05
age                      1.659780e+05
bedrooms_size_small      1.919951e+05
square_feet              2.052581e+06
dtype: float64

In [77]:
# create the model object
lars = LassoLars(alpha=1)

# fit the model to our training data
lars.fit(X_train, y_train)

# predict validate
X_train_pred_lars = lars.predict(X_train)

# Add lassolars predictions to our predictions DataFrame
predictions['lasso_lars'] = X_train_pred_lars

## Generalized Linear Model

In [79]:
from sklearn.linear_model import TweedieRegressor

# create the model object
glm = TweedieRegressor(power=1, alpha=0)

# fit the model to our training data
glm.fit(X_train, y_train)

# predict train
X_train_predict_glm = glm.predict(X_train)

# Add lassolars predictions to our predictions DataFrame
predictions['glm'] = X_train_predict_glm

## Evaluation

In [80]:
# add a baseline model
predictions['baseline'] = s_train.assessed_value.mean()

In [82]:
pd.options.display.float_format = '{:,.2f}'.format

In [83]:
def calculate_mse(y_predicted):
    return mean_squared_error(predictions.actual, y_predicted)

predictions.apply(calculate_mse).sort_values()

actual                                      0.00
polynomial degree 2            92,063,785,766.33
polynomial only interaction    92,130,653,737.70
lasso_lars                     96,607,774,682.03
multiple_rfe                  100,215,052,954.96
glm                           101,736,778,230.44
baseline                      151,015,432,605.67
dtype: float64

In [84]:
def calculate_rmse(y_predicted):
    return mean_squared_error(predictions.actual, y_predicted, squared=False)

predictions.apply(calculate_rmse).sort_values()

actual                              0.00
polynomial degree 2           303,420.15
polynomial only interaction   303,530.32
lasso_lars                    310,817.91
multiple_rfe                  316,567.61
glm                           318,962.03
baseline                      388,607.04
dtype: float64