In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import sklearn.preprocessing
import acquire
import prepare

In [2]:
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
rawdf = acquire.get_zillow_data()

In [4]:
rawdf.shape

(52441, 11)

In [5]:
rawdf.head()

Unnamed: 0,longitude,latitude,bedrooms,bathrooms,square_feet,lot_size,has_pool,fips_code,age,assessed_value,tax_amount
0,-118740133,34251502,4.0,2.0,1323.0,6825.0,,6111,53.0,453000.0,5235.32
1,-118217048,33798657,4.0,2.0,1339.0,6107.0,,6037,40.0,298371.0,3714.58
2,-118531253,34039467,2.0,2.0,1231.0,11837.0,,6037,67.0,161897.0,2031.39
3,-117613897,33663005,5.0,3.0,3008.0,6760.0,1.0,6059,22.0,472384.0,7234.74
4,-118393309,34227422,2.0,1.0,996.0,7518.0,,6037,77.0,58693.0,751.66


In [6]:
df = prepare.prep_zillow(rawdf)

In [7]:
df.shape

(45324, 23)

In [8]:
df.head()

Unnamed: 0,longitude,latitude,bedrooms,bathrooms,square_feet,lot_size,has_pool,fips_code,age,assessed_value,...,bathrooms_size,county_Los Angeles,county_Orange,county_Ventura,bedrooms_size_small,bedrooms_size_medium,bedrooms_size_large,bathrooms_size_small,bathrooms_size_medium,bathrooms_size_large
0,-118740133,34251502,4.0,2.0,1323.0,6825.0,0.0,6111,53,453000.0,...,small,0,0,1,0,1,0,1,0,0
1,-118217048,33798657,4.0,2.0,1339.0,6107.0,0.0,6037,40,298371.0,...,small,1,0,0,0,1,0,1,0,0
2,-118531253,34039467,2.0,2.0,1231.0,11837.0,0.0,6037,67,161897.0,...,small,1,0,0,1,0,0,1,0,0
3,-117613897,33663005,5.0,3.0,3008.0,6760.0,1.0,6059,22,472384.0,...,medium,0,1,0,0,0,1,0,1,0
4,-118393309,34227422,2.0,1.0,996.0,7518.0,0.0,6037,77,58693.0,...,small,1,0,0,1,0,0,1,0,0


## Simple Model
$$ y = mx + b $$

$$y = f(x)$$

Tested out all scalers, and MinMaxScaler performs the best

In [9]:
scaled_df = df.copy()

In [10]:
scaler = MinMaxScaler()

In [None]:
# scaler = sklearn.preprocessing.RobustScaler()

In [None]:
# # Quantile scaler
# qt = sklearn.preprocessing.QuantileTransformer(n_quantiles=10, output_distribution='normal', random_state=0)

In [None]:
# qt.fit(scaled_df[['square_feet', 'lot_size', 'age', 'bedrooms', 'bathrooms']])

In [None]:
# scaled_df[['square_feet', 'lot_size', 'age', 'bedrooms', 'bathrooms']] = qt.transform(scaled_df[['square_feet', 'lot_size', 'age', 'bedrooms', 'bathrooms']])

In [11]:
scaler.fit(scaled_df[['square_feet', 'lot_size', 'age', 'bedrooms', 'bathrooms']])

MinMaxScaler()

In [12]:
scaled_df[['square_feet', 'lot_size', 'age', 'bedrooms', 'bathrooms']] = scaler.transform(scaled_df[['square_feet', 'lot_size', 'age', 'bedrooms', 'bathrooms']])

In [13]:
scaled_df.head()

Unnamed: 0,longitude,latitude,bedrooms,bathrooms,square_feet,lot_size,has_pool,fips_code,age,assessed_value,...,bathrooms_size,county_Los Angeles,county_Orange,county_Ventura,bedrooms_size_small,bedrooms_size_medium,bedrooms_size_large,bathrooms_size_small,bathrooms_size_medium,bathrooms_size_large
0,-118740133,34251502,0.6,0.2,0.19,0.06,0.0,6111,0.38,453000.0,...,small,0,0,1,0,1,0,1,0,0
1,-118217048,33798657,0.6,0.2,0.2,0.05,0.0,6037,0.28,298371.0,...,small,1,0,0,0,1,0,1,0,0
2,-118531253,34039467,0.2,0.2,0.17,0.11,0.0,6037,0.48,161897.0,...,small,1,0,0,1,0,0,1,0,0
3,-117613897,33663005,0.8,0.4,0.58,0.06,1.0,6059,0.15,472384.0,...,medium,0,1,0,0,0,1,0,1,0
4,-118393309,34227422,0.2,0.0,0.12,0.07,0.0,6037,0.55,58693.0,...,small,1,0,0,1,0,0,1,0,0


In [14]:
s_train, s_validate, s_test = prepare.split(scaled_df)

In [15]:
pd.set_option('display.max_columns', None)

In [16]:
cols = ['square_feet', 'lot_size', 'has_pool', 'age', 'county_Los Angeles', 'county_Orange', 'bedrooms', 'bathrooms']

X_train = s_train[cols]
y_train = s_train.assessed_value

X_validate = s_validate[cols]
y_validate = s_validate.assessed_value

X_test = s_test[cols]
y_test = s_test.assessed_value

In [17]:
train_predictions = pd.DataFrame({
    'actual': s_train.assessed_value
}) 
validate_predictions = pd.DataFrame({
    'actual': s_validate.assessed_value
}) 

In [18]:
final_train_predictions = pd.DataFrame({
    'actual': s_train.assessed_value
}) 
final_validate_predictions = pd.DataFrame({
    'actual': s_validate.assessed_value
}) 
final_test_predictions = pd.DataFrame({
    'actual': s_test.assessed_value
})

## Multiple  Regression + RFE

In [19]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
k = 6

# 1. Transform our X
rfe = RFE(lm, n_features_to_select=6)
rfe.fit(X_train, y_train)
print('selected top 8 features:', X_train.columns[rfe.support_])
X_train_rfe = rfe.transform(X_train)

selected top 8 features: Index(['square_feet', 'lot_size', 'has_pool', 'county_Los Angeles', 'bedrooms',
       'bathrooms'],
      dtype='object')


### Looping through k

In [20]:
for k in range(2,8):
    lm = LinearRegression()
    # 1. Transform our X
    rfe = RFE(lm, n_features_to_select=k)
    rfe.fit(X_train, y_train)
    # 2. Use the transformed x in our model
    X_train_rfe = rfe.transform(X_train)
    X_validate_rfe = rfe.transform(X_validate)
    lm.fit(X_train_rfe, y_train)
    
    # Make predictions
    train_predictions['multiple_rfe_k=', k] = lm.predict(X_train_rfe)
    validate_predictions['multiple_rfe_k=', k] = lm.predict(X_validate_rfe)
    k+=1

In [21]:
train_predictions.head()

Unnamed: 0,actual,"(multiple_rfe_k=, 2)","(multiple_rfe_k=, 3)","(multiple_rfe_k=, 4)","(multiple_rfe_k=, 5)","(multiple_rfe_k=, 6)","(multiple_rfe_k=, 7)"
36490,702091.0,365446.15,336583.84,331268.49,307288.35,333329.05,332132.09
19337,442099.0,362022.82,367326.64,366844.52,407063.44,402521.84,402606.77
10447,76613.0,337243.53,338393.1,341151.05,320630.55,316125.78,315559.81
42559,802437.0,468703.54,456971.93,465270.27,495540.45,488181.68,483442.41
34091,188011.0,635997.56,585372.02,587489.63,550727.63,565314.42,568408.62


In [22]:
train_predictions['baseline'] = y_train.mean()

In [23]:
def calculate_rmse(y_predicted):
    return mean_squared_error(train_predictions.actual, y_predicted, squared = False)

train_predictions.apply(calculate_rmse).sort_values()

actual                       0.00
(multiple_rfe_k=, 7)   207,896.69
(multiple_rfe_k=, 6)   207,913.39
(multiple_rfe_k=, 5)   208,248.34
(multiple_rfe_k=, 4)   210,238.38
(multiple_rfe_k=, 3)   210,605.53
(multiple_rfe_k=, 2)   211,872.25
baseline               237,872.07
dtype: float64

In [24]:
validate_predictions['baseline'] = y_validate.mean()

In [25]:
validate_predictions.head()


Unnamed: 0,actual,"(multiple_rfe_k=, 2)","(multiple_rfe_k=, 3)","(multiple_rfe_k=, 4)","(multiple_rfe_k=, 5)","(multiple_rfe_k=, 6)","(multiple_rfe_k=, 7)",baseline
50904,170700.0,424588.29,405834.97,392597.52,366442.4,359170.19,354588.36,377092.24
22280,395742.0,501825.12,530898.33,538752.58,571359.34,595129.29,593258.35,377092.24
16371,222815.0,226145.81,243074.86,238132.12,225899.06,225193.42,225776.54,377092.24
33772,816260.0,404804.07,417595.86,438151.75,413479.96,406814.12,411092.34,377092.24
28672,478000.0,487522.62,444643.04,472608.76,437511.13,427483.55,432266.64,377092.24


In [26]:
def calculate_rmse(y_predicted):
    return mean_squared_error(validate_predictions.actual, y_predicted, squared = False)

validate_predictions.apply(calculate_rmse).sort_values()

actual                       0.00
(multiple_rfe_k=, 7)   205,507.56
(multiple_rfe_k=, 6)   205,561.22
(multiple_rfe_k=, 5)   205,988.21
(multiple_rfe_k=, 4)   207,997.73
(multiple_rfe_k=, 3)   208,212.92
(multiple_rfe_k=, 2)   209,451.26
baseline               235,949.69
dtype: float64

In [None]:
# k=7 is the best

228490-227162

**K=7 has the lowest rmse, highest performance**

In [None]:
# Adding k=7 to final table
lm = LinearRegression()
 # 1. Transform our X
    
rfe = RFE(lm, n_features_to_select=7)
rfe.fit(X_train, y_train)
# 2. Use the transformed x in our model
X_train_rfe = rfe.transform(X_train)
X_validate_rfe = rfe.transform(X_validate)
X_test_rfe = rfe.transform(X_test)
lm.fit(X_train_rfe, y_train)
    
# Make predictions
final_train_predictions['multiple_rfe_k=7'] = lm.predict(X_train_rfe)
final_validate_predictions['multiple_rfe_k=7'] = lm.predict(X_validate_rfe)
final_test_predictions['multiple_rfe_k=7']=lm.predict(X_test_rfe)

## Polynomial Features

In [27]:
train_pred = pd.DataFrame({
    'actual': s_train.assessed_value
}) 
validate_pred = pd.DataFrame({
    'actual': s_validate.assessed_value
}) 

In [28]:
from sklearn.preprocessing import PolynomialFeatures

# 1. Generate Polynomial Features, k=2
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)

In [29]:
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)
train_pred['baseline'] = y_train.mean()
validate_pred['baseline'] = y_validate.mean()
train_pred['polynomial degree 2'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 2'] = lm.predict(X_validate_poly)
train_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2
36490,702091.0,380065.99,330259.81
19337,442099.0,380065.99,388463.44
10447,76613.0,380065.99,304541.25
42559,802437.0,380065.99,663056.97
34091,188011.0,380065.99,558063.25


In [30]:
validate_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2
50904,170700.0,377092.24,312777.74
22280,395742.0,377092.24,652268.35
16371,222815.0,377092.24,258911.62
33772,816260.0,377092.24,431382.56
28672,478000.0,377092.24,457754.49


In [31]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 2'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 2'], squared = False)

In [32]:
train_rmse_b = mean_squared_error(train_pred.actual,train_pred['baseline'], squared = False)
validate_rmse_b = mean_squared_error(validate_pred.actual,validate_pred['baseline'], squared = False)

In [33]:
train_rmse, validate_rmse

(202295.3609893082, 199981.93217925238)

In [34]:
validate_rmse-train_rmse

-2313.428810055804

In [35]:
train_rmse_b, validate_rmse_b

(237872.06885584313, 235949.6867801346)

In [36]:
# Interaction terms only

poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)
lm = LinearRegression()
lm.fit(X_train_poly, y_train)

X_validate_poly = poly.transform(X_validate)
train_pred['polynomial degree 2 only interaction'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 2 only interaction'] = lm.predict(X_validate_poly)
train_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2,polynomial degree 2 only interaction
36490,702091.0,380065.99,330259.81,330661.04
19337,442099.0,380065.99,388463.44,386507.93
10447,76613.0,380065.99,304541.25,303762.97
42559,802437.0,380065.99,663056.97,666817.9
34091,188011.0,380065.99,558063.25,565992.28


In [37]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 2 only interaction'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 2 only interaction'], squared = False)
train_rmse, validate_rmse

(202395.46231037687, 200172.90419428024)

In [38]:
validate_rmse-train_rmse

-2222.558116096625

In [39]:
# k=3
poly = PolynomialFeatures(degree=3, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)

In [40]:
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)
train_pred['baseline'] = y_train.mean()
validate_pred['baseline'] = y_validate.mean()
train_pred['polynomial degree 3'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 3'] = lm.predict(X_validate_poly)
train_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2,polynomial degree 2 only interaction,polynomial degree 3
36490,702091.0,380065.99,330259.81,330661.04,324717.82
19337,442099.0,380065.99,388463.44,386507.93,366797.48
10447,76613.0,380065.99,304541.25,303762.97,309260.45
42559,802437.0,380065.99,663056.97,666817.9,785175.52
34091,188011.0,380065.99,558063.25,565992.28,592092.2


In [41]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 3'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 3'], squared = False)
train_rmse, validate_rmse

(200429.34706541212, 199529.26893855724)

In [42]:
# k=4
poly = PolynomialFeatures(degree=4, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)

In [43]:
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)
train_pred['baseline'] = y_train.mean()
validate_pred['baseline'] = y_validate.mean()
train_pred['polynomial degree 4'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 4'] = lm.predict(X_validate_poly)
train_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2,polynomial degree 2 only interaction,polynomial degree 3,polynomial degree 4
36490,702091.0,380065.99,330259.81,330661.04,324717.82,321177.35
19337,442099.0,380065.99,388463.44,386507.93,366797.48,361572.64
10447,76613.0,380065.99,304541.25,303762.97,309260.45,300658.64
42559,802437.0,380065.99,663056.97,666817.9,785175.52,760988.45
34091,188011.0,380065.99,558063.25,565992.28,592092.2,575510.96


In [44]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 4'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 4'], squared = False)
train_rmse, validate_rmse

(198875.59208328978, 200124.43203787412)

In [45]:
# k=5
poly = PolynomialFeatures(degree=5, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)

In [46]:
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)
train_pred['baseline'] = y_train.mean()
validate_pred['baseline'] = y_validate.mean()
train_pred['polynomial degree 5'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 5'] = lm.predict(X_validate_poly)
train_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2,polynomial degree 2 only interaction,polynomial degree 3,polynomial degree 4,polynomial degree 5
36490,702091.0,380065.99,330259.81,330661.04,324717.82,321177.35,307018.44
19337,442099.0,380065.99,388463.44,386507.93,366797.48,361572.64,364794.92
10447,76613.0,380065.99,304541.25,303762.97,309260.45,300658.64,305885.55
42559,802437.0,380065.99,663056.97,666817.9,785175.52,760988.45,744694.18
34091,188011.0,380065.99,558063.25,565992.28,592092.2,575510.96,626801.99


In [47]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 4'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 4'], squared = False)
train_rmse, validate_rmse

(198875.59208328978, 200124.43203787412)

**K=3 has the lowest rmse**

In [None]:
# Adding k=3 to final table
poly = PolynomialFeatures(degree=3, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)
X_test_poly = poly.transform(X_test)
final_train_predictions['baseline'] = y_train.mean()
final_validate_predictions['baseline'] = y_validate.mean()
final_test_predictions['baseline'] = y_test.mean()
final_train_predictions['polynomial degree 3'] = lm.predict(X_train_poly)
final_validate_predictions['polynomial degree 3'] = lm.predict(X_validate_poly)
final_test_predictions['polynomial degree 3'] = lm.predict(X_test_poly)

## Lasso-Lars

In [48]:
from sklearn.linear_model import LassoLars
# create the model object
lars = LassoLars(alpha=0)

# fit the model to our training data
lars.fit(X_train, y_train)

# predict validate
X_train_pred_lars = lars.predict(X_train)

pd.Series(lars.coef_, index=X_train.columns).sort_values()

bedrooms             -203,239.61
lot_size             -154,118.26
county_Los Angeles    -64,778.51
county_Orange            -286.41
has_pool               34,416.29
age                    40,127.32
bathrooms             160,689.93
square_feet           770,247.49
dtype: float64

In [49]:
# create the model object
lars = LassoLars(alpha=1)

# fit the model to our training data
lars.fit(X_train, y_train)

# predict validate
X_train_pred_lars = lars.predict(X_train)
X_validate_pred_lars = lars.predict(X_validate)
# Add lassolars predictions to our predictions DataFrame
train_pred['lasso_lars'] = X_train_pred_lars
validate_pred['lasso_lars'] = X_validate_pred_lars

In [50]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['lasso_lars'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['lasso_lars'], squared = False)
train_rmse, validate_rmse

(208110.17015086598, 205840.74432369764)

In [51]:
validate_rmse-train_rmse

-2269.4258271683357

In [None]:
# Adding to final table
# create the model object
lars = LassoLars(alpha=1)

# fit the model to our training data
lars.fit(X_train, y_train)

# predict validate
X_train_pred_lars = lars.predict(X_train)
X_validate_pred_lars = lars.predict(X_validate)
X_test_pred_lars = lars.predict(X_test)
# Add lassolars predictions to our predictions DataFrame
final_train_predictions['lasso_lars'] = X_train_pred_lars
final_validate_predictions['lasso_lars'] = X_validate_pred_lars
final_test_predictions['lasso_lars'] = X_test_pred_lars

## Generalized Linear Model

In [52]:
from sklearn.linear_model import TweedieRegressor

# create the model object
glm = TweedieRegressor(power=1, alpha=0)

# fit the model to our training data
glm.fit(X_train, y_train)

# predict train
X_train_predict_glm = glm.predict(X_train)
X_validate_predict_glm = glm.predict(X_validate)
# Add lassolars predictions to our predictions DataFrame
train_pred['glm'] = X_train_predict_glm
validate_pred['glm'] = X_validate_predict_glm

In [53]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['glm'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['glm'], squared = False)
train_rmse, validate_rmse

(209139.56707843233, 206658.25762675548)

In [54]:
validate_rmse-train_rmse

-2481.309451676847

In [55]:
# Adding to final table

glm = TweedieRegressor(power=1, alpha=0)

# fit the model to our training data
glm.fit(X_train, y_train)

# predict train
X_train_predict_glm = glm.predict(X_train)
X_validate_predict_glm = glm.predict(X_validate)
X_test_predict_glm = glm.predict(X_test)
# Add lassolars predictions to our predictions DataFrame
final_train_predictions['glm'] = X_train_predict_glm
final_validate_predictions['glm'] = X_validate_predict_glm
final_test_predictions['glm'] = X_test_predict_glm

## Evaluation

In [None]:
def calculate_rmse(y_predicted):
    return mean_squared_error(final_train_predictions.actual, y_predicted, squared = False)

final_train_predictions.apply(calculate_rmse).sort_values()

In [None]:
def calculate_rmse(y_predicted):
    return mean_squared_error(final_validate_predictions.actual, y_predicted, squared = False)

final_validate_predictions.apply(calculate_rmse).sort_values()

In [None]:
def calculate_rmse(y_predicted):
    return mean_squared_error(final_test_predictions.actual, y_predicted, squared = False)

final_test_predictions.apply(calculate_rmse).sort_values()

In [None]:
(277387.85-225323.49)/277387.85