In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import acquire
import prepare

In [2]:
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
rawdf = acquire.get_zillow_data()

In [4]:
rawdf.head()

Unnamed: 0,longitude,latitude,bedrooms,bathrooms,square_feet,lot_size,has_pool,fips_code,age,assessed_value,tax_amount
0,-118740133,34251502,4.0,2.0,1323.0,6825.0,,6111,53.0,453000.0,5235.32
1,-118217048,33798657,4.0,2.0,1339.0,6107.0,,6037,40.0,298371.0,3714.58
2,-118531253,34039467,2.0,2.0,1231.0,11837.0,,6037,67.0,161897.0,2031.39
3,-117613897,33663005,5.0,3.0,3008.0,6760.0,1.0,6059,22.0,472384.0,7234.74
4,-118393309,34227422,2.0,1.0,996.0,7518.0,,6037,77.0,58693.0,751.66


In [5]:
df = prepare.prep_zillow(rawdf)

In [5]:
q1 = np.percentile(df.assessed_value, 25)
q3 = np.percentile(df.assessed_value, 75)
iqr = q3 - q1
print("IQR equals: " + str(iqr))

IQR equals: 403123.0


In [5]:
df[df['assessed_value']>=1410860].shape[0]/df.shape[0]

0.03759595430459573

In [6]:
df[df['assessed_value']<=120866].shape[0]/df.shape[0]

0.14091431104305602

In [23]:
print("Upper bond for square_feet: ", q3 + 2*iqr)
print("Lower bond for square_feet: ", q1 - 0.2*iqr)

Upper bond for square_feet:  1209298.5
Lower bond for square_feet:  120866.4


In [6]:
df = df[df['assessed_value']<= 1410860]

In [7]:
df = df[df['assessed_value']>= 120866]

In [8]:
df.shape

(40773, 23)

In [11]:
df.head()

Unnamed: 0,longitude,latitude,bedrooms,bathrooms,square_feet,lot_size,has_pool,fips_code,age,assessed_value,...,bathrooms_size,county_Los Angeles,county_Orange,county_Ventura,bedrooms_size_small,bedrooms_size_medium,bedrooms_size_large,bathrooms_size_small,bathrooms_size_medium,bathrooms_size_large
0,-118740133,34251502,4.0,2.0,1323.0,6825.0,0.0,6111,53,453000.0,...,small,0,0,1,0,1,0,1,0,0
1,-118217048,33798657,4.0,2.0,1339.0,6107.0,0.0,6037,40,298371.0,...,small,1,0,0,0,1,0,1,0,0
2,-118531253,34039467,2.0,2.0,1231.0,11837.0,0.0,6037,67,161897.0,...,small,1,0,0,1,0,0,1,0,0
3,-117613897,33663005,5.0,3.0,3008.0,6760.0,1.0,6059,22,472384.0,...,medium,0,1,0,0,0,1,0,1,0
6,-118689128,34284379,3.0,2.5,1666.0,3301.0,0.0,6111,20,362552.0,...,small,0,0,1,0,1,0,1,0,0


## Simple Model
$$ y = mx + b $$

$$y = f(x)$$

In [9]:
scaled_df = df.copy()

In [10]:
scaler = MinMaxScaler()

In [12]:
scaler.fit(scaled_df[['square_feet', 'lot_size', 'age']])

MinMaxScaler()

In [13]:
scaled_df[['square_feet', 'lot_size', 'age']] = scaler.transform(scaled_df[['square_feet', 'lot_size', 'age']])

In [14]:
scaled_df.head()

Unnamed: 0,longitude,latitude,bedrooms,bathrooms,square_feet,lot_size,has_pool,fips_code,age,assessed_value,...,bathrooms_size,county_Los Angeles,county_Orange,county_Ventura,bedrooms_size_small,bedrooms_size_medium,bedrooms_size_large,bathrooms_size_small,bathrooms_size_medium,bathrooms_size_large
0,-118740133,34251502,4.0,2.0,0.12,0.04,0.0,6111,0.38,453000.0,...,small,0,0,1,0,1,0,1,0,0
1,-118217048,33798657,4.0,2.0,0.12,0.04,0.0,6037,0.28,298371.0,...,small,1,0,0,0,1,0,1,0,0
2,-118531253,34039467,2.0,2.0,0.11,0.07,0.0,6037,0.48,161897.0,...,small,1,0,0,1,0,0,1,0,0
3,-117613897,33663005,5.0,3.0,0.36,0.04,1.0,6059,0.15,472384.0,...,medium,0,1,0,0,0,1,0,1,0
6,-118689128,34284379,3.0,2.5,0.17,0.02,0.0,6111,0.14,362552.0,...,small,0,0,1,0,1,0,1,0,0


In [15]:
s_train, s_validate, s_test = prepare.split(scaled_df)

In [16]:
pd.set_option('display.max_columns', None)

In [58]:
cols = ['square_feet', 'lot_size', 'has_pool', 'age', 'county_Los Angeles', 'county_Orange', 'county_Ventura', 'bedrooms', 'bathrooms']

X_train = s_train[cols]
y_train = s_train.assessed_value

X_validate = s_validate[cols]
y_validate = s_validate.assessed_value

X_test = s_test[cols]
y_test = s_test.assessed_value

In [59]:
train_predictions = pd.DataFrame({
    'actual': s_train.assessed_value
}) 
validate_predictions = pd.DataFrame({
    'actual': s_validate.assessed_value
}) 

## Multiple  Regression + RFE

In [94]:
lm = LinearRegression()
k = 6

# 1. Transform our X
rfe = RFE(lm, n_features_to_select=7)
rfe.fit(X_train, y_train)
print('selected top 8 features:', X_train.columns[rfe.support_])
X_train_rfe = rfe.transform(X_train)

selected top 8 features: Index(['square_feet', 'lot_size', 'age', 'county_Los Angeles', 'county_Orange',
       'bedrooms', 'bathrooms'],
      dtype='object')


In [60]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
k = 2

# 1. Transform our X
rfe = RFE(lm, n_features_to_select=2)
rfe.fit(X_train, y_train)
print('selected top 2 features:', X_train.columns[rfe.support_])
X_train_rfe = rfe.transform(X_train)

selected top 2 features: Index(['square_feet', 'lot_size'], dtype='object')


In [61]:
# 2. Use the transformed x in our model
lm.fit(X_train_rfe, y_train)

# 3. Make predictions
X_train_rfe = rfe.transform(X_train)
X_validate_rfe = rfe.transform(X_validate)
train_predictions['multiple_rfe_k=2'] = lm.predict(X_train_rfe)
validate_predictions['multiple_rfe_k=2'] = lm.predict(X_validate_rfe)
train_predictions.head()

Unnamed: 0,actual,multiple_rfe_k=2
36884,206848.0,341099.68
24353,586029.0,448529.17
8475,929910.0,639535.2
34083,270562.0,401434.95
8792,199316.0,403451.96


In [62]:
validate_predictions.head()

Unnamed: 0,actual,multiple_rfe_k=2
3808,547219.0,464715.09
16336,155332.0,403249.14
10616,160969.0,333628.86
46216,1409388.0,719507.32
22066,1242640.0,515276.8


### Looping through k

In [63]:
for k in range(3,10):
    lm = LinearRegression()
    # 1. Transform our X
    rfe = RFE(lm, n_features_to_select=k)
    rfe.fit(X_train, y_train)
    # 2. Use the transformed x in our model
    X_train_rfe = rfe.transform(X_train)
    X_validate_rfe = rfe.transform(X_validate)
    lm.fit(X_train_rfe, y_train)
    
    # Make predictions
    train_predictions['multiple_rfe_k=', k] = lm.predict(X_train_rfe)
    validate_predictions['multiple_rfe_k=', k] = lm.predict(X_validate_rfe)
    k+=1

In [64]:
train_predictions.head()

Unnamed: 0,actual,multiple_rfe_k=2,"(multiple_rfe_k=, 3)","(multiple_rfe_k=, 4)","(multiple_rfe_k=, 5)","(multiple_rfe_k=, 6)","(multiple_rfe_k=, 7)","(multiple_rfe_k=, 8)","(multiple_rfe_k=, 9)"
36884,206848.0,341099.68,315055.73,344213.69,380726.52,393900.92,372193.1,372676.82,372676.82
24353,586029.0,448529.17,455491.75,435139.05,401720.98,393148.11,393360.77,388832.54,388832.54
8475,929910.0,639535.2,620337.63,639946.15,683367.55,660264.15,666787.98,661105.35,661105.35
34083,270562.0,401434.95,402186.64,379891.03,380023.74,383145.06,382899.0,379941.88,379941.88
8792,199316.0,403451.96,410649.35,392387.19,351644.99,351009.02,351246.85,347351.49,347351.49


In [65]:
train_predictions['baseline'] = y_train.mean()

In [80]:
def calculate_rmse(y_predicted):
    return mean_squared_error(train_predictions.actual, y_predicted, squared = False)

train_predictions.apply(calculate_rmse).sort_values()

actual                       0.00
(multiple_rfe_k=, 8)   229,179.31
(multiple_rfe_k=, 9)   229,179.31
(multiple_rfe_k=, 7)   229,325.43
(multiple_rfe_k=, 6)   229,445.27
(multiple_rfe_k=, 5)   230,110.53
(multiple_rfe_k=, 4)   231,673.65
(multiple_rfe_k=, 3)   233,540.19
multiple_rfe_k=2       234,072.12
baseline               271,718.54
dtype: float64

In [67]:
validate_predictions.head()

Unnamed: 0,actual,multiple_rfe_k=2,"(multiple_rfe_k=, 3)","(multiple_rfe_k=, 4)","(multiple_rfe_k=, 5)","(multiple_rfe_k=, 6)","(multiple_rfe_k=, 7)","(multiple_rfe_k=, 8)","(multiple_rfe_k=, 9)"
3808,547219.0,464715.09,467908.21,445153.72,415078.42,402368.79,402765.42,419188.21,419188.21
16336,155332.0,403249.14,378607.98,342139.03,347468.12,341838.97,342439.76,340119.27,340119.27
10616,160969.0,333628.86,331523.31,310657.93,300393.7,276618.66,276864.4,275442.43,275442.43
46216,1409388.0,719507.32,698545.59,712263.37,728976.54,723871.26,730410.76,722863.04,722863.04
22066,1242640.0,515276.8,526259.78,508221.38,524772.24,548639.98,548259.84,563227.65,563227.65


In [68]:
def calculate_rmse(y_predicted):
    return mean_squared_error(validate_predictions.actual, y_predicted, squared = False)

validate_predictions.apply(calculate_rmse).sort_values()

actual                       0.00
(multiple_rfe_k=, 8)   231,283.29
(multiple_rfe_k=, 9)   231,283.29
(multiple_rfe_k=, 7)   231,500.75
(multiple_rfe_k=, 6)   231,722.23
(multiple_rfe_k=, 5)   232,615.12
(multiple_rfe_k=, 4)   234,542.14
(multiple_rfe_k=, 3)   236,990.28
multiple_rfe_k=2       237,521.58
dtype: float64

In [28]:
# k=8/9 is the best

232847-230261

2586

## Polynomial Features

In [69]:
train_pred = pd.DataFrame({
    'actual': s_train.assessed_value
}) 
validate_pred = pd.DataFrame({
    'actual': s_validate.assessed_value
}) 

In [70]:
from sklearn.preprocessing import PolynomialFeatures

# 1. Generate Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)
X_train_poly.head()

Unnamed: 0,square_feet,lot_size,has_pool,age,county_Los Angeles,county_Orange,county_Ventura,bedrooms,bathrooms,square_feet^2,square_feet lot_size,square_feet has_pool,square_feet age,square_feet county_Los Angeles,square_feet county_Orange,square_feet county_Ventura,square_feet bedrooms,square_feet bathrooms,lot_size^2,lot_size has_pool,lot_size age,lot_size county_Los Angeles,lot_size county_Orange,lot_size county_Ventura,lot_size bedrooms,lot_size bathrooms,has_pool^2,has_pool age,has_pool county_Los Angeles,has_pool county_Orange,has_pool county_Ventura,has_pool bedrooms,has_pool bathrooms,age^2,age county_Los Angeles,age county_Orange,age county_Ventura,age bedrooms,age bathrooms,county_Los Angeles^2,county_Los Angeles county_Orange,county_Los Angeles county_Ventura,county_Los Angeles bedrooms,county_Los Angeles bathrooms,county_Orange^2,county_Orange county_Ventura,county_Orange bedrooms,county_Orange bathrooms,county_Ventura^2,county_Ventura bedrooms,county_Ventura bathrooms,bedrooms^2,bedrooms bathrooms,bathrooms^2
36884,0.09,0.02,0.0,0.21,0.0,0.0,1.0,2.0,2.0,0.01,0.0,0.0,0.02,0.0,0.0,0.09,0.18,0.18,0.0,0.0,0.01,0.0,0.0,0.02,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.21,0.42,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,4.0,4.0,4.0
24353,0.18,0.03,0.0,0.45,1.0,0.0,0.0,4.0,2.0,0.03,0.01,0.0,0.08,0.18,0.0,0.0,0.71,0.35,0.0,0.0,0.01,0.03,0.0,0.0,0.13,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.45,0.0,0.0,1.8,0.9,1.0,0.0,0.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,8.0,4.0
8475,0.33,0.04,0.0,0.09,0.0,1.0,0.0,3.0,2.5,0.11,0.01,0.0,0.03,0.0,0.33,0.0,1.0,0.83,0.0,0.0,0.0,0.0,0.04,0.0,0.11,0.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.09,0.0,0.28,0.24,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,2.5,0.0,0.0,0.0,9.0,7.5,6.25
34083,0.14,0.03,0.0,0.42,1.0,0.0,0.0,3.0,2.0,0.02,0.0,0.0,0.06,0.14,0.0,0.0,0.41,0.28,0.0,0.0,0.01,0.03,0.0,0.0,0.09,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.42,0.0,0.0,1.26,0.84,1.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,6.0,4.0
8792,0.14,0.03,0.0,0.48,1.0,0.0,0.0,4.0,2.0,0.02,0.0,0.0,0.07,0.14,0.0,0.0,0.56,0.28,0.0,0.0,0.01,0.03,0.0,0.0,0.12,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23,0.48,0.0,0.0,1.91,0.96,1.0,0.0,0.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,8.0,4.0


In [59]:
X_train_poly.shape, y_train.shape, train_pred.shape

((22832, 77), (22832,), (22832, 1))

In [71]:
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)

train_pred['polynomial degree 2'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 2'] = lm.predict(X_validate_poly)
train_pred.head()

Unnamed: 0,actual,polynomial degree 2
36884,206848.0,333957.42
24353,586029.0,405215.81
8475,929910.0,775170.92
34083,270562.0,386109.11
8792,199316.0,374108.44


In [72]:
validate_pred.head()

Unnamed: 0,actual,polynomial degree 2
3808,547219.0,427532.7
16336,155332.0,252211.58
10616,160969.0,286522.09
46216,1409388.0,904431.92
22066,1242640.0,590157.41


In [73]:
feature_names = poly.get_feature_names(X_train.columns)
pd.Series(lm.coef_, index=feature_names).sort_values()

square_feet county_Ventura          -13,872,719,207,724,570.00
square_feet county_Orange           -13,872,719,207,707,312.00
square_feet county_Los Angeles      -13,872,719,207,609,390.00
square_feet age                                  -1,847,109.06
square_feet^2                                    -1,519,103.00
age^2                                              -517,864.05
lot_size bathrooms                                 -233,600.73
lot_size county_Los Angeles                        -222,236.57
age county_Orange                                  -188,426.41
county_Los Angeles                                 -108,050.25
county_Los Angeles^2                               -107,821.99
lot_size age                                       -101,072.77
age bathrooms                                       -73,968.25
has_pool county_Orange                              -26,079.88
county_Los Angeles bedrooms                         -19,195.06
bedrooms                                            -17

In [74]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 2'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 2'], squared = False)

In [75]:
train_rmse, validate_rmse

(221548.45614485122, 225043.66645641494)

In [76]:
validate_rmse-train_rmse

3495.2103115637146

In [77]:
# Interaction terms only

poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)
lm = LinearRegression()
lm.fit(X_train_poly, y_train)

X_validate_poly = poly.transform(X_validate)
train_pred['polynomial degree 2 only interaction'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 2 only interaction'] = lm.predict(X_validate_poly)
train_pred.head()

Unnamed: 0,actual,polynomial degree 2,polynomial degree 2 only interaction
36884,206848.0,333957.42,335382.04
24353,586029.0,405215.81,389010.4
8475,929910.0,775170.92,797394.46
34083,270562.0,386109.11,373351.96
8792,199316.0,374108.44,369206.55


In [78]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 2 only interaction'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 2 only interaction'], squared = False)
train_rmse, validate_rmse

(222418.36276885946, 225438.12991192046)

In [79]:
validate_rmse-train_rmse

3019.767143061006

In [None]:
for k in range(3,9):
    
    poly = PolynomialFeatures(degree=k, include_bias=False, interaction_only=False)
    poly.fit(X_train)
    X_train_poly = pd.DataFrame(
        poly.transform(X_train),
        columns=poly.get_feature_names(X_train.columns),
        index=X_train.index,
    )
    lm = LinearRegression()
    lm.fit(X_train_poly, y_train)

    X_validate_poly = poly.transform(X_validate)
    train_pred['polynomial degree', k] = lm.predict(X_train_poly)
    validate_pred['polynomial degree', k] = lm.predict(X_validate_poly)

In [76]:
train_pred.head()

Unnamed: 0,actual,polynomial degree 2,polynomial degree 2 only interaction,"(polynomial degree, 3)","(polynomial degree, 4)","(polynomial degree, 5)","(polynomial degree, 6)","(polynomial degree, 7)","(polynomial degree, 8)"
36884,206848.0,387162.32,398656.0,387162.32,387162.32,387162.32,387162.32,387162.32,387162.32
24353,586029.0,434986.92,426720.0,434986.92,434986.92,434986.92,434986.92,434986.92,434986.92
8475,929910.0,757985.48,775744.0,757985.48,757985.48,757985.48,757985.48,757985.48,757985.48
34083,270562.0,374778.92,368480.0,374778.92,374778.92,374778.92,374778.92,374778.92,374778.92
8792,199316.0,404240.92,397984.0,404240.92,404240.92,404240.92,404240.92,404240.92,404240.92


In [77]:
def calculate_rmse(y_predicted):
    return mean_squared_error(train_pred.actual, y_predicted, squared = False)

train_pred.apply(calculate_rmse).sort_values()

actual                                       0.00
polynomial degree 2                    222,136.75
(polynomial degree, 3)                 222,136.75
(polynomial degree, 4)                 222,136.75
(polynomial degree, 5)                 222,136.75
(polynomial degree, 6)                 222,136.75
(polynomial degree, 7)                 222,136.75
(polynomial degree, 8)                 222,136.75
polynomial degree 2 only interaction   222,477.22
dtype: float64

## Lasso-Lars

In [81]:
from sklearn.linear_model import LassoLars
# create the model object
lars = LassoLars(alpha=0)

# fit the model to our training data
lars.fit(X_train, y_train)

# predict validate
X_train_pred_lars = lars.predict(X_train)

pd.Series(lars.coef_, index=X_train.columns).sort_values()

lot_size              -136,344.85
county_Los Angeles     -48,976.72
bedrooms               -45,422.93
county_Ventura               0.00
has_pool                21,177.85
county_Orange           27,327.99
bathrooms               36,907.55
age                    185,572.80
square_feet          1,272,839.94
dtype: float64

In [82]:
# create the model object
lars = LassoLars(alpha=1)

# fit the model to our training data
lars.fit(X_train, y_train)

# predict validate
X_train_pred_lars = lars.predict(X_train)
X_validate_pred_lars = lars.predict(X_validate)
# Add lassolars predictions to our predictions DataFrame
train_pred['lasso_lars'] = X_train_pred_lars
validate_pred['lasso_lars'] = X_validate_pred_lars

In [83]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['lasso_lars'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['lasso_lars'], squared = False)
train_rmse, validate_rmse

(229180.02327432, 231292.90323119983)

In [84]:
validate_rmse-train_rmse

2112.8799568798277

## Generalized Linear Model

In [85]:
from sklearn.linear_model import TweedieRegressor

# create the model object
glm = TweedieRegressor(power=1, alpha=0)

# fit the model to our training data
glm.fit(X_train, y_train)

# predict train
X_train_predict_glm = glm.predict(X_train)
X_validate_predict_glm = glm.predict(X_validate)
# Add lassolars predictions to our predictions DataFrame
train_pred['glm'] = X_train_predict_glm
validate_pred['glm'] = X_validate_predict_glm

In [86]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['glm'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['glm'], squared = False)
train_rmse, validate_rmse

(233008.17669950996, 233016.66516689927)

In [87]:
validate_rmse-train_rmse

8.488467389310244

## Evaluation