In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import acquire
import prepare

In [2]:
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
rawdf = acquire.get_zillow_data()

In [4]:
rawdf.head()

Unnamed: 0,longitude,latitude,bedrooms,bathrooms,square_feet,lot_size,has_pool,fips_code,age,assessed_value,tax_amount
0,-118217048,33798657,4.0,2.0,1339.0,6107.0,,6037,40.0,298371.0,3714.58
1,-118531253,34039467,2.0,2.0,1231.0,11837.0,,6037,67.0,161897.0,2031.39
2,-117613897,33663005,5.0,3.0,3008.0,6760.0,1.0,6059,22.0,472384.0,7234.74
3,-118393309,34227422,2.0,1.0,996.0,7518.0,,6037,77.0,58693.0,751.66
4,-118306261,33745855,4.0,2.0,1329.0,7005.0,,6037,73.0,62194.0,886.96


In [5]:
df = prepare.prep_zillow(rawdf)

In [6]:
df.shape

(47123, 23)

In [7]:
df.head()

Unnamed: 0,longitude,latitude,bedrooms,bathrooms,square_feet,lot_size,has_pool,fips_code,age,assessed_value,...,bathrooms_size,county_Los Angeles,county_Orange,county_Ventura,bedrooms_size_small,bedrooms_size_medium,bedrooms_size_large,bathrooms_size_small,bathrooms_size_medium,bathrooms_size_large
0,-118217048,33798657,4.0,2.0,1339.0,6107.0,0.0,6037,40,298371.0,...,small,1,0,0,0,1,0,1,0,0
1,-118531253,34039467,2.0,2.0,1231.0,11837.0,0.0,6037,67,161897.0,...,small,1,0,0,1,0,0,1,0,0
2,-117613897,33663005,5.0,3.0,3008.0,6760.0,1.0,6059,22,472384.0,...,medium,0,1,0,0,0,1,0,1,0
3,-118393309,34227422,2.0,1.0,996.0,7518.0,0.0,6037,77,58693.0,...,small,1,0,0,1,0,0,1,0,0
4,-118306261,33745855,4.0,2.0,1329.0,7005.0,0.0,6037,73,62194.0,...,small,1,0,0,0,1,0,1,0,0


## Simple Model
$$ y = mx + b $$

$$y = f(x)$$

In [8]:
scaled_df = df.copy()

In [9]:
scaler = MinMaxScaler()

In [42]:
scaler.fit(scaled_df[['square_feet', 'lot_size', 'age', 'bedrooms', 'bathrooms']])

MinMaxScaler()

In [43]:
scaled_df[['square_feet', 'lot_size', 'age', 'bedrooms', 'bathrooms']] = scaler.transform(scaled_df[['square_feet', 'lot_size', 'age', 'bedrooms', 'bathrooms']])

In [44]:
scaled_df.head()

Unnamed: 0,longitude,latitude,bedrooms,bathrooms,square_feet,lot_size,has_pool,fips_code,age,assessed_value,tax_amount,county,bedrooms_size,bathrooms_size,county_Los Angeles,county_Orange,county_Ventura,bedrooms_size_small,bedrooms_size_medium,bedrooms_size_large,bathrooms_size_small,bathrooms_size_medium,bathrooms_size_large
0,-118217048,33798657,0.6,0.18,0.13,0.05,0.0,6037,0.28,298371.0,3714.58,Los Angeles,medium,small,1,0,0,0,1,0,1,0,0
1,-118531253,34039467,0.2,0.18,0.12,0.11,0.0,6037,0.48,161897.0,2031.39,Los Angeles,small,small,1,0,0,1,0,0,1,0,0
2,-117613897,33663005,0.8,0.36,0.4,0.06,1.0,6059,0.15,472384.0,7234.74,Orange,large,medium,0,1,0,0,0,1,0,1,0
3,-118393309,34227422,0.2,0.0,0.08,0.06,0.0,6037,0.55,58693.0,751.66,Los Angeles,small,small,1,0,0,1,0,0,1,0,0
4,-118306261,33745855,0.6,0.18,0.13,0.06,0.0,6037,0.52,62194.0,886.96,Los Angeles,medium,small,1,0,0,0,1,0,1,0,0


In [45]:
s_train, s_validate, s_test = prepare.split(scaled_df)

In [46]:
pd.set_option('display.max_columns', None)

In [47]:
cols = ['square_feet', 'lot_size', 'has_pool', 'age', 'county_Los Angeles', 'county_Orange', 'bedrooms', 'bathrooms']

X_train = s_train[cols]
y_train = s_train.assessed_value

X_validate = s_validate[cols]
y_validate = s_validate.assessed_value

X_test = s_test[cols]
y_test = s_test.assessed_value

In [48]:
train_predictions = pd.DataFrame({
    'actual': s_train.assessed_value
}) 
validate_predictions = pd.DataFrame({
    'actual': s_validate.assessed_value
}) 

In [84]:
final_train_predictions = pd.DataFrame({
    'actual': s_train.assessed_value
}) 
final_validate_predictions = pd.DataFrame({
    'actual': s_validate.assessed_value
}) 
final_test_predictions = pd.DataFrame({
    'actual': s_test.assessed_value
})

## Multiple  Regression + RFE

In [49]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
k = 6

# 1. Transform our X
rfe = RFE(lm, n_features_to_select=5)
rfe.fit(X_train, y_train)
print('selected top 8 features:', X_train.columns[rfe.support_])
X_train_rfe = rfe.transform(X_train)

selected top 8 features: Index(['square_feet', 'lot_size', 'age', 'bedrooms', 'bathrooms'], dtype='object')


### Looping through k

In [50]:
for k in range(2,8):
    lm = LinearRegression()
    # 1. Transform our X
    rfe = RFE(lm, n_features_to_select=k)
    rfe.fit(X_train, y_train)
    # 2. Use the transformed x in our model
    X_train_rfe = rfe.transform(X_train)
    X_validate_rfe = rfe.transform(X_validate)
    lm.fit(X_train_rfe, y_train)
    
    # Make predictions
    train_predictions['multiple_rfe_k=', k] = lm.predict(X_train_rfe)
    validate_predictions['multiple_rfe_k=', k] = lm.predict(X_validate_rfe)
    k+=1

In [51]:
train_predictions.head()

Unnamed: 0,actual,"(multiple_rfe_k=, 2)","(multiple_rfe_k=, 3)","(multiple_rfe_k=, 4)","(multiple_rfe_k=, 5)","(multiple_rfe_k=, 6)","(multiple_rfe_k=, 7)"
14540,471024.0,284478.18,296936.86,294389.84,294708.55,341281.65,336950.48
11215,61339.0,296492.45,307360.21,307658.37,306977.58,348857.74,345070.64
27417,75483.0,358554.1,373228.26,372143.25,371398.82,415978.05,412265.89
21049,430000.0,260668.1,239544.08,240170.97,239692.28,221087.66,218623.23
25174,619000.0,300861.27,274414.91,278273.31,281313.47,272432.94,268301.14


In [52]:
train_predictions['baseline'] = y_train.mean()

In [53]:
def calculate_rmse(y_predicted):
    return mean_squared_error(train_predictions.actual, y_predicted, squared = False)

train_predictions.apply(calculate_rmse).sort_values()

actual                       0.00
(multiple_rfe_k=, 7)   232,763.09
(multiple_rfe_k=, 6)   233,166.16
(multiple_rfe_k=, 5)   234,858.62
(multiple_rfe_k=, 4)   234,875.93
(multiple_rfe_k=, 3)   235,076.59
(multiple_rfe_k=, 2)   235,705.45
baseline               274,862.72
dtype: float64

In [54]:
validate_predictions['baseline'] = y_validate.mean()

In [55]:
validate_predictions.head()


Unnamed: 0,actual,"(multiple_rfe_k=, 2)","(multiple_rfe_k=, 3)","(multiple_rfe_k=, 4)","(multiple_rfe_k=, 5)","(multiple_rfe_k=, 6)","(multiple_rfe_k=, 7)",baseline
40104,410538.0,419474.8,414057.38,413000.57,413635.57,394462.55,423705.62,405736.35
18167,339849.0,376005.02,376343.82,378991.27,380363.94,362945.37,393570.22,405736.35
7121,222431.0,360714.14,363077.74,364780.39,362573.74,397646.21,393115.36,405736.35
30194,857504.0,387825.21,361887.72,364578.94,365770.98,353101.33,349126.13,405736.35
27143,99085.0,358092.85,360803.55,353729.52,351212.51,388819.46,383003.17,405736.35


In [56]:
def calculate_rmse(y_predicted):
    return mean_squared_error(validate_predictions.actual, y_predicted, squared = False)

validate_predictions.apply(calculate_rmse).sort_values()

actual                       0.00
(multiple_rfe_k=, 7)   230,221.87
(multiple_rfe_k=, 6)   230,727.85
(multiple_rfe_k=, 5)   232,270.96
(multiple_rfe_k=, 4)   232,304.06
(multiple_rfe_k=, 3)   232,661.45
(multiple_rfe_k=, 2)   233,345.88
baseline               272,468.83
dtype: float64

In [83]:
# k=7 is the best

230221-232763

-2542

**K=7 has the lowest rmse, highest performance**

In [86]:
# Adding k=7 to final table
lm = LinearRegression()
 # 1. Transform our X
    
rfe = RFE(lm, n_features_to_select=7)
rfe.fit(X_train, y_train)
# 2. Use the transformed x in our model
X_train_rfe = rfe.transform(X_train)
X_validate_rfe = rfe.transform(X_validate)
X_test_rfe = rfe.transform(X_test)
lm.fit(X_train_rfe, y_train)
    
# Make predictions
final_train_predictions['multiple_rfe_k=7'] = lm.predict(X_train_rfe)
final_validate_predictions['multiple_rfe_k=7'] = lm.predict(X_validate_rfe)
final_test_predictions['multiple_rfe_k=7']=lm.predict(X_test_rfe)

## Polynomial Features

In [57]:
train_pred = pd.DataFrame({
    'actual': s_train.assessed_value
}) 
validate_pred = pd.DataFrame({
    'actual': s_validate.assessed_value
}) 

In [58]:
from sklearn.preprocessing import PolynomialFeatures

# 1. Generate Polynomial Features, k=2
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)

In [59]:
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)
train_pred['baseline'] = y_train.mean()
validate_pred['baseline'] = y_validate.mean()
train_pred['polynomial degree 2'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 2'] = lm.predict(X_validate_poly)
train_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2
14540,471024.0,409241.54,285097.77
11215,61339.0,409241.54,313418.0
27417,75483.0,409241.54,333493.35
21049,430000.0,409241.54,243253.97
25174,619000.0,409241.54,337785.11


In [60]:
validate_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2
40104,410538.0,405736.35,425943.78
18167,339849.0,405736.35,401633.18
7121,222431.0,405736.35,402310.14
30194,857504.0,405736.35,363816.89
27143,99085.0,405736.35,370505.32


In [61]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 2'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 2'], squared = False)

In [65]:
train_rmse_b = mean_squared_error(train_pred.actual,train_pred['baseline'], squared = False)
validate_rmse_b = mean_squared_error(validate_pred.actual,validate_pred['baseline'], squared = False)

In [62]:
train_rmse, validate_rmse

(226018.23613893724, 223545.9546342702)

In [63]:
validate_rmse-train_rmse

-2472.281504667044

In [66]:
train_rmse_b, validate_rmse_b

(274862.7167862455, 272468.8282290647)

In [67]:
# Interaction terms only

poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)
lm = LinearRegression()
lm.fit(X_train_poly, y_train)

X_validate_poly = poly.transform(X_validate)
train_pred['polynomial degree 2 only interaction'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 2 only interaction'] = lm.predict(X_validate_poly)
train_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2,polynomial degree 2 only interaction
14540,471024.0,409241.54,285097.77,293158.68
11215,61339.0,409241.54,313418.0,319185.77
27417,75483.0,409241.54,333493.35,331075.64
21049,430000.0,409241.54,243253.97,233450.18
25174,619000.0,409241.54,337785.11,328970.75


In [68]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 2 only interaction'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 2 only interaction'], squared = False)
train_rmse, validate_rmse

(226317.07147912346, 223939.20747348687)

In [69]:
validate_rmse-train_rmse

-2377.8640056365984

In [70]:
# k=3
poly = PolynomialFeatures(degree=3, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)

In [71]:
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)
train_pred['baseline'] = y_train.mean()
validate_pred['baseline'] = y_validate.mean()
train_pred['polynomial degree 3'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 3'] = lm.predict(X_validate_poly)
train_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2,polynomial degree 2 only interaction,polynomial degree 3
14540,471024.0,409241.54,285097.77,293158.68,299474.65
11215,61339.0,409241.54,313418.0,319185.77,308079.89
27417,75483.0,409241.54,333493.35,331075.64,335684.56
21049,430000.0,409241.54,243253.97,233450.18,263100.74
25174,619000.0,409241.54,337785.11,328970.75,337630.3


In [72]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 3'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 3'], squared = False)
train_rmse, validate_rmse

(223633.46235207212, 222070.91872805462)

In [73]:
# k=4
poly = PolynomialFeatures(degree=4, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)

In [74]:
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)
train_pred['baseline'] = y_train.mean()
validate_pred['baseline'] = y_validate.mean()
train_pred['polynomial degree 4'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 4'] = lm.predict(X_validate_poly)
train_pred.head()

Unnamed: 0,actual,baseline,polynomial degree 2,polynomial degree 2 only interaction,polynomial degree 3,polynomial degree 4
14540,471024.0,409241.54,285097.77,293158.68,299474.65,309613.66
11215,61339.0,409241.54,313418.0,319185.77,308079.89,318497.96
27417,75483.0,409241.54,333493.35,331075.64,335684.56,302993.86
21049,430000.0,409241.54,243253.97,233450.18,263100.74,258815.31
25174,619000.0,409241.54,337785.11,328970.75,337630.3,340044.95


In [75]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 4'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 4'], squared = False)
train_rmse, validate_rmse

(221616.71599597967, 225548.63048489933)

**K=3 has the lowest rmse**

In [88]:
# Adding k=3 to final table
poly = PolynomialFeatures(degree=3, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)
X_test_poly = poly.transform(X_test)
final_train_predictions['baseline'] = y_train.mean()
final_validate_predictions['baseline'] = y_validate.mean()
final_test_predictions['baseline'] = y_test.mean()
final_train_predictions['polynomial degree 3'] = lm.predict(X_train_poly)
final_validate_predictions['polynomial degree 3'] = lm.predict(X_validate_poly)
final_test_predictions['polynomial degree 3'] = lm.predict(X_test_poly)

## Lasso-Lars

In [76]:
from sklearn.linear_model import LassoLars
# create the model object
lars = LassoLars(alpha=0)

# fit the model to our training data
lars.fit(X_train, y_train)

# predict validate
X_train_pred_lars = lars.predict(X_train)

pd.Series(lars.coef_, index=X_train.columns).sort_values()

bedrooms              -269,204.77
lot_size              -117,309.33
county_Los Angeles     -54,913.82
county_Orange           11,562.02
has_pool                36,232.93
age                     83,563.09
bathrooms              219,997.52
square_feet          1,170,303.06
dtype: float64

In [77]:
# create the model object
lars = LassoLars(alpha=1)

# fit the model to our training data
lars.fit(X_train, y_train)

# predict validate
X_train_pred_lars = lars.predict(X_train)
X_validate_pred_lars = lars.predict(X_validate)
# Add lassolars predictions to our predictions DataFrame
train_pred['lasso_lars'] = X_train_pred_lars
validate_pred['lasso_lars'] = X_validate_pred_lars

In [78]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['lasso_lars'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['lasso_lars'], squared = False)
train_rmse, validate_rmse

(232745.2303032925, 230203.61065288217)

In [79]:
validate_rmse-train_rmse

-2541.6196504103427

In [90]:
# Adding to final table
# create the model object
lars = LassoLars(alpha=1)

# fit the model to our training data
lars.fit(X_train, y_train)

# predict validate
X_train_pred_lars = lars.predict(X_train)
X_validate_pred_lars = lars.predict(X_validate)
X_test_pred_lars = lars.predict(X_test)
# Add lassolars predictions to our predictions DataFrame
final_train_predictions['lasso_lars'] = X_train_pred_lars
final_validate_predictions['lasso_lars'] = X_validate_pred_lars
final_test_predictions['lasso_lars'] = X_test_pred_lars

## Generalized Linear Model

In [80]:
from sklearn.linear_model import TweedieRegressor

# create the model object
glm = TweedieRegressor(power=1, alpha=0)

# fit the model to our training data
glm.fit(X_train, y_train)

# predict train
X_train_predict_glm = glm.predict(X_train)
X_validate_predict_glm = glm.predict(X_validate)
# Add lassolars predictions to our predictions DataFrame
train_pred['glm'] = X_train_predict_glm
validate_pred['glm'] = X_validate_predict_glm

In [81]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['glm'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['glm'], squared = False)
train_rmse, validate_rmse

(235321.89586322848, 233617.59325644202)

In [82]:
validate_rmse-train_rmse

-1704.3026067864557

In [91]:
# Adding to final table

glm = TweedieRegressor(power=1, alpha=0)

# fit the model to our training data
glm.fit(X_train, y_train)

# predict train
X_train_predict_glm = glm.predict(X_train)
X_validate_predict_glm = glm.predict(X_validate)
X_test_predict_glm = glm.predict(X_test)
# Add lassolars predictions to our predictions DataFrame
final_train_predictions['glm'] = X_train_predict_glm
final_validate_predictions['glm'] = X_validate_predict_glm
final_test_predictions['glm'] = X_test_predict_glm

## Evaluation

In [94]:
def calculate_rmse(y_predicted):
    return mean_squared_error(final_train_predictions.actual, y_predicted, squared = False)

final_train_predictions.apply(calculate_rmse).sort_values()

actual                      0.00
polynomial degree 3   223,633.46
lasso_lars            232,745.23
multiple_rfe_k=7      232,763.09
glm                   235,321.90
baseline              274,862.72
dtype: float64

In [95]:
def calculate_rmse(y_predicted):
    return mean_squared_error(final_validate_predictions.actual, y_predicted, squared = False)

final_validate_predictions.apply(calculate_rmse).sort_values()

actual                      0.00
polynomial degree 3   222,070.92
lasso_lars            230,203.61
multiple_rfe_k=7      230,221.87
glm                   233,617.59
baseline              272,468.83
dtype: float64

In [96]:
def calculate_rmse(y_predicted):
    return mean_squared_error(final_test_predictions.actual, y_predicted, squared = False)

final_test_predictions.apply(calculate_rmse).sort_values()

actual                      0.00
polynomial degree 3   225,323.49
lasso_lars            233,614.86
multiple_rfe_k=7      233,644.57
glm                   237,682.56
baseline              277,387.85
dtype: float64

In [97]:
(277387.85-225323.49)/277387.85

0.18769517121964782