In [1]:
from imports import *

# set columnns display format
pd.set_option('display.max_columns', None)
# default pandas decimal number display format
# pd.options.display.float_format = '{:20,.3f}'.format

In [2]:
# import raw, messy data
df = acquire.get_zillow_data()

In [3]:
# clean data
df = prepare.prep_zillow(df)

In [4]:
# scale and split the data
train, validate, test = prepare.split_scale(df)

In [5]:
# adding location clusters A to dataframe
cols = ['latitude', 'longitude', 'zip_bin_insgfnt high', 'zip_bin_insgfnt low',
       'zip_bin_sgfnt high']
kmeans = KMeans(n_clusters=4)
kmeans.fit(train[cols])

train['location_clusters_a'] = kmeans.predict(train[cols])
validate['location_clusters_a'] = kmeans.predict(validate[cols])
test['location_clusters_a'] = kmeans.predict(test[cols])

In [6]:
# adding location clusters B to dataframe
cols = ['county_Los Angeles', 'county_Orange', 'zip_bin_insgfnt high', 'zip_bin_insgfnt low',
       'zip_bin_sgfnt high']
kmeans = KMeans(n_clusters=5)
kmeans.fit(train[cols])

train['location_clusters_b'] = kmeans.predict(train[cols])
validate['location_clusters_b'] = kmeans.predict(validate[cols])
test['location_clusters_b'] = kmeans.predict(test[cols])

In [7]:
# adding area clusters to dataframe - somewhat significant

cols =['total_sqft', 'lot_sqft', 'living_sqft']
kmeans = KMeans(n_clusters=3)
kmeans.fit(train[cols])

train['area_clusters'] = kmeans.predict(train[cols])
validate['area_clusters'] = kmeans.predict(validate[cols])
test['area_clusters'] = kmeans.predict(test[cols])

In [19]:
# adding size clusters A to dataframe - not significant

cols = ['bedrooms', 'bathrooms', 'full_bath']
kmeans = KMeans(n_clusters=3)
kmeans.fit(train[cols])
train['size_clusters_a'] = kmeans.predict(train[cols])
validate['size_clusters_a'] = kmeans.predict(validate[cols])
test['size_clusters_a'] = kmeans.predict(test[cols])

In [20]:
# adding size clusters B to dataframe - not significant

cols = ['bedrooms', 'bathrooms', 'full_bath', 'roomcnt']
kmeans = KMeans(n_clusters=4)
kmeans.fit(train[cols])
train['size_clusters_b'] = kmeans.predict(train[cols])
validate['size_clusters_b'] = kmeans.predict(validate[cols])
test['size_clusters_b'] = kmeans.predict(test[cols])

In [14]:
# adding value clusters to dataframe - not significant

cols = ['structure_value', 'assessed_value', 'land_value','taxamount']
kmeans = KMeans(n_clusters=3)
kmeans.fit(train[cols])
train['value_clusters'] = kmeans.predict(train[cols])
validate['value_clusters'] = kmeans.predict(validate[cols])
test['value_clusters'] = kmeans.predict(test[cols])

- ['age', 'location_clusters_a', 'area_clusters','size_clusters_a','value_clusters','transaction_month']
    Train:
    (multiple_rfe_k=, 4)    0.363455
    (multiple_rfe_k=, 5)    0.363463
    (multiple_rfe_k=, 2)    0.363488
    (multiple_rfe_k=, 3)    0.363528
    baseline                0.364527
    
    Validate:
    (multiple_rfe_k=, 4)    0.364318
    (multiple_rfe_k=, 5)    0.364327
    (multiple_rfe_k=, 2)    0.364370
    (multiple_rfe_k=, 3)    0.364388
    basline                 0.366898
    
    **Polynomial k=2**
    Basline (0.16262411066536314, 0.15806831374792724)
    (0.16163094482814297, 0.1576410238958071)
    
    Polynomial k=3
    (0.16097933947491191, 0.15775996884892424)
    
    Lasso-Lars
    (0.16262411066536314, 0.1580787480680725)
    
- ['age', 'location_clusters_a', 'total_sqft','lot_sqft','size_clusters_a','value_clusters','transaction_month']

In [25]:
# set x and y
cols = ['age', 'location_clusters_a', 'total_sqft','lot_sqft','size_clusters_a','value_clusters','transaction_month']

X_train = train[cols]
y_train = train.logerror

X_validate = validate[cols]
y_validate = validate.logerror

X_test = test[cols]
y_test = test.logerror

## Multiple  Regression + RFE

In [24]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
k = 3

# 1. Transform our X
rfe = RFE(lm, n_features_to_select=5)
rfe.fit(X_train, y_train)
print('selected top 3 features:', X_train.columns[rfe.support_])
X_train_rfe = rfe.transform(X_train)

selected top 3 features: Index(['age', 'location_clusters_a', 'area_clusters', 'size_clusters_a',
       'size_clusters_b'],
      dtype='object')


In [26]:
train_predictions = pd.DataFrame({
    'actual': y_train.logerror
}) 
validate_predictions = pd.DataFrame({
    'actual': y_validate
})

In [27]:
train_predictions['baseline'] = y_train.mean()
validate_predictions['basline']=y_validate.mean()

In [28]:
for k in range(2,6):
    lm = LinearRegression()
    # 1. Transform our X
    rfe = RFE(lm, n_features_to_select=k)
    rfe.fit(X_train, y_train)
    # 2. Use the transformed x in our model
    X_train_rfe = rfe.transform(X_train)
    X_validate_rfe = rfe.transform(X_validate)
    lm.fit(X_train_rfe, y_train)
    
    # Make predictions
    train_predictions['multiple_rfe_k=', k] = lm.predict(X_train_rfe)
    validate_predictions['multiple_rfe_k=', k] = lm.predict(X_validate_rfe)
    k+=1

In [29]:
def calculate_rmse(y_predicted):
    return mean_squared_error(train_predictions.actual, y_predicted, squared = False)

train_predictions.apply(calculate_rmse).sort_values()

actual                  0.000000
(multiple_rfe_k=, 4)    0.363455
(multiple_rfe_k=, 5)    0.363463
(multiple_rfe_k=, 2)    0.363488
(multiple_rfe_k=, 3)    0.363528
baseline                0.364527
dtype: float64

In [30]:
def calculate_rmse(y_predicted):
    return mean_squared_error(validate_predictions.actual, y_predicted, squared = False)

validate_predictions.apply(calculate_rmse).sort_values()

actual                  0.000000
(multiple_rfe_k=, 4)    0.364318
(multiple_rfe_k=, 5)    0.364327
(multiple_rfe_k=, 2)    0.364370
(multiple_rfe_k=, 3)    0.364388
basline                 0.366898
dtype: float64

## Polynomial Features

In [32]:
train_pred = pd.DataFrame({
    'actual': train.logerror
}) 
validate_pred = pd.DataFrame({
    'actual': validate.logerror
}) 
train_pred['baseline'] = y_train.mean()
validate_pred['basline']=y_validate.mean()

In [33]:
from sklearn.preprocessing import PolynomialFeatures

# 1. Generate Polynomial Features, k=2
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)

In [34]:
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)
train_pred['baseline'] = y_train.mean()
validate_pred['baseline'] = y_validate.mean()
train_pred['polynomial degree 2'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 2'] = lm.predict(X_validate_poly)

In [48]:
baseline_rmse = mean_squared_error(train_pred.actual,train_pred['baseline'], squared = False)
baseline_rmse

0.16262411066536314

In [49]:
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['baseline'], squared = False)
validate_rmse

0.15806831374792724

In [43]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 2'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 2'], squared = False)
train_rmse, validate_rmse

(0.16163094482814297, 0.1576410238958071)

In [35]:
# Interaction terms only

poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)
lm = LinearRegression()
lm.fit(X_train_poly, y_train)

X_validate_poly = poly.transform(X_validate)
train_pred['polynomial degree 2 only interaction'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 2 only interaction'] = lm.predict(X_validate_poly)

In [40]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 2 only interaction'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 2 only interaction'], squared = False)
train_rmse, validate_rmse

(0.16223911452454318, 0.15809481556209198)

In [36]:
# k=3
poly = PolynomialFeatures(degree=3, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)
train_pred['baseline'] = y_train.mean()
validate_pred['baseline'] = y_validate.mean()
train_pred['polynomial degree 3'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 3'] = lm.predict(X_validate_poly)

In [50]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 3'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 3'], squared = False)
train_rmse, validate_rmse

(0.16097933947491191, 0.15775996884892424)

In [38]:
# k=4
poly = PolynomialFeatures(degree=4, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=X_train.index,
)
lm = LinearRegression()
lm.fit(X_train_poly, y_train)
X_validate_poly = poly.transform(X_validate)
train_pred['baseline'] = y_train.mean()
validate_pred['baseline'] = y_validate.mean()
train_pred['polynomial degree 4'] = lm.predict(X_train_poly)
validate_pred['polynomial degree 4'] = lm.predict(X_validate_poly)

In [51]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['polynomial degree 4'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['polynomial degree 4'], squared = False)
train_rmse, validate_rmse

(0.1603366092078218, 0.15824011533835622)

## Lasso-Lars

In [52]:
# create the model object
lars = LassoLars(alpha=1)

# fit the model to our training data
lars.fit(X_train, y_train)

# predict validate
X_train_pred_lars = lars.predict(X_train)
X_validate_pred_lars = lars.predict(X_validate)
# Add lassolars predictions to our predictions DataFrame
train_pred['lasso_lars'] = X_train_pred_lars
validate_pred['lasso_lars'] = X_validate_pred_lars

In [53]:
train_rmse = mean_squared_error(train_pred.actual,train_pred['lasso_lars'], squared = False)
validate_rmse = mean_squared_error(validate_pred.actual,validate_pred['lasso_lars'], squared = False)
train_rmse, validate_rmse

(0.16262411066536314, 0.1580787480680725)

## Generalized Linear Model

In [54]:
from sklearn.linear_model import TweedieRegressor

# create the model object
glm = TweedieRegressor(power=1, alpha=0)

# fit the model to our training data
glm.fit(X_train, y_train)

# predict train
X_train_predict_glm = glm.predict(X_train)
X_validate_predict_glm = glm.predict(X_validate)
# Add lassolars predictions to our predictions DataFrame
train_pred['glm'] = X_train_predict_glm
validate_pred['glm'] = X_validate_predict_glm

ValueError: Some value(s) of y are out of the valid range for family TweedieDistribution