In [1]:
from imports import *

# set columnns display format
pd.set_option('display.max_columns', None)
# default pandas decimal number display format
# pd.options.display.float_format = '{:20,.3f}'.format

In [2]:
# import raw, messy data
df = acquire.get_zillow_data()

In [3]:
# clean data
df = prepare.prep_zillow(df)

In [4]:
# scale and split the data
train, validate, test = prepare.split_scale(df)

In [9]:
# adding location clusters to dataframe
cols = ['county_Los Angeles', 'county_Orange', 'zip_code']
kmeans = KMeans(n_clusters=4)
kmeans.fit(train[cols])

train['location_clusters'] = kmeans.predict(train[cols])
validate['location_clusters'] = kmeans.predict(validate[cols])
test['location_clusters'] = kmeans.predict(test[cols])

In [11]:
# adding area clusters to dataframe - somewhat significant

cols =['total_sqft', 'lot_sqft', 'living_sqft']
kmeans = KMeans(n_clusters=3)
kmeans.fit(train[cols])

train['area_clusters'] = kmeans.predict(train[cols])
validate['area_clusters'] = kmeans.predict(validate[cols])
test['area_clusters'] = kmeans.predict(test[cols])

In [12]:
# adding size clusters to dataframe - not significant

cols = ['bedrooms', 'bathrooms', 'full_bath']
kmeans = KMeans(n_clusters=3)
kmeans.fit(train[cols])
train['size_clusters'] = kmeans.predict(train[cols])
validate['size_clusters'] = kmeans.predict(validate[cols])
test['size_clusters'] = kmeans.predict(test[cols])

In [13]:
# adding value clusters to dataframe - not significant

cols = ['structure_value', 'assessed_value', 'land_value','taxamount']
kmeans = KMeans(n_clusters=3)
kmeans.fit(train[cols])
train['value_clusters'] = kmeans.predict(train[cols])
validate['value_clusters'] = kmeans.predict(validate[cols])
test['value_clusters'] = kmeans.predict(test[cols])

In [14]:
# set x and y
cols = ['age', 'location_clusters', 'size_clusters', 'value_clusters','transaction_month']

X_train = train[cols]
y_train = train.logerror

X_validate = validate[cols]
y_validate = validate.logerror

X_test = test[cols]
y_test = test.logerror

## Multiple  Regression + RFE

In [16]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
k = 3

# 1. Transform our X
rfe = RFE(lm, n_features_to_select=3)
rfe.fit(X_train, y_train)
print('selected top 3 features:', X_train.columns[rfe.support_])
X_train_rfe = rfe.transform(X_train)

selected top 3 features: Index(['age', 'location_clusters', 'size_clusters'], dtype='object')


In [17]:
train_predictions = pd.DataFrame({
    'actual': train.assessed_value
}) 
validate_predictions = pd.DataFrame({
    'actual': validate.assessed_value
})

In [18]:
train_predictions['baseline'] = y_train.mean()
validate_predictions['basline']=y_validate.mean()

In [22]:
for k in range(2,6):
    lm = LinearRegression()
    # 1. Transform our X
    rfe = RFE(lm, n_features_to_select=k)
    rfe.fit(X_train, y_train)
    # 2. Use the transformed x in our model
    X_train_rfe = rfe.transform(X_train)
    X_validate_rfe = rfe.transform(X_validate)
    lm.fit(X_train_rfe, y_train)
    
    # Make predictions
    train_predictions['multiple_rfe_k=', k] = lm.predict(X_train_rfe)
    validate_predictions['multiple_rfe_k=', k] = lm.predict(X_validate_rfe)
    k+=1

In [23]:
def calculate_rmse(y_predicted):
    return mean_squared_error(train_predictions.actual, y_predicted, squared = False)

train_predictions.apply(calculate_rmse).sort_values()

actual                  0.000000
(multiple_rfe_k=, 4)    0.363425
(multiple_rfe_k=, 5)    0.363429
(multiple_rfe_k=, 3)    0.363556
(multiple_rfe_k=, 2)    0.363805
baseline                0.364527
dtype: float64

In [24]:
def calculate_rmse(y_predicted):
    return mean_squared_error(validate_predictions.actual, y_predicted, squared = False)

validate_predictions.apply(calculate_rmse).sort_values()

actual                  0.000000
(multiple_rfe_k=, 4)    0.364359
(multiple_rfe_k=, 5)    0.364364
(multiple_rfe_k=, 3)    0.364486
(multiple_rfe_k=, 2)    0.364746
basline                 0.366898
dtype: float64