In [1]:
# Imports necessary to acquire, prepare, explore, visualize, analyze, and model data

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import env
import acquire
import prepare 

from tabulate import tabulate
from IPython.display import Markdown, display

from math import sqrt
from scipy import stats

import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, spearmanr

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer

np.random.seed(123)

using functions created to acquire and wrangle the data

In [2]:
df = acquire.get_zillow_data()

In [6]:
df.propertycountylandusecode

0

In [None]:
df = acquire.get_zillow_data()

df = prepare.remove_outliers(df)
df = prepare.handle_nulls(df)
df = prepare.rename_columns(df)
df = prepare.prepare_locs(df)

In [None]:
df.info()

this looks good for now... let's think about some things to explore-

-curious to see if transaction date and log error have any relation, like does the time of year of transation increase/dec likelihood of error?

-let's look for like-things... how closely linked are bedroom and bathroom, could they be combined to one feature?

-what does the correlatoin to log error chart look like?

-what can we bin? keep this in mind through exploration.

-look at square feet and year together. do homes get bigger over time?

-what if we tried K means on lat, long and square feet?

-let's start here and keep adding ideas here when they come up

-look at pool

In [None]:
df = df.copy().where(df.square_feet < 1123)
df = df.dropna()

In [None]:
train, test = train_test_split(df, test_size=.2, random_state=123)
train, validate = train_test_split(train, test_size=.3, random_state=123)
# Splits data into 3 subsets: train, validate, test. Random state specifying that data is split
# with the exact same records when the code is re-run (useful for exploration and modeling, yet
# I suggest dropping this when employing the model)

def printmd(string): # function to format text style
    display(Markdown(string))
print()
printmd('**Number of Train Records:** {:,}'
     .format(len(train)))
printmd('**Number of Validate Records:** {:,}'
     .format(len(validate)))
printmd('**Number of Test Records:** {:,}'
     .format(len(test)))
# returns the amount of records in each subset after splitting


In [None]:
from sklearn.cluster import KMeans
import sklearn.preprocessing

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(train[['bathrooms', 'bedrooms', 'square_feet']])
# inserts the 4 selected features into the scaler

train[['scalbathrooms', 'scalbedrooms', 'scalsquare_feet']] = scaler.transform(train[['bathrooms', 'bedrooms', 'square_feet']])

In [None]:
X = train[['scalbathrooms', 'scalbedrooms', 'scalsquare_feet']]

kmeans = KMeans(n_clusters=7)
kmeans.fit(X)

kmeans.predict(X)

train['cluster'] = kmeans.predict(X)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(validate[['bathrooms', 'bedrooms', 'square_feet']])
# inserts the 4 selected features into the scaler

validate[['scalbathrooms', 'scalbedrooms', 'scalsquare_feet']] = scaler.transform(validate[['bathrooms', 'bedrooms', 'square_feet']])

In [None]:
X = validate[['scalbathrooms', 'scalbedrooms', 'scalsquare_feet']]

kmeans = KMeans(n_clusters=7)
kmeans.fit(X)

kmeans.predict(X)

validate['cluster'] = kmeans.predict(X)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(test[['bathrooms', 'bedrooms', 'square_feet']])
# inserts the 4 selected features into the scaler

test[['scalbathrooms', 'scalbedrooms', 'scalsquare_feet']] = scaler.transform(test[['bathrooms', 'bedrooms', 'square_feet']])

In [None]:
X = test[['scalbathrooms', 'scalbedrooms', 'scalsquare_feet']]

kmeans = KMeans(n_clusters=7)
kmeans.fit(X)

kmeans.predict(X)

test['cluster'] = kmeans.predict(X)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(train[['year_built', 'square_feet']])
# inserts the 4 selected features into the scaler

train[['scalyear', 'scalsquare_feet']] = scaler.transform(train[['year_built', 'square_feet']])

In [None]:
X = train[['scalyear', 'scalsquare_feet']]

kmeans = KMeans(n_clusters=7)
kmeans.fit(X)

kmeans.predict(X)

train['cluster2'] = kmeans.predict(X)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(validate[['year_built', 'square_feet']])
# inserts the 4 selected features into the scaler

validate[['scalyear', 'scalsquare_feet']] = scaler.transform(validate[['year_built', 'square_feet']])

In [None]:
X = validate[['scalyear', 'scalsquare_feet']]

kmeans = KMeans(n_clusters=7)
kmeans.fit(X)

kmeans.predict(X)

validate['cluster2'] = kmeans.predict(X)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(test[['year_built', 'square_feet']])
# inserts the 4 selected features into the scaler

test[['scalyear', 'scalsquare_feet']] = scaler.transform(test[['year_built', 'square_feet']])

In [None]:
X = test[['scalyear', 'scalsquare_feet']]

kmeans = KMeans(n_clusters=7)
kmeans.fit(X)

kmeans.predict(X)

test['cluster2'] = kmeans.predict(X)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
plt.figure(figsize=(9, 6))
pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
plt.xticks(range(2, 12))
plt.xlabel('k')
plt.ylabel('inertia')
plt.title('Change in inertia as k increases')

In [None]:
kmeans = KMeans(
    init="random",
    n_clusters=30)

In [None]:
coords = train[['latitude', 'longitude']].to_numpy()

In [None]:
train

In [None]:
coords = train[['latitude', 'longitude']].to_numpy()

kmeans.fit(coords)

kmeans.predict(coords)

train['cluster3'] = kmeans.predict(coords)


In [None]:
coords = validate[['latitude', 'longitude']].to_numpy()

kmeans.fit(coords)

kmeans.predict(coords)

validate['cluster3'] = kmeans.predict(coords)


In [None]:
coords = test[['latitude', 'longitude']].to_numpy()

kmeans.fit(coords)

kmeans.predict(coords)

test['cluster3'] = kmeans.predict(coords)


In [None]:
train

In [None]:
xsmall_train = train.copy().where(train.square_feet < 1184)
xsmall_train = xsmall_train.dropna()

xsmall_validate = validate.copy().where(validate.square_feet < 1184)
xsmall_validate = xsmall_validate.dropna()

xsmall_test = test.copy().where(test.square_feet < 1184)
xsmall_test = xsmall_test.dropna()

small = train.copy().where((train.square_feet >= 1184) & (train.square_feet < 1423))
small = small.dropna()

med = train.copy().where((train.square_feet >= 1423) & (train.square_feet < 1988))
med = med.dropna()

large = train.copy().where((train.square_feet >= 1988))
large = large.dropna()

In [None]:
# dummy1 = pd.get_dummies(train['cluster'])
# dummy2 = pd.get_dummies(train['cluster2'])
dummy3 = pd.get_dummies(train['cluster3'])

# dummy4 = pd.get_dummies(validate['cluster'])
# dummy5 = pd.get_dummies(validate['cluster2'])
dummy6 = pd.get_dummies(validate['cluster3'])

# dummy7 = pd.get_dummies(test['cluster'])
# dummy8 = pd.get_dummies(test['cluster2'])
dummy9 = pd.get_dummies(test['cluster3'])


train = pd.concat([train, dummy3], axis=1)
validate = pd.concat([validate, dummy6], axis=1)
test = pd.concat([test, dummy9], axis=1)

In [None]:
validate.info()

In [None]:
X_train = train[['cluster3', 'bedrooms', 'bathrooms', 'year_built']]
# creates dataframe that drops all column except the selected features for modeling
y_train = train[['log_error']]
# creates dataframe of target variable (y) only

X_validate = validate[['cluster3', 'bedrooms', 'bathrooms', 'year_built']]
y_validate = validate[['log_error']]
# repeat above for validate set

X_test = test[['cluster3', 'bedrooms', 'bathrooms', 'year_built']]
y_test = test[['log_error']]

In [None]:
X_validate.info()

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
# employs the min max scaler
scaler.fit(X_train[['bedrooms', 'bathrooms', 'year_built']])
# inserts the 4 selected features into the scaler

In [None]:
X_validate

In [None]:
v = scaler.transform(X_train[['bedrooms', 'bathrooms', 'year_built']]) 
v1 = scaler.transform(X_validate[['bedrooms', 'bathrooms', 'year_built']]) 
v2 = scaler.transform(X_test[['bedrooms', 'bathrooms', 'year_built']]) 

In [None]:
v = pd.DataFrame(data = v)
v1 = pd.DataFrame(data = v1)
v2 = pd.DataFrame(data = v2)

In [None]:
v.index = X_train.index
v1.index = X_validate.index
v2.index = X_test.index

In [None]:
X_train = pd.concat([X_train, v], axis=1)
X_validate = pd.concat([X_validate, v1], axis=1)
X_test = pd.concat([X_test, v2], axis=1)

In [None]:
X_train.info()

In [None]:
X_train = X_train.drop(columns=['bedrooms', 'bathrooms', 'year_built'])
# creates dataframe that drops all column except the selected features for modeling
y_train = y_train[['log_error']]
# creates dataframe of target variable (y) only

X_validate = X_validate.drop(columns=['bedrooms', 'bathrooms', 'year_built'])
y_validate = y_validate[['log_error']]
# repeat above for validate set

X_test = X_test.drop(columns=['bedrooms', 'bathrooms', 'year_built'])
y_test = y_test[['log_error']]

In [None]:
#X_validate[['scaledsize', 'scaledyear']] = scaler.transform(X_validate[['square_feet', 'year_built']])

In [None]:
# X_validate[['scaledsize', 'scaledyear']] = scaler.transform(X_validate[['square_feet', 'year_built']])
# X_test[['scaledsize', 'scaledyear']] = data=scaler.transform(X_test[['square_feet', 'year_built']]) 

In [None]:
# X_train

In [None]:
# X_train = X_train.drop(columns = ['square_feet', 'year_built'])
# X_validate = X_validate.drop(columns = ['square_feet', 'year_built'])
# X_test = X_test.drop(columns = ['square_feet','year_built'])

In [None]:
y_train['pred_mean'] = y_train.log_error.mean()
y_validate['pred_mean'] = y_validate.log_error.mean()
# calculates mean prior to computing rmse

rmse_train = mean_squared_error(y_train.log_error, y_train.pred_mean)**(1/2)
rmse_validate = mean_squared_error(y_validate.log_error, y_validate.pred_mean)**(1/2)
# computes baseline RMSE for train and validate sets (square root of MSE)

print("Baseline RMSE\nTrain/In-Sample: ", (rmse_train)), 
print("Baseline RMSE\nValidate/Out-of-Sample: ", (rmse_validate))

In [None]:
#lm = LinearRegression(normalize=True)
# create the model

In [None]:
#lm.fit(X_train, y_train.log_error)
# fit the model to scaled training data

In [None]:
#y_train['value_predict_lm'] = lm.predict(X_train)
# computes model predictions

In [None]:
#rmse_train = mean_squared_error(y_train.log_error, y_train.value_predict_lm)**(1/2)

In [None]:
#X_train

In [None]:
#_validate['value_predict_lm'] = lm.predict(X_validate)

In [None]:
X_train.info()

lm = LinearRegression(normalize=True)
# create the model

lm.fit(X_train, y_train.log_error)
# fit the model to scaled training data

y_train['value_predict_lm'] = lm.predict(X_train)
# computes model predictions

rmse_train = mean_squared_error(y_train.log_error, y_train.value_predict_lm)**(1/2)
# computes model rmse

In [None]:
X_validate.info()

In [None]:
lm = LinearRegression(normalize=True)
# create the model

lm.fit(X_train, y_train.log_error)
# fit the model to scaled training data

y_train['value_predict_lm'] = lm.predict(X_train)
# computes model predictions

rmse_train = mean_squared_error(y_train.log_error, y_train.value_predict_lm)**(1/2)
# computes model rmse

y_validate['value_predict_lm'] = lm.predict(X_validate)
rmse_validate = mean_squared_error(y_validate.log_error, y_validate.value_predict_lm)**(1/2)
# comutes predictions and rmse with validate data

printmd("**OLS Linear Regression Performance**")
print("---------------------------------------")
print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", (rmse_train))
print("---------------------------------------")
print("RMSE for OLS using LinearRegression\nValidation/Out-of-Sample: ", (rmse_validate))

In [None]:
y_train['value_predict_lm'].mean() < y_train['log_error'].mean()

In [None]:
y_train['value_predict_lm'].mean()

In [None]:
y_train['log_error'].mean()

In [None]:
# small = train.copy().where((train.square_feet >= 1184) & (train.square_feet < 1393))
# small = small.dropna()

In [None]:
# xlarge = train.copy().where(train.square_feet >= 4000)
# xlarge = large.dropna()

In [None]:
# xlarge = train.copy().where(train.square_feet >= 2000)
# xlarge = large.dropna()