In [1]:
# Imports necessary to acquire, prepare, explore, visualize, analyze, and model data

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import env
import acquire
import prepare 

from tabulate import tabulate
from IPython.display import Markdown, display

from math import sqrt
from scipy import stats

import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, spearmanr

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer

np.random.seed(123)

using functions created to acquire and wrangle the data

In [2]:
df = acquire.get_zillow_data()

df = prepare.remove_outliers(df)
df = prepare.handle_nulls(df)
df = prepare.rename_columns(df)
df = prepare.prepare_locs(df)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47936 entries, 0 to 52440
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   parcelid          47936 non-null  int64         
 1   bathrooms         47936 non-null  float64       
 2   bedrooms          47936 non-null  float64       
 3   square_feet       47936 non-null  int64         
 4   fips              47936 non-null  int64         
 5   garages           47936 non-null  int64         
 6   garage_size       47936 non-null  int64         
 7   lot_size          47936 non-null  int64         
 8   has_pool          47936 non-null  bool          
 9   year_built        47936 non-null  int64         
 10  log_error         47936 non-null  float64       
 11  transaction_date  47936 non-null  datetime64[ns]
 12  county            47936 non-null  object        
 13  latitude          47936 non-null  float64       
 14  longitude         4793

this looks good for now... let's think about some things to explore-

-curious to see if transaction date and log error have any relation, like does the time of year of transation increase/dec likelihood of error?

-let's look for like-things... how closely linked are bedroom and bathroom, could they be combined to one feature?

-what does the correlatoin to log error chart look like?

-what can we bin? keep this in mind through exploration.

-look at square feet and year together. do homes get bigger over time?

-what if we tried K means on lat, long and square feet?

-let's start here and keep adding ideas here when they come up

-look at pool

In [4]:
df = df.copy().where(df.square_feet < 1123)
df = df.dropna()

In [5]:
train, test = train_test_split(df, test_size=.2, random_state=123)
train, validate = train_test_split(train, test_size=.3, random_state=123)
# Splits data into 3 subsets: train, validate, test. Random state specifying that data is split
# with the exact same records when the code is re-run (useful for exploration and modeling, yet
# I suggest dropping this when employing the model)

def printmd(string): # function to format text style
    display(Markdown(string))
print()
printmd('**Number of Train Records:** {:,}'
     .format(len(train)))
printmd('**Number of Validate Records:** {:,}'
     .format(len(validate)))
printmd('**Number of Test Records:** {:,}'
     .format(len(test)))
# returns the amount of records in each subset after splitting





**Number of Train Records:** 3,818

**Number of Validate Records:** 1,637

**Number of Test Records:** 1,364

In [6]:
from sklearn.cluster import KMeans
import sklearn.preprocessing

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(train[['bathrooms', 'bedrooms', 'square_feet']])
# inserts the 4 selected features into the scaler

train[['scalbathrooms', 'scalbedrooms', 'scalsquare_feet']] = scaler.transform(train[['bathrooms', 'bedrooms', 'square_feet']])

In [None]:
X = train[['scalbathrooms', 'scalbedrooms', 'scalsquare_feet']]

kmeans = KMeans(n_clusters=7)
kmeans.fit(X)

kmeans.predict(X)

train['cluster'] = kmeans.predict(X)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(validate[['bathrooms', 'bedrooms', 'square_feet']])
# inserts the 4 selected features into the scaler

validate[['scalbathrooms', 'scalbedrooms', 'scalsquare_feet']] = scaler.transform(validate[['bathrooms', 'bedrooms', 'square_feet']])

In [None]:
X = validate[['scalbathrooms', 'scalbedrooms', 'scalsquare_feet']]

kmeans = KMeans(n_clusters=7)
kmeans.fit(X)

kmeans.predict(X)

validate['cluster'] = kmeans.predict(X)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(test[['bathrooms', 'bedrooms', 'square_feet']])
# inserts the 4 selected features into the scaler

test[['scalbathrooms', 'scalbedrooms', 'scalsquare_feet']] = scaler.transform(test[['bathrooms', 'bedrooms', 'square_feet']])

In [None]:
X = test[['scalbathrooms', 'scalbedrooms', 'scalsquare_feet']]

kmeans = KMeans(n_clusters=7)
kmeans.fit(X)

kmeans.predict(X)

test['cluster'] = kmeans.predict(X)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(train[['year_built', 'square_feet']])
# inserts the 4 selected features into the scaler

train[['scalyear', 'scalsquare_feet']] = scaler.transform(train[['year_built', 'square_feet']])

In [None]:
X = train[['scalyear', 'scalsquare_feet']]

kmeans = KMeans(n_clusters=7)
kmeans.fit(X)

kmeans.predict(X)

train['cluster2'] = kmeans.predict(X)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(validate[['year_built', 'square_feet']])
# inserts the 4 selected features into the scaler

validate[['scalyear', 'scalsquare_feet']] = scaler.transform(validate[['year_built', 'square_feet']])

In [None]:
X = validate[['scalyear', 'scalsquare_feet']]

kmeans = KMeans(n_clusters=7)
kmeans.fit(X)

kmeans.predict(X)

validate['cluster2'] = kmeans.predict(X)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(test[['year_built', 'square_feet']])
# inserts the 4 selected features into the scaler

test[['scalyear', 'scalsquare_feet']] = scaler.transform(test[['year_built', 'square_feet']])

In [None]:
X = test[['scalyear', 'scalsquare_feet']]

kmeans = KMeans(n_clusters=7)
kmeans.fit(X)

kmeans.predict(X)

test['cluster2'] = kmeans.predict(X)

centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
plt.figure(figsize=(9, 6))
pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
plt.xticks(range(2, 12))
plt.xlabel('k')
plt.ylabel('inertia')
plt.title('Change in inertia as k increases')

In [7]:
kmeans = KMeans(
    init="random",
    n_clusters=30)

In [8]:
coords = train[['latitude', 'longitude']].to_numpy()

In [9]:
train

Unnamed: 0,parcelid,bathrooms,bedrooms,square_feet,fips,garages,garage_size,lot_size,has_pool,year_built,log_error,transaction_date,county,latitude,longitude
50711,11445451.0,1.0,3.0,938.0,6037.0,0.0,0.0,8084.0,False,1949.0,-0.010825,2017-09-08,los_angeles,33.862078,-118.347547
22675,12917947.0,1.0,3.0,1122.0,6037.0,0.0,0.0,5977.0,False,1952.0,0.025276,2017-05-05,los_angeles,34.096247,-117.924726
37635,11944898.0,1.0,1.0,800.0,6037.0,0.0,0.0,2172.0,False,1921.0,-0.119306,2017-07-10,los_angeles,34.093347,-118.276062
17884,12565279.0,1.0,2.0,946.0,6037.0,0.0,0.0,4036.0,False,1948.0,-0.032992,2017-04-14,los_angeles,33.872200,-118.195445
46935,13879501.0,1.0,3.0,1052.0,6059.0,1.0,360.0,7500.0,False,1951.0,0.041223,2017-08-23,orange,33.839456,-117.905136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6255,11402602.0,1.0,2.0,884.0,6037.0,0.0,0.0,3958.0,False,1926.0,0.303324,2017-02-10,los_angeles,33.957517,-118.367536
18230,12683288.0,1.0,2.0,884.0,6037.0,0.0,0.0,5390.0,False,1951.0,0.003837,2017-04-17,los_angeles,33.801235,-118.344439
14384,12720970.0,1.0,3.0,1047.0,6037.0,0.0,0.0,5235.0,False,1955.0,0.058952,2017-03-29,los_angeles,33.933328,-118.082629
30518,10986123.0,2.0,3.0,920.0,6037.0,0.0,0.0,3553.0,False,1958.0,-0.036523,2017-06-09,los_angeles,34.265456,-118.417831


In [10]:
coords = train[['latitude', 'longitude']].to_numpy()

kmeans.fit(coords)

kmeans.predict(coords)

train['cluster3'] = kmeans.predict(coords)


In [11]:
coords = validate[['latitude', 'longitude']].to_numpy()

kmeans.fit(coords)

kmeans.predict(coords)

validate['cluster3'] = kmeans.predict(coords)


In [12]:
coords = test[['latitude', 'longitude']].to_numpy()

kmeans.fit(coords)

kmeans.predict(coords)

test['cluster3'] = kmeans.predict(coords)


In [13]:
train

Unnamed: 0,parcelid,bathrooms,bedrooms,square_feet,fips,garages,garage_size,lot_size,has_pool,year_built,log_error,transaction_date,county,latitude,longitude,cluster3
50711,11445451.0,1.0,3.0,938.0,6037.0,0.0,0.0,8084.0,False,1949.0,-0.010825,2017-09-08,los_angeles,33.862078,-118.347547,5
22675,12917947.0,1.0,3.0,1122.0,6037.0,0.0,0.0,5977.0,False,1952.0,0.025276,2017-05-05,los_angeles,34.096247,-117.924726,26
37635,11944898.0,1.0,1.0,800.0,6037.0,0.0,0.0,2172.0,False,1921.0,-0.119306,2017-07-10,los_angeles,34.093347,-118.276062,20
17884,12565279.0,1.0,2.0,946.0,6037.0,0.0,0.0,4036.0,False,1948.0,-0.032992,2017-04-14,los_angeles,33.872200,-118.195445,8
46935,13879501.0,1.0,3.0,1052.0,6059.0,1.0,360.0,7500.0,False,1951.0,0.041223,2017-08-23,orange,33.839456,-117.905136,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6255,11402602.0,1.0,2.0,884.0,6037.0,0.0,0.0,3958.0,False,1926.0,0.303324,2017-02-10,los_angeles,33.957517,-118.367536,1
18230,12683288.0,1.0,2.0,884.0,6037.0,0.0,0.0,5390.0,False,1951.0,0.003837,2017-04-17,los_angeles,33.801235,-118.344439,17
14384,12720970.0,1.0,3.0,1047.0,6037.0,0.0,0.0,5235.0,False,1955.0,0.058952,2017-03-29,los_angeles,33.933328,-118.082629,23
30518,10986123.0,2.0,3.0,920.0,6037.0,0.0,0.0,3553.0,False,1958.0,-0.036523,2017-06-09,los_angeles,34.265456,-118.417831,29


In [None]:
xsmall_train = train.copy().where(train.square_feet < 1184)
xsmall_train = xsmall_train.dropna()

xsmall_validate = validate.copy().where(validate.square_feet < 1184)
xsmall_validate = xsmall_validate.dropna()

xsmall_test = test.copy().where(test.square_feet < 1184)
xsmall_test = xsmall_test.dropna()

small = train.copy().where((train.square_feet >= 1184) & (train.square_feet < 1423))
small = small.dropna()

med = train.copy().where((train.square_feet >= 1423) & (train.square_feet < 1988))
med = med.dropna()

large = train.copy().where((train.square_feet >= 1988))
large = large.dropna()

In [14]:
# dummy1 = pd.get_dummies(train['cluster'])
# dummy2 = pd.get_dummies(train['cluster2'])
dummy3 = pd.get_dummies(train['cluster3'])

# dummy4 = pd.get_dummies(validate['cluster'])
# dummy5 = pd.get_dummies(validate['cluster2'])
dummy6 = pd.get_dummies(validate['cluster3'])

# dummy7 = pd.get_dummies(test['cluster'])
# dummy8 = pd.get_dummies(test['cluster2'])
dummy9 = pd.get_dummies(test['cluster3'])


train = pd.concat([train, dummy3], axis=1)
validate = pd.concat([validate, dummy6], axis=1)
test = pd.concat([test, dummy9], axis=1)

In [16]:
validate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1637 entries, 27275 to 41258
Data columns (total 46 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   parcelid          1637 non-null   float64       
 1   bathrooms         1637 non-null   float64       
 2   bedrooms          1637 non-null   float64       
 3   square_feet       1637 non-null   float64       
 4   fips              1637 non-null   float64       
 5   garages           1637 non-null   float64       
 6   garage_size       1637 non-null   float64       
 7   lot_size          1637 non-null   float64       
 8   has_pool          1637 non-null   object        
 9   year_built        1637 non-null   float64       
 10  log_error         1637 non-null   float64       
 11  transaction_date  1637 non-null   datetime64[ns]
 12  county            1637 non-null   object        
 13  latitude          1637 non-null   float64       
 14  longitude         1

In [19]:
X_train = train[['cluster3', 'bedrooms', 'bathrooms', 'year_built']]
# creates dataframe that drops all column except the selected features for modeling
y_train = train[['log_error']]
# creates dataframe of target variable (y) only

X_validate = validate[['cluster3', 'bedrooms', 'bathrooms', 'year_built']]
y_validate = validate[['log_error']]
# repeat above for validate set

X_test = test[['cluster3', 'bedrooms', 'bathrooms', 'year_built']]
y_test = test[['log_error']]

In [20]:
X_validate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1637 entries, 27275 to 41258
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   cluster3    1637 non-null   int32  
 1   bedrooms    1637 non-null   float64
 2   bathrooms   1637 non-null   float64
 3   year_built  1637 non-null   float64
dtypes: float64(3), int32(1)
memory usage: 57.6 KB


In [21]:
scaler = sklearn.preprocessing.MinMaxScaler()
# employs the min max scaler
scaler.fit(X_train[['bedrooms', 'bathrooms', 'year_built']])
# inserts the 4 selected features into the scaler

MinMaxScaler()

In [None]:
X_validate

In [22]:
v = scaler.transform(X_train[['bedrooms', 'bathrooms', 'year_built']]) 
v1 = scaler.transform(X_validate[['bedrooms', 'bathrooms', 'year_built']]) 
v2 = scaler.transform(X_test[['bedrooms', 'bathrooms', 'year_built']]) 

In [23]:
v = pd.DataFrame(data = v)
v1 = pd.DataFrame(data = v1)
v2 = pd.DataFrame(data = v2)

In [24]:
v.index = X_train.index
v1.index = X_validate.index
v2.index = X_test.index

In [25]:
X_train = pd.concat([X_train, v], axis=1)
X_validate = pd.concat([X_validate, v1], axis=1)
X_test = pd.concat([X_test, v2], axis=1)

In [26]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3818 entries, 50711 to 45839
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   cluster3    3818 non-null   int32  
 1   bedrooms    3818 non-null   float64
 2   bathrooms   3818 non-null   float64
 3   year_built  3818 non-null   float64
 4   0           3818 non-null   float64
 5   1           3818 non-null   float64
 6   2           3818 non-null   float64
dtypes: float64(6), int32(1)
memory usage: 223.7 KB


In [27]:
X_train = X_train.drop(columns=['bedrooms', 'bathrooms', 'year_built'])
# creates dataframe that drops all column except the selected features for modeling
y_train = y_train[['log_error']]
# creates dataframe of target variable (y) only

X_validate = X_validate.drop(columns=['bedrooms', 'bathrooms', 'year_built'])
y_validate = y_validate[['log_error']]
# repeat above for validate set

X_test = X_test.drop(columns=['bedrooms', 'bathrooms', 'year_built'])
y_test = y_test[['log_error']]

In [None]:
#X_validate[['scaledsize', 'scaledyear']] = scaler.transform(X_validate[['square_feet', 'year_built']])

In [None]:
# X_validate[['scaledsize', 'scaledyear']] = scaler.transform(X_validate[['square_feet', 'year_built']])
# X_test[['scaledsize', 'scaledyear']] = data=scaler.transform(X_test[['square_feet', 'year_built']]) 

In [None]:
# X_train

In [None]:
# X_train = X_train.drop(columns = ['square_feet', 'year_built'])
# X_validate = X_validate.drop(columns = ['square_feet', 'year_built'])
# X_test = X_test.drop(columns = ['square_feet','year_built'])

In [28]:
y_train['pred_mean'] = y_train.log_error.mean()
y_validate['pred_mean'] = y_validate.log_error.mean()
# calculates mean prior to computing rmse

rmse_train = mean_squared_error(y_train.log_error, y_train.pred_mean)**(1/2)
rmse_validate = mean_squared_error(y_validate.log_error, y_validate.pred_mean)**(1/2)
# computes baseline RMSE for train and validate sets (square root of MSE)

print("Baseline RMSE\nTrain/In-Sample: ", (rmse_train)), 
print("Baseline RMSE\nValidate/Out-of-Sample: ", (rmse_validate))

Baseline RMSE
Train/In-Sample:  0.09416681190758816
Baseline RMSE
Validate/Out-of-Sample:  0.09990204714131118


In [None]:
#lm = LinearRegression(normalize=True)
# create the model

In [None]:
#lm.fit(X_train, y_train.log_error)
# fit the model to scaled training data

In [None]:
#y_train['value_predict_lm'] = lm.predict(X_train)
# computes model predictions

In [None]:
#rmse_train = mean_squared_error(y_train.log_error, y_train.value_predict_lm)**(1/2)

In [None]:
#X_train

In [None]:
#_validate['value_predict_lm'] = lm.predict(X_validate)

In [None]:
X_train.info()

lm = LinearRegression(normalize=True)
# create the model

lm.fit(X_train, y_train.log_error)
# fit the model to scaled training data

y_train['value_predict_lm'] = lm.predict(X_train)
# computes model predictions

rmse_train = mean_squared_error(y_train.log_error, y_train.value_predict_lm)**(1/2)
# computes model rmse

In [None]:
X_validate.info()

In [29]:
lm = LinearRegression(normalize=True)
# create the model

lm.fit(X_train, y_train.log_error)
# fit the model to scaled training data

y_train['value_predict_lm'] = lm.predict(X_train)
# computes model predictions

rmse_train = mean_squared_error(y_train.log_error, y_train.value_predict_lm)**(1/2)
# computes model rmse

y_validate['value_predict_lm'] = lm.predict(X_validate)
rmse_validate = mean_squared_error(y_validate.log_error, y_validate.value_predict_lm)**(1/2)
# comutes predictions and rmse with validate data

printmd("**OLS Linear Regression Performance**")
print("---------------------------------------")
print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", (rmse_train))
print("---------------------------------------")
print("RMSE for OLS using LinearRegression\nValidation/Out-of-Sample: ", (rmse_validate))

**OLS Linear Regression Performance**

---------------------------------------
RMSE for OLS using LinearRegression
Training/In-Sample:  0.09407122204925293
---------------------------------------
RMSE for OLS using LinearRegression
Validation/Out-of-Sample:  0.10008368525781866


In [None]:
y_train['value_predict_lm'].mean() < y_train['log_error'].mean()

In [None]:
y_train['value_predict_lm'].mean()

In [None]:
y_train['log_error'].mean()

In [None]:
# small = train.copy().where((train.square_feet >= 1184) & (train.square_feet < 1393))
# small = small.dropna()

In [None]:
# xlarge = train.copy().where(train.square_feet >= 4000)
# xlarge = large.dropna()

In [None]:
# xlarge = train.copy().where(train.square_feet >= 2000)
# xlarge = large.dropna()