![banner](./data/home-sales-shutterstock-295804091-1068x601.jpg)

# King County Home Sales
**Authors:** [Jerry Vasquez](https://www.linkedin.com/in/jerry-vasquez-832b71224/), [Paul Lindquist](https://www.linkedin.com/in/paul-lindquist/), [Vu Brown](https://www.linkedin.com/in/austin-brown-b5211384/)

## Overview
***
This is our overview

## Business Problem
***
This is our business problem

## Data
***
This is where the data is sourced from with focuses:

## Methods
***
Descriptive analysis, etc.

## Exploratory Data Analysis
***
Notes on EDA

In [None]:
# Import libraries
from collections import Counter
import folium
import itertools
from math import sqrt
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import time
import scipy.stats as stats
import seaborn as sns
sns.set_theme(palette='magma_r')
from sklearn.dummy import DummyRegressor
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_validate, ShuffleSplit
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, LabelEncoder, MinMaxScaler
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

pd.set_option('display.max_rows', 500) # Allows Jupyter Notebook to expand how much data is shown.

In [None]:
# Load DataFrame
df = pd.read_csv('./data/kc_house_data.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.nunique(axis=0)

In [None]:
df.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))

In [None]:
df.corr()

In [None]:
# Features with highest correlation to price
price_corr = df.corr()['price'].map(abs).sort_values(ascending=False)
price_corr

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(data=df, x='long', y='lat', hue='price', palette='magma_r');

## Data Cleaning, Preparation, Feature Engineering for Inferential Model
***

In [None]:
infer_df = df.copy()

### Identifying and dropping duplicates

In [None]:
# Create a function to identify duplicates
def determine_dupes(series):
    series_vcs = pd.Series(series.value_counts())
    series_dupes = [series_vcs.index[index] for index in range(len(series_vcs)) if series_vcs.values[index] > 1]
    print("Amount of unique duplicates: " + str(len(series_dupes)))
    print("Total amount of duplicates: " + str(series_vcs.values[0:len(series_dupes)].sum()))
    
    return series_vcs

In [None]:
# Run duplicates function for 'id' series
determine_dupes(infer_df.id)

In [None]:
# display(infer_df.loc[infer_df.id == 795000620])
# display(infer_df[infer_df.duplicated(subset=['id'], keep=False)].head(20))
# display(infer_df[infer_df.duplicated(subset=['id'], keep='first')].head(10))

In [None]:
# Drop duplicates found within 'id' series
infer_df = infer_df.drop_duplicates(subset=['id'], keep='last')
infer_df.info()

In [None]:
# # Consider droping duplicated based upon latitude and longitude
# infer_df[infer_df.duplicated(subset=['lat','long'], keep=False)].sort_values('lat')
# infer_df = infer_df.drop_duplicates(subset=['lat', 'long'], keep='last')

### Make features more workable by dealing with missing/bunk values and changing series from objects to integers

In [None]:
# Replace NaN/?/missing values with 0, None or No for respective series
# Also change object series to integer via astype function
infer_df.yr_renovated = infer_df.yr_renovated.fillna(0)
infer_df.yr_renovated = infer_df.yr_renovated.astype('int64')

infer_df.view = infer_df.view.fillna('NONE')

infer_df.waterfront = infer_df.waterfront.fillna('NO')

infer_df.loc[infer_df.sqft_basement == '?', 'sqft_basement'] = 0.0
infer_df.sqft_basement = infer_df.sqft_basement.astype('float64').astype('int64')

In [None]:
infer_df.info()

Grade

In [None]:
infer_df.grade.value_counts()

In [None]:
# Change 'grade' series objects to corresponding integers
infer_df.grade = pd.to_numeric(infer_df.grade.map(lambda x: x.split()[0]))
infer_df.grade.value_counts()

Condition

In [None]:
infer_df.condition.value_counts()

In [None]:
# Change 'condition' series objects to corresponding integers
# Integer values from https://info.kingcounty.gov/assessor/esales/Glossary.aspx
infer_df['condition'].replace('Poor', 1, inplace=True)
infer_df['condition'].replace('Fair', 2, inplace=True)
infer_df['condition'].replace('Average', 3, inplace=True)
infer_df['condition'].replace('Good', 4, inplace=True)
infer_df['condition'].replace('Very Good', 5, inplace=True)
infer_df.condition.value_counts()

Waterfront

In [None]:
infer_df.waterfront.value_counts()

In [None]:
# Change 'waterfront' series objects to integers
lb_make = LabelEncoder()
infer_df['waterfront'] = lb_make.fit_transform(infer_df['waterfront'])
infer_df.waterfront.value_counts()
# 0:NO, 1:YES

View

In [None]:
infer_df.view.value_counts()

In [None]:
# Change 'view' series objects to corresponding integers
# Integer values mirrored from 'condition' series
infer_df['view'].replace('NONE', 0, inplace=True)
infer_df['view'].replace('FAIR', 2, inplace=True)
infer_df['view'].replace('AVERAGE', 3, inplace=True)
infer_df['view'].replace('GOOD', 4, inplace=True)
infer_df['view'].replace('EXCELLENT', 5, inplace=True)
infer_df.view.value_counts()

Date

In [None]:
# Change 'date' series to datetime data type (may not be needed)
infer_df['date'] = pd.to_datetime(infer_df['date'])

In [None]:
display(infer_df.info())
display(infer_df.head())

### Identify and drop outliers

Price outliers

In [None]:
infer_df.price.hist(bins=100);

In [None]:
print(infer_df.price.mean()+3*infer_df.price.std())

In [None]:
infer_df = infer_df[infer_df.price < (infer_df.price.mean() + 3*infer_df.price.std())]
display(infer_df.info())

Bedroom outliers

In [None]:
# infer_df.loc[infer_df.bedrooms == 10].sort_values('sqft_living', ascending=False).head(20)

# # 33 bedrooms for a 1620 sqft house is a mistake. 
# # 11 bedrooms for a 3000 sqft house is also a mistake.
# # We'll drop those records
# infer_df.drop(infer_df.loc[infer_df['bedrooms']==33].index, inplace=True)
# infer_df.drop(infer_df.loc[infer_df['bedrooms']==11].index, inplace=True)
# display(infer_df.info())

In [None]:
# Examine number of bedrooms for outliers
infer_df.bedrooms.value_counts()

In [None]:
print(infer_df.bedrooms.mean()-3*infer_df.bedrooms.std())
print(infer_df.bedrooms.mean()+3*infer_df.bedrooms.std())

In [None]:
infer_df = infer_df[(infer_df.bedrooms < (infer_df.bedrooms.mean() + 3*infer_df.bedrooms.std()))]
display(infer_df.info())
infer_df.bedrooms.value_counts()

Bathroom outliers

In [None]:
infer_df.bathrooms.value_counts()
# infer_df.loc[infer_df.bathrooms == 0.5]

In [None]:
print(infer_df.bathrooms.mean()-3*infer_df.bathrooms.std())
print(infer_df.bathrooms.mean()+3*infer_df.bathrooms.std())

In [None]:
infer_df = infer_df[(infer_df.bathrooms < (infer_df.bathrooms.mean() + 3*infer_df.bathrooms.std()))]
display(infer_df.info())
infer_df.bathrooms.value_counts()

Floors outliers

In [None]:
infer_df.floors.value_counts()

In [None]:
print(infer_df.floors.mean()-3*infer_df.floors.std())
print(infer_df.floors.mean()+3*infer_df.floors.std())

In [None]:
infer_df = infer_df[(infer_df.floors < (infer_df.floors.mean() + 3*infer_df.floors.std()))]
display(infer_df.info())
infer_df.floors.value_counts()

Sq. Ft. Living outliers

In [None]:
fig, axs = plt.subplots(figsize=(10,4))
axs.scatter(infer_df.sqft_living, infer_df.price)
axs.axvline(infer_df.sqft_living.mean()+3.5*infer_df.sqft_living.std())
axs.set_title('sqft_living');

In [None]:
print(infer_df.sqft_living.mean()-3*infer_df.sqft_living.std())
print(infer_df.sqft_living.mean()+3.5*infer_df.sqft_living.std())

In [None]:
infer_df = infer_df[(infer_df.sqft_living < (infer_df.sqft_living.mean() + 3.5*infer_df.sqft_living.std()))]
display(infer_df.info())

In [None]:
fig, axs = plt.subplots(figsize=(10,4))
axs.scatter(infer_df.sqft_living, infer_df.price)
axs.set_title('sqft_living');

Sq. Ft. Lot outliers

In [None]:
fig, axs = plt.subplots(figsize=(20,10))
axs.scatter(infer_df.sqft_lot, infer_df.price)
axs.axvline(infer_df.sqft_lot.mean()+3*infer_df.sqft_lot.std())
axs.set_title('sqft_lot');

In [None]:
print(infer_df.sqft_lot.mean()-4*infer_df.sqft_lot.std())
print(infer_df.sqft_lot.mean()+3*infer_df.sqft_lot.std())

In [None]:
infer_df = infer_df[(infer_df.sqft_lot < (infer_df.sqft_lot.mean() + 3*infer_df.sqft_lot.std()))]
display(infer_df.info())

In [None]:
fig, axs = plt.subplots(figsize=(20,10))
axs.scatter(infer_df.sqft_lot, infer_df.price)
axs.set_title('sqft_lot');

Sq. Ft. Above outliers

In [None]:
fig, axs = plt.subplots(figsize=(10,4))
axs.scatter(infer_df.sqft_above, infer_df.price)
axs.axvline(infer_df.sqft_above.mean()+3*infer_df.sqft_above.std())
axs.set_title('sqft_above');

In [None]:
print(infer_df.sqft_above.mean()-3*infer_df.sqft_above.std())
print(infer_df.sqft_above.mean()+3*infer_df.sqft_above.std())

In [None]:
infer_df = infer_df[(infer_df.sqft_above < (infer_df.sqft_above.mean() + 3*infer_df.sqft_above.std()))]
display(infer_df.info())
infer_df.sqft_above.value_counts()

In [None]:
fig, axs = plt.subplots(figsize=(10,4))
axs.scatter(infer_df.sqft_above, infer_df.price)
axs.set_title('sqft_above');

Sq. Ft. Basement outliers

In [None]:
fig, axs = plt.subplots(figsize=(10,4))
axs.scatter(infer_df.sqft_basement, infer_df.price)
axs.axvline(infer_df.sqft_basement.mean()+3*infer_df.sqft_basement.std())
axs.set_title('sqft_basement');

In [None]:
print(infer_df.sqft_basement.mean()-3*infer_df.sqft_basement.std())
print(infer_df.sqft_basement.mean()+3*infer_df.sqft_basement.std())

In [None]:
infer_df = infer_df[(infer_df.sqft_basement < (infer_df.sqft_basement.mean() + 3*infer_df.sqft_basement.std()))]
display(infer_df.info())

In [None]:
fig, axs = plt.subplots(figsize=(10,4))
axs.scatter(infer_df.sqft_basement, infer_df.price)
axs.set_title('sqft_basement');

# Inferential Model

## Modeling
***
Notes on models

In [None]:
# Create model training and testing data

# Trial 1
# X = infer_df.drop(columns=['price', 'id', 'date', 'condition', 'sqft_above',
#                            'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
#                            'lat', 'long', 'sqft_living15', 'sqft_lot15'])

# Trial 2 (Difference from Trial 1--> Dropped grade)
# X = infer_df.drop(columns=['price', 'id', 'date', 'condition', 'grade', 'sqft_above',
#                            'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 
#                            'lat', 'long', 'sqft_living15', 'sqft_lot15'])

# Trial 3 (Difference from Trial 2--> Dropped bedrooms and bathrooms)
# X = infer_df.drop(columns=['price', 'id', 'date', 'bedrooms', 'bathrooms', 'condition',
#                            'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
#                            'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'])

# Trial 4 (Difference from Trial 3--> Dropped waterfront)
# X = infer_df.drop(columns=['price', 'id', 'date', 'bedrooms', 'bathrooms', 'waterfront',
#                            'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built',
#                            'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'])

# Trial 5 (Difference from Trial 1--> Dropped waterfront)
# X = infer_df.drop(columns=['price', 'id', 'date', 'waterfront', 'condition', 'sqft_above',
#                            'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
#                            'sqft_living15', 'sqft_lot15'])

# Trial 6 (Difference from Trial 1--> Dropped waterfront, view, grade; Added sqft_above, sqft_basement)
# X = infer_df.drop(columns=['price', 'id', 'date', 'waterfront', 'view', 'condition', 'grade',
#                            'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'])

# Trial 7 (Difference from Trial 1--> Added condition, sqft_above, sqft_basement)
X = infer_df.drop(columns=['price', 'id', 'date', 'yr_built', 'yr_renovated',
                           'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'])

y = infer_df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Show feature correlation of training data
train_data = pd.concat([X_train, y_train], axis=1)
corr = train_data.corr()

fig, ax = plt.subplots(figsize=(12,12))
sns.heatmap(data=corr, mask=np.triu(np.ones_like(corr, dtype=bool)),
            ax=ax,annot=True, cbar_kws={"label": "Correlation",
                                        "orientation": "horizontal",
                                        "pad": .2, "extend": "both"});

# Trial 1 - Multicollinearity Concerns:
# sqft_living & bathrooms
# sqft_living & grade

# Trial 2 - Multicollinearity Concerns:
# sqft_living & bathrooms

# Trials 3, 4 - Multicollinearity Concerns:
# sqft_living & price

# Trial 5 - Multicollinearity Concerns:
# sqft_living & bathrooms
# sqft_living & grade

# Trial 6 - Multicollinearity Concerns:
# sqft_living & bathrooms
# sqft_living & sqft_above

# Trial 7 - Multicollinearity Concerns:
# sqft_living & bathrooms
# sqft_living & grade
# sqft_living & sqft_above
# sqft_above & grade

In [None]:
# Show scatter plots of training data compared to target

# Trials 1, 2, 5, 6
# fig, axes = plt.subplots(ncols=3, nrows=3, figsize=(16, 10))

# Trials 3, 4
# fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(16, 7))

# Trial 7 
fig, axes = plt.subplots(ncols=3, nrows=4, figsize=(16, 10))

fig.set_tight_layout(True)

for index, col in enumerate(X_train.columns):
    ax = axes[index//3][index%3]
    ax.scatter(X_train[col], y_train) #, alpha=0.2)
    ax.set_xlabel(col)
    ax.set_ylabel('price')

    
# Trial 1
# fig.delaxes(axes[2][2])

# Trials 2, 5, 6
# fig.delaxes(axes[2][1])
# fig.delaxes(axes[2][2])

# Trial 3
# fig.delaxes(axes[1][2])

# Trial 4
# fig.delaxes(axes[1][1])
# fig.delaxes(axes[1][2])

# Trial 7
fig.delaxes(axes[3][2])

In [None]:
# Create baseline model with DummyRegressor
baseline = DummyRegressor()
baseline.fit(X_train, y_train)
baseline.score(X_test, y_test)

In [None]:
# Run first model with highested correlated feature ('sqft_living')

# Trials 1, 5, 7
most_correlated_feature = 'grade'

# Trial 2, 3, 4, 6
# most_correlated_feature = 'sqft_living'

first_model = LinearRegression()

splitter = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)

first_scores = cross_validate(estimator=first_model,
                                 X=X_train[[most_correlated_feature]],
                                 y=y_train, return_train_score=True,
                                 cv=splitter)

print('First Model')
print('Train score: ', first_scores['train_score'].mean())
print('Validation score: ', first_scores['test_score'].mean())

# Trials 1, 5, 7:
# First Model
# Train score:  0.4026275172522893
# Validation score:  0.39002239307457764

# Trials 2, 3, 4, 6:
# First Model
# Train score:  0.3824853370125143
# Validation score:  0.37867372602330196

In [None]:
# Examine OLS summary table to examine coefficients of first model
sm.OLS(y_train, sm.add_constant(X_train[[most_correlated_feature]])).fit().summary()

In [None]:
# Run second model with additional, correlated features

# Trial 1
# select_features = X_train[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
#                            'floors', 'waterfront', 'view', 'grade']].copy()

# Trial 2
# select_features = X_train[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
#                            'floors', 'waterfront', 'view']].copy()

# Trial 3
# select_features = X_train[['sqft_living', 'sqft_lot',
#                            'floors', 'waterfront', 'view']].copy()

# Trial 4
# select_features = X_train[['sqft_living', 'sqft_lot',
#                            'floors', 'view']].copy()

# Trial 5
# select_features = X_train[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
#                            'floors', 'view', 'grade']].copy()


# Trial 6
# select_features = X_train[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
#                            'floors', 'sqft_above', 'sqft_basement']].copy()

# Trial 7
select_features = X_train[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
                           'floors', 'waterfront', 'view', 'condition', 'grade',
                           'sqft_above', 'sqft_basement']].copy()

second_model = LinearRegression()

second_model_scores = cross_validate(estimator=second_model,
                                     X=select_features, y=y_train,
                                     return_train_score=True, cv=splitter)

print('Second Model')
print('Train score: ', second_model_scores['train_score'].mean())
print('Validation score: ', second_model_scores['test_score'].mean())
print('First Model')
print('Train score: ', first_scores['train_score'].mean())
print('Validation score: ', first_scores['test_score'].mean())

# Trial 1:
# Second Model
# Train score:  0.496510401293593
# Validation score:  0.49119845883778046
# First Model
# Train score:  0.4026275172522893
# Validation score:  0.39002239307457764

# Trial 2:
# Second Model
# Train score:  0.4288909091195216
# Validation score:  0.430601865447946
# First Model
# Train score:  0.3824853370125143
# Validation score:  0.37867372602330196

# Trial 3:
# Second Model
# Train score:  0.42285711826946865
# Validation score:  0.42393550511133987
# First Model
# Train score:  0.3824853370125143
# Validation score:  0.37867372602330196

# Trial 4:
# Second Model
# Train score:  0.42142246455384114
# Validation score:  0.4216979102573514
# First Model
# Train score:  0.3824853370125143
# Validation score:  0.37867372602330196

# Trial 5:
# Second Model
# Train score:  0.4948170285278602
# Validation score:  0.4885522178278873
# First Model
# Train score:  0.4026275172522893
# Validation score:  0.39002239307457764

# Trial 6:
# Second Model
# Train score:  0.40003825863753684
# Validation score:  0.3941216058870845
# First Model
# Train score:  0.3824853370125143
# Validation score:  0.37867372602330196

# Trial 7:
# Second Model
# Train score:  0.5166094744927036
# Validation score:  0.5121567236780332
# First Model
# Train score:  0.4026275172522893
# Validation score:  0.39002239307457764

In [None]:
# Examine OLS summary table to examine coefficients of second model
sm.OLS(y_train, sm.add_constant(select_features)).fit().summary()

In [None]:
# Run third model with features with high p-value removed

# Remove features due to high p-value and possible multicollinearity
# Trials 1, 3, 4, 5
# N/A

# Trial #2
# less_features = select_features.drop(columns=['bathrooms']).copy()

# Trial #6
# less_features = select_features.drop(columns=['bathrooms', 'sqft_above', 'sqft_basement']).copy()

# Trial #7a
# less_features = select_features.drop(columns=['floors', 'sqft_above', 'sqft_basement']).copy()

# Trial #7b
less_features = select_features.drop(columns=['floors', 'waterfront', 'sqft_above', 'sqft_basement']).copy()

third_model = LinearRegression()

third_model_scores = cross_validate(estimator=third_model,
                                     X=less_features, y=y_train,
                                     return_train_score=True, cv=splitter)

print('Third Model')
print('Train score: ', third_model_scores['train_score'].mean())
print('Validation score: ', third_model_scores['test_score'].mean())
print('Second Model')
print('Train score: ', second_model_scores['train_score'].mean())
print('Validation score: ', second_model_scores['test_score'].mean())
print('First Model')
print('Train score: ', first_scores['train_score'].mean())
print('Validation score: ', first_scores['test_score'].mean())

# Trial 2:
# Third Model
# Train score:  0.42882665785284235
# Validation score:  0.43085378605110547
# Second Model
# Train score:  0.4288909091195216
# Validation score:  0.430601865447946
# First Model
# Train score:  0.3824853370125143
# Validation score:  0.37867372602330196

# Trial 6:
# Third Model
# Train score:  0.3966865317353403
# Validation score:  0.39227606270023035
# Second Model
# Train score:  0.40003825863753684
# Validation score:  0.3941216058870845
# First Model
# Train score:  0.3824853370125143
# Validation score:  0.37867372602330196

# Trial 7a:
# Third Model
# Train score:  0.5115716743687072
# Validation score:  0.5087137858963889
# Second Model
# Train score:  0.5166094744927036
# Validation score:  0.5121567236780332
# First Model
# Train score:  0.4026275172522893
# Validation score:  0.39002239307457764

# Trial 7b:
# Third Model
# Train score:  0.5097722382826495
# Validation score:  0.5059020926945704
# Second Model
# Train score:  0.5166094744927036
# Validation score:  0.5121567236780332
# First Model
# Train score:  0.4026275172522893
# Validation score:  0.39002239307457764

In [None]:
# Examine OLS summary table to examine coefficients of third model
# Trials 1, 3, 4, 5
# N/A

# Trials 2, 6, 7
sm.OLS(y_train, sm.add_constant(less_features)).fit().summary()

In [None]:
# Build final model and score it
# Trial 1
# final_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
#                   'floors', 'waterfront', 'view', 'grade']

# Trial 2
# final_features = ['bedrooms', 'sqft_living', 'sqft_lot',
#                   'floors', 'waterfront', 'view']

# Trial 3
# final_features = ['sqft_living', 'sqft_lot',
#                   'floors', 'waterfront', 'view']

# Trial 4
# final_features = ['sqft_living', 'sqft_lot',
#                   'floors', 'view']

# Trial  5
# final_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
#                   'floors', 'view', 'grade']

# Trial  6
# final_features = ['bedrooms', 'sqft_living', 'sqft_lot', 'floors']

# Trial  7a
# final_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
#                   'waterfront', 'view', 'condition', 'grade']

# Trial  7b
final_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
                  'view', 'condition', 'grade']

X_train_final = X_train[final_features]
X_test_final = X_test[final_features]

final_model = LinearRegression()
final_model.fit(X_train_final, y_train)

final_model.score(X_test_final, y_test)

# Trial 1 Score: 0.511874207507455
# Trial 2 Score: 0.451456418439514
# Trial 3 Score: 0.4438049237298961
# Trial 4 Score: 0.44217810233401955
# Trial 5 Score: 0.5092022681763618
# Trial 6 Score: 0.41074053515426934
# Trial 7a Score: 0.5290284607510969
# Trial 7b Score: 0.5267250835532156

## Results
***
Ca-ching

In [None]:
# Show feature correlation of training data

# Trial 1
# final_features_include_price = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
#                                 'floors', 'waterfront', 'view', 'grade']

# Trial 2: 
# final_features_include_price = ['price', 'bedrooms', 'sqft_living', 'sqft_lot',
#                                 'floors', 'waterfront', 'view']

# Trial 3: 
# final_features_include_price = ['price', 'sqft_living', 'sqft_lot',
#                                 'floors', 'waterfront', 'view']

# Trial 4:
# final_features_include_price = ['price', 'sqft_living', 'sqft_lot', 'floors', 'view']

# Trial 5:
# final_features_include_price = ['price', 'bedrooms', 'bathrooms', 'sqft_living',
#                                 'sqft_lot', 'floors', 'view', 'grade']

# Trial 6:
# final_features_include_price = ['price', 'bedrooms', 'sqft_living', 'sqft_lot', 'floors']

# Trial 7a:
# final_features_include_price = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
#                                 'waterfront', 'view', 'condition', 'grade']

# Trial 7b:
final_features_include_price = ['price', 'bedrooms', 'bathrooms', 'sqft_living',
                                'sqft_lot', 'view', 'condition', 'grade']


final_features_infer_df = infer_df[final_features_include_price]
corr = final_features_infer_df.corr()

fig, ax = plt.subplots(figsize=(16,10))
sns.heatmap(data=corr, mask=np.triu(np.ones_like(corr, dtype=bool)),
            ax=ax,annot=True, cbar_kws={"label": "Correlation",
                                        "orientation": "horizontal",
                                        "pad": .2, "extend": "both"})
ax.set_title('Correlation Heatmap of Inferential Model Features');
plt.savefig('./data/correlation_heatmap.jpg', dpi=300)

In [None]:
# Check RMSE
RMSE = mean_squared_error(y_test, final_model.predict(X_test_final), squared=False)

RMSE

# Trial 1 RMSE: 173586.9619946568
# Trial 2 RMSE: 184016.51147291518
# Trial 3 RMSE: 185295.46668845217
# Trial 4 RMSE: 185566.2552490975
# Trial 5 RMSE: 174061.41023609342
# Trial 6 RMSE: 190723.6316965016
# Trial 7a RMSE: 170509.49027367876
# Trial 7b RMSE: 170925.93650008747

In [None]:
# Coefficients and intercept of final model
print(pd.Series(final_model.coef_, index=X_train_final.columns, name="Coefficients"))
print("Intercept:", final_model.intercept_)

# Trial 1:
# bedrooms       -12701.455316
# bathrooms      -21503.827436
# sqft_living       131.799022
# sqft_lot           -1.004437
# floors          -9873.331838
# waterfront     201394.511585
# view            40887.654188
# grade           91304.929574
# Name: Coefficients, dtype: float64
# Intercept: -355895.2075841872

# Trial 2:
# bedrooms       -28110.327140
# sqft_living       209.106764
# sqft_lot           -0.960157
# floors          23539.481932
# waterfront     180402.784943
# view            45219.673734
# Name: Coefficients, dtype: float64
# Intercept: 137305.31369272142

# Trial 3:
# sqft_living       187.609784
# sqft_lot           -0.858892
# floors          26848.382311
# waterfront     184712.899065
# view            47335.907533
# Name: Coefficients, dtype: float64
# Intercept: 79592.55837849632

# Trial 4:
# sqft_living      186.707086
# sqft_lot          -0.822644
# floors         27252.036823
# view           50456.581770
# Name: Coefficients, dtype: float64
# Intercept: 80128.47086002585

# Trial 5:
# bedrooms      -12925.470940
# bathrooms     -21570.658078
# sqft_living      131.268752
# sqft_lot          -0.965824
# floors         -9301.170791
# view           44287.779986
# grade          91020.604718
# Name: Coefficients, dtype: float64
# Intercept: -353361.70400261175

# Trial 6:
# bedrooms      -34895.027351
# sqft_living      225.647839
# sqft_lot          -0.906406
# floors         17789.230753
# Name: Coefficients, dtype: float64
# Intercept: 147838.76873069798

# Trial 7a:
# bedrooms       -15484.751485
# bathrooms      -18483.736434
# sqft_living       128.488448
# sqft_lot           -1.037998
# waterfront     207724.208474
# view            39203.973986
# condition       48150.886040
# grade           95244.182246
# Name: Coefficients, dtype: float64
# Intercept: -554302.4580548927

# Trial 7b:
# bedrooms      -15734.933848
# bathrooms     -18384.720739
# sqft_living      127.929927
# sqft_lot          -1.000784
# view           42698.987942
# condition      47902.050447
# grade          95015.067619
# Name: Coefficients, dtype: float64
# Intercept: -550675.8977261952

In [None]:
sns.reset_orig
coef_s = pd.Series(final_model.coef_, index=X_train_final.columns, name="Coefficients")
coef_s.sort_values(ascending=False, inplace=True)
colors_ax = ['darkgreen', 'darkgreen', 'darkgreen', 'darkgreen', 'darkred', 'darkred', 'darkred']
display(coef_s)

fig, ax = plt.subplots(figsize=(16, 10))
ax.bar(coef_s.index, coef_s.values, color=colors_ax)
ax.axhline(y=0, color='black')
ax.set_xlabel('Features')
ax.set_ylabel('Coefficients ($)')
ax.set_title('Coefficients of Features')
xlocs, xlabs = plt.xticks()
for i, v in enumerate(coef_s):
    string = ''
    if v > 0:
        string = '$' + str(abs(round(v,2)))
        plt.text(xlocs[i] - 0.225, v + 1000, string)
    else:
        string = '-$' + str(abs(round(v,2)))
        plt.text(xlocs[i] - 0.225, v - 3000, string)
plt.savefig('./data/coefficients.jpg', dpi=300);

In [None]:
# Checkinng independence (aka no multicollinearity) assumption holds
vif = [variance_inflation_factor(X_train_final.values, i) for i in range(X_train_final.shape[1])]
pd.Series(vif, index=X_train_final.columns, name="Variance Inflation Factor")

# Trial 1:
# bedrooms       21.471617
# bathrooms      24.188882
# sqft_living    21.695238
# sqft_lot        1.790673
# floors         12.856621
# waterfront      1.088276
# view            1.232208
# grade          32.032787
# Name: Variance Inflation Factor, dtype: float64

# Trial 2: 
# bedrooms       14.669954
# sqft_living    15.641708
# sqft_lot        1.735450
# floors          7.823340
# waterfront      1.088188
# view            1.226896
# Name: Variance Inflation Factor, dtype: float64

# Trial 3: 
# sqft_living    8.207268
# sqft_lot       1.726465
# floors         6.828538
# waterfront     1.088187
# view           1.216567
# Name: Variance Inflation Factor, dtype: float64

# Trial 4:
# sqft_living    8.178242
# sqft_lot       1.722206
# floors         6.819759
# view           1.123939
# Name: Variance Inflation Factor, dtype: float64

# Trial 5:
# bedrooms       21.471491
# bathrooms      24.188339
# sqft_living    21.665891
# sqft_lot        1.786698
# floors         12.849574
# view            1.139868
# grade          32.030612
# Name: Variance Inflation Factor, dtype: float64

# Trial 6:
# bedrooms       14.537254
# sqft_living    14.949514
# sqft_lot        1.730840
# floors          7.793032
# Name: Variance Inflation Factor, dtype: float64

# Trial 7a:
# bedrooms       24.021589
# bathrooms      21.743362
# sqft_living    23.314119
# sqft_lot        1.763216
# waterfront      1.087689
# view            1.225547
# condition      20.083208
# grade          44.842451
# Name: Variance Inflation Factor, dtype: float64

# Trial 7b:
# bedrooms       24.020809
# bathrooms      21.743050
# sqft_living    23.284192
# sqft_lot        1.759836
# view            1.134468
# condition      20.083027
# grade          44.833321
# Name: Variance Inflation Factor, dtype: float64

In [None]:
vif_s = pd.Series(vif, index=X_train_final.columns, name="Variance Inflation Factor")
vif_s.sort_values(ascending=False, inplace=True)
colors_ax = ['darkred', 'darkred', 'darkred', 'darkred', 'darkred', 'darkgreen', 'darkgreen']

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(vif_s.index, vif_s.values, color=colors_ax)
ax.axhline(y=5, color='black')
ax.set_xlabel('Features')
ax.set_ylabel('VIF')
ax.set_title('Variance Inflation Factors of Features');
ax

In [None]:
# Checking linearity assumption holds
preds = final_model.predict(X_test_final)
fig, ax = plt.subplots()

perfect_line = np.arange(y_test.min(), y_test.max())
ax.plot(perfect_line, linestyle="--", color="red", label="Perfect Fit")
ax.scatter(y_test, preds, alpha=0.5)
ax.set_xlabel("Actual Price")
ax.set_ylabel("Predicted Price")
ax.legend();

In [None]:
# Checkinng normality assumption holds
residuals = (y_test - preds)
sm.graphics.qqplot(residuals, dist=stats.norm, line='45', fit=True);

In [None]:
# Checking homoscedasticity
fig, ax = plt.subplots()

ax.scatter(preds, residuals, alpha=0.5)
ax.plot(preds, [0 for i in range(len(X_test))])
ax.set_xlabel("Predicted Value")
ax.set_ylabel("Actual - Predicted Value");

## Predictive Model - Experimentation

In [None]:
# # Create model training and testing data
# X = df.drop(columns=['id', 'date', 'yr_built', 'yr_renovated',
#                      'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'])
# X = np.log(X+1)
# y = X.price
# X = X.drop(columns='price')

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# # Examine skew of final features and target
# X_train.hist(figsize=(12,12));

In [None]:
# # Examine target ('price') distribution
# sns.distplot(y_train, fit=stats.norm)
# fig = plt.figure()
# stats.probplot(y_train, plot=plt);

In [None]:
# # Show feature correlation of training data
# train_data = pd.concat([X_train, y_train], axis=1)
# corr = train_data.corr()

# fig, ax = plt.subplots(figsize=(12,12))
# sns.heatmap(data=corr, mask=np.triu(np.ones_like(corr, dtype=bool)),
#             ax=ax,annot=True, cbar_kws={"label": "Correlation",
#                                         "orientation": "horizontal",
#                                         "pad": .2, "extend": "both"});

In [None]:
# # Show scatter plots of training data compared to target
# fig, axes = plt.subplots(ncols=3, nrows=4, figsize=(16, 10))

# fig.set_tight_layout(True)

# for index, col in enumerate(X_train.columns):
#     ax = axes[index//3][index%3]
#     ax.scatter(X_train[col], y_train) #, alpha=0.2)
#     ax.set_xlabel(col)
#     ax.set_ylabel('price')

# fig.delaxes(axes[3][2])

In [None]:
# # Examine target ('price') distribution
# sns.distplot(y_train, fit=stats.norm)
# fig = plt.figure()
# stats.probplot(y_train, plot=plt);

In [None]:
# # Run log function to normalize target data
# y_train_log = np.log(y_train+1)
# y_test_log = np.log(y_test+1)

In [None]:
# # Re-examine target ('price') 
# sns.distplot(y_train_log, fit=stats.norm)
# fig = plt.figure()
# stats.probplot(y_train_log, plot=plt);

In [None]:
# # Examine skew of final features
# X_train.hist(figsize=(12,12));

In [None]:
# # Apply log to continuous features and re-examine skew
# X_train_continuous_log = pd.DataFrame([])
# X_train_continuous_log['sqft_living_log'] = np.log(X_train['sqft_living']+1)
# X_train_continuous_log['sqft_lot_log'] = np.log(X_train['sqft_lot']+1)
# X_train_continuous_log['sqft_basement_log'] = np.log(X_train['sqft_basement']+1)
# X_train_continuous_log.hist(figsize=(12,12));

In [None]:
# X_train_discrete = X_train[['bedrooms', 'floors']]
# train_data = pd.concat([X_train_continuous_log, X_train_discrete, y_train_log], axis=1)
# train_data

In [None]:
# # Scale all training features
# train_columns = train_data.columns
# scaler = StandardScaler()
# train_scaled = scaler.fit_transform(train_data)
# train_scaled_df = pd.DataFrame(train_scaled, columns=train_columns)
# X_train_scaled = train_scaled_df.drop(columns=['price'])
# y_train_scaled = train_scaled_df.price
# display(X_train_scaled)
# display(y_train_scaled)

In [None]:
# # Repeat the above process for the test data
# # X_test_continuous_log = pd.DataFrame([])
# # X_test_continuous_log['sqft_living_log'] = np.log(X_test['sqft_living'])
# # X_test_continuous_log['sqft_lot_log'] = np.log(X_test['sqft_lot'])

# # X_test_discrete = X_test[['bedrooms', 'floors']]
# # test_data = pd.concat([X_test_continuous_log, X_test_discrete, y_test_log], axis=1)
# # display(test_data)

# test_data = pd.concat([X_test, y_test], axis=1)
# test_columns = test_data.columns
# scaler = StandardScaler()
# test_scaled = scaler.fit_transform(test_data)
# test_scaled_df = pd.DataFrame(test_scaled, columns=test_columns)
# X_test_scaled = test_scaled_df.drop(columns=['price'])
# y_test_scaled = test_scaled_df.price
# display(X_test_scaled)
# display(y_test_scaled)

In [None]:
# final_features = X_train_scaled.columns

# X_train_final = X_train_scaled[final_features]
# X_test_final = X_test_scaled[final_features]

# final_model = LinearRegression()
# final_model.fit(X_train_final, y_train_scaled)

# final_model.score(X_test_final, y_test_scaled)

In [None]:
# sm.OLS(y_train_scaled, sm.add_constant(X_train_scaled)).fit().summary()

In [None]:
# # Check RMSE
# RMSE = mean_squared_error(y_test_scaled, final_model.predict(X_test_final), squared=False)

# RMSE

In [None]:
# # Coefficients and intercept of final model
# print(pd.Series(final_model.coef_, index=X_train_final.columns, name="Coefficients"))
# print("Intercept:", final_model.intercept_)


In [None]:
# # Checking linearity assumption holds
# preds = final_model.predict(X_test_final)
# fig, ax = plt.subplots()

# perfect_line = np.arange(y_test_scaled.min(), y_test_scaled.max())
# ax.plot(perfect_line, linestyle="--", color="red", label="Perfect Fit")
# ax.scatter(y_test_scaled, preds, alpha=0.5)
# ax.set_xlabel("Actual Price")
# ax.set_ylabel("Predicted Price")
# ax.legend();

In [None]:
# # Checkinng normality assumption holds
# residuals = (y_test_scaled - preds)
# sm.graphics.qqplot(residuals, dist=stats.norm, line='45', fit=True);

In [None]:
# # Checkinng independence (aka no multicollinearity) assumption holds
# vif = [variance_inflation_factor(X_train_final.values, i) for i in range(X_train_final.shape[1])]
# pd.Series(vif, index=X_train_final.columns, name="Variance Inflation Factor")

In [None]:
# # Checking homoscedasticity
# fig, ax = plt.subplots()

# ax.scatter(preds, residuals, alpha=0.5)
# ax.plot(preds, [0 for i in range(len(X_test_scaled))])
# ax.set_xlabel("Predicted Value")
# ax.set_ylabel("Actual - Predicted Value");

In [None]:
# # small sample because loading the whole dataframe will be too much to load
# sample = df.sample(20,random_state=33)
# # creates a map and centers its in the center of King County
# mp = folium.Map(location=[sample.lat.mean(),
#                           sample.long.mean()], zoom_start=10, control_scale=True)
# # adds markers for the homes in the same and their price
# for index, location_info in sample.iterrows():
#     folium.Marker([location_info["lat"], location_info["long"]],
#                   popup="$" + str(location_info["price"])).add_to(mp)
# mp

## Recommendations
***
Here they are:

## Overall Conclusions
***
They are: