#### Zillow Clustering Analysis
**Artifact: Jupyter Notebook Report**

Created by: Mijail Q. Mariano

Presented: Tuesday, August 09th 2022

-----

**Required Libraries & Modules**

In [None]:
%matplotlib inline
import matplotlib as mlp
# mlp.rcParams['figure.dpi'] = 300

# diasbling warning messages
import warnings
warnings.filterwarnings("ignore")

# importing key libraries
import pandas as pd
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:.2f}'.format

# numpy import
import numpy as np

# scipy import
import scipy.stats as stats
import math
from math import sqrt

# datetime module for home transaction dates
import datetime

# importing acquire module
import acquire
from acquire import get_zillow_dataset, \
                    clean_zillow_dataset, \
                    age_of_homes, \
                    get_lower_and_upper_bounds, \
                    zillow_outliers, \
                    clean_months, \
                    null_df, \
                    train_iterative_imputer, \
                    impute_val_and_test, \
                    train_validate_test_split, \
                    get_transaction_quarters, \
                    get_dummy_dataframes, \
                    get_cluster_dummy

# importing data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set(style = "darkgrid")

# sklearn data science library
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.metrics import mean_squared_error
from sklearn.impute import IterativeImputer
from sklearn.cluster import KMeans

from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.decomposition import PCA 
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectKBest, RFE
from sklearn.feature_selection import RFECV

----
### **``Project Scope and Objectives:``**

<u>``Scope:``</u> Apply machine learning clustering techniques to better segment Zillow's real-estate data and reduce the overall error in predicted home evaluations as measured by Zillow's "Zestimate". 

<u>``Objectives:``</u>

1. Acquire and clean the Zillow dataset
2. Apply and treat outliers in the dataset (make recommendations)
3. Identify and treat missing values in the dataset
4. Generate statistical hypotheses for testing
5. Use dataset features to create clusters
6. Generate visualizations to interpret clusters and make selections for modeling
7. Use clusters to model logerror predictions and evaluate results
8. Make final recommendations



-----

### **``Data Acquisition and Preparation``**

**``Key Highlights``**

- Used domain knowledge and research to focus on key questions for analysis
- Renamed and converted columns/features to proper data type
- Dropped initial records and features with > 80% Null values
- Added columns:
    - "Home Age"
    - "Transactions by Month"
    - "Transactions by Quarter"





*these homes represented homes larger than the majority homes in the dataset, therefore making the analysis or future prediction less accurate or obscured when including these records in the analysis.* *

In [None]:
# acquiring and preparing initial zillow dataset

df = get_zillow_dataset()
df = clean_zillow_dataset(df)
df = age_of_homes(df)

df.head()

----
```Dataset Summary Statistics:```

In [None]:
# dataframe info:

sorted_cols = df.columns.sort_values()
df[sorted_cols].info()

In [None]:
# dataframe summary statistics:

summary_stats = df.describe().T
summary_stats["range"] = summary_stats["max"] - summary_stats["min"]
summary_stats.sort_index()

-----
```Handling Outliers:```

In [None]:
# using "iqr" method to determine lower and upper bounds for continuous variables

get_lower_and_upper_bounds(df)

In [None]:
# cleaning dataset for outliers at lower and upper bounds

df = zillow_outliers(df)

#### <u>**Note on Outliers:**</u>


----

In [None]:
# adding transactions by month column

df = clean_months(df)
df.head()

----
#### **``Splitting the Zillow Dataset for Hypothesis Testing:``**

Highlights:

- initial dataset: (52238, 14)
- dataset after cleaning: (38293, 15)

In [None]:
# splitting the Zillow dataset

train, validate, test = train_validate_test_split(df)

In [None]:
# detecting and handling missing values in the training dataset

null_df = null_df(train)
null_df

In [None]:
# handling remaining null values using sklearn's iterative imputer

train_imputed = train_iterative_imputer(train)
train_imputed.isnull().sum() # checks out!

In [None]:
# handling validate and test datasets

validate_imputed, test_imputed = impute_val_and_test(train, validate, test)

-----
### <u>``Analyzing the Target Variable (logerror)``</u>

*logerror = log(Predicted: Zestimate) − log(Actual: Home Transaction Price)*

In [None]:
# setting alpha for hypothesis tests

alpha = 0.05

In [None]:
# examining logerror by county and home value

plt.figure(figsize = (20, 3))
sns.set(font_scale = .5)

p = sns.jointplot(
    data = train_imputed.sample(5000, random_state = 14),
    x = "home_value", 
    y = "logerror", 
    hue = "county_by_fips",
    s = 4)

p.fig.suptitle("Logerror by County Home Value")
p.fig.tight_layout()
p.fig.subplots_adjust(top = .95) # Reduce plot to make room 

plt.show()

In [None]:
# examining logerror distribution by county (exclusive of outliers)

county_lst = list(train_imputed["county_by_fips"].unique())

for ele in county_lst:
    print(f'County: {ele}')
    print(f'logerror: {round(train_imputed[train_imputed["county_by_fips"] == ele].logerror.var(), 5)}')
    print('-------------------------')

    plt.figure(figsize = (8,3))
    plt.xlim(-.2, .2)

    plt.title('Distribution of logerror for {}'.format(ele))
    plt.hist(train_imputed[train_imputed["county_by_fips"] == ele].logerror, bins = 50)

----
**```Hypothesis Question Number 1: Is there a difference in logerror across transaction month?```**

Given the many economic factors that may influence the housing market, I belive that this could also lead to challenges in accurate and timely home evaluations - thus leading to over, or under estimating a home's true value.

**Null Hypothesis:** "There's **not** a statistical logerror difference across transaction months."

**Alternative Hypothesis:** "There **is** a statistical logerror difference across transaction months."

$\alpha$: 0.05

- Monthly logerror variances are relatively equal.

In [None]:
# plotting monthly logerror

plt.figure(figsize = (10, 6))
sns.set(font_scale = 0.6)
sns.barplot(x = "transaction_month",
    y = "logerror", 
    data = train_imputed,
    order = [
    'January',
    'February',
    'March',
    'April',
    'May',
    'June',
    'July',
    'August',
    'September'],
    palette = "crest_r",
    ci = 95)


plt.title("Yearly Glance: Log Error by Month")
plt.xlabel(None)
plt.show()

In [None]:
# creating transaction yearly quarter columns (future testing)

# train dataset
train_imputed["q1_transaction"] = (train_imputed["transaction_month"] == "January") | (train_imputed["transaction_month"] == "February") | (train_imputed["transaction_month"] == "March")
train_imputed["q2_transaction"] = (train_imputed["transaction_month"] == "April") | (train_imputed["transaction_month"] == "May") | (train_imputed["transaction_month"] == "June")
train_imputed["q3_transaction"] = (train_imputed["transaction_month"] == "July") | (train_imputed["transaction_month"] == "August") | (train_imputed["transaction_month"] == "September")

# validate dataset
validate_imputed["q1_transaction"] = (validate_imputed["transaction_month"] == "January") | (validate_imputed["transaction_month"] == "February") | (validate_imputed["transaction_month"] == "March")
validate_imputed["q2_transaction"] = (validate_imputed["transaction_month"] == "April") | (validate_imputed["transaction_month"] == "May") | (validate_imputed["transaction_month"] == "June")
validate_imputed["q3_transaction"] = (validate_imputed["transaction_month"] == "July") | (validate_imputed["transaction_month"] == "August") | (validate_imputed["transaction_month"] == "September")

# test dataset
test_imputed["q1_transaction"] = (test_imputed["transaction_month"] == "January") | (test_imputed["transaction_month"] == "February") | (test_imputed["transaction_month"] == "March")
test_imputed["q2_transaction"] = (test_imputed["transaction_month"] == "April") | (test_imputed["transaction_month"] == "May") | (test_imputed["transaction_month"] == "June")
test_imputed["q3_transaction"] = (test_imputed["transaction_month"] == "July") | (test_imputed["transaction_month"] == "August") | (test_imputed["transaction_month"] == "September")

In [None]:
# melting quarter transaction columns

train_imputed["transaction_quarter"] = train_imputed[["q1_transaction", "q2_transaction", "q3_transaction"]].idxmax(1).to_frame('transaction_quarter')
validate_imputed["transaction_quarter"] = validate_imputed[["q1_transaction", "q2_transaction", "q3_transaction"]].idxmax(1).to_frame('transaction_quarter')
test_imputed["transaction_quarter"] = test_imputed[["q1_transaction", "q2_transaction", "q3_transaction"]].idxmax(1).to_frame('transaction_quarter')

# dropping redundant columns

train_imputed.drop(columns = ["q1_transaction", "q2_transaction", "q3_transaction"], inplace = True)
validate_imputed.drop(columns = ["q1_transaction", "q2_transaction", "q3_transaction"], inplace = True)
test_imputed.drop(columns = ["q1_transaction", "q2_transaction", "q3_transaction"], inplace = True)

train.head()

In [None]:
# ANOVA hypothesis test for: transactions by quarter

f_statistic, p_value = stats.f_oneway(
    train_imputed[train_imputed["transaction_month"] == "January"].logerror,
    train_imputed[train_imputed["transaction_month"] == "February"].logerror,
    train_imputed[train_imputed["transaction_month"] == "March"].logerror,
    train_imputed[train_imputed["transaction_month"] == "April"].logerror,
    train_imputed[train_imputed["transaction_month"] == "May"].logerror,
    train_imputed[train_imputed["transaction_month"] == "June"].logerror,
    train_imputed[train_imputed["transaction_month"] == "July"].logerror,
    train_imputed[train_imputed["transaction_month"] == "August"].logerror,
    train_imputed[train_imputed["transaction_month"] == "September"].logerror)

# comparing the counties p_value to alpha
print(f'f statistic = {f_statistic.round(5)}')
print(f'p value = {p_value.round(5)}')
print("-----------------")

if p_value < alpha:
    print('We reject the null hypothesis. Means are different across the groups.')
else:
    print('We fail to reject the null hypothesis. Means are similar across the groups.')

----
```Hypothesis Question Number 2: Is there a difference in logerror across home sizes (binned living sq-feet)?```

Social and cultural changes such as the age/period of first-time families, interests in more sustainable and smaller eco-friendly lifestyles can all play a role in determining the size value of a home. I believe factors such as these can undoubtedbly make predicting the precise true value of a home more difficult.

**Null Hypothesis:** There's **not** a statistical logerror difference across home sizes."

**Althernative Hypothesis:** There **is** a statistical logerror difference across home sizes."

$\alpha$: 0.05

- Home Size logerror variances are not equal.

<u>**Binned Home Sizes:**</u>

* 360 - 1241 sq. ft (smallest)
* 1241 - 1566 sq. ft
* 1566 - 2037 sq. ft
* 2037 - 3855 sq ft (largest)

In [None]:
# generating new home size column

sq_ft_labels = ["360_to_1240_sqfeet", "1241_to_1565_sqfeet", "1566_to_2036_sqfeet", "2037_to_3855_sqfeet"]

train_imputed["living_sqfeet_binned"] = pd.qcut(
    train_imputed["living_sq_feet"], \
    q = 4, \
    labels = sq_ft_labels)

validate_imputed["living_sqfeet_binned"] = pd.qcut(
    validate_imputed["living_sq_feet"], \
    q = 4, \
    labels = sq_ft_labels)

test_imputed["living_sqfeet_binned"] = pd.qcut(
    test_imputed["living_sq_feet"], \
    q = 4, \
    labels = sq_ft_labels)

train_imputed.head()

In [None]:
# plotting logerror by home size

plt.figure(figsize = (10, 6))
sns.set(font_scale = 0.6)
sns.barplot(x = "living_sqfeet_binned",
    y = "logerror", 
    data = train_imputed,
    order = [
        "2037_to_3855_sqfeet", 
        "1566_to_2036_sqfeet", 
        "1241_to_1565_sqfeet",
        "360_to_1240_sqfeet"],
    palette = "crest_r",
    ci = 95)

plt.title("Log Error by Home Size")
plt.xlabel(None)
plt.show()

In [None]:
# ANOVA hypothesis test for: home sizes

f_statistic, p_value = stats.f_oneway(
    train_imputed[train_imputed["living_sqfeet_binned"] == "360_to_1240_sqfeet"].logerror,
    train_imputed[train_imputed["living_sqfeet_binned"] == "1241_to_1565_sqfeet"].logerror,
    train_imputed[train_imputed["living_sqfeet_binned"] == "1566_to_2036_sqfeet"].logerror,
    train_imputed[train_imputed["living_sqfeet_binned"] == "2037_to_3855_sqfeet"].logerror)

# comparing the counties p_value to alpha
print(f'f statistic = {f_statistic.round(5)}')
print(f'p value = {p_value.round(5)}')
print("-----------------")

if p_value < alpha:
    print('We reject the null hypothesis. Means are different across the groups.')
else:
    print('We fail to reject the null hypothesis. Means are similar across the groups.')

----
```Hypothesis Question Number 3: Is there a difference in logerror across building era?```

As time passes so do the architectural design methods and the kinds of homes that are built. For example, colonial style homes may be reminiscent of a 17th-18th century time period. 

A mid-century modern design home - may provide a feeling of both nostalgia and future creativity. In either case, I believe that as time passes so do the building styles of home and ultimately which home styles are more prevailing in current times. Unfortunately, similar to home styles - these preference trends are just as difficult to predict but could be valuable to understand.

**Null Hypothesis:** There's **not** a statistical logerror difference across building era."

**Althernative Hypothesis:** There **is** a statistical logerror difference across building era."

$\alpha$: 0.05

- Build Era logerror variances are not equal.

<u>**Home Building Eras:**</u>

* 1977 - 2015: New Century
* 1960 - 1976: Late 20th Century
* 1950 - 1959: Mid 20th Century
* 1907 - 1949: Early 20th Century

In [None]:
# creating new home build era column 

age_labels = ["new_century", "late_20th_century", "mid_20th_century", "early_20th_century"]

train_imputed["home_age_binned"] = pd.qcut(
    train_imputed["home_age"],
    q = 4,
    labels = age_labels)

validate_imputed["home_age_binned"] = pd.qcut(
    validate_imputed["home_age"], 
    q = 4, 
    labels = age_labels)

test_imputed["home_age_binned"] = pd.qcut(
    test_imputed["home_age"], 
    q = 4, 
    labels = age_labels)

train_imputed.head()

In [None]:
# plotting logerror by building era w/20% Confidence Interval

plt.figure(figsize = (10, 6))
sns.set(font_scale = 0.6)
sns.barplot(
    x = "home_age_binned",
    y = "logerror",
    data = train_imputed,
    order = ["new_century", "late_20th_century", "mid_20th_century", "early_20th_century"], 
    ci = 20,
    palette = "magma_r")

plt.title("Differences in Home Era Preferences")
plt.xlabel(None)
plt.show()

In [None]:
# ANOVA hypothesis test for: home era

f_statistic, p_value = stats.f_oneway(
    train_imputed[train_imputed["home_age_binned"] == "new_century"].logerror,
    train_imputed[train_imputed["home_age_binned"] == "late_20th_century"].logerror,
    train_imputed[train_imputed["home_age_binned"] == "mid_20th_century"].logerror,
    train_imputed[train_imputed["home_age_binned"] == "early_20th_century"].logerror)
    

# comparing the counties p_value to alpha
print(f'f statistic = {f_statistic.round(5)}')
print(f'p value = {p_value.round(5)}')
print("-----------------")

if p_value < alpha:
    print('We reject the null hypothesis. Means are different across the groups.')
else:
    print('We fail to reject the null hypothesis. Means are similar across the groups.')

-----
### <u>``Scaling Data and Clustering Features``</u>

**Highlights:**

In [None]:
# creating dummy variables for clustering

train_dummy, validate_dummy, test_dummy = get_dummy_dataframes(train_imputed, validate_imputed, test_imputed)
train_dummy.head()

In [None]:
# scaling plot using sklearn's MinMaxScaler

cont_lst = train_dummy.select_dtypes(exclude = ["object", "category", "uint8", "int64", "bool"]).columns.tolist()
cont_lst = [ele for ele in cont_lst if ele not in ("logerror", "home_value")]

for col in cont_lst:
    scaler = MinMaxScaler()
    scaler.fit(train_dummy[[col]])

    x_scaled = scaler.transform(train_dummy[[col]])

    plt.figure(figsize=(18, 6))
    plt.subplot(121)
    sns.histplot(train_dummy[[col]], bins = 25, edgecolor = 'black', label = col)
    
    plt.title(f'Original: {col}')
    plt.legend()

    plt.subplot(122)
    ax = sns.histplot(x_scaled, bins=25, edgecolor = 'black', label = "scaled")
    
    # removing axes scientific notation 
    ax.ticklabel_format(style = "plain") 
    plt.title(f'Scaled: {col}')
    plt.legend()

In [None]:
# scaling necessary features in datasets

scaler = MinMaxScaler()
scaler.fit(train_dummy[cont_lst])

# dummy datasets
train_dummy[cont_lst] = scaler.transform(train_dummy[cont_lst])
validate_dummy[cont_lst] = scaler.transform(validate_dummy[cont_lst])
test_dummy[cont_lst] = scaler.transform(test_dummy[cont_lst])

# non-dummy datasets
train_imputed[cont_lst] = scaler.transform(train_imputed[cont_lst])
validate_imputed[cont_lst] = scaler.transform(validate_imputed[cont_lst])
test_imputed[cont_lst] = scaler.transform(test_imputed[cont_lst])

train_dummy.head()

----
#### **``Cluster Number 1: Monthly Cluster``**

In [None]:
# setting features to cluster on 

X_month = train_dummy[[
    'property_sq_feet',
    'transaction_month_January',
    'transaction_month_February',
    'transaction_month_March',
    'transaction_month_April',
    'transaction_month_May',
    'transaction_month_June',
    'transaction_month_July',
    'transaction_month_August',
    'transaction_month_September'
    ]]

# validate df
X_val = validate_dummy[[
    'property_sq_feet',
    'transaction_month_January',
    'transaction_month_February',
    'transaction_month_March',
    'transaction_month_April',
    'transaction_month_May',
    'transaction_month_June',
    'transaction_month_July',
    'transaction_month_August',
    'transaction_month_September'
    ]]

# test df
X_test = test_dummy[[
    'property_sq_feet',
    'transaction_month_January',
    'transaction_month_February',
    'transaction_month_March',
    'transaction_month_April',
    'transaction_month_May',
    'transaction_month_June',
    'transaction_month_July',
    'transaction_month_August',
    'transaction_month_September'
    ]]

X_month.head()

In [None]:
# initial fitting of KMeans cluster

kmeans = KMeans(n_clusters = 9)
kmeans.fit(X_month)

train_clusters = kmeans.predict(X_month)

In [None]:
# KMeans Elbow-method

with plt.style.context('seaborn-whitegrid'):

    plt.figure(figsize=(10, 3))
    pd.Series({k: KMeans(k).fit(X_month).inertia_ for k in range(1, 12)}).plot(marker = 'o')
    plt.xticks(range(2, 13))
    
    plt.ylim(0, 21000)
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

In [None]:
# plotting inertia for number of KMeans clusters

fig, axs = plt.subplots(3, 2, figsize=(13, 13), sharex=True, sharey=True)

for ax, k in zip(axs.ravel(), range(4, 13)):
    X1 = X_month.sample(1000, random_state = 14)

    # creating and fitting KMeans
    clusters = KMeans(k).fit(X1).predict(X1)

    X1["logerror"] = train_dummy["logerror"]

    ax.scatter(
        X1["property_sq_feet"],
        X1["logerror"],
        c = clusters,
        s = 2)

    ax.set(title='k = {}'.format(k), xlabel = 'property_sq_feet', ylabel = 'logerror')

In [None]:
# creating cluster for ea. dataset 

train_month_clusters = kmeans.predict(X_month)
val_month_clusters = kmeans.predict(X_val)
test_month_clusters = kmeans.predict(X_test)

pd.DataFrame(train_month_clusters).rename(columns = {0: "Cluster Sample"}).sample(10, random_state = 14)

In [None]:
# adding clusters back to original datasets

train_imputed["month_clusters"] = train_month_clusters
validate_imputed["month_clusters"] = val_month_clusters
test_imputed["month_clusters"] = test_month_clusters

train_imputed.head()

In [None]:
# seeing how month clusters compare against actual month and home age

sns.set(font_scale = 0.6)
plt.subplots(1, 2, figsize = (16, 6), sharex = True, sharey = False)

plt.subplot(121)
sns.scatterplot(
        x = "property_sq_feet",
        y = "logerror",
        data = train_imputed.sample(1000, random_state = 14), 
        hue = "transaction_month",
        # hue_order = [
        # 'January',
        # 'August',
        # 'April',
        # 'June',
        # 'May',
        # 'July',
        # 'March',
        # 'February',
        # 'September'],
        palette = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6']
)

plt.title("Actual: Transaction Month & Property Sq. Feet")
plt.legend(loc = 'upper left', ncol = 3)
plt.ylim(-.2, .2)

plt.subplot(122)
sns.scatterplot(
        x = "property_sq_feet",
        y = "logerror",
        data = train_imputed.sample(1000, random_state = 14), 
        hue = "month_clusters",
        # hue_order = [0, 1, 2, 3, 4, 5, 6, 7, 8],
        palette = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6']
)

plt.title("Predicted: Clusters of Transaction Month & Property Sq. Feet")
plt.legend(loc = 'upper left', ncol = 3)
plt.ylim(-.2, .2)
plt.show()

----
#### **``Cluster Number 2: Home Size Cluster``**

In [None]:
# # setting features to cluster on 

X_home_size = train_dummy[[ 
    'home_age',
    'living_sqfeet_binned_360_to_1240_sqfeet',
    'living_sqfeet_binned_1241_to_1565_sqfeet',
    'living_sqfeet_binned_1566_to_2036_sqfeet',
    'living_sqfeet_binned_2037_to_3855_sqfeet'
]]

val_home_size = validate_dummy[[ 
    'home_age',
    'living_sqfeet_binned_360_to_1240_sqfeet',
    'living_sqfeet_binned_1241_to_1565_sqfeet',
    'living_sqfeet_binned_1566_to_2036_sqfeet',
    'living_sqfeet_binned_2037_to_3855_sqfeet'
]]

test_home_size = test_dummy[[ 
    'home_age',
    'living_sqfeet_binned_360_to_1240_sqfeet',
    'living_sqfeet_binned_1241_to_1565_sqfeet',
    'living_sqfeet_binned_1566_to_2036_sqfeet',
    'living_sqfeet_binned_2037_to_3855_sqfeet'
]]

X_home_size.head()

In [None]:
# creating the KMeans cluster object

kmeans = KMeans(n_clusters = 4)
kmeans.fit(X_home_size)

clusters = kmeans.predict(X_home_size)

In [None]:
# KMeans Elbow-method

with plt.style.context('seaborn-whitegrid'):

    plt.figure(figsize=(10, 3))
    pd.Series({k: KMeans(k).fit(X_home_size).inertia_ for k in range(2, 12)}).plot(marker = 'o')
    plt.xticks(range(2, 13))
    
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

In [None]:
# plotting inertia for number of KMeans clusters

fig, axs = plt.subplots(3, 2, figsize=(13, 13), sharex=True, sharey=True)

for ax, k in zip(axs.ravel(), range(2, 8)):
    # creating and fitting KMeans
    X2 = X_home_size.sample(2000, random_state = 14)
    clusters = KMeans(k).fit(X2).predict(X2)

    X2["logerror"] = train_dummy["logerror"]

    ax.scatter(
        X2["home_age"], 
        X2["logerror"], 
        c = clusters,
        s = 2)

    ax.set(title='k = {}'.format(k), xlabel = 'home age', ylabel = 'logerror')

In [None]:
# creating clusters for ea. dataset

train_size_clusters = kmeans.predict(X_home_size)
val_size_clusters = kmeans.predict(val_home_size)
test_size_clusters = kmeans.predict(test_home_size)

pd.DataFrame(train_size_clusters).rename(columns = {0: "Cluster Sample"}).sample(10, random_state = 54)

In [None]:
# adding clusters to original datasets

train_imputed["size_clusters"] = train_size_clusters
validate_imputed["size_clusters"] = val_size_clusters
test_imputed["size_clusters"] = test_size_clusters

train_imputed.head()

In [None]:
# seeing how clusters compare against actual month & home age plots

plt.subplots(1, 2, figsize = (16, 6), sharex = True, sharey = False)

plt.subplot(121)
sns.scatterplot(
        x = "home_age",
        y = "logerror",
        data = train_imputed.sample(300, random_state = 14),
        hue = "living_sqfeet_binned",
        # hue_order = ["360_to_1240_sqfeet", "1241_to_1565_sqfeet", "1566_to_2036_sqfeet", "2037_to_3855_sqfeet"],
        palette = ['#66c2a5','#fc8d62','#8da0cb','#e78ac3']
)

plt.title("Actual: Home Size Binned & Home Age")
plt.legend(loc = 'upper left', ncol = 2)
plt.ylim(-.2, .2)

plt.subplot(122)
sns.scatterplot(
        x = "home_age",
        y = "logerror",
        data = train_imputed.sample(300, random_state = 14), 
        hue = "size_clusters",
        # hue_order = [0, 1, 2, 3],
        palette = ['#66c2a5','#fc8d62','#8da0cb','#e78ac3']
)

plt.title("Predicted: Clusters of Home Size Binned & Home Age")
plt.legend(loc = 'upper left', ncol = 2)
plt.ylim(-.2, .2)
plt.show()

----
#### **``Cluster Number 3: Build Era Clusters``**

In [None]:
# # setting features to cluster on 

X_era = train_dummy[[
    'living_sq_feet',
    'home_age_binned_new_century',
    'home_age_binned_late_20th_century',
    'home_age_binned_mid_20th_century',
    'home_age_binned_early_20th_century']]

val_era = validate_dummy[[
    'living_sq_feet',
    'home_age_binned_new_century',
    'home_age_binned_late_20th_century',
    'home_age_binned_mid_20th_century',
    'home_age_binned_early_20th_century']]

test_era = test_dummy[[
    'living_sq_feet',
    'home_age_binned_new_century',
    'home_age_binned_late_20th_century',
    'home_age_binned_mid_20th_century',
    'home_age_binned_early_20th_century']]

X_era.head()

In [None]:
# creating the KMeans cluster object

kmeans = KMeans(n_clusters = 4)
kmeans.fit(X_era)

clusters = kmeans.predict(X_era)

In [None]:
# KMeans Elbow-method

with plt.style.context('seaborn-whitegrid'):

    plt.figure(figsize=(10, 3))
    pd.Series({k: KMeans(k).fit(X_era).inertia_ for k in range(1, 12)}).plot(marker = 'o')
    plt.xticks(range(2, 13))
    
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

In [None]:
# plotting inertia for number of KMeans clusters

fig, axs = plt.subplots(2, 2, figsize=(13, 13), sharex=True, sharey=True)

for ax, k in zip(axs.ravel(), range(2, 6)):

    X3 = X_era.sample(2000, random_state = 54)

    # creating and fitting KMeans
    clusters = KMeans(k).fit(X3).predict(X3)

    X3["logerror"] = train_dummy["logerror"]

    ax.scatter(
        X3["living_sq_feet"], 
        X3["logerror"], 
        c = clusters,
        s = 2)

    ax.set(title='k = {}'.format(k), xlabel = 'living sq. feet', ylabel = 'logerror')

In [None]:
train_era_clusters = kmeans.predict(X_era)
validate_era_clusters = kmeans.predict(val_era)
test_era_clusters = kmeans.predict(test_era)

pd.DataFrame(train_era_clusters).rename(columns = {0: "Cluster Sample"}).head(10)

In [None]:
# adding clusters to original datasets

train_imputed["era_clusters"] = train_era_clusters
validate_imputed["era_clusters"] = validate_era_clusters
test_imputed["era_clusters"] = test_era_clusters

train_imputed.head()

In [None]:
# seeing how clusters compare against actual month & home age plots

plt.subplots(1, 2, figsize = (16, 6), sharex = True, sharey = False)

plt.subplot(121)
sns.scatterplot(
        x = "living_sq_feet",
        y = "logerror",
        data = train_imputed.sample(1000, random_state = 54),
        hue = "home_age_binned",
        # hue_order = ["early_20th_century", "late_20th_century", "mid_20th_century", "new_century"],
        palette = ['#66c2a5','#fc8d62','#8da0cb','#e78ac3']
)

plt.title("Actual: Home Build Era by Living Sq. Feet")
plt.legend(loc = 'upper right', ncol = 2)
plt.ylim(-.2, .2)

plt.subplot(122)
sns.scatterplot(
        x = "living_sq_feet",
        y = "logerror",
        data = train_imputed.sample(1000, random_state = 54), 
        hue = "era_clusters",
        # hue_order = [0, 1, 2, 3],
        palette = ['#66c2a5','#fc8d62','#8da0cb','#e78ac3']
)

plt.title("Predicted: Clusters of Home Build Era & Living Sq. Feet")
plt.legend(loc = 'upper right', ncol = 2)
plt.ylim(-.2, .2)
plt.show()

In [None]:
# plotting size and era clusters against actual home value

sns.relplot(
    data = train_imputed.sample(500, random_state = 54), 
    x = "home_value", 
    y = "logerror", 
    col = "era_clusters", 
    hue = "size_clusters",
    palette = ['#a6cee3','#1f78b4','#b2df8a','#33a02c'],
    col_wrap = 2)

plt.show()

----
### **``Modeling``**

In [None]:
# creating dummy dataframes with generated clusters for modeling

train_dummy, validate_dummy, test_dummy = get_cluster_dummy(train_imputed, validate_imputed, test_imputed)
print(f'dataframe shape: {train_dummy.shape}')
train_dummy.head()

In [None]:
# establishing a mean logerror baseline for train & validate datasets

train_dummy, validate_dummy = acquire.establish_baseline(train_dummy, validate_dummy)

----
#### ``2nd Data Split: Taking forward needed X variables and y (logerror) variable

In [None]:
# 2nd split: splitting larger datasets into x and y variables

X_train = train_dummy.drop(columns = [
    "logerror", 
    "home_value",
    'blockgroup_assignment',
    'parcel_id',
    'transaction_quarter',
    'county_zoning_code',
    'home_age_binned_new_century',               
    'home_age_binned_late_20th_century',             
    'home_age_binned_mid_20th_century',               
    'home_age_binned_early_20th_century',                 
    'living_sqfeet_binned_360_to_1240_sqfeet',
    'living_sqfeet_binned_1241_to_1565_sqfeet',
    'living_sqfeet_binned_1566_to_2036_sqfeet',
    'living_sqfeet_binned_2037_to_3855_sqfeet',
    'transaction_month_April',                      
    'transaction_month_August',                      
    'transaction_month_February',                    
    'transaction_month_January',                     
    'transaction_month_July',                       
    'transaction_month_June',                        
    'transaction_month_March',                       
    'transaction_month_May',                         
    'transaction_month_September',
    'county_by_fips_LA County',
    'county_by_fips_Orange County',
    'county_by_fips_Ventura County',
    'bathroom_count_1.0',
    'bathroom_count_1.5',
    'bathroom_count_2.0',
    'bathroom_count_2.5',
    'bathroom_count_3.0',
    'bathroom_count_3.5',
    'bathroom_count_4.0',
    'bathroom_count_4.5',
    'bedroom_count_1.0',
    'bedroom_count_2.0',
    'bedroom_count_3.0',
    'bedroom_count_4.0',
    'bedroom_count_5.0',
    "year_built",
    "home_age",
    "living_sq_feet",
    "property_sq_feet",
    "latitude",
    "longitude"
])

y_train = train_dummy["logerror"]

X_validate = validate_dummy.drop(columns = [
    "logerror", 
    "home_value",
    'blockgroup_assignment',
    'parcel_id',
    'transaction_quarter',
    'county_zoning_code',
    'home_age_binned_new_century',               
    'home_age_binned_late_20th_century',             
    'home_age_binned_mid_20th_century',               
    'home_age_binned_early_20th_century',                 
    'living_sqfeet_binned_360_to_1240_sqfeet',
    'living_sqfeet_binned_1241_to_1565_sqfeet',
    'living_sqfeet_binned_1566_to_2036_sqfeet',
    'living_sqfeet_binned_2037_to_3855_sqfeet',
    'transaction_month_April',                      
    'transaction_month_August',                      
    'transaction_month_February',                    
    'transaction_month_January',                     
    'transaction_month_July',                       
    'transaction_month_June',                        
    'transaction_month_March',                       
    'transaction_month_May',                         
    'transaction_month_September',
    'county_by_fips_LA County',
    'county_by_fips_Orange County',
    'county_by_fips_Ventura County',
    'bathroom_count_1.0',
    'bathroom_count_1.5',
    'bathroom_count_2.0',
    'bathroom_count_2.5',
    'bathroom_count_3.0',
    'bathroom_count_3.5',
    'bathroom_count_4.0',
    'bathroom_count_4.5',
    'bedroom_count_1.0',
    'bedroom_count_2.0',
    'bedroom_count_3.0',
    'bedroom_count_4.0',
    'bedroom_count_5.0',
    "year_built",
    "home_age",
    "living_sq_feet",
    "property_sq_feet",
    "latitude",
    "longitude"
])

y_validate = validate_dummy["logerror"]

X_test = test_dummy.drop(columns = [
    "logerror", 
    "home_value",
    'blockgroup_assignment',
    'parcel_id',
    'transaction_quarter',
    'county_zoning_code',
    'home_age_binned_new_century',               
    'home_age_binned_late_20th_century',             
    'home_age_binned_mid_20th_century',               
    'home_age_binned_early_20th_century',                 
    'living_sqfeet_binned_360_to_1240_sqfeet',
    'living_sqfeet_binned_1241_to_1565_sqfeet',
    'living_sqfeet_binned_1566_to_2036_sqfeet',
    'living_sqfeet_binned_2037_to_3855_sqfeet',
    'transaction_month_April',                      
    'transaction_month_August',                      
    'transaction_month_February',                    
    'transaction_month_January',                     
    'transaction_month_July',                       
    'transaction_month_June',                        
    'transaction_month_March',                       
    'transaction_month_May',                         
    'transaction_month_September',
    'county_by_fips_LA County',
    'county_by_fips_Orange County',
    'county_by_fips_Ventura County',
    'bathroom_count_1.0',
    'bathroom_count_1.5',
    'bathroom_count_2.0',
    'bathroom_count_2.5',
    'bathroom_count_3.0',
    'bathroom_count_3.5',
    'bathroom_count_4.0',
    'bathroom_count_4.5',
    'bedroom_count_1.0',
    'bedroom_count_2.0',
    'bedroom_count_3.0',
    'bedroom_count_4.0',
    'bedroom_count_5.0',
    "year_built",
    "home_age",
    "living_sq_feet",
    "property_sq_feet",
    "latitude",
    "longitude"
])
    
y_test = test_dummy["logerror"]

In [None]:
# checking the shape

print(X_train.shape)
print(y_train.shape)

In [None]:
acquire.recursive_feature_eliminate(X_train, y_train, 10).reset_index(drop = True)

In [None]:
# using sklearn's RFECV function to select best features to include
# initiating, and fitting

rfecv = RFECV(
    estimator = LinearRegression(),
    min_features_to_select = 5)

rfecv = rfecv.fit(X_train, y_train)

feature_lst = X_train.columns[rfecv.support_].tolist()
pd.DataFrame(feature_lst).rename(columns = {0: "Features"}).sort_values("Features").reset_index(drop = True)

In [None]:
# selecting only identified features from RFECV 

X_train = X_train[feature_lst]
X_validate = X_validate[feature_lst]
X_test = X_test[feature_lst]

# checking the shape 

print(f'dataframe shape: {X_train.shape}')
X_train.head()

In [None]:
# calculating and plotting feature importance
plt.figure(figsize = (10, 5))
sns.set(style = "darkgrid", font_scale = .75)

rf = RandomForestRegressor(random_state = 123)
rf = rf.fit(X_train, y_train)

sorted_idx = rf.feature_importances_.argsort()

sns.barplot(rf.feature_importances_[sorted_idx], X_train.columns[sorted_idx], orient = "h", color = "b")

plt.title("Feature Importance")
plt.show()

-----
#### <u>**``Model Plots and Evaluation on Validate Dataset:``**</u>

In [None]:
def round_up(n, decimals=0):
    multiplier = 10 ** decimals
    return math.ceil(n * multiplier) / multiplier

round_up(y_validate.mean(), 2)

In [None]:
# creating a dataframe w/X variables, y_train, and model predictions 

# creating the independent and dependent variables
X_var = pd.DataFrame(X_validate[feature_lst])
y_var = pd.DataFrame({'logerror actual': y_validate})
predictions = pd.concat([X_var, y_var], axis = 1).reset_index(drop = True)

# baseline mean predictions
baseline = round_up(y_validate.mean(), 2)
predictions["baseline_mean_predictions"] = baseline

predictions.head()

----
#### <u>**``Linear Models:``**</u>

In [None]:
# generating models using selected RFECV features

lr = LinearRegression()
lr_model = lr.fit(X_train, y_train)

lars = LassoLars()
lars_model = lars.fit(X_train, y_train)

glm = TweedieRegressor(alpha = 1, power = 0)
glm_model = glm.fit(X_train, y_train)

print("Training R-squared w/Linear Regression:", lr_model.score(X_train, y_train).round(4))
print("Training R-squared w/Lasso Lars:", lars_model.score(X_train, y_train).round(4))
print("Training R-squared w/Tweedie Regressor:", glm_model.score(X_train, y_train).round(4))

----
#### <u>**``Non-linear Models:``**</u>

In [None]:
# creating the Principal Competent Analysis "PCA" non-linear object

pca = PCA(n_components = 2)
X_reduced_train = pca.fit_transform(X_train)

# transforming validate dataset
X_reduced_validate = pca.transform(X_validate)

# training PCR model on training data
regr = LinearRegression()
regr.fit(X_reduced_train, y_train)

# making predictions on validate dataset
predictions["pca_predictions"] = regr.predict(X_reduced_validate)

In [None]:
# Polynomial non-linear model
# step 1: generating Polynomial Features

poly = PolynomialFeatures(degree = 2, include_bias = False)
poly.fit(X_train)

X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns = poly.get_feature_names(X_train.columns),
    index = train.index)

In [None]:
# step 2: using the poly features to create the linear regression model

lm_poly = LinearRegression()
lm_poly.fit(X_train_poly, y_train)

X_validate_poly = poly.transform(X_validate)
predictions['polynomial degree 2'] = lm_poly.predict(X_validate_poly)
predictions.head()

In [None]:
# evaluating pca regression model

model_performance = []

train_model = regr.predict(X_reduced_train)
rmse_train = sqrt(mean_squared_error(y_train, train_model))

validate_model = regr.predict(X_reduced_validate)
rmse_validate = sqrt(mean_squared_error(y_validate, validate_model))

metrics = {
    "model": "PCA",
    "train_rmse": rmse_train,
    "validate_rmse": rmse_validate}

model_performance.append(metrics)

print('RMSE for PCA model on the train dataset: {:.2f}'.format(rmse_train))
print('RMSE for PCA model on the validate dataset: {:.2f}'.format(rmse_validate))

In [None]:
# evaluating polynomial deg. 2 regression model

train_model = lm_poly.predict(X_train_poly)
rmse_train = sqrt(mean_squared_error(y_train, train_model))

validate_model = lm_poly.predict(X_validate_poly)
rmse_validate = sqrt(mean_squared_error(y_validate, validate_model))

metrics = {
    "model": "Polynomial deg.2",
    "train_rmse": rmse_train,
    "validate_rmse": rmse_validate}

model_performance.append(metrics)

print('RMSE for Polynomial Deg. 2 model on the train dataset: {:.2f}'.format(rmse_train))
print('RMSE for Polynomial Deg. 2 model on the validate dataset: {:.2f}'.format(rmse_validate))

In [None]:
# evaluating linear regression models

models = [lr_model, lars_model, glm_model]

for model in models:

    train_model = model.predict(X_train)
    rmse_train = sqrt(mean_squared_error(y_train,
                                         train_model))
    
    validate_model = model.predict(X_validate)
    rmse_validate = sqrt(mean_squared_error(y_validate,
                                         validate_model))
    metrics = {
    "model": str(model),
    "train_rmse": rmse_train,
    "validate_rmse": rmse_validate}

    model_performance.append(metrics)

    print('RMSE for {} model on the train dataset: {}'.format(model, round_up(rmse_train, 2)))
    print('RMSE for {} model on the validate dataset: {}'.format(model, round_up(rmse_validate, 2)))
    print()

In [None]:
# returning the models performance 

model_performance = pd.DataFrame(model_performance)
model_performance["model"] = model_performance["model"].replace(
    {"LinearRegression()": "LinearRegression", 
     "LassoLars()": "LassoLars", 
     "TweedieRegressor(alpha=1, power=0)": "TweedieRegressor"})

model_performance

In [None]:
# generating validate model predictions and assigning to dataframe

lr_predictions = lr_model.predict(X_validate)
predictions["linear_predictions"] = lr_predictions.round(4)

lars_predictions = lars_model.predict(X_validate)
predictions["lars_predictions"] = lars_predictions.round(4)

glm_predictions = lars_model.predict(X_validate)
predictions["glm_predictions"] = glm_predictions.round(4)

predictions.head()

In [None]:
# melting columns for residual plot
melt_df = acquire.get_melted_table(predictions)

# plotting model residuals
acquire.plot_model_residuals(melt_df)

In [None]:
# plotting model predicted home values against target
acquire.plot_models(melt_df)

In [None]:
# model logerror distribution
acquire.model_distributions(predictions)

In [None]:
# generating a dataframe
test_df = pd.DataFrame(y_test)

# using poly linear model to transform X_test
X_test_poly = poly.transform(X_test)

# generating model predictions
test_df["model_predictions"] = lm_poly.predict(X_test_poly)
test_df.head()

In [None]:
# returning R-squared score & RMSE on test dataset

rmse_test = sqrt(mean_squared_error(test_df['logerror'], test_df['model_predictions']))

# print('Training R-squared w/Linear Model: {:.3f}'.format(lr_model.score(X_test, y_test)))
print('RMSE for Polynomial Deg. 2 model on the test dataset: {:.2f}'.format(rmse_test))

In [None]:
# returning RMSE report 
acquire.final_rmse()