In [1]:
%matplotlib inline
import matplotlib as mlp
mlp.rcParams['figure.dpi'] = 200

# diasbling warning messages
import warnings
warnings.filterwarnings("ignore")

# importing key libraries
import pandas as pd
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:20,.2f}'.format
import numpy as np

# importing wrangle/acquire module
import wrangle
from wrangle import get_zillow_dataset, \
                    null_df, \
                    drop_nulls, \
                    clean_zillow_dataset, \
                    train_validate_test_split

# importing data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set()

# sklearn data science library
from sklearn.impute import KNNImputer
from sklearn.impute import MissingIndicator
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

### **``Clustering Module: Exploratory Analysis Exercises``**


Ask at least 5 questions about the data, keeping in mind that your target variable is logerror. 

<u>**``Example Questions (place holder):``**</u>

1. Is logerror significantly different for properties in LA County vs Orange County vs Ventura County?

<br></br>
*Answer these questions through a mix of statistical tests and visualizations.*

**``Bonus:``**

* Compute the mean(logerror) by zipcode and the overall mean(logerror). Write a loop that will run a t-test between the overall mean and the mean for each zip code. 

* We want to identify the zip codes where the error is significantly higher or lower than the expected error.

----

In [2]:
# importing zillow dataset

df = get_zillow_dataset()
df.shape

(52319, 68)

In [3]:
# cleaning dataset for feature/row null % > 80%
# df = drop_nulls(df, .8, .8)

df = clean_zillow_dataset(df)
df.shape

dataframe shape: (52311, 16)


(52311, 16)

In [4]:
df.head(5)

Unnamed: 0,bathroom_count,bedroom_count,living_sq_feet,county_by_fips,land_assessed_value,latitude,logerror,longitude,property_sq_feet,county_zoning_code,blockgroup_assignment,home_assessed_value,taxamount,home_value,transaction_date,year_built
0,3.5,4.0,3100.0,Orange County,537569.0,33634931.0,0.03,-117869207.0,4506.0,122,60590630.07,485713.0,11013.72,1023282.0,2017-01-01,1998.0
1,1.0,2.0,1465.0,Ventura County,376000.0,34449266.0,0.06,-119281531.0,12647.0,1110,61110010.02,88000.0,5672.48,464000.0,2017-01-01,1967.0
2,2.0,3.0,1243.0,Orange County,479489.0,33886168.0,0.01,-117823170.0,8432.0,122,60590218.02,85289.0,6488.3,564778.0,2017-01-01,1962.0
3,3.0,4.0,2376.0,LA County,36225.0,34245180.0,-0.1,-118240722.0,13038.0,101,60373001.0,108918.0,1777.51,145143.0,2017-01-01,1970.0
4,3.0,4.0,2962.0,LA County,496619.0,34145202.0,-0.0,-118179824.0,63000.0,101,60374608.0,276684.0,9516.26,773303.0,2017-01-01,1950.0


In [5]:
df = wrangle.clean_months(df)
df.head()

Unnamed: 0,bathroom_count,bedroom_count,living_sq_feet,county_by_fips,land_assessed_value,latitude,logerror,longitude,property_sq_feet,county_zoning_code,blockgroup_assignment,home_assessed_value,taxamount,home_value,year_built,transaction_month
0,3.5,4.0,3100.0,Orange County,537569.0,33634931.0,0.03,-117869207.0,4506.0,122,60590630.07,485713.0,11013.72,1023282.0,1998.0,January
1,1.0,2.0,1465.0,Ventura County,376000.0,34449266.0,0.06,-119281531.0,12647.0,1110,61110010.02,88000.0,5672.48,464000.0,1967.0,January
2,2.0,3.0,1243.0,Orange County,479489.0,33886168.0,0.01,-117823170.0,8432.0,122,60590218.02,85289.0,6488.3,564778.0,1962.0,January
3,3.0,4.0,2376.0,LA County,36225.0,34245180.0,-0.1,-118240722.0,13038.0,101,60373001.0,108918.0,1777.51,145143.0,1970.0,January
4,3.0,4.0,2962.0,LA County,496619.0,34145202.0,-0.0,-118179824.0,63000.0,101,60374608.0,276684.0,9516.26,773303.0,1950.0,January


In [6]:
# creating a new column for total age of home

df = wrangle.age_of_homes(df)
df["home_age"].head(20)

0                   24.00
1                   55.00
2                   60.00
3                   52.00
4                   72.00
5                  100.00
6                   52.00
7                   23.00
8                   42.00
9                   18.00
10                  41.00
11                  83.00
12                  44.00
13                  24.00
14                 114.00
15                  78.00
16                  50.00
17                  50.00
18                 102.00
19                  55.00
Name: home_age, dtype: float64

In [7]:
# remaining nulls and percentage of feature

remaining_nulls = null_df(df)
remaining_nulls

Unnamed: 0,Total Null,Feature Null %
property_sq_feet,360,0.01
year_built,106,0.0
home_age,106,0.0
home_assessed_value,79,0.0
living_sq_feet,73,0.0
taxamount,4,0.0
land_assessed_value,1,0.0
home_value,1,0.0
county_by_fips,0,0.0
latitude,0,0.0


In [8]:
# splitting the dataset to 1. fill-in remaining nulls and 2. determine outlier cutoffs

train, validate, test = train_validate_test_split(df)

train shape: (29293, 17)
validate shape: (12555, 17)
test shape: (10463, 17)


In [9]:
# 2nd split: splitting larger datasets into x and y variables

X_train = train.drop(columns = "logerror")
y_train = train['logerror']

X_validate = validate.drop(columns = "logerror")
y_validate = validate['logerror']

X_test = test.drop(columns = "logerror")
y_test = test['logerror']

In [10]:
# viewing X_train variables/features

X_train.columns.sort_values().tolist()

['bathroom_count',
 'bedroom_count',
 'blockgroup_assignment',
 'county_by_fips',
 'county_zoning_code',
 'home_age',
 'home_assessed_value',
 'home_value',
 'land_assessed_value',
 'latitude',
 'living_sq_feet',
 'longitude',
 'property_sq_feet',
 'taxamount',
 'transaction_month',
 'year_built']

In [11]:
# checking nulls in X_train dataset

nulls_in_X = null_df(X_train)
nulls_in_X

Unnamed: 0,Total Null,Feature Null %
property_sq_feet,208,0.01
year_built,64,0.0
home_age,64,0.0
living_sq_feet,45,0.0
home_assessed_value,41,0.0
taxamount,1,0.0
bathroom_count,0,0.0
bedroom_count,0,0.0
county_by_fips,0,0.0
land_assessed_value,0,0.0


In [12]:

X_train.describe().T.round(3)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bathroom_count,29293.0,2.29,1.02,0.0,2.0,2.0,3.0,18.0
bedroom_count,29293.0,3.3,0.95,0.0,3.0,3.0,4.0,14.0
living_sq_feet,29248.0,1919.4,997.24,128.0,1267.75,1657.0,2306.0,18654.0
land_assessed_value,29293.0,332028.08,590451.92,161.0,76690.0,218233.0,407884.0,48952198.0
latitude,29293.0,34023098.28,273169.58,33340620.0,33828615.0,34027079.0,34187915.0,34812397.0
longitude,29293.0,-118195165.1,357197.23,-119475416.0,-118402034.0,-118154196.0,-117929756.0,-117555373.0
property_sq_feet,29085.0,11665.61,96680.71,236.0,5581.0,6855.0,8770.0,6971010.0
blockgroup_assignment,29293.0,60494603.46,209491.56,60371011.1,60374012.01,60376201.01,60590423.24,61110091.0
home_assessed_value,29252.0,196273.62,255184.4,129.0,77199.5,131310.5,225871.0,7893568.0
taxamount,29292.0,6443.57,8978.74,49.18,2678.15,4645.71,7352.78,586639.3


----
### ```determining and handling outliers```

In [13]:
# adding upperbound columns to X_train dataset

# X_train = wrangle.add_upper_outlier_columns(X_train)
# print(X_train.shape)
X_train.head()

Unnamed: 0,bathroom_count,bedroom_count,living_sq_feet,county_by_fips,land_assessed_value,latitude,longitude,property_sq_feet,county_zoning_code,blockgroup_assignment,home_assessed_value,taxamount,home_value,year_built,transaction_month,home_age
35549,2.0,3.0,1244.0,LA County,26279.0,34166955.0,-118357164.0,7885.0,100,60371253.1,32283.0,764.51,58562.0,1943.0,2017-06-30 00:00:00,79.0
18097,3.0,4.0,2898.0,LA County,408465.0,34120410.0,-118058997.0,5468.0,100,60374318.0,518378.0,10802.46,926843.0,1993.0,2017-04-17 00:00:00,29.0
18950,1.0,3.0,1166.0,LA County,312268.0,34078141.0,-118297087.0,2002.0,100,60371926.2,147520.0,5518.32,459788.0,1922.0,2017-04-20 00:00:00,100.0
36943,1.0,2.0,803.0,LA County,397978.0,34177765.0,-118511941.0,5610.0,100,60371390.01,99494.0,6099.58,497472.0,1949.0,2017-07-07 00:00:00,73.0
52305,2.0,4.0,2166.0,LA County,73500.0,33809933.0,-118375646.0,6143.0,101,60376513.02,66047.0,1770.95,139547.0,1954.0,2017-09-19 00:00:00,68.0


In [14]:
# sampling of feature "taxamount" quantiles 

print("Q2 quantile of feature : ", np.quantile(X_train["taxamount"], .50))
print("Q1 quantile of feature : ", np.quantile(X_train["taxamount"], .25))
print("Q3 quantile of feature : ", np.quantile(X_train["taxamount"], .75))
print("100th quantile of feature : ", np.quantile(X_train["taxamount"], .1)) 

Q2 quantile of feature :  nan
Q1 quantile of feature :  nan
Q3 quantile of feature :  nan
100th quantile of feature :  nan


In [15]:
# detect outliers by looping through columns for values >= 1

uppercap_df = []

for col in X_train.select_dtypes("number"):
    
    k = 1.5
    
    # determing 1st and 3rd quartile
    q1, q3 = X_train[col].quantile([.25, 0.75])

    # calculate interquartile range
    iqr = q3 - q1

    # set feature/data upperbound limit
    upper_bound = q3 + k * iqr
    
    # boolean mask to determine total number of outliers
    df = X_train[X_train[col] > upper_bound]
    
    if df.shape[0] > 0:
        
        output = {
            "Feature": col, \
            "Upper_Bound": upper_bound, \
            "Total Outliers": df.shape[0]
            }
    
        uppercap_df.append(output)
    
df = pd.DataFrame(uppercap_df).sort_values(by = "Total Outliers", ascending = False, ).reset_index(drop = True)
df

Unnamed: 0,Feature,Upper_Bound,Total Outliers
0,property_sq_feet,13553.5,3111
1,blockgroup_assignment,60915040.08,2481
2,home_assessed_value,448878.25,2174
3,taxamount,14364.72,2131
4,home_value,1244563.0,1978
5,land_assessed_value,904675.0,1650
6,living_sq_feet,3863.38,1242
7,bathroom_count,4.5,877
8,bedroom_count,5.5,443
9,home_age,115.5,132


In [16]:
# creating a function to conduct this action in the future

def sum_outliers(df, k = 1.5):
    
    # placeholder for df values
    uppercap_df = []

    for col in df.select_dtypes("number"):

        # determing 1st and 3rd quartile
        q1, q3 = df[col].quantile([.25, 0.75])

        # calculate interquartile range
        iqr = q3 - q1

        # set feature/data upperbound limit
        upper_bound = q3 + k * iqr

        # boolean mask to determine total number of outliers
        mask = df[df[col] > upper_bound]

        if mask.shape[0] > 0:

            output = {
                "Feature": col, \
                "Upper_Bound": upper_bound, \
                "Total Outliers": mask.shape[0]
                }

            uppercap_df.append(output)
    
    new_df = pd.DataFrame(uppercap_df).sort_values(by = "Total Outliers", ascending = False, ).reset_index(drop = True)
    
    return new_df

In [None]:
# returning the outlier sum dataframe

sum_outliers(X_train)

Unnamed: 0,Feature,Upper_Bound,Total Outliers
0,property_sq_feet,13553.5,3111
1,blockgroup_assignment,60915040.08,2481
2,home_assessed_value,448878.25,2174
3,taxamount,14364.72,2131
4,home_value,1244563.0,1978
5,land_assessed_value,904675.0,1650
6,living_sq_feet,3863.38,1242
7,bathroom_count,4.5,877
8,bedroom_count,5.5,443
9,home_age,115.5,132


In [None]:
# creating a function to determine outliers based on "iqr" and then capping at upperboud limit

def capp_outliers(df, k = 1.5):
    
    # determining continuous features/columns
    for col in df.select_dtypes("number"):
        # determing 1st and 3rd quartile
        q1, q3 = df[col].quantile([.25, 0.75])
        
        # calculate interquartile range
        iqr = q3 - q1
        
        # set feature/data upperbound limit
        upper_bound = q3 + k * iqr
        
        # cap/convert outliers to upperbound
        df[col] = df[col].apply(lambda x: upper_bound if x > upper_bound else x)
    
        # renaming cols
        df.rename(columns = {col: col + "-capped"}, inplace = True)
        
    # returning the updated dataframe
    return df

In [None]:
# capping the outliers at feature upper-bound limit

X_train = capp_outliers(X_train)

In [None]:
# checking the shape

X_train.shape

(29293, 18)

In [None]:
# checking the head of df

X_train.head()

Unnamed: 0,bathroom_count-capped,bedroom_count-capped,living_sq_feet-capped,county_by_fips,land_assessed_value-capped,latitude-capped,longitude-capped,property_sq_feet-capped,property_id-capped,county_zoning_code,blockgroup_assignment-capped,home_assessed_value-capped,taxamount-capped,home_value-capped,transaction_date,year_built-capped,transaction_month,home_age-capped
35549,2.0,3.0,1244.0,LA County,26279.0,34166955.0,-118357164.0,7885.0,10928726.0,100,60371253.1,32283.0,764.51,58562.0,2017-06-30,1943.0,2017-06-30 00:00:00,79.0
18097,3.0,4.0,2898.0,LA County,408465.0,34120410.0,-118058997.0,5468.0,11925904.0,100,60374318.0,448878.25,10802.46,926843.0,2017-04-17,1993.0,2017-04-17 00:00:00,29.0
18950,1.0,3.0,1166.0,LA County,312268.0,34078141.0,-118297087.0,2002.0,11991988.0,100,60371926.2,147520.0,5518.32,459788.0,2017-04-20,1922.0,2017-04-20 00:00:00,100.0
36943,1.0,2.0,803.0,LA County,397978.0,34177765.0,-118511941.0,5610.0,10850571.0,100,60371390.01,99494.0,6099.58,497472.0,2017-07-07,1949.0,2017-07-07 00:00:00,73.0
52305,2.0,4.0,2166.0,LA County,73500.0,33809933.0,-118375646.0,6143.0,12669704.0,101,60376513.02,66047.0,1770.95,139547.0,2017-09-19,1954.0,2017-09-19 00:00:00,68.0


----

### ```identifying and handling null/missing values```

In [None]:
# classifying features/varibles by data type (discrete/continuous)

cat_lst = []
num_lst = []

for col in list(X_train.columns):
    if X_train[col].dtype == "int" or X_train[col].dtype == "float":
        num_lst.append(col)
    else:
        cat_lst.append(col)

print(f'Discrete Features:\n{cat_lst}')
print()
print(f'Continuous Features:\n{num_lst}')

Discrete Features:
['county_by_fips', 'county_zoning_code', 'transaction_date', 'transaction_month']

Continuous Features:
['bathroom_count-capped', 'bedroom_count-capped', 'living_sq_feet-capped', 'land_assessed_value-capped', 'latitude-capped', 'longitude-capped', 'property_sq_feet-capped', 'property_id-capped', 'blockgroup_assignment-capped', 'home_assessed_value-capped', 'taxamount-capped', 'home_value-capped', 'year_built-capped', 'home_age-capped']


In [None]:
# replacing null instances with "np.NaN" for better use of sklearn iterative imputer

X_train = X_train.replace('?', np.NaN)
X_train.shape

(29293, 18)

In [None]:
# using sklearn's iterative imputer to determine/fill-in remaining missing values

numeric_cols = X_train[num_lst]

impute_it = IterativeImputer(missing_values = np.NaN, skip_complete = True, random_state = 123)
imputed_df = impute_it.fit_transform(numeric_cols)

In [None]:
# checking there are no nulls in inputed dataframe

pd.DataFrame(imputed_df, columns = num_lst).isnull().sum()

bathroom_count-capped           0
bedroom_count-capped            0
living_sq_feet-capped           0
land_assessed_value-capped      0
latitude-capped                 0
longitude-capped                0
property_sq_feet-capped         0
property_id-capped              0
blockgroup_assignment-capped    0
home_assessed_value-capped      0
taxamount-capped                0
home_value-capped               0
year_built-capped               0
home_age-capped                 0
dtype: int64

In [None]:
# creating a new df for fill-in missing X_train values

missing_vals = pd.DataFrame(imputed_df, columns = numeric_cols, index = X_train.index)
missing_vals.head()

In [None]:
# assigning determined inputed values back to dataframe

X_train[cont_lst] = missing_vals
X_train.head()

In [None]:
# checking the dataframe information

X_train.info()