In [1]:
%matplotlib inline
import matplotlib as mlp
mlp.rcParams['figure.dpi'] = 200

# diasbling warning messages
import warnings
warnings.filterwarnings("ignore")

# importing key libraries
import pandas as pd
pd.set_option('display.max_rows', None)
import numpy as np

# importing wrangle/acquire module
import wrangle
from wrangle import get_zillow_dataset, \
                    null_df, \
                    drop_nulls, \
                    clean_zillow_dataset, \
                    train_validate_test_split

# importing data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set()

# sklearn data science library
from sklearn.impute import KNNImputer
from sklearn.impute import MissingIndicator
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

### **``Clustering Module: Exploratory Analysis Exercises``**


Ask at least 5 questions about the data, keeping in mind that your target variable is logerror. 

<u>**``Example Questions (place holder):``**</u>

1. Is logerror significantly different for properties in LA County vs Orange County vs Ventura County?

<br></br>
*Answer these questions through a mix of statistical tests and visualizations.*

**``Bonus:``**

* Compute the mean(logerror) by zipcode and the overall mean(logerror). Write a loop that will run a t-test between the overall mean and the mean for each zip code. 

* We want to identify the zip codes where the error is significantly higher or lower than the expected error.

----

In [2]:
# importing zillow dataset

df = get_zillow_dataset()
df.shape

(52319, 68)

In [3]:
df["transactiondate"].sort_values().head(20)

0     2017-01-01
1     2017-01-01
2     2017-01-01
3     2017-01-01
4     2017-01-01
5     2017-01-01
50    2017-01-02
49    2017-01-02
48    2017-01-02
47    2017-01-02
46    2017-01-02
45    2017-01-02
42    2017-01-02
43    2017-01-02
51    2017-01-02
41    2017-01-02
40    2017-01-02
39    2017-01-02
44    2017-01-02
52    2017-01-02
Name: transactiondate, dtype: object

In [4]:
# cleaning dataset for feature/row null % > 80%
# df = drop_nulls(df, .8, .8)

df = clean_zillow_dataset(df)
df.shape

(52311, 17)

In [5]:
df.head(5)

Unnamed: 0,bathroom_count,bedroom_count,living_sq_feet,county_by_fips,land_assessed_value,latitude,logerror,longitude,property_sq_feet,property_id,county_zoning_code,blockgroup_assignment,home_assessed_value,taxamount,home_value,transaction_date,year_built
0,3.5,4.0,3100.0,Orange County,537569.0,33634931.0,0.025595,-117869207.0,4506.0,14297519,122,60590630.0,485713.0,11013.72,1023282.0,2017-01-01,1998.0
1,1.0,2.0,1465.0,Ventura County,376000.0,34449266.0,0.055619,-119281531.0,12647.0,17052889,1110,61110010.0,88000.0,5672.48,464000.0,2017-01-01,1967.0
2,2.0,3.0,1243.0,Orange County,479489.0,33886168.0,0.005383,-117823170.0,8432.0,14186244,122,60590220.0,85289.0,6488.3,564778.0,2017-01-01,1962.0
3,3.0,4.0,2376.0,LA County,36225.0,34245180.0,-0.10341,-118240722.0,13038.0,12177905,101,60373000.0,108918.0,1777.51,145143.0,2017-01-01,1970.0
4,3.0,4.0,2962.0,LA County,496619.0,34145202.0,-0.001011,-118179824.0,63000.0,12095076,101,60374610.0,276684.0,9516.26,773303.0,2017-01-01,1950.0


In [6]:
df = wrangle.clean_months(df)
df.head()

Unnamed: 0,bathroom_count,bedroom_count,living_sq_feet,county_by_fips,land_assessed_value,latitude,logerror,longitude,property_sq_feet,property_id,county_zoning_code,blockgroup_assignment,home_assessed_value,taxamount,home_value,transaction_date,year_built,transaction_month
0,3.5,4.0,3100.0,Orange County,537569.0,33634931.0,0.025595,-117869207.0,4506.0,14297519,122,60590630.0,485713.0,11013.72,1023282.0,2017-01-01,1998.0,January
1,1.0,2.0,1465.0,Ventura County,376000.0,34449266.0,0.055619,-119281531.0,12647.0,17052889,1110,61110010.0,88000.0,5672.48,464000.0,2017-01-01,1967.0,January
2,2.0,3.0,1243.0,Orange County,479489.0,33886168.0,0.005383,-117823170.0,8432.0,14186244,122,60590220.0,85289.0,6488.3,564778.0,2017-01-01,1962.0,January
3,3.0,4.0,2376.0,LA County,36225.0,34245180.0,-0.10341,-118240722.0,13038.0,12177905,101,60373000.0,108918.0,1777.51,145143.0,2017-01-01,1970.0,January
4,3.0,4.0,2962.0,LA County,496619.0,34145202.0,-0.001011,-118179824.0,63000.0,12095076,101,60374610.0,276684.0,9516.26,773303.0,2017-01-01,1950.0,January


In [7]:
# creating a new column for total age of home

df = wrangle.age_of_homes(df)
df["home_age"].head(20)

0      24.0
1      55.0
2      60.0
3      52.0
4      72.0
5     100.0
6      52.0
7      23.0
8      42.0
9      18.0
10     41.0
11     83.0
12     44.0
13     24.0
14    114.0
15     78.0
16     50.0
17     50.0
18    102.0
19     55.0
Name: home_age, dtype: float64

In [8]:
# remaining nulls and percentage of feature

remaining_nulls = null_df(df)
remaining_nulls

Unnamed: 0,Total Null,Feature Null %
property_sq_feet,360,0.006882
home_age,106,0.002026
year_built,106,0.002026
home_assessed_value,79,0.00151
living_sq_feet,73,0.001395
taxamount,4,7.6e-05
land_assessed_value,1,1.9e-05
home_value,1,1.9e-05
blockgroup_assignment,0,0.0
transaction_month,0,0.0


In [9]:
# splitting the dataset to 1. fill-in remaining nulls and 2. determine outlier cutoffs

train, validate, test = train_validate_test_split(df)

train shape: (29293, 19)
validate shape: (12555, 19)
test shape: (10463, 19)


In [10]:
# 2nd split: splitting larger datasets into x and y variables

X_train = train.drop(columns = "logerror")
y_train = train['logerror']

X_validate = validate.drop(columns = "logerror")
y_validate = validate['logerror']

X_test = test.drop(columns = "logerror")
y_test = test['logerror']

In [11]:
# viewing X_train variables/features

X_train.columns.sort_values().tolist()

['bathroom_count',
 'bedroom_count',
 'blockgroup_assignment',
 'county_by_fips',
 'county_zoning_code',
 'home_age',
 'home_assessed_value',
 'home_value',
 'land_assessed_value',
 'latitude',
 'living_sq_feet',
 'longitude',
 'property_id',
 'property_sq_feet',
 'taxamount',
 'transaction_date',
 'transaction_month',
 'year_built']

In [12]:
# checking nulls in X_train dataset

nulls_in_X = null_df(X_train)
nulls_in_X

Unnamed: 0,Total Null,Feature Null %
property_sq_feet,208,0.007101
home_age,64,0.002185
year_built,64,0.002185
living_sq_feet,45,0.001536
home_assessed_value,41,0.0014
taxamount,1,3.4e-05
blockgroup_assignment,0,0.0
transaction_month,0,0.0
transaction_date,0,0.0
home_value,0,0.0


In [13]:
# classifying features/varibles by data type (discrete/continuous)

disc_lst = []
cont_lst = []

for col in list(X_train.columns):
    if X_train[col].dtype == "int" or X_train[col].dtype == "float":
        cont_lst.append(col)
    else:
        disc_lst.append(col)

print(f'Discrete Features:\n{disc_lst}')
print()
print(f'Continuous Features:\n{cont_lst}')

Discrete Features:
['county_by_fips', 'county_zoning_code', 'transaction_date', 'transaction_month']

Continuous Features:
['bathroom_count', 'bedroom_count', 'living_sq_feet', 'land_assessed_value', 'latitude', 'longitude', 'property_sq_feet', 'property_id', 'blockgroup_assignment', 'home_assessed_value', 'taxamount', 'home_value', 'year_built', 'home_age']


In [14]:
# replacing null instances with "np.NaN" for better use of sklearn iterative imputer

X_train = X_train.replace('?', np.NaN)
X_train.shape

(29293, 18)

In [15]:
# using sklearn's iterative imputer to determine/fill-in remaining missing values

numeric_cols = X_train[cont_lst]

impute_it = IterativeImputer(missing_values = np.NaN, skip_complete = True, random_state = 123)
imputed_df = impute_it.fit_transform(numeric_cols)

In [16]:
# ensuring there are no nulls in inputed dataframe

pd.DataFrame(imputed_df, columns = cont_lst).isnull().sum()

bathroom_count           0
bedroom_count            0
living_sq_feet           0
land_assessed_value      0
latitude                 0
longitude                0
property_sq_feet         0
property_id              0
blockgroup_assignment    0
home_assessed_value      0
taxamount                0
home_value               0
year_built               0
home_age                 0
dtype: int64

In [17]:
# creating a new df for fill-in missing X_train values

missing_vals = pd.DataFrame(imputed_df, columns = cont_lst, index = X_train.index)
missing_vals.head()

Unnamed: 0,bathroom_count,bedroom_count,living_sq_feet,land_assessed_value,latitude,longitude,property_sq_feet,property_id,blockgroup_assignment,home_assessed_value,taxamount,home_value,year_built,home_age
35549,2.0,3.0,1244.0,26279.0,34166955.0,-118357164.0,7885.0,10928726.0,60371250.0,32283.0,764.51,58562.0,1943.0,79.0
18097,3.0,4.0,2898.0,408465.0,34120410.0,-118058997.0,5468.0,11925904.0,60374320.0,518378.0,10802.46,926843.0,1993.0,29.0
18950,1.0,3.0,1166.0,312268.0,34078141.0,-118297087.0,2002.0,11991988.0,60371930.0,147520.0,5518.32,459788.0,1922.0,100.0
36943,1.0,2.0,803.0,397978.0,34177765.0,-118511941.0,5610.0,10850571.0,60371390.0,99494.0,6099.58,497472.0,1949.0,73.0
52305,2.0,4.0,2166.0,73500.0,33809933.0,-118375646.0,6143.0,12669704.0,60376510.0,66047.0,1770.95,139547.0,1954.0,68.0


In [18]:
# assigning determined inputed values back to dataframe

X_train[cont_lst] = missing_vals
X_train.head()

Unnamed: 0,bathroom_count,bedroom_count,living_sq_feet,county_by_fips,land_assessed_value,latitude,longitude,property_sq_feet,property_id,county_zoning_code,blockgroup_assignment,home_assessed_value,taxamount,home_value,transaction_date,year_built,transaction_month,home_age
35549,2.0,3.0,1244.0,LA County,26279.0,34166955.0,-118357164.0,7885.0,10928726.0,100,60371250.0,32283.0,764.51,58562.0,2017-06-30,1943.0,2017-06-30 00:00:00,79.0
18097,3.0,4.0,2898.0,LA County,408465.0,34120410.0,-118058997.0,5468.0,11925904.0,100,60374320.0,518378.0,10802.46,926843.0,2017-04-17,1993.0,2017-04-17 00:00:00,29.0
18950,1.0,3.0,1166.0,LA County,312268.0,34078141.0,-118297087.0,2002.0,11991988.0,100,60371930.0,147520.0,5518.32,459788.0,2017-04-20,1922.0,2017-04-20 00:00:00,100.0
36943,1.0,2.0,803.0,LA County,397978.0,34177765.0,-118511941.0,5610.0,10850571.0,100,60371390.0,99494.0,6099.58,497472.0,2017-07-07,1949.0,2017-07-07 00:00:00,73.0
52305,2.0,4.0,2166.0,LA County,73500.0,33809933.0,-118375646.0,6143.0,12669704.0,101,60376510.0,66047.0,1770.95,139547.0,2017-09-19,1954.0,2017-09-19 00:00:00,68.0


In [19]:
# checking the dataframe information

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29293 entries, 35549 to 36098
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   bathroom_count         29293 non-null  float64       
 1   bedroom_count          29293 non-null  float64       
 2   living_sq_feet         29293 non-null  float64       
 3   county_by_fips         29293 non-null  object        
 4   land_assessed_value    29293 non-null  float64       
 5   latitude               29293 non-null  float64       
 6   longitude              29293 non-null  float64       
 7   property_sq_feet       29293 non-null  float64       
 8   property_id            29293 non-null  float64       
 9   county_zoning_code     29293 non-null  object        
 10  blockgroup_assignment  29293 non-null  float64       
 11  home_assessed_value    29293 non-null  float64       
 12  taxamount              29293 non-null  float64       
 1

In [20]:
# sampling feature quantiles 

print("Q2 quantile of feature : ", np.quantile(X_train["taxamount"], .50))
print("Q1 quantile of feature : ", np.quantile(X_train["taxamount"], .25))
print("Q3 quantile of feature : ", np.quantile(X_train["taxamount"], .75))
print("100th quantile of feature : ", np.quantile(X_train["taxamount"], .1)) 

Q2 quantile of feature :  4645.79
Q1 quantile of feature :  2678.16
Q3 quantile of feature :  7354.06
100th quantile of feature :  1290.26


In [21]:
# viewing upperbound samples in "taxamount"

wrangle.get_upper_outliers(df["taxamount"]).sort_values(by = "taxamount", ascending = False).head(20)

Unnamed: 0,taxamount
5707,572183.1225
5247,276541.8825
11159,274068.4225
24497,262341.6525
30635,255479.4325
51728,215279.3425
26864,210767.5225
7364,209526.1125
44924,199833.0925
4838,187407.1325


In [22]:
# adding upperbound columns to X_train dataset

X_train = wrangle.add_upper_outlier_columns(X_train)
print(X_train.shape)
X_train.head()

(29293, 32)


Unnamed: 0,bathroom_count,bedroom_count,living_sq_feet,county_by_fips,land_assessed_value,latitude,longitude,property_sq_feet,property_id,county_zoning_code,...,latitude_outliers_upper,longitude_outliers_upper,property_sq_feet_outliers_upper,property_id_outliers_upper,blockgroup_assignment_outliers_upper,home_assessed_value_outliers_upper,taxamount_outliers_upper,home_value_outliers_upper,year_built_outliers_upper,home_age_outliers_upper
0,2.0,3.0,1244.0,LA County,26279.0,34166955.0,-118357164.0,7885.0,10928726.0,100,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
1,3.0,4.0,2898.0,LA County,408465.0,34120410.0,-118058997.0,5468.0,11925904.0,100,...,0.0,0,0.0,0.0,0.0,69910.5,0.0,0.0,0,0.0
2,1.0,3.0,1166.0,LA County,312268.0,34078141.0,-118297087.0,2002.0,11991988.0,100,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
3,1.0,2.0,803.0,LA County,397978.0,34177765.0,-118511941.0,5610.0,10850571.0,100,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
4,2.0,4.0,2166.0,LA County,73500.0,33809933.0,-118375646.0,6143.0,12669704.0,101,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0


In [23]:
X_train.dtypes.sort_values()

longitude_outliers_upper                         int64
year_built_outliers_upper                        int64
bathroom_count                                 float64
home_value_outliers_upper                      float64
taxamount_outliers_upper                       float64
home_assessed_value_outliers_upper             float64
blockgroup_assignment_outliers_upper           float64
property_id_outliers_upper                     float64
property_sq_feet_outliers_upper                float64
latitude_outliers_upper                        float64
land_assessed_value_outliers_upper             float64
living_sq_feet_outliers_upper                  float64
bedroom_count_outliers_upper                   float64
bathroom_count_outliers_upper                  float64
home_age                                       float64
year_built                                     float64
home_value                                     float64
taxamount                                      float64
home_asses