In [1]:
%matplotlib inline
import matplotlib as mlp
mlp.rcParams['figure.dpi'] = 200

# diasbling warning messages
import warnings
warnings.filterwarnings("ignore")

# importing key libraries
import pandas as pd
pd.set_option('display.max_rows', None)
import numpy as np

# importing wrangle/acquire module
import wrangle
from wrangle import get_zillow_dataset, \
                    null_df, \
                    drop_nulls, \
                    train_validate_test_split

# importing data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set()

# sklearn data science library
from sklearn.impute import KNNImputer
from sklearn.impute import MissingIndicator
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

### ``Clustering Module: Exploratory Analysis Exercises``


Ask at least 5 questions about the data, keeping in mind that your target variable is logerror. 

**``Example Questions:``**

1. Is logerror significantly different for properties in LA County vs Orange County vs Ventura County?


*Answer these questions through a mix of statistical tests and visualizations.*

**``Bonus:``**

* Compute the mean(logerror) by zipcode and the overall mean(logerror). Write a loop that will run a t-test between the overall mean and the mean for each zip code. 

* We want to identify the zip codes where the error is significantly higher or lower than the expected error.

----

In [2]:
# importing zillow dataset
df = get_zillow_dataset()
df.shape

(52319, 68)

In [3]:
# cleaning dataset for feature/row null % > 80%
df = drop_nulls(df, .8, .8)
df.shape

(52311, 29)

In [4]:
# remaining nulls and percentage of feature
remaining_nulls = null_df(df)
remaining_nulls

Unnamed: 0,Total Null,Feature Null %
regionidcity,1032,0.019728
lotsizesquarefeet,360,0.006882
finishedsquarefeet12,238,0.00455
calculatedbathnbr,127,0.002428
fullbathcnt,127,0.002428
censustractandblock,113,0.00216
yearbuilt,106,0.002026
structuretaxvaluedollarcnt,79,0.00151
calculatedfinishedsquarefeet,73,0.001395
regionidzip,23,0.00044


In [5]:
# splitting the dataset to 1. fill-in remaining nulls and 2. determine outlier cutoffs

train, validate, test = train_validate_test_split(df)

train shape: (29293, 29)
validate shape: (12555, 29)
test shape: (10463, 29)


In [6]:
# 2nd split: splitting larger datasets into x and y variables

X_train = train.drop(columns = "logerror")
y_train = train['logerror']

X_validate = validate.drop(columns = "logerror")
y_validate = validate['logerror']

X_test = test.drop(columns = "logerror")
y_test = test['logerror']

In [7]:
# viewing X_train variables/features
X_train.columns.sort_values().tolist()

['assessmentyear',
 'bathroomcnt',
 'bedroomcnt',
 'calculatedbathnbr',
 'calculatedfinishedsquarefeet',
 'censustractandblock',
 'finishedsquarefeet12',
 'fips',
 'fullbathcnt',
 'id',
 'landtaxvaluedollarcnt',
 'latitude',
 'longitude',
 'lotsizesquarefeet',
 'parcelid',
 'propertycountylandusecode',
 'propertylandusedesc',
 'propertylandusetypeid',
 'rawcensustractandblock',
 'regionidcity',
 'regionidcounty',
 'regionidzip',
 'roomcnt',
 'structuretaxvaluedollarcnt',
 'taxamount',
 'taxvaluedollarcnt',
 'transactiondate',
 'yearbuilt']

In [8]:
# checking nulls in X_train dataset

nulls_in_X = null_df(X_train)
nulls_in_X

Unnamed: 0,Total Null,Feature Null %
regionidcity,600,0.020483
lotsizesquarefeet,208,0.007101
finishedsquarefeet12,133,0.00454
fullbathcnt,72,0.002458
calculatedbathnbr,72,0.002458
yearbuilt,64,0.002185
censustractandblock,55,0.001878
calculatedfinishedsquarefeet,45,0.001536
structuretaxvaluedollarcnt,41,0.0014
regionidzip,15,0.000512


In [9]:
# classifying features/varibles by data type (discrete/continuous)

disc_lst = []
cont_lst = []

for col in list(X_train.columns):
    if X_train[col].dtype == "int" or X_train[col].dtype == "float":
        cont_lst.append(col)
    else:
        disc_lst.append(col)

print(f'Discrete Features:\n{disc_lst}')
print()
print(f'Continuous Features:\n{cont_lst}')

Discrete Features:
['propertycountylandusecode', 'transactiondate', 'propertylandusedesc']

Continuous Features:
['id', 'parcelid', 'bathroomcnt', 'bedroomcnt', 'calculatedbathnbr', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'fips', 'fullbathcnt', 'latitude', 'longitude', 'lotsizesquarefeet', 'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidzip', 'roomcnt', 'yearbuilt', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount', 'censustractandblock']


In [10]:
# replacing null instances with "np.NaN" for better use of sklearn iterative imputer

X_train = X_train.replace('?', np.NaN)
X_train.shape

(29293, 28)

In [11]:
# using sklearn's iterative imputer to determine/fill-in remaining missing values

numeric_cols = X_train[cont_lst]

impute_it = IterativeImputer(missing_values = np.NaN, skip_complete = True, random_state = 123)
imputed_df = impute_it.fit_transform(numeric_cols)

In [17]:
# ensuring there are no nulls in inputed dataframe

pd.DataFrame(imputed_df, columns = cont_lst).isnull().sum()

id                              0
parcelid                        0
bathroomcnt                     0
bedroomcnt                      0
calculatedbathnbr               0
calculatedfinishedsquarefeet    0
finishedsquarefeet12            0
fips                            0
fullbathcnt                     0
latitude                        0
longitude                       0
lotsizesquarefeet               0
propertylandusetypeid           0
rawcensustractandblock          0
regionidcity                    0
regionidcounty                  0
regionidzip                     0
roomcnt                         0
yearbuilt                       0
structuretaxvaluedollarcnt      0
taxvaluedollarcnt               0
assessmentyear                  0
landtaxvaluedollarcnt           0
taxamount                       0
censustractandblock             0
dtype: int64

In [13]:
# creating a new df for fill-in missing X_train values

missing_vals = pd.DataFrame(imputed_df, columns = cont_lst, index = X_train.index)
missing_vals.head()

Unnamed: 0,id,parcelid,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,latitude,...,regionidcounty,regionidzip,roomcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock
35549,2835290.0,10928726.0,2.0,3.0,2.0,1244.0,1244.0,6037.0,2.0,34166955.0,...,3101.0,96446.0,0.0,1943.0,32283.0,58562.0,2016.0,26279.0,764.51,60371250000000.0
18097,1855117.0,11925904.0,3.0,4.0,3.0,2898.0,2898.0,6037.0,3.0,34120410.0,...,3101.0,96268.0,0.0,1993.0,518378.0,926843.0,2016.0,408465.0,10802.46,60374320000000.0
18950,2809180.0,11991988.0,1.0,3.0,1.0,1166.0,1166.0,6037.0,1.0,34078141.0,...,3101.0,95985.0,0.0,1922.0,147520.0,459788.0,2016.0,312268.0,5518.32,60371930000000.0
36943,1030712.0,10850571.0,1.0,2.0,1.0,803.0,803.0,6037.0,1.0,34177765.0,...,3101.0,96349.0,0.0,1949.0,99494.0,497472.0,2016.0,397978.0,6099.58,60371390000000.0
52305,2871267.0,12669704.0,2.0,4.0,2.0,2166.0,2166.0,6037.0,2.0,33809933.0,...,3101.0,96123.0,0.0,1954.0,66047.0,139547.0,2016.0,73500.0,1770.95,60376510000000.0


In [14]:
# assigning determined inputed values back to dataframe

X_train[cont_lst] = missing_vals
X_train.head()

Unnamed: 0,id,parcelid,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,latitude,...,roomcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,transactiondate,propertylandusedesc
35549,2835290.0,10928726.0,2.0,3.0,2.0,1244.0,1244.0,6037.0,2.0,34166955.0,...,0.0,1943.0,32283.0,58562.0,2016.0,26279.0,764.51,60371250000000.0,2017-06-30,Single Family Residential
18097,1855117.0,11925904.0,3.0,4.0,3.0,2898.0,2898.0,6037.0,3.0,34120410.0,...,0.0,1993.0,518378.0,926843.0,2016.0,408465.0,10802.46,60374320000000.0,2017-04-17,Single Family Residential
18950,2809180.0,11991988.0,1.0,3.0,1.0,1166.0,1166.0,6037.0,1.0,34078141.0,...,0.0,1922.0,147520.0,459788.0,2016.0,312268.0,5518.32,60371930000000.0,2017-04-20,Single Family Residential
36943,1030712.0,10850571.0,1.0,2.0,1.0,803.0,803.0,6037.0,1.0,34177765.0,...,0.0,1949.0,99494.0,497472.0,2016.0,397978.0,6099.58,60371390000000.0,2017-07-07,Single Family Residential
52305,2871267.0,12669704.0,2.0,4.0,2.0,2166.0,2166.0,6037.0,2.0,33809933.0,...,0.0,1954.0,66047.0,139547.0,2016.0,73500.0,1770.95,60376510000000.0,2017-09-19,Single Family Residential


In [15]:
# checking the dataset information

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29293 entries, 35549 to 36098
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            29293 non-null  float64
 1   parcelid                      29293 non-null  float64
 2   bathroomcnt                   29293 non-null  float64
 3   bedroomcnt                    29293 non-null  float64
 4   calculatedbathnbr             29293 non-null  float64
 5   calculatedfinishedsquarefeet  29293 non-null  float64
 6   finishedsquarefeet12          29293 non-null  float64
 7   fips                          29293 non-null  float64
 8   fullbathcnt                   29293 non-null  float64
 9   latitude                      29293 non-null  float64
 10  longitude                     29293 non-null  float64
 11  lotsizesquarefeet             29293 non-null  float64
 12  propertycountylandusecode     29293 non-null  object 
 1