In [1]:
%matplotlib inline
import matplotlib as mlp
mlp.rcParams['figure.dpi'] = 200

# diasbling warning messages
import warnings
warnings.filterwarnings("ignore")

# importing key libraries
import pandas as pd
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:.2f}'.format
import numpy as np

# importing wrangle/acquire module
import wrangle
from wrangle import get_zillow_dataset, \
                    null_df, \
                    drop_nulls, \
                    clean_zillow_dataset, \
                    train_validate_test_split

# importing data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set()

# sklearn data science library
from sklearn.impute import KNNImputer
from sklearn.impute import MissingIndicator
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
# data acquisition and preparation

df = get_zillow_dataset()
print(df.shape)

(52319, 68)


In [3]:
# initial clean of dataset 
# where: feature and record null % > 80% are dropped

df = clean_zillow_dataset(df)
df.head()

dataframe shape: (52311, 16)


Unnamed: 0,bathroom_count,bedroom_count,living_sq_feet,county_by_fips,land_assessed_value,latitude,logerror,longitude,property_sq_feet,county_zoning_code,blockgroup_assignment,home_assessed_value,taxamount,home_value,transaction_date,year_built
0,3.5,4.0,3100.0,Orange County,537569.0,33634931.0,0.03,-117869207.0,4506.0,122,60590630.07,485713.0,11013.72,1023282.0,2017-01-01,1998.0
1,1.0,2.0,1465.0,Ventura County,376000.0,34449266.0,0.06,-119281531.0,12647.0,1110,61110010.02,88000.0,5672.48,464000.0,2017-01-01,1967.0
2,2.0,3.0,1243.0,Orange County,479489.0,33886168.0,0.01,-117823170.0,8432.0,122,60590218.02,85289.0,6488.3,564778.0,2017-01-01,1962.0
3,3.0,4.0,2376.0,LA County,36225.0,34245180.0,-0.1,-118240722.0,13038.0,101,60373001.0,108918.0,1777.51,145143.0,2017-01-01,1970.0
4,3.0,4.0,2962.0,LA County,496619.0,34145202.0,-0.0,-118179824.0,63000.0,101,60374608.0,276684.0,9516.26,773303.0,2017-01-01,1950.0


In [4]:
# calculating and adding age of the home thru 2017

df = wrangle.age_of_homes(df)
df.head()

Unnamed: 0,bathroom_count,bedroom_count,living_sq_feet,county_by_fips,land_assessed_value,latitude,logerror,longitude,property_sq_feet,county_zoning_code,blockgroup_assignment,home_assessed_value,taxamount,home_value,transaction_date,year_built,home_age
0,3.5,4.0,3100.0,Orange County,537569.0,33634931.0,0.03,-117869207.0,4506.0,122,60590630.07,485713.0,11013.72,1023282.0,2017-01-01,1998.0,24.0
1,1.0,2.0,1465.0,Ventura County,376000.0,34449266.0,0.06,-119281531.0,12647.0,1110,61110010.02,88000.0,5672.48,464000.0,2017-01-01,1967.0,55.0
2,2.0,3.0,1243.0,Orange County,479489.0,33886168.0,0.01,-117823170.0,8432.0,122,60590218.02,85289.0,6488.3,564778.0,2017-01-01,1962.0,60.0
3,3.0,4.0,2376.0,LA County,36225.0,34245180.0,-0.1,-118240722.0,13038.0,101,60373001.0,108918.0,1777.51,145143.0,2017-01-01,1970.0,52.0
4,3.0,4.0,2962.0,LA County,496619.0,34145202.0,-0.0,-118179824.0,63000.0,101,60374608.0,276684.0,9516.26,773303.0,2017-01-01,1950.0,72.0


In [5]:
# adding transactions by month columns

df = wrangle.clean_months(df)
df.head()

Unnamed: 0,bathroom_count,bedroom_count,living_sq_feet,county_by_fips,land_assessed_value,latitude,logerror,longitude,property_sq_feet,county_zoning_code,blockgroup_assignment,home_assessed_value,taxamount,home_value,year_built,home_age,transaction_month
0,3.5,4.0,3100.0,Orange County,537569.0,33634931.0,0.03,-117869207.0,4506.0,122,60590630.07,485713.0,11013.72,1023282.0,1998.0,24.0,January
1,1.0,2.0,1465.0,Ventura County,376000.0,34449266.0,0.06,-119281531.0,12647.0,1110,61110010.02,88000.0,5672.48,464000.0,1967.0,55.0,January
2,2.0,3.0,1243.0,Orange County,479489.0,33886168.0,0.01,-117823170.0,8432.0,122,60590218.02,85289.0,6488.3,564778.0,1962.0,60.0,January
3,3.0,4.0,2376.0,LA County,36225.0,34245180.0,-0.1,-118240722.0,13038.0,101,60373001.0,108918.0,1777.51,145143.0,1970.0,52.0,January
4,3.0,4.0,2962.0,LA County,496619.0,34145202.0,-0.0,-118179824.0,63000.0,101,60374608.0,276684.0,9516.26,773303.0,1950.0,72.0,January


In [6]:
# dataframe info

sorted_cols = df.columns.sort_values()
df[sorted_cols].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52311 entries, 0 to 52318
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   bathroom_count         52311 non-null  float64
 1   bedroom_count          52311 non-null  float64
 2   blockgroup_assignment  52311 non-null  float64
 3   county_by_fips         52311 non-null  object 
 4   county_zoning_code     52311 non-null  object 
 5   home_age               52205 non-null  float64
 6   home_assessed_value    52232 non-null  float64
 7   home_value             52310 non-null  float64
 8   land_assessed_value    52310 non-null  float64
 9   latitude               52311 non-null  float64
 10  living_sq_feet         52238 non-null  float64
 11  logerror               52311 non-null  float64
 12  longitude              52311 non-null  float64
 13  property_sq_feet       51951 non-null  float64
 14  taxamount              52307 non-null  float64
 15  tr

In [7]:
# checking nulls and percentage of dataframe features

remaining_nulls = null_df(df)
remaining_nulls

Unnamed: 0,Total Null,Feature Null %
property_sq_feet,360,0.01
home_age,106,0.0
year_built,106,0.0
home_assessed_value,79,0.0
living_sq_feet,73,0.0
taxamount,4,0.0
land_assessed_value,1,0.0
home_value,1,0.0
blockgroup_assignment,0,0.0
bathroom_count,0,0.0


In [8]:
# summing total number of outliers per continuous feature - target variable

wrangle.sum_outliers(df)

Unnamed: 0,Feature,Upper_Bound,Total Outliers
0,property_sq_feet,13624.25,5537
1,blockgroup_assignment,60915040.12,4384
2,home_assessed_value,450377.75,3892
3,taxamount,14456.18,3811
4,home_value,1257364.25,3533
5,land_assessed_value,907636.38,3011
6,living_sq_feet,3863.0,2272
7,bathroom_count,4.5,1577
8,bedroom_count,5.5,777
9,home_age,115.5,240


In [9]:
# identifying continuous variables/features:

num_lst = df.select_dtypes("number").columns.tolist()
num_lst = [ele for ele in num_lst if ele not in ("logerror", "year_built")]
num_lst

['bathroom_count',
 'bedroom_count',
 'living_sq_feet',
 'land_assessed_value',
 'latitude',
 'longitude',
 'property_sq_feet',
 'blockgroup_assignment',
 'home_assessed_value',
 'taxamount',
 'home_value',
 'home_age']

In [10]:
# check the df shape

df.shape

(52311, 17)

In [11]:
# let's handle the outliers using a "capping" at upper_bound method

df = wrangle.capp_outliers(df, num_lst)
df.head()

Unnamed: 0,bathroom_count-capped,bedroom_count-capped,living_sq_feet-capped,county_by_fips,land_assessed_value-capped,latitude-capped,logerror,longitude-capped,property_sq_feet-capped,county_zoning_code,blockgroup_assignment-capped,home_assessed_value-capped,taxamount-capped,home_value-capped,year_built,home_age-capped,transaction_month
0,3.5,4.0,3100.0,Orange County,537569.0,33634931.0,0.03,-117869207.0,4506.0,122,60590630.07,450377.75,11013.72,1023282.0,1998.0,24.0,January
1,1.0,2.0,1465.0,Ventura County,376000.0,34449266.0,0.06,-119281531.0,12647.0,1110,60915040.12,88000.0,5672.48,464000.0,1967.0,55.0,January
2,2.0,3.0,1243.0,Orange County,479489.0,33886168.0,0.01,-117823170.0,8432.0,122,60590218.02,85289.0,6488.3,564778.0,1962.0,60.0,January
3,3.0,4.0,2376.0,LA County,36225.0,34245180.0,-0.1,-118240722.0,13038.0,101,60373001.0,108918.0,1777.51,145143.0,1970.0,52.0,January
4,3.0,4.0,2962.0,LA County,496619.0,34145202.0,-0.0,-118179824.0,13624.25,101,60374608.0,276684.0,9516.26,773303.0,1950.0,72.0,January


In [12]:
df.shape

(52311, 17)

In [13]:
df.columns.tolist()

['bathroom_count-capped',
 'bedroom_count-capped',
 'living_sq_feet-capped',
 'county_by_fips',
 'land_assessed_value-capped',
 'latitude-capped',
 'logerror',
 'longitude-capped',
 'property_sq_feet-capped',
 'county_zoning_code',
 'blockgroup_assignment-capped',
 'home_assessed_value-capped',
 'taxamount-capped',
 'home_value-capped',
 'year_built',
 'home_age-capped',
 'transaction_month']

In [14]:
# handling remaining nulls/missing data



----
### ```splitting Zillow Dataset for hypothesis testing & exploration```

In [15]:
# splitting the dataset to 1. fill-in remaining nulls and 2. determine outlier cutoffs

train, validate, test = train_validate_test_split(df)

train shape: (29293, 17)
validate shape: (12555, 17)
test shape: (10463, 17)


In [16]:
# 2nd split: splitting larger datasets into x and y variables

X_train = train.drop(columns = "logerror")
y_train = train['logerror']

X_validate = validate.drop(columns = "logerror")
y_validate = validate['logerror']

X_test = test.drop(columns = "logerror")
y_test = test['logerror']

In [17]:
# checking null values in train dataset

wrangle.sum_outliers(X_train)

Unnamed: 0,Feature,Upper_Bound,Total Outliers
0,property_sq_feet-capped,13553.5,3111
1,blockgroup_assignment-capped,60915040.08,2481
2,home_assessed_value-capped,448878.25,2174
3,taxamount-capped,14364.72,2131
4,home_value-capped,1244563.0,1978
5,land_assessed_value-capped,904675.0,1650
6,latitude-capped,34726865.0,19


In [18]:
num_lst = []

for col in X_train.select_dtypes("number"):
    num_lst.append(col)

print(len(num_lst))

13


In [19]:
# checking number of nulls in train dataset

wrangle.null_df(X_train)

Unnamed: 0,Total Null,Feature Null %
property_sq_feet-capped,208,0.01
year_built,64,0.0
home_age-capped,64,0.0
living_sq_feet-capped,45,0.0
home_assessed_value-capped,41,0.0
taxamount-capped,1,0.0
bathroom_count-capped,0,0.0
bedroom_count-capped,0,0.0
county_by_fips,0,0.0
land_assessed_value-capped,0,0.0
