In [3]:
%matplotlib inline
from env import get_connection, password, host, user
import acquire_zillow
import acquire_mall
import prepare_mall
import prepare_zillow
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

In [4]:
properties = pd.read_csv('zillow_properties.csv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
properties.drop(['Unnamed: 0', 'parcelid.1'], axis=1, inplace=True)

In [6]:
properties.shape

(167888, 60)

### Filling nulls with zeros. These properties were chosen because they either are binary and just didn't have 0's, or they are some type of cnt that was left blank since it was 0.

In [7]:
properties = prepare_zillow.replace_nulls_with_zeros(properties, ['hashottuborspa', 'poolcnt', 'poolsizesum', 
                                                                  'basementsqft', 'decktypeid', 'fireplacecnt',
                                                                  'garagecarcnt'])

### Grabbing only those properties that are single unit and have lat and lon. 
The columns that are transformed to object dtypes are chosen because they either are just simple id numbers or a binary representing yes or no.

In [8]:
properties = prepare_zillow.single_lat_lon_objects(properties, ['parcelid', 'buildingqualitytypeid', 
                                                                'decktypeid', 'fips', 'hashottuborspa', 
                                                                'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 
                                                                'regionidcity', 'regionidcounty',
                                                                'regionidneighborhood', 'regionidzip', 
                                                                'fireplaceflag'])


In [9]:
properties.shape

(97618, 60)

In [10]:
prepare_zillow.missing_vals_cols(properties)

buildingqualitytypeid has 954 missing values, accounting for 0.98% of that column.
calculatedbathnbr has 8 missing values, accounting for 0.01% of that column.
finishedfloor1squarefeet has 97618 missing values, accounting for 100.00% of that column.
calculatedfinishedsquarefeet has 4 missing values, accounting for 0.00% of that column.
finishedsquarefeet12 has 15 missing values, accounting for 0.02% of that column.
finishedsquarefeet13 has 97618 missing values, accounting for 100.00% of that column.
finishedsquarefeet15 has 97607 missing values, accounting for 99.99% of that column.
finishedsquarefeet50 has 97618 missing values, accounting for 100.00% of that column.
finishedsquarefeet6 has 97618 missing values, accounting for 100.00% of that column.
fullbathcnt has 8 missing values, accounting for 0.01% of that column.
garagetotalsqft has 97610 missing values, accounting for 99.99% of that column.
lotsizesquarefeet has 2516 missing values, accounting for 2.58% of that column.
pooltype

### Do any columns have too many NaN's to be useful?
After looking at the print out and seeing which columns have what percentage of null values, I decide that if a column is more than 30% null, it should be dropped.

In [11]:
prepare_zillow.drop_col_if_too_many_nulls(properties, .30)

Nulls still present but not nearly as many.

In [12]:
prepare_zillow.missing_vals_cols(properties)

buildingqualitytypeid has 954 missing values, accounting for 0.98% of that column.
calculatedbathnbr has 8 missing values, accounting for 0.01% of that column.
calculatedfinishedsquarefeet has 4 missing values, accounting for 0.00% of that column.
finishedsquarefeet12 has 15 missing values, accounting for 0.02% of that column.
fullbathcnt has 8 missing values, accounting for 0.01% of that column.
lotsizesquarefeet has 2516 missing values, accounting for 2.58% of that column.
propertyzoningdesc has 403 missing values, accounting for 0.41% of that column.
regionidcity has 2054 missing values, accounting for 2.10% of that column.
regionidzip has 18 missing values, accounting for 0.02% of that column.
yearbuilt has 20 missing values, accounting for 0.02% of that column.
structuretaxvaluedollarcnt has 110 missing values, accounting for 0.11% of that column.
taxamount has 8 missing values, accounting for 0.01% of that column.
censustractandblock has 199 missing values, accounting for 0.20% o

In [13]:
properties.shape

(97618, 38)

### Dropping the rows with remaining null values.

In [14]:
properties.dropna(inplace=True)

In [15]:
properties.shape

(90578, 38)

In [16]:
properties.isna().sum()

parcelid                        0
basementsqft                    0
bathroomcnt                     0
bedroomcnt                      0
buildingqualitytypeid           0
calculatedbathnbr               0
decktypeid                      0
calculatedfinishedsquarefeet    0
finishedsquarefeet12            0
fips                            0
fireplacecnt                    0
fullbathcnt                     0
garagecarcnt                    0
hashottuborspa                  0
latitude                        0
longitude                       0
lotsizesquarefeet               0
poolcnt                         0
poolsizesum                     0
propertycountylandusecode       0
propertyzoningdesc              0
rawcensustractandblock          0
regionidcity                    0
regionidcounty                  0
regionidzip                     0
roomcnt                         0
unitcnt                         0
yearbuilt                       0
structuretaxvaluedollarcnt      0
taxvaluedollar

In [17]:
properties[properties.calculatedfinishedsquarefeet != properties.finishedsquarefeet12]

Unnamed: 0,parcelid,basementsqft,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,decktypeid,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate,heatingorsystemdesc,propertylandusedesc


##### Dropping `finishedsquarefeet12` since it's values are the same as `calculatedfinishedsquarefeet`.

In [18]:
properties.drop(['finishedsquarefeet12'], axis=1, inplace=True)

##### Dropping `fullbathcnt` since it is similar to `bathroomcnt`.

In [19]:
properties.drop(['fullbathcnt'], axis=1, inplace=True)

##### I don't think land use code or zoning description will be a factor in estimate error.
They will be dropped.

In [20]:
properties.propertycountylandusecode.unique()

array(['0100', '010C', '010E', '0104', '0101', '01DC', '012C', '010G',
       '0103', '010M', '01HC', '0108', '0109', '010H', '010V', '0102',
       '0130', '012E', '0110', '010', '0131', '0133', '0105', '012D',
       '0113'], dtype=object)

In [21]:
properties.drop(['propertycountylandusecode', 'propertyzoningdesc'], axis=1, inplace=True)

array([6037.0], dtype=object)

##### Are the descriptions useful? Example: `airconditioningdesc` can be centralized or refrigeration, but can also just be 'yes'.
Seeing that properties can have 'presumably' no air conditioning but still have a bed or bath, it seems like this column may not be useful.

In [None]:
# properties[(properties.airconditioningdesc.isna()) & ((properties.bathroomcnt > 0) | (properties.bedroomcnt > 0))].describe()

### Handling of Outliers
Outliers will be determined using IQR, specifically by looking for values that are over the Q3 point by 1.5 (IQR) or those that are under the Q1 point by 1.5 (IQR). I will be using my `detect_and_remove_outliers()` to do so. 

In [None]:
prepare_zillow.detect_and_remove_outliers(properties)

In [None]:
properties.shape