## We finna wrangle

In these exercises, you will complete the first step toward the above goal: acquire and prepare the necessary Zillow data from the zillow database in the Codeup database server.

In [1]:
import warnings
warnings.filterwarnings('ignore')
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import mason_functions as mf

In [2]:
def get_db_url(db_name):
    from env import host, user, password
    return f'mysql+pymysql://{user}:{password}@{host}/{db_name}'

### Exercise I
Acquire bedroomcnt, bathroomcnt, calculatedfinishedsquarefeet, taxvaluedollarcnt, yearbuilt, taxamount, and fips from the zillow database for all 'Single Family Residential' properties.

In [3]:
#define my sql query into the relational database
sql = '''
SELECT bedroomcnt, bathroomcnt, calculatedfinishedsquarefeet, taxvaluedollarcnt, yearbuilt, taxamount, fips
FROM properties_2017
WHERE propertylandusetypeid = 261
'''

#define my url
url = get_db_url('zillow')

#read the information from the db into a df
#I also don't want to keep querying the codeup rdbms
if os.path.isfile('properties_2017.csv'):
    df = pd.read_csv('properties_2017.csv', index_col = 0)
else:
    df = pd.read_sql(sql, url)
    df.to_csv('properties_2017.csv')

In [4]:
#at a glance
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2152863 entries, 0 to 2152862
Data columns (total 7 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   bedroomcnt                    float64
 1   bathroomcnt                   float64
 2   calculatedfinishedsquarefeet  float64
 3   taxvaluedollarcnt             float64
 4   yearbuilt                     float64
 5   taxamount                     float64
 6   fips                          float64
dtypes: float64(7)
memory usage: 131.4 MB
None


Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
0,0.0,0.0,,27516.0,,,6037.0
1,0.0,0.0,,10.0,,,6037.0
2,0.0,0.0,,10.0,,,6037.0
3,0.0,0.0,,2108.0,,174.21,6037.0
4,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0


### Exercise II
Using your acquired Zillow data, walk through the summarization and cleaning steps in your wrangle.ipynb file like we did above. You may handle the missing values however you feel is appropriate and meaninful; remember to document your process and decisions using markdown and code commenting where helpful.

In [5]:
#summary statistics
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bedroomcnt,2152852.0,3.287196,0.954754,0.0,3.0,3.0,4.0,25.0
bathroomcnt,2152852.0,2.230688,0.99928,0.0,2.0,2.0,3.0,32.0
calculatedfinishedsquarefeet,2144379.0,1862.855178,1222.125124,1.0,1257.0,1623.0,2208.0,952576.0
taxvaluedollarcnt,2152370.0,461896.237963,699676.0496,1.0,188170.25,327671.0,534527.0,98428909.0
yearbuilt,2143526.0,1960.949681,22.162196,1801.0,1949.0,1958.0,1976.0,2016.0
taxamount,2148421.0,5634.865978,8178.910249,1.85,2534.98,4108.95,6414.32,1337755.86
fips,2152863.0,6048.377335,20.433292,6037.0,6037.0,6037.0,6059.0,6111.0


In [6]:
#more perspective
df.sample(10)

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
1248472,3.0,2.0,1883.0,284931.0,2002.0,4341.75,6037.0
1584740,4.0,3.0,2354.0,195239.0,1977.0,2505.04,6037.0
570691,2.0,1.0,950.0,1395382.0,1932.0,18481.71,6037.0
1431140,4.0,3.0,2759.0,413366.0,1978.0,4733.84,6059.0
1890433,3.0,2.0,1306.0,238238.0,1949.0,3233.98,6037.0
627509,4.0,2.0,1356.0,700600.0,1944.0,8487.45,6037.0
1174800,3.0,2.0,1708.0,355127.0,1953.0,4320.61,6037.0
2009840,3.0,3.0,1546.0,539747.0,1989.0,6574.3,6037.0
1055816,4.0,2.0,1612.0,457972.0,1964.0,5370.68,6111.0
631764,2.0,1.0,1437.0,280843.0,1936.0,3695.7,6037.0


In [7]:
#what kind of nulls am I looking at here?
df.isnull().any()

bedroomcnt                       True
bathroomcnt                      True
calculatedfinishedsquarefeet     True
taxvaluedollarcnt                True
yearbuilt                        True
taxamount                        True
fips                            False
dtype: bool

In [8]:
#sum of nulls by column
df.isnull().sum()

bedroomcnt                        11
bathroomcnt                       11
calculatedfinishedsquarefeet    8484
taxvaluedollarcnt                493
yearbuilt                       9337
taxamount                       4442
fips                               0
dtype: int64

In [9]:
#list the columns with null values
df.columns[df.isnull().any()]

Index(['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet',
       'taxvaluedollarcnt', 'yearbuilt', 'taxamount'],
      dtype='object')

In [10]:
#how many bathrooms can one single family residence have?
df.bedroomcnt.value_counts(dropna = False, ascending = True)

25.0         1
16.0         2
18.0         3
15.0         6
14.0         7
NaN         11
12.0        12
13.0        16
11.0        34
10.0       121
9.0        291
8.0       1107
7.0       4807
0.0      13187
1.0      23166
6.0      25166
5.0     150866
2.0     335473
4.0     634289
3.0     964298
Name: bedroomcnt, dtype: int64

In [11]:
#let's take a look at the rows where bedroom count is missing
df[df.bedroomcnt.isna()]

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
107763,,,,67366.0,1926.0,780.54,6059.0
118612,,,,43992.0,1946.0,541.64,6059.0
193993,,,1348.0,840698.0,1952.0,,6059.0
1141339,,,200.0,188972.0,,,6037.0
1324608,,,990.0,435000.0,1906.0,,6037.0
1442975,,,,273196.0,,,6037.0
1647346,,,400.0,28347.0,1954.0,,6037.0
1701026,,,,407930.0,1926.0,,6037.0
1722707,,,,477161.0,,,6037.0
1776422,,,,38855.0,,,6037.0


In [12]:
df.bathroomcnt.value_counts(dropna = False, ascending = True)

19.50         1
32.00         1
14.50         1
19.00         1
11.50         3
1.75          3
12.50         3
17.00         4
20.00         6
18.00         8
NaN          11
16.00        12
10.50        14
0.50         16
15.00        17
14.00        25
13.00        39
9.50         50
12.00        73
8.50        110
11.00       146
10.00       325
7.50        384
9.00        713
6.50       1333
8.00       1692
7.00       4394
5.50       6217
6.00      10747
0.00      13027
4.50      19506
5.00      28362
3.50      28518
1.50      31211
4.00      82155
2.50     142981
1.00     414324
3.00     422841
2.00     943589
Name: bathroomcnt, dtype: int64

### Exercise III
Store all of the necessary functions to automate your process from acquiring the data to returning a cleaned dataframe witn no missing values in your wrangle.py file. Name your final function wrangle_zillow.