In [1]:
import pandas as pd
import numpy as np
import os
from env import get_db_url
import wrangle

In [2]:
def get_zillow_data():
    '''Returns a dataframe of all single family residential properties from 2017.'''
    filename = "zillow.csv"

    if os.path.isfile(filename):
        return pd.read_csv(filename)
    else:
        sql_query = '''
        SELECT properties_2017.bedroomcnt AS Number_of_Bedrooms,
        properties_2017.bathroomcnt AS Number_of_Bathrooms,
        properties_2017.calculatedfinishedsquarefeet AS Square_Footage, 
        properties_2017.taxvaluedollarcnt AS Tax_Appraised_Value, 
        properties_2017.yearbuilt AS Year_Built, 
        properties_2017.taxamount AS Tax_Assessed, properties_2017.fips
        FROM properties_2017
        JOIN propertylandusetype using (propertylandusetypeid)
        WHERE propertylandusedesc = "Single Family Residential";
        '''
        df = pd.read_sql(sql_query, get_db_url('zillow'))
        df.to_csv(filename)
    return df



In [3]:
df = get_zillow_data()
df.head()

Unnamed: 0.1,Unnamed: 0,Number_of_Bedrooms,Number_of_Bathrooms,Square_Footage,Tax_Appraised_Value,Year_Built,Tax_Assessed,fips
0,0,0.0,0.0,,27516.0,,,6037.0
1,1,0.0,0.0,,10.0,,,6037.0
2,2,0.0,0.0,,10.0,,,6037.0
3,3,0.0,0.0,,2108.0,,174.21,6037.0
4,4,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0


In [4]:

sql_query = '''
      SELECT properties_2017.bedroomcnt AS Number_of_Bedrooms,
properties_2017.bathroomcnt AS Number_of_Bathrooms,
properties_2017.calculatedfinishedsquarefeet AS Square_Footage, 
properties_2017.taxvaluedollarcnt AS Tax_Appraised_Value, 
properties_2017.yearbuilt AS Year_Built, 
properties_2017.taxamount AS Tax_Assessed, properties_2017.fips
FROM properties_2017
JOIN propertylandusetype using (propertylandusetypeid)
WHERE propertylandusedesc = "Single Family Residential"
       '''
df = pd.read_sql(sql_query, get_db_url('zillow'))

In [5]:
df.head()

Unnamed: 0,Number_of_Bedrooms,Number_of_Bathrooms,Square_Footage,Tax_Appraised_Value,Year_Built,Tax_Assessed,fips
0,0.0,0.0,,27516.0,,,6037.0
1,0.0,0.0,,10.0,,,6037.0
2,0.0,0.0,,10.0,,,6037.0
3,0.0,0.0,,2108.0,,174.21,6037.0
4,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0


In [6]:
df.shape

(2152863, 7)

In [7]:
df.isnull().mean()

Number_of_Bedrooms     0.000005
Number_of_Bathrooms    0.000005
Square_Footage         0.003941
Tax_Appraised_Value    0.000229
Year_Built             0.004337
Tax_Assessed           0.002063
fips                   0.000000
dtype: float64

In [8]:
# nulls are a small percentage of the df, good to drop them
df = df.dropna()

In [9]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Number_of_Bedrooms,2140235.0,3.301419,0.9326,0.0,3.0,3.0,4.0,25.0
Number_of_Bathrooms,2140235.0,2.240352,0.990549,0.0,2.0,2.0,3.0,32.0
Square_Footage,2140235.0,1863.19397,1221.754161,1.0,1258.0,1623.0,2208.0,952576.0
Tax_Appraised_Value,2140235.0,460641.625164,677157.635675,22.0,189166.0,328296.0,534606.0,90188462.0
Year_Built,2140235.0,1960.967545,22.150563,1801.0,1949.0,1958.0,1976.0,2016.0
Tax_Assessed,2140235.0,5616.711322,7814.562798,6.34,2540.85,4111.47,6411.93,1078101.87
fips,2140235.0,6048.309556,20.34491,6037.0,6037.0,6037.0,6059.0,6111.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2140235 entries, 4 to 2152862
Data columns (total 7 columns):
 #   Column               Dtype  
---  ------               -----  
 0   Number_of_Bedrooms   float64
 1   Number_of_Bathrooms  float64
 2   Square_Footage       float64
 3   Tax_Appraised_Value  float64
 4   Year_Built           float64
 5   Tax_Assessed         float64
 6   fips                 float64
dtypes: float64(7)
memory usage: 130.6 MB


In [11]:
# value counts for each column
for column in df.columns:
    print(column)
    print(df[column].value_counts())
    print("-----------------")

Number_of_Bedrooms
3.0     962944
4.0     633608
2.0     334221
5.0     150671
6.0      25117
1.0      22895
7.0       4792
0.0       4397
8.0       1103
9.0        290
10.0       118
11.0        34
13.0        15
12.0        12
14.0         7
15.0         5
18.0         3
16.0         2
25.0         1
Name: Number_of_Bedrooms, dtype: int64
-----------------
Number_of_Bathrooms
2.00     942463
3.00     422398
1.00     412582
2.50     142827
4.00      82039
1.50      31157
3.50      28464
5.00      28306
4.50      19474
6.00      10717
5.50       6201
7.00       4381
0.00       4274
8.00       1681
6.50       1330
9.00        707
7.50        382
10.00       322
11.00       145
8.50        108
12.00        73
9.50         50
13.00        39
14.00        25
15.00        17
0.50         16
10.50        14
16.00        12
18.00         8
20.00         6
17.00         4
1.75          3
12.50         3
11.50         3
19.50         1
14.50         1
32.00         1
19.00         1
Name: Numbe

In [12]:
# what should the minimum square footage be? 120 
# https://worldpopulationreview.com/state-rankings/minimum-house-size-by-state
# what percentage of homes under 120 sq ft are in the df?
round(len(df[df['Square_Footage']< 120])/ df.shape[0],4)

0.0001

In [13]:
# how many homes have no bedrooms?
len(df[df['Number_of_Bedrooms']==0])

4397

In [14]:
# bathrooms?
len(df[df['Number_of_Bathrooms']==0])

4274

In [16]:
# dropping rows that don't meet minimum home requirements

df = df.drop(df.loc[df['Square_Footage']< 120], inplace=True)
df = df.drop(df.loc[df['Number_of_Bedrooms']<1], inplace=True)
df = df.drop(df.loc[df['Number_of_Bathrooms']<.5] ,inplace=True)

AttributeError: 'NoneType' object has no attribute 'drop'

In [17]:
df.describe().T

AttributeError: 'NoneType' object has no attribute 'describe'

In [None]:
# can a house really have 0 beds and 0 baths? 
df[(df.Number_of_Bedrooms < 1) & (df.Number_of_Bathrooms < .5)]