# Acquire

1st step is to aquire the data from the Codeup db.

In [18]:
#Libraries

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

# import acquire
# import summarize
# import prepare

In [19]:
import acquire

In [11]:
import os
import env

In [3]:
# Add how handling nulls and document how we're handling them.
# Handle outliers and document (add to README).

In [None]:
'''
This function will allow the user to retrieve all tables from the Zillow database from the Codeup DB source. 
It will acquire the data, import it as a dataframe, and then write that dataframe to a .csv file in the local directory.
'''

In [5]:
sql_query = '''
                SELECT * 
                    FROM properties_2017
                    JOIN (select id, logerror, pid, tdate FROM predictions_2017 pred_2017
                    JOIN (SELECT parcelid AS pid, Max(transactiondate) as tdate FROM predictions_2017 GROUP BY parcelid) AS sq1
                    ON (pred_2017.parcelid = sq1.pid AND pred_2017.transactiondate = sq1.tdate)) AS sq2
                    ON (properties_2017.parcelid = sq2.pid)
                    LEFT JOIN airconditioningtype USING (airconditioningtypeid)
                    LEFT JOIN architecturalstyletype USING (architecturalstyletypeid)
                    LEFT JOIN buildingclasstype USING (buildingclasstypeid)
                    LEFT JOIN heatingorsystemtype USING (heatingorsystemtypeid)
                    LEFT JOIN propertylandusetype USING (propertylandusetypeid)
                    LEFT JOIN storytype USING (storytypeid)
                    LEFT JOIN typeconstructiontype USING (typeconstructiontypeid)
                    LEFT JOIN unique_properties USING (parcelid)
                    WHERE latitude IS NOT NULL AND longitude IS NOT NULL;
                '''

In [12]:
# Importing and displaying the zillow dataframe

df = acquire.get_zillow_data()
df.head()

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,propertylandusetypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,id,basementsqft,...,logerror,pid,tdate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,14297519,,,261.0,,,,,1727539,,...,0.03,14297519,2017-01-01,,,,,Single Family Residential,,
1,17052889,,,261.0,,,,,1387261,,...,0.06,17052889,2017-01-01,,,,,Single Family Residential,,
2,14186244,,,261.0,,,,,11677,,...,0.01,14186244,2017-01-01,,,,,Single Family Residential,,
3,12177905,,,261.0,2.0,,,,2288172,,...,-0.1,12177905,2017-01-01,,,,Central,Single Family Residential,,
4,10887214,,,266.0,2.0,,,1.0,1970746,,...,0.01,10887214,2017-01-01,Central,,,Central,Condominium,,


In [17]:
# Finding the shape of the dataframe to make sure it matches the shape of the data that I built in MySQLPro.

df.shape

(77381, 70)

In [15]:
# Using .info() to find the dtypes of the columns, and to have a baseline count of non-nulls in each column to compare to my nulls when I get into the prep stage.

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77381 entries, 0 to 77380
Data columns (total 70 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      77381 non-null  int64  
 1   typeconstructiontypeid        222 non-null    float64
 2   storytypeid                   50 non-null     float64
 3   propertylandusetypeid         77381 non-null  float64
 4   heatingorsystemtypeid         49440 non-null  float64
 5   buildingclasstypeid           15 non-null     float64
 6   architecturalstyletypeid      206 non-null    float64
 7   airconditioningtypeid         24953 non-null  float64
 8   id                            77381 non-null  int64  
 9   basementsqft                  50 non-null     float64
 10  bathroomcnt                   77381 non-null  float64
 11  bedroomcnt                    77381 non-null  float64
 12  buildingqualitytypeid         49672 non-null  float64
 13  c

In [16]:
df.describe()

Unnamed: 0,parcelid,typeconstructiontypeid,storytypeid,propertylandusetypeid,heatingorsystemtypeid,buildingclasstypeid,architecturalstyletypeid,airconditioningtypeid,id,basementsqft,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyyear,censustractandblock,id.1,logerror,pid
count,77381.0,222.0,50.0,77381.0,49440.0,15.0,206.0,24953.0,77381.0,50.0,...,77269.0,77380.0,77381.0,77379.0,77376.0,2886.0,77137.0,77381.0,77381.0,77381.0
mean,13007150.51,6.04,7.0,261.83,3.92,3.93,7.39,1.81,1495138.9,679.72,...,189314.53,490134.48,2016.0,301095.41,5995.53,14.09,60496733646264.16,38848.69,0.02,13007150.51
std,3481345.65,0.56,0.0,5.14,3.59,0.26,2.73,2.97,860907.05,689.7,...,230087.42,653443.97,0.0,492596.03,7622.84,2.19,1535241981138.57,22402.43,0.17,3481345.65
min,10711855.0,4.0,7.0,31.0,1.0,3.0,2.0,1.0,349.0,38.0,...,44.0,1000.0,2016.0,161.0,19.92,3.0,60371011101000.0,0.0,-4.66,10711855.0
25%,11538305.0,6.0,7.0,261.0,2.0,4.0,7.0,1.0,752070.0,273.0,...,84265.0,207000.0,2016.0,85504.0,2715.63,14.0,60373109005002.0,19461.0,-0.02,11538305.0
50%,12531568.0,6.0,7.0,261.0,2.0,4.0,7.0,1.0,1497932.0,515.0,...,136499.0,358975.5,2016.0,203372.0,4450.69,15.0,60376032003008.0,38870.0,0.01,12531568.0
75%,14211831.0,6.0,7.0,266.0,7.0,4.0,7.0,1.0,2240535.0,796.5,...,218787.0,569001.5,2016.0,366796.5,6927.79,15.0,60590423251008.0,58253.0,0.04,14211831.0
max,167689317.0,13.0,7.0,275.0,24.0,4.0,21.0,13.0,2982274.0,3560.0,...,11421790.0,49061236.0,2016.0,48952198.0,586639.3,99.0,483030105084015.06,77613.0,5.26,167689317.0
