In [7]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

# importing libraries/modules, and functions
import pandas as pd
import numpy as np
import scipy.stats as stats

# visualization libraries
import matplotlib as mlp
mlp.rcParams['figure.dpi'] = 200
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

# created zillow library and functions
import wrangle
from acquire import get_zillow_dataset, clean_zillow_dataset, zillow_outliers, train_validate_test_split
import prepare
from prepare import display_all

# sklearn library for data science
import sklearn
from sklearn.feature_selection import SelectKBest, RFE, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# just in case :) 
import env
from env import user, password, host, get_connection

----
### ``Data Acquisition & Preparation:``

In [8]:
# initial query for MySQL zillow data
# query = '''
# SELECT *
#     FROM properties_2017
#         JOIN predictions_2017 USING (id)
#             JOIN propertylandusetype USING (propertylandusetypeid)
#                 WHERE transactiondate >= 2017
#                     AND propertylandusedesc = 'Single Family Residential' '''

In [9]:
# pulling the MySQL zillow data
# url = get_connection(user, password, host, "zillow")
# df = pd.read_sql(query, url)

In [10]:
# creating a csv file for storing
# df.to_csv("/Users/mijailmariano/codeup-data-science/regression-exercises/zillow_regression.csv", index = False)

In [11]:
# can now pull the zillow data using the following function:
zillow_df = get_zillow_dataset()
display_all(zillow_df.head()) # checks out!

Unnamed: 0,propertylandusetypeid,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,calculatedfinishedsquarefeet,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15,finishedsquarefeet50,finishedsquarefeet6,fips,fireplacecnt,fullbathcnt,garagecarcnt,garagetotalsqft,hashottuborspa,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10,pooltypeid2,pooltypeid7,propertycountylandusecode,propertyzoningdesc,rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood,regionidzip,roomcnt,storytypeid,threequarterbathnbr,typeconstructiontypeid,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,parcelid.1,logerror,transactiondate,propertylandusedesc
0,261.0,941,17223154,,,,2.5,4.0,,,2.5,,990.0,2088.0,2088.0,,,990.0,,6111.0,1.0,2.0,2.0,445.0,,,34238617.0,-118813281.0,6692.0,,,,,,1111,,61110080.0,27110.0,2061.0,,97118.0,8.0,,1.0,,,484.0,,1990.0,2.0,,281174.0,562348.0,2016.0,281174.0,6405.58,,,61110080000000.0,14710129,-0.002189,2017-01-05,Single Family Residential
1,261.0,940,17223031,,,,3.0,4.0,,,3.0,,1376.0,2572.0,2572.0,,,1376.0,,6111.0,2.0,3.0,2.0,620.0,,,34235499.0,-118808366.0,10360.0,,,,,,1111,,61110080.0,27110.0,2061.0,,97118.0,8.0,,,,,474.0,,1994.0,2.0,,271103.0,444563.0,2016.0,173460.0,5151.84,,,61110080000000.0,12477465,0.053014,2017-01-05,Single Family Residential
2,261.0,939,17222931,,,,3.5,4.0,,,3.5,66.0,1643.0,3213.0,3213.0,,,1643.0,,6111.0,2.0,3.0,2.0,548.0,,,34243775.0,-118812925.0,11059.0,1.0,810.0,,,1.0,1111,,61110080.0,27110.0,2061.0,,97118.0,9.0,,1.0,,,1534.0,,1994.0,2.0,,397886.0,613950.0,2016.0,216064.0,7042.4,,,61110080000000.0,14608599,0.010948,2017-01-05,Single Family Residential
3,261.0,937,17222339,,,,2.0,3.0,,,2.0,,2170.0,2170.0,2170.0,,,2170.0,,6111.0,2.0,2.0,2.0,636.0,,,34244934.0,-118803182.0,10297.0,,,,,,1111,,61110080.0,27110.0,2061.0,,97118.0,7.0,,,,,1002.0,,1987.0,1.0,,311147.0,622294.0,2016.0,311147.0,7143.4,,,61110080000000.0,12473718,0.053193,2017-01-05,Single Family Residential
4,261.0,935,17222139,,,,2.0,2.0,,,2.0,,874.0,1457.0,1457.0,,,874.0,,6111.0,1.0,2.0,2.0,420.0,,,34247692.0,-118818283.0,8767.0,,,,,,1111,,61110080.0,27110.0,2061.0,,97118.0,4.0,,,,,168.0,,1986.0,2.0,,270000.0,538000.0,2016.0,268000.0,6153.12,,,61110080000000.0,14603860,0.080817,2017-01-05,Single Family Residential


In [12]:
# exploring the dataset further
initial_shape = zillow_df.shape

In [13]:
zillow_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56080 entries, 0 to 56079
Data columns (total 63 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   propertylandusetypeid         56080 non-null  float64
 1   id                            56080 non-null  int64  
 2   parcelid                      56080 non-null  int64  
 3   airconditioningtypeid         13651 non-null  float64
 4   architecturalstyletypeid      61 non-null     float64
 5   basementsqft                  43 non-null     float64
 6   bathroomcnt                   56080 non-null  float64
 7   bedroomcnt                    56080 non-null  float64
 8   buildingclasstypeid           0 non-null      float64
 9   buildingqualitytypeid         36991 non-null  float64
 10  calculatedbathnbr             55694 non-null  float64
 11  decktypeid                    360 non-null    float64
 12  finishedfloor1squarefeet      4757 non-null   float64
 13  c

In [14]:
# creating a function that bypasses pd row/cols limits
# with pd.option_context('display.max_rows', None):
#     display(zillow_df.isnull().sum())

display_all(zillow_df.isnull().sum())

propertylandusetypeid               0
id                                  0
parcelid                            0
airconditioningtypeid           42429
architecturalstyletypeid        56019
basementsqft                    56037
bathroomcnt                         0
bedroomcnt                          0
buildingclasstypeid             56080
buildingqualitytypeid           19089
calculatedbathnbr                 386
decktypeid                      55720
finishedfloor1squarefeet        51323
calculatedfinishedsquarefeet      231
finishedsquarefeet12              421
finishedsquarefeet13            56080
finishedsquarefeet15            56080
finishedsquarefeet50            51323
finishedsquarefeet6             55890
fips                                0
fireplacecnt                    48449
fullbathcnt                       386
garagecarcnt                    37833
garagetotalsqft                 37833
hashottuborspa                  54840
heatingorsystemtypeid           19004
latitude    

In [15]:
# checking total null percentage for ea. column
display_all(round(zillow_df.isnull().mean(), 3))

propertylandusetypeid           0.000
id                              0.000
parcelid                        0.000
airconditioningtypeid           0.757
architecturalstyletypeid        0.999
basementsqft                    0.999
bathroomcnt                     0.000
bedroomcnt                      0.000
buildingclasstypeid             1.000
buildingqualitytypeid           0.340
calculatedbathnbr               0.007
decktypeid                      0.994
finishedfloor1squarefeet        0.915
calculatedfinishedsquarefeet    0.004
finishedsquarefeet12            0.008
finishedsquarefeet13            1.000
finishedsquarefeet15            1.000
finishedsquarefeet50            0.915
finishedsquarefeet6             0.997
fips                            0.000
fireplacecnt                    0.864
fullbathcnt                     0.007
garagecarcnt                    0.675
garagetotalsqft                 0.675
hashottuborspa                  0.978
heatingorsystemtypeid           0.339
latitude    

In [16]:
# cleaning df for for feature with NULL % > 20% 

dropped_cols = []
for col in zillow_df.columns:
    if zillow_df[col].isnull().mean() > 0.2:
        dropped_cols.append(col)
        zillow_df = zillow_df.drop(columns = col)

# returning initial shape vs. null drop shape
print(f'initial df shape: {initial_shape}')
print(f'shape after null drop: {zillow_df.shape}')
print('dropped columns:', *dropped_cols, sep = '\n- ')

initial df shape: (56080, 63)
shape after null drop: (56080, 30)
dropped columns:
- airconditioningtypeid
- architecturalstyletypeid
- basementsqft
- buildingclasstypeid
- buildingqualitytypeid
- decktypeid
- finishedfloor1squarefeet
- finishedsquarefeet13
- finishedsquarefeet15
- finishedsquarefeet50
- finishedsquarefeet6
- fireplacecnt
- garagecarcnt
- garagetotalsqft
- hashottuborspa
- heatingorsystemtypeid
- poolcnt
- poolsizesum
- pooltypeid10
- pooltypeid2
- pooltypeid7
- propertyzoningdesc
- regionidneighborhood
- storytypeid
- threequarterbathnbr
- typeconstructiontypeid
- unitcnt
- yardbuildingsqft17
- yardbuildingsqft26
- numberofstories
- fireplaceflag
- taxdelinquencyflag
- taxdelinquencyyear


In [17]:
# let's see all "id" columns
# i presume that i will not need most, if not all of them - but let's check anyways

mask = zillow_df.columns.str.contains("id")
zillow_df.iloc[:, mask].columns

Index(['propertylandusetypeid', 'id', 'parcelid', 'regionidcity',
       'regionidcounty', 'regionidzip', 'parcelid.1'],
      dtype='object')

In [18]:
# will drop column "parcelid.1" from the mysql predictions_2017 table as this does not appear to have any significance to my current zillow dataframe
zillow_df[["parcelid", "parcelid.1"]] 

Unnamed: 0,parcelid,parcelid.1
0,17223154,14710129
1,17223031,12477465
2,17222931,14608599
3,17222339,12473718
4,17222139,14603860
...,...,...
56075,11040656,14445769
56076,11039751,12901433
56077,11038761,11087327
56078,11038661,10744723


In [19]:
zillow_df = zillow_df.drop(columns = "parcelid.1")

print(zillow_df.shape)
zillow_df.head()

(56080, 29)


Unnamed: 0,propertylandusetypeid,id,parcelid,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,...,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate,propertylandusedesc
0,261.0,941,17223154,2.5,4.0,2.5,2088.0,2088.0,6111.0,2.0,...,1990.0,281174.0,562348.0,2016.0,281174.0,6405.58,61110080000000.0,-0.002189,2017-01-05,Single Family Residential
1,261.0,940,17223031,3.0,4.0,3.0,2572.0,2572.0,6111.0,3.0,...,1994.0,271103.0,444563.0,2016.0,173460.0,5151.84,61110080000000.0,0.053014,2017-01-05,Single Family Residential
2,261.0,939,17222931,3.5,4.0,3.5,3213.0,3213.0,6111.0,3.0,...,1994.0,397886.0,613950.0,2016.0,216064.0,7042.4,61110080000000.0,0.010948,2017-01-05,Single Family Residential
3,261.0,937,17222339,2.0,3.0,2.0,2170.0,2170.0,6111.0,2.0,...,1987.0,311147.0,622294.0,2016.0,311147.0,7143.4,61110080000000.0,0.053193,2017-01-05,Single Family Residential
4,261.0,935,17222139,2.0,2.0,2.0,1457.0,1457.0,6111.0,2.0,...,1986.0,270000.0,538000.0,2016.0,268000.0,6153.12,61110080000000.0,0.080817,2017-01-05,Single Family Residential
