# Explore Exercises
1. Ask at least 5 questions about the data, keeping in mind that your target variable is logerror. e.g. Is logerror significantly different for properties in LA County vs Orange County vs Ventura County?

2. Answer those questions through a mix of statistical tests and visualizations.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

import acquire
import wrangle

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = acquire.get_zillow()

In [3]:
df.head()

Unnamed: 0,id,parcelid,logerror,transactiondate,id.1,parcelid.1,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,...,buildingclasstypeid,buildingclassdesc,heatingorsystemtypeid,heatingorsystemdesc,propertylandusetypeid,propertylandusedesc,storytypeid,storydesc,typeconstructiontypeid,typeconstructiondesc
0,0,14297519,0.025595,2017-01-01,1727539,14297519,,,,3.5,...,,,,,261,Single Family Residential,,,,
1,1,17052889,0.055619,2017-01-01,1387261,17052889,,,,1.0,...,,,,,261,Single Family Residential,,,,
2,2,14186244,0.005383,2017-01-01,11677,14186244,,,,2.0,...,,,,,261,Single Family Residential,,,,
3,3,12177905,-0.10341,2017-01-01,2288172,12177905,,,,3.0,...,,,2.0,Central,261,Single Family Residential,,,,
4,4,10887214,0.00694,2017-01-01,1970746,10887214,1.0,,,3.0,...,,,2.0,Central,266,Condominium,,,,


In [4]:
def zillow_split(df, target):
    '''
    This function take in get_zillow  from aquire.py and performs a train, validate, test split
    Returns train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test
    and prints out the shape of train, validate, test
    '''
    #create train_validate and test datasets
    train, test = train_test_split(df, train_size = 0.8, random_state = 123)
    #create train and validate datasets
    train, validate = train_test_split(train, train_size = 0.7, random_state = 123)

    #Split into X and y
    X_train = train.drop(columns=[target])
    y_train = train[target]

    X_validate = validate.drop(columns=[target])
    y_validate = validate[target]

    X_test = test.drop(columns=[target])
    y_test = test[target]

    # Have function print datasets shape
    print(f'train -> {train.shape}')
    print(f'validate -> {validate.shape}')
    print(f'test -> {test.shape}')
   
    return train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test

In [5]:
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle.zillow_split(df, 'logerror')

train -> (43444, 77)
validate -> (18620, 77)
test -> (15516, 77)


In [16]:
#set variable to correlation
zillow_correlation = train.corr()

In [17]:
#shows correlation to log error
logerror_corr = zillow_correlation['logerror'].sort_values(ascending=False)
logerror_corr

logerror                        1.000000
basementsqft                    0.456891
buildingclasstypeid             0.391056
buildingclasstypeid             0.391056
yardbuildingsqft26              0.175675
poolsizesum                     0.048855
finishedsquarefeet12            0.043762
calculatedfinishedsquarefeet    0.039371
garagetotalsqft                 0.035306
bedroomcnt                      0.034465
garagecarcnt                    0.030147
calculatedbathnbr               0.029246
fullbathcnt                     0.027912
bathroomcnt                     0.026313
fireplacecnt                    0.024381
finishedsquarefeet6             0.019666
roomcnt                         0.017292
lotsizesquarefeet               0.013636
threequarterbathnbr             0.013482
longitude                       0.011826
parcelid                        0.010303
parcelid                        0.010303
finishedsquarefeet13            0.007681
structuretaxvaluedollarcnt      0.007351
censustractandbl