# Exercises

Sourcing the Zillow database as the data source, create a jupyter notebook named explore_zillow and do the following:

- Ask at least 5 questions about the data, keeping in mind that your target variable is logerror. e.g. Is logerror significantly different for properties in LA County vs Orange County vs Ventura County?

- Answer those questions through a mix of statistical tests and visualizations.

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

### Imports

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Statistical Tests
import scipy.stats as stats

# Visualizing
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
from sklearn.model_selection import learning_curve

pd.options.display.float_format = '{:20,.2f}'.format

import env
import explore
import acquire
import summarize
from summarize import df_summary

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

### Acquire/Wrangle Data

In [None]:
df = acquire.get_zillow_data()

df.head()

In [None]:
info, describe, nulls, value_count = df_summary (df)

In [None]:
def handle_missing_values(df, prop_required_column = .5, prop_required_row = .70):
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df

In [None]:
def remove_columns(df, cols_to_remove):  
    df = df.drop(columns=cols_to_remove)
    return df

In [None]:
# Function to read and wrangle data:

def wrangle_zillow():
    df = acquire.get_zillow_data()
    
    # Restrict df to only properties that meet single unit use criteria
    single_use = [261, 262, 263, 264, 266, 268, 273, 276, 279]
    df = df[df.propertylandusetypeid.isin(single_use)]
    
    # Restrict df to only those properties with at least 1 bath & bed and 350 sqft area
    df = df[(df.bedroomcnt > 0) & (df.bathroomcnt > 0) & ((df.unitcnt<=1)|df.unitcnt.isnull())\
            & (df.calculatedfinishedsquarefeet>350)]

    # Handle missing values i.e. drop columns and rows based on a threshold
    df = handle_missing_values(df)
    
    # Add column for counties
    df['county'] = np.where(df.fips == 6037, 'Los_Angeles',
                           np.where(df.fips == 6059, 'Orange', 
                                   'Ventura'))    
    # drop columns not needed
    df = remove_columns(df, ['id',
       'calculatedbathnbr', 'finishedsquarefeet12', 'fullbathcnt', 'heatingorsystemtypeid'
       ,'propertycountylandusecode', 'propertylandusetypeid','propertyzoningdesc', 
        'censustractandblock', 'propertylandusedesc'])


    # replace nulls in unitcnt with 1
    df.unitcnt.fillna(1, inplace = True)
    
    # assume that since this is Southern CA, null means 'None' for heating system
    df.heatingorsystemdesc.fillna('None', inplace = True)
    
    # replace nulls with median values for select columns
    df.lotsizesquarefeet.fillna(7313, inplace = True)
    df.buildingqualitytypeid.fillna(6.0, inplace = True)

    # Columns to look for outliers
    df = df[df.taxvaluedollarcnt < 5_000_000]
    df[df.calculatedfinishedsquarefeet < 8000]
    df[df.lotsizesquarefeet < 8000] # = to 1 acre
    
    # Just to be sure we caught all nulls, drop them here
    df = df.dropna()
    
    return df

In [None]:
df = wrangle_zillow()
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
q1 = df.lotsizesquarefeet.quantile(.25)
q3 = df.lotsizesquarefeet.quantile(.75)

iqr = q3 - q1

multiplier = 1.5
upper_bound = q3 + (multiplier * iqr)
lower_bound = q1 - (multiplier * iqr)

# Let's filter out the low outliers
df = df[df.lotsizesquarefeet > lower_bound]
# lets say give us everyhting less than the upper bound
df = df[df.lotsizesquarefeet < upper_bound]
df.shape

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

### Prep Data 

In [None]:
con_vars = ['transactiondate', 'taxamount', 'landtaxvaluedollarcnt', 'taxvaluedollarcnt', 'structuretaxvaluedollarcnt','roomcnt','lotsizesquarefeet','longitude','latitude','fips', 'calculatedfinishedsquarefeet', 'bedroomcnt','bathroomcnt']

In [None]:
def scale_my_data(train, validate, test):
    scaler = StandardScaler()
    scaler.fit(train[['roomcnt','lotsizesquarefeet','longitude','latitude','fips', 'calculatedfinishedsquarefeet', 'bedroomcnt','bathroomcnt']])
    X_train_scaled = scaler.transform(train[['roomcnt','lotsizesquarefeet','longitude','latitude','fips', 'calculatedfinishedsquarefeet', 'bedroomcnt','bathroomcnt']])
    X_validate_scaled = scaler.transform(validate[['roomcnt','lotsizesquarefeet','longitude','latitude','fips', 'calculatedfinishedsquarefeet', 'bedroomcnt','bathroomcnt']])
    X_test_scaled = scaler.transform(test[['roomcnt','lotsizesquarefeet','longitude','latitude','fips', 'calculatedfinishedsquarefeet', 'bedroomcnt','bathroomcnt']])

    train[['roomcnt_scaled','lotsizesquarefeet_scaled','longitude_scaled','latitude_scaled','fips_scaled', 'calculatedfinishedsquarefeet_scaled', 'bedroomcnt_scaled','bathroomcnt_scaled']] = X_train_scaled
    validate[['roomcnt_scaled','lotsizesquarefeet_scaled','longitude_scaled','latitude_scaled','fips_scaled', 'calculatedfinishedsquarefeet_scaled', 'bedroomcnt_scaled','bathroomcnt_scaled']] = X_validate_scaled
    test[['roomcnt_scaled','lotsizesquarefeet_scaled','longitude_scaled','latitude_scaled','fips_scaled', 'calculatedfinishedsquarefeet_scaled', 'bedroomcnt_scaled','bathroomcnt_scaled']] = X_test_scaled
    
    return train, validate, test

def prep_mall(df):
    '''
    dummy var for gender into is_male
    add 'spending_class' that cut spending score into the 4 quartiles and label the new field by q1, q2, q3, q4. 
    split on target of 'spending_score'
    scale age and annual income. 
    '''
    
    train, validate, test = explore.train_validate_test_split(df, target='logerror', seed=123)
    train, validate, test = scale_my_data(train, validate, test)
        
    return df, train, validate, test

In [None]:
df, train, validate, test = prep_mall(df)
train.head()

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

## Exploration

In [None]:
binary_vars = ['fips']
quant_vars = ['calculatedfinishedsquarefeet', 'lotsizesquarefeet', 'buildingqualitytypeid']
categorical_target = 'logerror'
continuous_target = 'logerror'

In [None]:
explore.explore_univariate(train, binary_vars, quant_vars)

## Takeaways

- 63.17 percent of the properties are located in a fips of 6037
- The mean of calcualted finished square feet is 1,737.22 sq/ft
- With major outliers removed the mean loitsize square feet is 7,169.83 which is roughly 0.16 of an acre.

In [None]:
explore.explore_bivariate(train, categorical_target, continuous_target, binary_vars, quant_vars)

### Question 1

### Question 2

### Question 3

### Question 4

### Question 5

## Bonus:

Compute the mean(logerror) by zipcode and the overall mean(logerror). Write a loop that will run a t-test between the overall mean and the mean for each zip code. We want to identify the zip codes where the error is significantly higher or lower than the expected error. 