In [1]:
import numpy as np
import pandas as pd

from wrangle import *
from env import *

In [108]:
sql_db = "zillow"
query = "SELECT * FROM properties_2017 JOIN predictions_2017 USING(parcelid) WHERE (`propertylandusetypeid` = 261) & (YEAR(`transactiondate`) = 2017);"
df = get_data(sql_db,query)

Reading CSV


In [91]:
def summary(df):
    # saving output of .isnull().sum() to a DataFrame
    null_count = df.isnull().sum().reset_index(name='count')
    
    # setting a name to columns in new DataFrame
    null_count.columns = ['column_name', 'num_rows_missing']

    # total count of nulls in DataFrame
    total_count = df.shape[0]
    
    # creating column 'pct_row_missing' with calculated percent value 
    # using null_count and total_count
    null_count['pct_rows_missing'] = (null_count['num_rows_missing']/total_count)
    
    #return null_count[null_count['num_rows_missing']!=0]
    return null_count

In [93]:
summarized_df = summary(df)
summarized_df.sort_values(by=['pct_rows_missing'],ascending=False)

Unnamed: 0,column_name,num_rows_missing,pct_rows_missing
14,finishedsquarefeet13,52441,1.000000
7,buildingclasstypeid,52441,1.000000
15,finishedsquarefeet15,52441,1.000000
4,basementsqft,52394,0.999104
42,storytypeid,52394,0.999104
...,...,...,...
25,latitude,0,0.000000
18,fips,0,0.000000
6,bedroomcnt,0,0.000000
5,bathroomcnt,0,0.000000


In [94]:
def separate_columns_list(df):
    '''
        Creates 2 lists separating continous & discrete
        variables.
        
        Parameters
        ----------
        df : Pandas DataFrame
            The DataFrame from which columns will be sorted.
        
        Returns
        ----------
        continuous_columns : list
            Columns in DataFrame with numerical values.
        discrete_columns : list
            Columns in DataFrame with categorical values.
    '''
    continuous_columns = []
    discrete_columns = []
    
    for column in df.columns:
        if (df[column].dtype == 'int' or df[column].dtype == 'float') and ('id' not in column):
            continuous_columns.append(column)
        elif(df[column].dtype == 'int' or df[column].dtype == 'float') and ('id' in column):
            discrete_columns.append(column)
        else:
            discrete_columns.append(column)

    return continuous_columns, discrete_columns

In [95]:
continuous_columns, discrete_columns = separate_columns_list(df)

In [96]:
df['regionidneighborhood'].isnull().mean()

0.6370587898781488

In [97]:
df['regionidneighborhood'].notnull().mean()

0.3629412101218512

In [98]:
def handle_missing_values(df, prop_required_column, prop_required_row):
    '''
        Drops columns and rows in a Pandas DataFrame if they have a proportion
        of null values less than the required cutoffs.

        Parameters
        ----------
        df : Pandas DataFrame
            The DataFrame from which null values are to be dropped.
        prop_required_column : float
            The proportion of null values required for a column to be dropped.
        row_cutoff : float
            The proportion of null values required for a row to be dropped.

        Returns
        ----------
        df : Pandas DataFrame
            The DataFrame with columns and rows with null values dropped.
    '''
    
    # Drop columns with a proportion of null values greater than col_cutoff
    cols_to_drop = [col for col in df.columns if df[col].notnull().mean() < prop_required_column]
    df.drop(columns=cols_to_drop, inplace=True)

    # Drop rows with a proportion of null values greater than row_cutoff
    rows_to_drop = [row for row in df.index if df.loc[row].notnull().mean() < prop_required_row]
    df.drop(index=rows_to_drop, inplace=True)

    # Return the new DataFrame
    return df

In [99]:
df_handled = handle_missing_values(df,.65,.97)

In [100]:
summary(df_handled)

Unnamed: 0,column_name,num_rows_missing,pct_rows_missing
0,parcelid,0,0.0
1,id,0,0.0
2,bathroomcnt,0,0.0
3,bedroomcnt,0,0.0
4,calculatedbathnbr,0,0.0
5,calculatedfinishedsquarefeet,0,0.0
6,finishedsquarefeet12,0,0.0
7,fips,0,0.0
8,fullbathcnt,0,0.0
9,latitude,0,0.0


In [124]:
df_handled.isnull().sum()

parcelid                        0
id                              0
bathroomcnt                     0
bedroomcnt                      0
calculatedbathnbr               0
calculatedfinishedsquarefeet    0
finishedsquarefeet12            0
fips                            0
fullbathcnt                     0
latitude                        0
longitude                       0
lotsizesquarefeet               0
propertycountylandusecode       0
propertylandusetypeid           0
rawcensustractandblock          0
regionidcity                    0
regionidcounty                  0
regionidzip                     0
roomcnt                         0
yearbuilt                       0
structuretaxvaluedollarcnt      0
taxvaluedollarcnt               0
assessmentyear                  0
landtaxvaluedollarcnt           0
taxamount                       0
censustractandblock             0
id.1                            0
logerror                        0
transactiondate                 0
dtype: int64

In [11]:
def handle_outliers(df):
    return 0

In [129]:
def all_outliers(df):

    # calculate interquartile range
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1

    # detect outliers
    outliers = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))

    return outliers

In [130]:
outliers = all_outliers(df_handled)

  outliers = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))


In [142]:
outliers_series = outliers.sum()
print(outliers_series[outliers_series > 0].sort_values(ascending=False))

logerror                        6167
lotsizesquarefeet               5414
censustractandblock             4331
rawcensustractandblock          4330
fips                            4330
structuretaxvaluedollarcnt      3815
taxamount                       3698
taxvaluedollarcnt               3441
landtaxvaluedollarcnt           2863
calculatedfinishedsquarefeet    2237
finishedsquarefeet12            2237
regionidcity                    1618
bathroomcnt                     1456
calculatedbathnbr               1456
longitude                       1436
fullbathcnt                     1434
bedroomcnt                      1298
yearbuilt                        279
latitude                          35
roomcnt                           15
regionidzip                       11
parcelid                           5
dtype: int64


In [143]:
def upper_outlier_detector(dataframe,column,k=1.5):
    q1,q3 = dataframe[column].quantile([0.25,0.75])
    iqr = q3-q1
    upper_bound = q3+k*iqr
    
    return np.where(dataframe[column]>upper_bound,1,0)

In [161]:
def lower_outlier_detector(dataframe,column,k=1.5):
    q1,q3 = dataframe[column].quantile([0.25,0.75])
    iqr = q3-q1
    
    lower_bound=q1-k*iqr

    return np.where(dataframe[column]<lower_bound,1,0)

# Test Upper Outliers

In [145]:
upper_outlier_detector(df_handled,'bathroomcnt')

array([0, 0, 0, ..., 0, 0, 0])

In [152]:
test = pd.DataFrame()

In [154]:
my_list = ['roomcnt','bathroomcnt']
for col in my_list:
    test[f'{col}_upper_outliers']=upper_outlier_detector(df_handled,col)
test.head()

Unnamed: 0,roomcnt_upper_outliers,bathroomcnt_upper_outliers
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [157]:
test.sum()

roomcnt_upper_outliers          15
bathroomcnt_upper_outliers    1456
dtype: int64

# Testing Lower Outliers

In [162]:
lower_outlier_detector(df_handled,'bathroomcnt')

array([0, 0, 0, ..., 0, 0, 0])

In [163]:
lower_outlier_test = pd.DataFrame()

In [165]:
my_list = ['roomcnt','bathroomcnt']
for col in my_list:
    test[f'{col}_upper_outliers']=lower_outlier_detector(df_handled,col)
lower_outlier_test.head()

In [166]:
lower_outlier_test.sum()

Series([], dtype: float64)

<hr style="border:2px solid gray">

# E X E R C I S E S

# Zillow

## Acquire

#### 1. Acquire

#### 1. Acquire

## Acquire

#### 1. Acquire

#### 1. Acquire