# Preprocessing

In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

PATH = 'Data/Raw/'

## Features and Outliers

In [2]:
def read_raw(filepath):
    ''' Return Pandas DataFrames of raw training and testing sets  
    '''
    
    # Read raw csv
    train = pd.read_csv(filepath + 'train.csv').drop('Id', axis=1).reset_index(drop=True)
    test = pd.read_csv(filepath + 'test.csv').drop('Id', axis=1).reset_index(drop=True)
    return(train, test)

In [3]:
def use_cols(df, frac=0.1):
    ''' Returns columns to use for modeling and exploration
        Omits features missing more than 'frac' of their observations
    '''
    
    # Remove missing features
    missing = df.isnull().sum() / df.shape[0]
    cols = list(missing[missing <= frac].index)
    return(cols)

In [4]:
def drop_outliers(df):
    ''' Returns Pandas DataFrame without 2 possible outliers
    '''
    train = pd.read_csv('Data/Raw/train.csv')
    outliers = np.sort(train['SalePrice'])[-2:]
    new_train = train[train['SalePrice'] < np.min(outliers)].reset_index(drop=True)
    
    return(new_train)

In [5]:
def drop_missing(df, frac=0.1):
    ''' Returns DataFrame after dropping features with more than 'frac' missing values
    '''
    cols = use_cols(df, frac=frac)
    df = df[cols]
    return(df)

In [6]:
def drop_data(PATH, frac=0, outliers=False):
    ''' Return :: train, test
        Drop outliers and features with missing observations
    '''
    
    # Read Data
    train, test = read_raw(PATH)
    
    # Drop Outliers
    if outliers == True:
        train = drop_outliers(train)
    df_full = pd.concat([train.drop('SalePrice', axis=1, inplace=False), test], axis=0)
    
    # Drop Missing Features
    df = drop_missing(df_full, frac=frac)
    
    # Split Data
    length = train.shape[0]
    df_train = df.iloc[:length, :]
    df_train['SalePrice'] = train['SalePrice']
    
    df_test = df.iloc[length:, :]
    
    return(df_train, df_test)

## Missing Values

__Used for Analysis__

In [7]:
def get_missing_features(df):
    missing = df.isnull().sum()
    features = missing[missing > 0]
    return(features)

In [8]:
def get_missing_dataframe(df):
    features = get_missing_features(df)
    df_missing = pd.DataFrame(features, columns=['Missing']).sort_values('Missing')
    return(df_missing)

__Impute Functions__

In [9]:
def impute_garage(df):
    # Single missing values
    df['GarageCars'].fillna(0, inplace=True)
    df['GarageArea'].fillna(0, inplace=True)
    
    # Many missing values
    features_garage2 = ['GarageType', 'GarageCond', 'GarageYrBlt', 'GarageFinish', 'GarageQual']
    df[features_garage2] = df[features_garage2].fillna('None')
    
    return(df)

In [10]:
def impute_bsmt(df):
    features_num = ['BsmtFinSF1', 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath']
    df[features_num] = df[features_num].fillna(0)
    
    features_cat = ['BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'BsmtExposure', 'BsmtCond']
    df[features_cat] = df[features_cat].fillna('None')
    return(df)

In [11]:
def impute_others(train, test):
    # Replace with 0
    train['MasVnrArea'].fillna(0, inplace=True)
    test['MasVnrArea'].fillna(0, inplace=True)
    
    missing = ['Exterior1st', 'Exterior2nd','Electrical', 'KitchenQual', 'SaleType', 'Utilities', 
               'Functional', 'MSZoning', 'MasVnrType']
    for m in missing:
        train[m].fillna(train[m].mode, inplace=True)
        test[m].fillna(train[m].mode, inplace=True)
    return(train, test)

In [12]:
def impute_df(train, test):
    # train and test imputes garage, basement the same
    train = impute_garage(train)
    train = impute_bsmt(train)
    
    test = impute_garage(test)
    test = impute_bsmt(test)
    
    # impute using training set modes
    train, test = impute_others(train, test)
    return(train, test)

### Preprocess DataSet

In [13]:
def preprocess(PATH, frac, outliers):
    # Get DataFrame
    train, test = drop_data(PATH, frac=frac, outliers=outliers)
    salePrice = train['SalePrice']
    df_full = pd.concat([train.drop('SalePrice', axis=1), test], axis=0)

    # Get Missing
    df_train, df_test = impute_df(train, test)
    return(df_train, df_test)

In [14]:
# Build DataFrames
PATH = 'Data/Raw/'

train, test = preprocess(PATH, frac=0.1, outliers=True)
train_outliers, test_outliers = preprocess(PATH, frac=0.1, outliers=False)

In [15]:
# Save to CSV
train_outliers.to_csv('Data/Inputs/train_outliers.csv', index=False)
test_outliers.to_csv('Data/Inputs/test_outliers.csv', index=False)