## Data Cleaning for Home Mortgage Dataset

### Load Raw Dataset

In [1]:
# import dependencies
import pandas as pd
from path import Path 
import numpy as np 
import os
from random import randint

In [3]:
# set path and load dataframe
path = Path('./data/raw_data.csv')
df = pd.read_csv(path, low_memory=False)


In [4]:
df.head()

Unnamed: 0,as_of_year,respondent_id,agency_name,agency_abbr,agency_code,loan_type_name,loan_type,property_type_name,property_type,loan_purpose_name,...,edit_status_name,edit_status,sequence_number,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units,application_date_indicator
0,2017,0000451965,Consumer Financial Protection Bureau,CFPB,9,Conventional,1,One-to-four family dwelling (other than manufa...,1,Home improvement,...,preprocessing,1,299322036995,4824.0,37.23,75200.0,57.419998,818.0,1626.0,
1,2017,0000146672,Consumer Financial Protection Bureau,CFPB,9,Conventional,1,One-to-four family dwelling (other than manufa...,1,Home purchase,...,preprocessing,1,750442599848,7404.0,57.52,63200.0,116.010002,1215.0,1743.0,
2,2017,0000451965,Consumer Financial Protection Bureau,CFPB,9,Conventional,1,One-to-four family dwelling (other than manufa...,1,Home purchase,...,preprocessing,1,589059911788,3372.0,33.189999,97400.0,141.740005,592.0,1105.0,
3,2017,13-6131491,Department of Housing and Urban Development,HUD,7,Conventional,1,One-to-four family dwelling (other than manufa...,1,Refinancing,...,preprocessing,1,757438893415,8787.0,65.129997,97400.0,97.269997,1463.0,2164.0,
4,2017,33-0419992,Department of Housing and Urban Development,HUD,7,FHA-insured,2,One-to-four family dwelling (other than manufa...,1,Refinancing,...,preprocessing,1,126735412706,5356.0,23.1,75200.0,126.690002,1711.0,2102.0,


#### Extract all columns

In [5]:
df_columns = df.columns
print(f'Row Count: {len(df)} | Column Count: {len(df_columns)}')
print(df_columns)


Row Count: 1709502 | Column Count: 78
Index(['as_of_year', 'respondent_id', 'agency_name', 'agency_abbr',
       'agency_code', 'loan_type_name', 'loan_type', 'property_type_name',
       'property_type', 'loan_purpose_name', 'loan_purpose',
       'owner_occupancy_name', 'owner_occupancy', 'loan_amount_000s',
       'preapproval_name', 'preapproval', 'action_taken_name', 'action_taken',
       'msamd_name', 'msamd', 'state_name', 'state_abbr', 'state_code',
       'county_name', 'county_code', 'census_tract_number',
       'applicant_ethnicity_name', 'applicant_ethnicity',
       'co_applicant_ethnicity_name', 'co_applicant_ethnicity',
       'applicant_race_name_1', 'applicant_race_1', 'applicant_race_name_2',
       'applicant_race_2', 'applicant_race_name_3', 'applicant_race_3',
       'applicant_race_name_4', 'applicant_race_4', 'applicant_race_name_5',
       'applicant_race_5', 'co_applicant_race_name_1', 'co_applicant_race_1',
       'co_applicant_race_name_2', 'co_applicant_ra

#### Divide the dataset by column_type

In [6]:
# constant, float
tracking_columns = ['edit_status_name', 'edit_status', 'sequence_number']

#demographic float columns, 
demographics_columns = ['population', 'minority_population', 'hud_median_family_income', 'tract_to_msamd_income', 'number_of_owner_occupied_units', 'number_of_1_to_4_family_units']

# categorical text/numeric
applicant_sex_columns = ['applicant_sex_name', 'applicant_sex', 'co_applicant_sex_name', 'co_applicant_sex']

race_ethnicity_columns = ['applicant_ethnicity_name', 'applicant_ethnicity', 'co_applicant_ethnicity_name', 'co_applicant_ethnicity', 'applicant_race_name_1', 'applicant_race_1', 'applicant_race_name_2', 'applicant_race_2', 'applicant_race_name_3', 'applicant_race_3', 'applicant_race_name_4', 'applicant_race_4', 'applicant_race_name_5', 'applicant_race_5', 'co_applicant_race_name_1', 'co_applicant_race_1', 'co_applicant_race_name_2', 'co_applicant_race_2', 'co_applicant_race_name_3', 'co_applicant_race_3', 'co_applicant_race_name_4', 'co_applicant_race_4', 'co_applicant_race_name_5', 'co_applicant_race_5']

# loan info float
monetary_columns = ['loan_amount_000s', 'applicant_income_000s','rate_spread']

# loan info categorical
loan_info_columns = ['loan_type_name', 'loan_type', 'property_type_name', 'property_type', 'loan_purpose_name', 'loan_purpose', 'owner_occupancy_name', 'owner_occupancy', 'purchaser_type_name', 'purchaser_type','hoepa_status_name', 'hoepa_status', 'lien_status_name',
'lien_status']

# pre/approval
preapproval_columns = ['preapproval_name', 'preapproval']
approval_columns = ['action_taken_name', 'action_taken']

# denial
denial_reason_columns = ['denial_reason_name_1', 'denial_reason_1', 'denial_reason_name_2','denial_reason_2', 'denial_reason_name_3', 'denial_reason_3']

# date/time/location columns
date_time_loc_columns = ['as_of_year', 'respondent_id', 'agency_name', 'agency_abbr', 'agency_code','msamd_name', 'msamd', 'state_name', 'state_abbr', 'state_code', 'county_name', 'county_code', 'census_tract_number', 'application_date_indicator']


In [7]:
# compile all column types into column_types_dict
column_types_dict = {'tracking_columns': tracking_columns, 
                    'demographics_columns': demographics_columns, 
                    'applicant_sex_columns' : applicant_sex_columns,
                    'race_ethnicity_columns' : race_ethnicity_columns,
                    'monetary_columns' : monetary_columns,
                    'loan_info_columns' : loan_info_columns,
                    'preapproval_columns' : preapproval_columns,
                    'approval_columns' : approval_columns,
                    'denial_reason_columns' : denial_reason_columns,      
                    'date_time_loc_columns' : date_time_loc_columns}

In [8]:
# check the len of column_types_dict to make sure we didn't lose any columns
def count_column_types(column_dict):
    # initialize counter
    counter = 0
    # loop through each value in dictionary
    for value in column_dict.values():
        value_count = len(value)
        counter += value_count

    return counter

In [9]:
print(f'Total Column Count: {count_column_types(column_types_dict)}')
print(f'Column Types: {list(column_types_dict.keys())}')
print(f'Column Type Count: {len(column_types_dict.values())}')


# All good!

Total Column Count: 78
Column Types: ['tracking_columns', 'demographics_columns', 'applicant_sex_columns', 'race_ethnicity_columns', 'monetary_columns', 'loan_info_columns', 'preapproval_columns', 'approval_columns', 'denial_reason_columns', 'date_time_loc_columns']
Column Type Count: 10
