In [39]:
# import dependencies
import pandas as pd
from path import Path 
import numpy as np 
import os
from random import randint

In [40]:
# set path and load dataframe
path = Path('../data/cleaned_data.csv')
df = pd.read_csv(path, low_memory=False)
df.head()

Unnamed: 0,as_of_year,respondent_id,agency_name,agency_abbr,agency_code,loan_type_name,loan_type,property_type_name,property_type,loan_purpose_name,...,edit_status_name,edit_status,sequence_number,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units,application_date_indicator
0,2017,0000451965,Consumer Financial Protection Bureau,CFPB,9,Conventional,1,One-to-four family dwelling (other than manufa...,1,Home improvement,...,preprocessing,1,191856311741,4824.0,37.23,75200.0,57.419998,818.0,1626.0,
1,2017,0000146672,Consumer Financial Protection Bureau,CFPB,9,Conventional,1,One-to-four family dwelling (other than manufa...,1,Home purchase,...,preprocessing,1,798333441520,7404.0,57.52,63200.0,116.010002,1215.0,1743.0,
2,2017,0000451965,Consumer Financial Protection Bureau,CFPB,9,Conventional,1,One-to-four family dwelling (other than manufa...,1,Home purchase,...,preprocessing,1,172228252986,3372.0,33.189999,97400.0,141.740005,592.0,1105.0,
3,2017,13-6131491,Department of Housing and Urban Development,HUD,7,Conventional,1,One-to-four family dwelling (other than manufa...,1,Refinancing,...,preprocessing,1,723494359341,8787.0,65.129997,97400.0,97.269997,1463.0,2164.0,
4,2017,33-0419992,Department of Housing and Urban Development,HUD,7,FHA-insured,2,One-to-four family dwelling (other than manufa...,1,Refinancing,...,preprocessing,1,503311073815,5356.0,23.1,75200.0,126.690002,1711.0,2102.0,


In [41]:
row_count = len(df)
print(f'Row Count: {row_count}')

Row Count: 1709502


In [42]:
column_count = len(df.columns)
print(f'Column Count: {column_count}')

Column Count: 78


In [43]:
status_log_dict = {}

def update_status(data, dict, status, level):
    data.copy() 

    status = status
    level = int(level)

    dict[status] = [level]

    data['edit_status_name'] = status
    data['edit_status'] = level

    return data 

In [44]:
status = 'eda'
level = 2
df = update_status(df, status_log_dict, status, level)

In [45]:
# find out value counts for a few columns
df['lien_status_name'].value_counts()

Secured by a first lien          1410741
Not applicable                    232537
Secured by a subordinate lien      46872
Not secured by a lien              19352
Name: lien_status_name, dtype: int64

In [46]:
df.co_applicant_sex_name.value_counts()

No co-applicant                                                                      827046
Female                                                                               538535
Male                                                                                 140590
Not applicable                                                                       131337
Information not provided by applicant in mail, Internet, or telephone application     71994
Name: co_applicant_sex_name, dtype: int64

In [47]:
df.purchaser_type_name.value_counts()

Loan was not originated or was not sold in calendar year covered by register    889882
Fannie Mae (FNMA)                                                               257027
Freddie Mac (FHLMC)                                                             171704
Ginnie Mae (GNMA)                                                               157912
Commercial bank, savings bank or savings association                             89712
Life insurance company, credit union, mortgage bank, or finance company          82005
Other type of purchaser                                                          31732
Private securitization                                                           17192
Affiliate institution                                                            12301
Farmer Mac (FAMC)                                                                   35
Name: purchaser_type_name, dtype: int64

In [48]:
df['hoepa_status_name'].value_counts()

Not a HOEPA loan    1708993
HOEPA loan              509
Name: hoepa_status_name, dtype: int64

In [49]:
print(df['denial_reason_name_1'].value_counts())
print(df.denial_reason_name_1.count())


Debt-to-income ratio                              49027
Credit history                                    28323
Credit application incomplete                     22986
Collateral                                        19853
Other                                             14597
Unverifiable information                           9042
Insufficient cash (downpayment, closing costs)     4850
Employment history                                 1530
Mortgage insurance denied                            67
Name: denial_reason_name_1, dtype: int64
150275


In [50]:
# Define column_value_counts function to determine how data is organized 
def column_value_counts(data):

    data = data.copy()

    column_list = []
    value_counts_dict = {}

    # loop through columns in dataframe 
    for column in data.columns:

        # check if path exists, if not create it
        if not os.path.isdir(f'../eda/column_value_counts/{column}'):
            os.makedirs(f'../eda/column_value_counts/{column}')

        if 'edit_status' or 'sequence_number' not in column:
            # assign value_counts for column to col_vc
            col_vc = data[column].value_counts()
            # make a dataframe, vc_df, from col_vc; reset index values, reset column names, and sort
            vc_df = pd.DataFrame(col_vc.reset_index().values, columns=['Value','Count']).sort_index(axis=0, ascending=True)
            # export column value counts df to csv in column_value_counts folder
            vc_df.to_csv(f'../eda/column_value_counts/{column}/Value_Counts_{column}.csv', index=False)
            # set value_counts_dict {key: column, value: value counts df}
            value_counts_dict[column] = vc_df
            # append column name to column_list
            column_list.append(column)

    return column_list, value_counts_dict

In [51]:
column_list, value_counts_dict = column_value_counts(df)

In [52]:
value_counts_dict['lien_status']


Unnamed: 0,Value,Count
0,1,1410741
1,4,232537
2,2,46872
3,3,19352


In [53]:
value_counts_dict['purchaser_type_name']


Unnamed: 0,Value,Count
0,Loan was not originated or was not sold in cal...,889882
1,Fannie Mae (FNMA),257027
2,Freddie Mac (FHLMC),171704
3,Ginnie Mae (GNMA),157912
4,"Commercial bank, savings bank or savings assoc...",89712
5,"Life insurance company, credit union, mortgage...",82005
6,Other type of purchaser,31732
7,Private securitization,17192
8,Affiliate institution,12301
9,Farmer Mac (FAMC),35


In [54]:
def compile_value_counts_df(vc_dict):
    # initialize compiled value counts dataframe
    comp_vc_df = pd.DataFrame()

    # iterate through value counts dictionary to extract value counts data for each column
    for key in vc_dict.keys():
        # set vc_df to the key's corresponding value (eg. column's value count dataframe)
        vc_df = vc_dict[key]
        vc_df.sort_values(by=['Count'], ascending=False)
        # create new Column by the same name, with key's name inputed
        vc_df['Column'] = f'{key}'
        # concatenate vc_df with comp_vc_df
        comp_vc_df = pd.concat([comp_vc_df, vc_df], ignore_index=True, axis=0)
    #return comp_vc_df
    return comp_vc_df

In [55]:
vc_df = compile_value_counts_df(vc_dict=value_counts_dict)

In [56]:
print(len(vc_df))
vc_df = vc_df[['Column','Value','Count']]
vc_df.to_csv('../eda/columnar_value_counts.csv', index=False)
vc_df.head()


1747979


Unnamed: 0,Column,Value,Count
0,as_of_year,2017,1709502
1,respondent_id,451965,153445
2,respondent_id,852218,72384
3,respondent_id,7197000003,59888
4,respondent_id,480228,52812


In [57]:
vc_df_sorted = vc_df.sort_values(by=['Count'], ascending=False)

vc_df_sorted.to_csv('../eda/columnar_VC_high_low.csv', index=False)

In [58]:
vc_df_sorted.head()

Unnamed: 0,Column,Value,Count
0,as_of_year,2017,1709502
6097,state_name,California,1709502
17793,edit_status,2,1709502
6099,state_code,6,1709502
6098,state_abbr,CA,1709502


In [59]:
# define get_unique_values_df function to make a dataframe of each column and its unique values
def get_unique_values_df(df):

    # initialize unique_dict and unique_values_list
    unique_values_list = []

    # copy dataframe
    df = df.copy()

    # iterate through columns
    for column in df.columns:
        if 'edit_status' or 'sequence_number' not in column:
            # extract unique values
            unique_values = df[column].unique()
            # append unique values from each column to unique_values_list
            unique_values_list.append(unique_values)


    # make a dataframe of column name and unique values 
    values_df = pd.DataFrame({'Column Name':[col for col in df.columns if 'edit_status' or 'sequence_number' not in col], 'Unique_Count': [len(item) for item in unique_values_list], 'Unique_Values': unique_values_list}) 

    # return dataframe
    return values_df

In [60]:
unique_values_df = get_unique_values_df(df)
unique_values_df.to_csv('../eda/column_data.csv', index=False)


In [61]:
unique_values_df.head()

Unnamed: 0,Column Name,Unique_Count,Unique_Values
0,as_of_year,1,[2017]
1,respondent_id,1212,"[0000451965, 0000146672, 13-6131491, 33-041999..."
2,agency_name,6,"[Consumer Financial Protection Bureau, Departm..."
3,agency_abbr,6,"[CFPB, HUD, OCC, FRS, NCUA, FDIC]"
4,agency_code,6,"[9, 7, 1, 2, 5, 3]"


In [62]:
print('Done!')

Done!


GO TO data_splitting.ipynb
