# US Mortgage Market Analysis

## Importing Libraries

In [14]:
import numpy as np
import pandas as pd

## Loading Data

In [120]:
mortgage_df = pd.read_csv("./datasets/year_2021.csv", nrows=10000)

In [121]:
mortgage_df = mortgage_df.sample(n=1000)

## Filtering Missing Data

In [123]:
num_rows, num_cols = mortgage_df.shape

In [124]:
# Drops rows that are all missing values
mortgage_df.dropna(how="all", inplace=True)

# Drops columns that have more than 95% of its values as NAs
mortgage_df.dropna(thresh=num_rows*0.05, axis=1, inplace=True)

# Drops columns that are all missing values
#mortgage_df.dropna(how="all", axis=1, inplace=True)

# Reset index values
mortgage_df.reset_index(drop=True, inplace=True)

In [125]:
# Freequency table of missing values per column
na_counts = mortgage_df.isna().sum().sort_values(ascending=False)

# Filters out columns with non-missing values
na_counts = na_counts[na_counts > 0]

# Convert to relative frequency table
na_freq_tab = na_counts / num_rows

In [126]:
na_freq_tab

applicant_ethnicity-2        0.903
lender_credits               0.831
discount_points              0.810
co-applicant_age_above_62    0.597
rate_spread                  0.396
debt_to_income_ratio         0.320
loan_to_value_ratio          0.320
origination_charges          0.235
total_loan_costs             0.235
interest_rate                0.187
property_value               0.154
dtype: float64

In [129]:
mortgage_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 73 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   activity_year                             1000 non-null   int64  
 1   lei                                       1000 non-null   object 
 2   derived_msa-md                            1000 non-null   int64  
 3   state_code                                1000 non-null   object 
 4   county_code                               1000 non-null   int64  
 5   census_tract                              1000 non-null   int64  
 6   conforming_loan_limit                     1000 non-null   object 
 7   derived_loan_product_type                 1000 non-null   object 
 8   derived_dwelling_category                 1000 non-null   object 
 9   derived_ethnicity                         1000 non-null   object 
 10  derived_race                         

In [130]:
# Frequency table of column data types
mortgage_df.dtypes.value_counts()

int64      48
object     13
float64    12
dtype: int64