How do mortgage application outcomes vary by lender and loan size by state?

applicant_id
applicant_income
state_name
respondent


In [1]:
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path.cwd()
SOURCE_FILE = PROJECT_ROOT / "Data" / "2010HMDA_sample.csv"

In [2]:
OUT_DIR = Path("test_decks")
OUT_DIR.mkdir(parents=True, exist_ok=True)

KEEP_COLS = [
    "action_taken",
    "state_abbr",
    "respondent_id",
    "loan_amount_000s",
    "applicant_income_000s",
]


Set the output directory
defined the keep_columns variable

In [3]:

def select_keep_cols(df: pd.DataFrame) -> pd.DataFrame:
    missing = [c for c in KEEP_COLS if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")
    return df[KEEP_COLS].copy()

def sanity_checks(df: pd.DataFrame) -> None:
    # A) state_abbr must be exactly 2 letters
    invalid_state = df["state_abbr"].dropna().astype(str).str.len().ne(2)
    if invalid_state.any():
        bad = df.loc[invalid_state, "state_abbr"].head(5).tolist()
        raise AssertionError(f"state_abbr invalid (not 2 letters). Examples: {bad}")

    # B) action_taken must be in expected set
    allowed_actions = {1}
    action_numeric = pd.to_numeric(df["action_taken"], errors="coerce")
    bad_actions = set(action_numeric.dropna().unique()) - allowed_actions
    if bad_actions:
        raise AssertionError(f"action_taken has unexpected codes: {sorted(bad_actions)}")

    # C) loan_amount_000s must be positive when present
    loan = pd.to_numeric(df["loan_amount_000s"], errors="coerce")
    if (loan.dropna() <= 0).any():
        raise AssertionError("loan_amount_000s has non-positive values")


Defined select_keep_cols and sanity_checks

In [4]:
def transform(df: pd.DataFrame) -> pd.DataFrame:
    df = select_keep_cols(df)

    df["loan_amount_000s"] = pd.to_numeric(df["loan_amount_000s"], errors="coerce")
    df["applicant_income_000s"] = pd.to_numeric(df["applicant_income_000s"], errors="coerce")

    sanity_checks(df)
    return df

def run_file(label: str, df: pd.DataFrame) -> None:
    try:
        transform(df)
        print(f"PASS  {label}")
    except Exception as e:
        print(f"FAIL  {label} -> {type(e).__name__}: {e}")

Defined two more functions: transform and run_file

runs the sanity checks

In [5]:
base = pd.read_csv(SOURCE_FILE, low_memory=False)

base_small = base.sample(200, random_state=1)


deck_fail_state = base_small.copy()

deck_fail_state.loc[0, "state_abbr"] = "TXX"

deck_fail_state.loc[1, "state_abbr"] = "California"

path_fail_state = OUT_DIR / "deck_fail_state_abbr.csv"

deck_fail_state.to_csv(path_fail_state, index=False)


deck_fail_action = base_small.copy()

deck_fail_action.loc[0, "action_taken"] = 999

path_fail_action = OUT_DIR / "deck_fail_action_taken.csv"

deck_fail_action.to_csv(path_fail_action, index=False)


deck_fail_loan = base_small.copy()

deck_fail_loan.loc[0, "loan_amount_000s"] = -50

path_fail_loan = OUT_DIR / "deck_fail_loan_amount.csv"

deck_fail_loan.to_csv(path_fail_loan, index=False)


deck_fail_schema = base_small.drop(columns=["respondent_id"], errors="ignore").copy()

path_fail_schema = OUT_DIR / "deck_fail_missing_column.csv"

deck_fail_schema.to_csv(path_fail_schema, index=False)


print("Saved failing decks:")

print(" -", path_fail_state)

print(" -", path_fail_action)

print(" -", path_fail_loan)

print(" -", path_fail_schema)



Saved failing decks:
 - test_decks\deck_fail_state_abbr.csv
 - test_decks\deck_fail_action_taken.csv
 - test_decks\deck_fail_loan_amount.csv
 - test_decks\deck_fail_missing_column.csv


Deliberately creates errors

In [6]:
print("\n=== Running TEST DECKS ===\n")


df_test = pd.read_csv(path_fail_state, low_memory=False)

run_file("deck_fail_state_abbr.csv", df_test)


df_test = pd.read_csv(path_fail_action, low_memory=False)

run_file("deck_fail_action_taken.csv", df_test)


df_test = pd.read_csv(path_fail_loan, low_memory=False)

run_file("deck_fail_loan_amount.csv", df_test)


df_test = pd.read_csv(path_fail_schema, low_memory=False)

run_file("deck_fail_missing_column.csv", df_test)


print("\n=== Running ORIGINAL DATASET ===\n")

original_df = pd.read_csv(SOURCE_FILE, low_memory=False)

run_file(f"{SOURCE_FILE} (original)", original_df)


=== Running TEST DECKS ===

FAIL  deck_fail_state_abbr.csv -> AssertionError: 
FAIL  deck_fail_action_taken.csv -> AssertionError: action_taken has unexpected codes: [np.float64(999.0)]
FAIL  deck_fail_loan_amount.csv -> AssertionError: loan_amount_000s has non-positive values
FAIL  deck_fail_missing_column.csv -> ValueError: Missing required columns: ['respondent_id']

=== Running ORIGINAL DATASET ===

PASS  c:\Users\marks\Documents\GitHub\Learning-Journal\Compliance and Risk Reporting\Data\2010HMDA_sample.csv (original)
