In [None]:
import pandas as pd
all_data = pd.read_csv('hmda_2017_ca_all-records_labels.csv')

In [None]:
from pandas.api.types import is_numeric_dtype

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns

# modeling imports
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, StratifiedKFold
from sklearn import metrics
from sklearn import preprocessing, pipeline, decomposition, compose
from sklearn_pandas import DataFrameMapper

import imblearn.under_sampling
import imblearn.over_sampling

In [None]:
#Store this to use later on if we want to dig more into interperetablity of results
denials = all_data[['respondent_id', 'denial_reason_name_1', 'denial_reason_name_2', 'denial_reason_name_3']]

all_data.drop([
                'agency_name',
                'agency_code',
                'loan_type',
                'property_type',
                'loan_purpose',
                'owner_occupancy',
                'preapproval',
                'action_taken',
                'msamd',
                'state_name',
                'state_code',
                'county_code',
                'applicant_ethnicity',
                'co_applicant_ethnicity',
                'applicant_race_1',
                'applicant_race_2',
                'applicant_race_3',
                'applicant_race_4',
                'applicant_race_5',
                'co_applicant_race_1',
                'co_applicant_race_2',
                'co_applicant_race_3',
                'co_applicant_race_4',
                'co_applicant_race_5',
                'applicant_sex',
                'co_applicant_sex',
                'purchaser_type',
                'denial_reason_1',
                'denial_reason_2',
                'denial_reason_3',
                'denial_reason_name_1', #Stored in denials table
                'denial_reason_name_2', #Stored in denials table
                'denial_reason_name_3', #Stored in denials table
                'hoepa_status',
                'lien_status',
                'edit_status',
                'edit_status_name',
                'sequence_number',
                'application_date_indicator',
                'rate_spread'],axis=1, inplace=True)

#Only do this if single state and single year
all_data.drop(['as_of_year', 'state_abbr'], axis=1, inplace=True)

#Shortening some strings
all_data.replace('One-to-four family dwelling (other than manufactured housing)',
                 '1 to 4 family (excl. manufactured)', inplace=True)
all_data.replace('Information not provided by applicant in mail, Internet, or telephone application',
                 'Info not provided', inplace=True)

## Subjective Cleaning

In [None]:
# Likely too much complexity to go beyond two race listings
all_data.drop(['applicant_race_name_3',
               'applicant_race_name_4',
               'applicant_race_name_5',
               'co_applicant_race_name_3',
               'co_applicant_race_name_4',
               'co_applicant_race_name_5'], axis=1, inplace=True)

In [None]:
# Filter down to applications accepted or denied (remove preapprovals and incompletes) and make binary
acc_or_den_filter = (all_data['action_taken_name'] == 'Loan originated') |\
                    (all_data['action_taken_name'] == 'Application denied by financial institution')
all_data = all_data[acc_or_den_filter]

all_data['action_taken_name'].replace('Loan originated', 0, inplace=True)
all_data['action_taken_name'].replace('Application denied by financial institution', 1, inplace=True)

In [None]:
#Droping all the entires where we don't have census data
all_data = all_data[all_data['population'].notna()]

In [None]:
#Droping where loan_amount_000s is na
all_data = all_data[all_data['loan_amount_000s'].notna()]

In [None]:
all_data.drop(['county_name', #Too many
               'census_tract_number', #Too many
               'purchaser_type_name',
               'hoepa_status_name', #Most are not HOEPA
               'lien_status_name', #Probably exclude since vast majority are in one category
              ], axis=1, inplace=True)

In [None]:
# Unreported incomes seem to have similar loan amounts as reported incomes
# Droping nulls for now, but consider filling with median. Probably big enough sample to just drop though.
all_data = all_data[all_data['applicant_income_000s'].notna()]

In [None]:
# Null location for Metropolitan Statistical Area/Metropolitan Division calling 'Other'
all_data.loc[all_data['msamd_name'] == np.nan, 'msamd_name'] = 'Other'

In [None]:
#all_data.info()

## Outlier Handling

In [None]:
#Dropping multifamily dwellings
multifamily = all_data['property_type_name'] == 'Multifamily dwelling'
all_data = all_data.drop(all_data[multifamily].index)

In [None]:
# We want to exclude huge purchases and really small ones
# Magic numbers here consider review
loan_amt_filter = (all_data['loan_amount_000s'] > 25) &\
                  (all_data['loan_amount_000s'] < 20000)
all_data = all_data[loan_amt_filter]

## Feature Engineering

In [None]:
all_data.info()

In [None]:
# binary column, has coapplicant?
def has_coapp(row):
    if row['co_applicant_sex_name'] == 'No co-applicant':
        return 0
    else:
        return 1
all_data['has_coapplicant'] = all_data.apply(lambda row: has_coapp(row), axis=1)

In [None]:
# 1 if backed by ANY federal agency
def fed_insured(row):
    if row['loan_type_name'] == 'Conventional':
        return 0
    else:
        return 1
all_data['fed_insured'] = all_data.apply(lambda row: fed_insured(row), axis=1)
all_data.drop('loan_type_name', axis=1, inplace=True)

## Helper Functions

In [None]:
def get_cols(df):
    '''Print vertical list so you can paste into excel and make notes'''
    for col in df.columns:
        print(col)

In [None]:
def check_feature(df, feature):
    if is_numeric_dtype(df[feature]):
        plt.hist(df[feature])
    else:
        print(df[feature].value_counts())
        ((df[feature].value_counts() / len(df[feature]))*100).sort_values().plot(kind = 'barh')
        plt.xticks(np.arange(0, 100, 5))

In [None]:
def create_model_df(df, features_list):
    return df[features_list + ['action_taken_name']].dropna()

## EDA

In [None]:
for count in all_data['action_taken_name'].value_counts():
    print(count/len(all_data['action_taken_name']))

In [None]:
check_feature(all_data, 'action_taken_name')

In [None]:
#get_cols(all_data)

## Creating Models

#### Choose features and make a dataframe with those

In [None]:
features_1 = ['loan_amount_000s', 'applicant_income_000s', 'hud_median_family_income']
model1 = create_model_df(all_data, features_1)

#### Split Train/Test

In [None]:
def make_splits(df):
    '''Makes X_train, X_test, y_train_y_test given a data frame with features and action_taken_name as y.
    Use this to make sure we don't mess up getting data from the wrong df'''
    return train_test_split(df[np.setdiff1d(df.columns, ['action_taken_name'])],
                                          df['action_taken_name'],
                                          test_size = 0.2, random_state=42)

In [None]:
X_tr1, X_te1, y_tr1, y_te1 = make_splits(model1)

##### Preprocessing

In [None]:
def preprocess_columns(X_train, X_test):
    num_cols = []
    cat_cols = []
    
    #This is a list that will be the dummy column to drop (it will be the one with the lowest value count)
    drop_dummy_cols = []
    
    # Create the column lists from X_train and then apply the procedure to X_train and X_test
    for feature in X_train.columns:
        if type(all_data[feature].iloc[0]) == str:
            cat_cols.append(feature) # Creating the list of columns that are categorical

            # Gets the category name with the lowest count (this will be the dummy column we drop)
            min_cat = all_data[feature].value_counts().keys()[-1]

            drop_dummy_cols.append(feature + '_' + min_cat) #string convention from pd.get_dummies

        else:
            num_cols.append(feature) #Columns that will get the standard scaler
    
    
    std_scale = preprocessing.StandardScaler()
    for column in X_train.columns:
        if column in num_cols:
            #standard scaling the numerical columns
            X_train[column] = std_scale.fit_transform(np.array(X_train[column]).reshape(-1, 1))
            X_test[column] = std_scale.fit_transform(np.array(X_test[column]).reshape(-1, 1))
    
    #Get dummies for the categoricals and drop the one with the lowest value count
    X_train = pd.get_dummies(X_train)
    X_test = pd.get_dummies(X_test)
    X_train.drop(drop_dummy_cols, axis=1, inplace=True)
    X_test.drop(drop_dummy_cols, axis=1, inplace=True)
    
    return X_train, X_test

#### Pairplot

In [None]:
sns.pairplot(model1.sample(frac=0.3, replace=True, random_state=1), hue='action_taken_name');

In [None]:
#sorted(metrics.SCORERS.keys())

In [None]:
# 20% is denials (action_taken_name = 1)
# 80% is loan originated
def under_sample_bal(X_train, y_train):
    '''pass in training data and return X_train_rs, y_train_rs'''
    n_pos = np.sum(y_train == 1)
    n_neg = np.sum(y_train == 0)
    ratio = {1 : int(n_pos), 0 : int(n_neg/4)} 

    ROS = imblearn.under_sampling.RandomUnderSampler(sampling_strategy = ratio, random_state=42) 
    return ROS.fit_resample(X_train, y_train)

In [None]:
def find_c(X_train, y_train):
    basic_model = LogisticRegression()
    c_vals_grid = [{'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 10.0, 100, 1000, 10000],}]
    # Cross validates to find optimal C
    
    #Grid search object that is looking for best C value
    basic_model = GridSearchCV(estimator=basic_model, param_grid=c_vals_grid, scoring='roc_auc', cv=5)
    
    # Preprcess - Since the function is made to return processed df for train and test we are just dropping it some
    # dummy data so it doesn't error.
    X_tr_proc, dummy_df = preprocess_columns(X_train, X_train)
    # Resample balancing
    X_tr_rs, y_tr_rs = under_sample_bal(X_tr_proc, y_train)
    
    # Fit the model with processed and resampled training data
    basic_model.fit(X_tr_rs, y_tr_rs)
    
    #Return optimal C
    return basic_model.best_params_['C']

In [None]:
#Take in unprocessed X_train, y_train
def cross_validate(X_train, y_train):
    C = find_c(X_train, y_train)
    scores = {'accuracy': [], 'recall': [], 'roc_auc': [], 'f1': [], 'precision': []}
    sfk = StratifiedKFold(n_splits=5)
    
    for train_index, test_index in sfk.split(X_train, y_train):
        #create a train/val set for a split
        X_tr, X_val = X_train.iloc[train_index].copy(), X_train.iloc[test_index].copy()
        y_tr, y_val = y_train.iloc[train_index].copy(), y_train.iloc[test_index].copy()
        
        #preprocess X sets
        X_tr_proc, X_val_proc = preprocess_columns(X_tr, X_val)
        #Balance pos/neg class with undersampling
        X_tr_rs, y_tr_rs = under_sample_bal(X_tr_proc, y_tr)
        #fit to train
        model = LogisticRegression(C=C)
        model.fit(X_tr_rs, y_tr_rs)
        #score on val
        y_pred = model.predict(X_val_proc)
        
        # Score test data
        accuracy = metrics.accuracy_score(y_val, y_pred)
        scores['accuracy'].append(accuracy)
        
        recall = metrics.recall_score(y_val, y_pred)
        scores['recall'].append(recall)
        
        roc_auc = metrics.roc_auc_score(y_val, y_pred)
        scores['roc_auc'].append(roc_auc)       
        
        f1 = metrics.f1_score(y_val, y_pred)
        scores['f1'].append(f1)
        
        precision = metrics.precision_score(y_val, y_pred)
        scores['precision'].append(precision)
        
    print('Means:')
    for metric in scores:
        mean = np.mean(scores[metric])
        print('{:<20s}{:>15.2f}'.format(metric, mean))

In [None]:
cross_validate(X_tr2, y_tr2)

In [None]:
def test(X_train, X_test, y_train, y_test):
    C = find_c(X_train, y_train)
    model = LogisticRegression(C=C)
    
    #preprocess X sets
    X_train_proc, X_test_proc = preprocess_columns(X_train, X_test)
    #Balance pos/neg class with undersampling
    X_train_rs, y_train_rs = under_sample_bal(X_train_proc, y_train)
    #fit to train
    model = LogisticRegression(C=C)
    model.fit(X_train_rs, y_train_rs)
    #generate predictions
    y_pred = model.predict(X_test_proc)
    
    # Now that everything is transformed use:
        # X_train_rs
        # X_test_proc
        # y_train_rs
        # y_test       

    # Score test data
    accuracy = metrics.accuracy_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)

    print('Important:')
    print(f'Accuracy: {accuracy}')
    print(f'Recall: {recall}')

    print(metrics.confusion_matrix(y_test, y_pred))

    fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test_proc)[:,1])
    plt.plot(fpr, tpr,lw=2)
    plt.plot([0,1],[0,1],c='violet',ls='--')
    plt.xlim([-0.05,1.05])
    plt.ylim([-0.05,1.05])

    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve');
    print("ROC AUC score = ", metrics.roc_auc_score(y_test, model.predict_proba(X_test_proc)[:,1]))
    
    #for feature, coef in X_train_rs.columns, model.coef_:
        #print('{:<20s}{:>15.2f}'.format(feature, coef))
    print(model.coef_)

    plt.show()


    print('\nLess Important:')
    print(f'Precision: {precision}')

In [None]:
test(X_tr2, X_te2, y_tr2, y_te2)