In [1]:
import os
import pandas as pd
import psycopg2
from psycopg2 import sql
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import LabelEncoder

In [2]:
text_columns = ['agency_name', 'agency_abbr', 'loan_type_name', 'property_type_name', 'loan_purpose_name', 'owner_occupancy_name', 'preapproval_name', 'action_taken_name', 'msamd_name', 'state_name', 'state_abbr', 'county_name', 'applicant_ethnicity_name', 'co_applicant_ethnicity_name', 'applicant_race_name_1', 'co_applicant_race_name_1', 'applicant_sex_name', 'co_applicant_sex_name', 'purchaser_type_name', 'hoepa_status_name', 'lien_status_name', 'application_id']
numeric_columns = ['agency_code', 'loan_type', 'property_type', 'loan_purpose', 'owner_occupancy', 'loan_amount_000s', 'preapproval', 'action_taken', 'msamd', 'state_code', 'county_code', 'census_tract_number', 'applicant_ethnicity', 'co_applicant_ethnicity', 'applicant_race_1', 'co_applicant_race_1', 'applicant_sex', 'co_applicant_sex', 'applicant_income_000s', 'purchaser_type', 'hoepa_status', 'lien_status', 'population', 'minority_population', 'hud_median_family_income', 'tract_to_msamd_income', 'number_of_owner_occupied_units', 'number_of_1_to_4_family_units', 'application_id']

### Set Up connection to AWS RDS

In [3]:
PGEND_POINT = 'hmda-final-db.cfo8yzx2osax.us-east-1.rds.amazonaws.com' # End_point
PGDATABASE_NAME = 'hmda_db' # Database Name example: youtube_test_db
PGUSER_NAME = 'postgres' # UserName
PGPASSWORD = 'Password' # Password

In [4]:
def connect():
    
    # Set up a connection to the postgres server.
    conn_string = "host="+ PGEND_POINT +" port="+ "5432" +" dbname="+ PGDATABASE_NAME +" user=" + PGUSER_NAME \
                  +" password="+ PGPASSWORD
    
    conn = psycopg2.connect(conn_string)
    print("Connected!")

    # Create a cursor object
    cursor = conn.cursor()
    
    return conn, cursor

In [5]:
conn, cursor = connect()

Connected!


### Query Machine Learning Dataset
Machine Learning Dataset will include categorical applicant/loan information, census data, and categoric

In [6]:
query = sql.SQL('''
SELECT 
    nd.loan_amount_000s,
    nd.applicant_income_000s,
    nd.population,
    nd.minority_population,
    nd.census_tract_number,
    nd.hud_median_family_income,
    nd.tract_to_msamd_income,
    nd.number_of_owner_occupied_units,
    nd.number_of_1_to_4_family_units,
    td.agency_abbr, 
    td.loan_type_name, 
    td.property_type_name, 
    td.loan_purpose_name,
    td.owner_occupancy_name, 
    td.msamd_name,
    td.county_name,
    td.purchaser_type_name,
    td.hoepa_status_name,
    td.lien_status_name,
    td.preapproval_name,
    nd.preapproval,
    td.action_taken_name,
    nd.action_taken
FROM
    numeric_data nd 
INNER JOIN text_data td 
    ON nd.application_id = td.application_id 
ORDER BY nd.loan_amount_000s
''')

In [7]:
cur = conn.cursor()
cur.execute(query)

In [8]:
query_results = cur.fetchall()


In [9]:
ml_columns = ['loan_amount_000s', 'applicant_income_000s', 'population', 'minority_population', 'census_tract_number', 'hud_median_family_income', 'tract_to_msamd_income', 'number_of_owner_occupied_units', 'number_of_1_to_4_family_units', 'agency_abbr', 'loan_type_name', 'property_type_name', 'loan_purpose_name', 'owner_occupancy_name', 'msamd_name', 'county_name', 'purchaser_type_name', 'hoepa_status_name', 'lien_status_name', 'preapproval_name', 'preapproval', 'action_taken_name', 'action_taken']

In [10]:
ml_df = pd.DataFrame(query_results, columns=ml_columns)
ml_df.head()

Unnamed: 0,loan_amount_000s,applicant_income_000s,population,minority_population,census_tract_number,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units,agency_abbr,...,owner_occupancy_name,msamd_name,county_name,purchaser_type_name,hoepa_status_name,lien_status_name,preapproval_name,preapproval,action_taken_name,action_taken
0,1,41,5149,50.0499992370605,4016.01,64300,124.059997558594,1513,1881,HUD,...,Owner-occupied as a principal dwelling,"Los Angeles, Long Beach, Glendale - CA",Los Angeles County,Loan was not originated or was not sold in cal...,Not a HOEPA loan,Secured by a first lien,Not applicable,3,Application denied by financial institution,3
1,1,50,5495,26.9899997711182,1042.0,131500,76.5500030517578,1539,2129,CFPB,...,Owner-occupied as a principal dwelling,San Rafael - CA,Marin County,Loan was not originated or was not sold in cal...,Not a HOEPA loan,Not secured by a lien,Not applicable,3,Application denied by financial institution,3
2,1,49,7274,55.75,101.02,75200,58.2200012207031,906,1909,HUD,...,Owner-occupied as a principal dwelling,"Sacramento, Roseville, Arden-Arcade - CA",Yolo County,Loan was not originated or was not sold in cal...,Not a HOEPA loan,Secured by a first lien,Not applicable,3,Application denied by financial institution,3
3,1,124,6003,56.3199996948242,3005.02,64300,145.520004272461,705,1532,CFPB,...,Owner-occupied as a principal dwelling,"Los Angeles, Long Beach, Glendale - CA",Los Angeles County,Loan was not originated or was not sold in cal...,Not a HOEPA loan,Not applicable,Not applicable,3,Loan purchased by the institution,6
4,1,66,6129,79.1600036621094,93.21,75200,98.6999969482422,1344,1710,HUD,...,Owner-occupied as a principal dwelling,"Sacramento, Roseville, Arden-Arcade - CA",Sacramento County,Loan was not originated or was not sold in cal...,Not a HOEPA loan,Secured by a first lien,Not applicable,3,Application denied by financial institution,3


In [11]:
ml_df.isna().sum()
ml_df = ml_df.drop_duplicates()
ml_df.duplicated().sum()

0

In [12]:
ml_df['minority_population'] = ml_df['minority_population'].astype('float64')
ml_df['census_tract_number'] = ml_df['census_tract_number'].astype('float64')
ml_df['tract_to_msamd_income'] = ml_df['tract_to_msamd_income'].astype('float64')

In [13]:
target_df = ml_df[['action_taken_name','action_taken']]
preapproval_df = ml_df[['preapproval_name', 'preapproval']]

## Target and Metric Assessment

In [14]:
target_df_count = target_df.groupby(['action_taken','action_taken_name'])['action_taken'].count()
target_df_count

action_taken  action_taken_name                                  
1             Loan originated                                        809668
2             Application approved but not accepted                   46278
3             Application denied by financial institution            207513
4             Application withdrawn by applicant                     217340
5             File closed for incompleteness                          70335
6             Loan purchased by the institution                      120740
7             Preapproval request denied by financial institution        24
8             Preapproval request approved but not accepted              13
Name: action_taken, dtype: int64

In [15]:
preapproval_df_count = preapproval_df.groupby(['preapproval_name','preapproval'])['preapproval'].count()
preapproval_df_count


preapproval_name               preapproval
Not applicable                 3              1272458
Preapproval was not requested  2               172193
Preapproval was requested      1                27260
Name: preapproval, dtype: int64

In [16]:
metric_df = pd.concat([target_df, preapproval_df], axis=1)

In [17]:
pa_req_df = metric_df.loc[metric_df['preapproval'] == 1]
pa_req_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27260 entries, 2315 to 1473035
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   action_taken_name  27260 non-null  object
 1   action_taken       27260 non-null  int64 
 2   preapproval_name   27260 non-null  object
 3   preapproval        27260 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.0+ MB


In [18]:
pa_req_df_count = pa_req_df.groupby(['action_taken','action_taken_name'])['action_taken'].count()
pa_req_df_count

action_taken  action_taken_name                                  
1             Loan originated                                        20790
2             Application approved but not accepted                   1243
3             Application denied by financial institution             1540
4             Application withdrawn by applicant                      3278
5             File closed for incompleteness                           372
7             Preapproval request denied by financial institution       24
8             Preapproval request approved but not accepted             13
Name: action_taken, dtype: int64

In [19]:
pa_origination = len(pa_req_df.loc[pa_req_df['action_taken']==1])
pa_origination_rate = pa_origination / len(pa_req_df) * 100
print(f'Loan Origination: {pa_origination_rate:.2f}%')

Loan Origination: 76.27%


## Cleaning ML Data
1. remove rows with applications withdrawn by applicant action_taken == 3
2. remove rows with application closed for incompleteness action_taken == 5

In [20]:
# ml_df = ml_df.loc[ml_df['action_taken'] != 4 ].loc[ml_df['action_taken'] != 5]

In [21]:
len(ml_df)

1471911

In [22]:
def fix_target_data(member):
  if member == 1:
    return 1 
  else:
    return 0

In [23]:
ml_df['action_taken'] = ml_df['action_taken'].apply(fix_target_data)

In [24]:
ml_df = ml_df.drop(columns=['preapproval','preapproval_name','action_taken_name'])

In [25]:
row_count = len(ml_df)
column_count = len(ml_df.columns)

print(f'Row Count: {row_count} | Column Count: {column_count}')

Row Count: 1471911 | Column Count: 20


In [26]:
ml_df.dtypes

loan_amount_000s                    int64
applicant_income_000s               int64
population                          int64
minority_population               float64
census_tract_number               float64
hud_median_family_income            int64
tract_to_msamd_income             float64
number_of_owner_occupied_units      int64
number_of_1_to_4_family_units       int64
agency_abbr                        object
loan_type_name                     object
property_type_name                 object
loan_purpose_name                  object
owner_occupancy_name               object
msamd_name                         object
county_name                        object
purchaser_type_name                object
hoepa_status_name                  object
lien_status_name                   object
action_taken                        int64
dtype: object

In [27]:
def get_categorical_columns(data):
  data = data.copy()

  encoded_columns = data.dtypes[data.dtypes == 'object'].index.tolist()

  return encoded_columns

In [28]:
ml_cat = get_categorical_columns(ml_df)

In [29]:
len(ml_cat)

10

In [30]:
ml_df[ml_cat].nunique()

agency_abbr              6
loan_type_name           4
property_type_name       2
loan_purpose_name        3
owner_occupancy_name     3
msamd_name              29
county_name             37
purchaser_type_name     10
hoepa_status_name        2
lien_status_name         4
dtype: int64

In [31]:
for cat in ml_cat:
  if not os.path.isdir('./eda'):
    os.makedirs('./eda')
    
  with open(f'./eda/{cat}_value_counts.txt','w') as txt_file:
    col_value_counts = ml_df[cat].value_counts()

    txt_file.write(str(col_value_counts))
    
    print(col_value_counts)

HUD     860158
CFPB    466969
NCUA     70844
FDIC     36445
OCC      27019
FRS      10476
Name: agency_abbr, dtype: int64
Conventional          1190183
FHA-insured            185171
VA-guaranteed           93325
FSA/RHS-guaranteed       3232
Name: loan_type_name, dtype: int64
One-to-four family dwelling (other than manufactured housing)    1446807
Manufactured housing                                               25104
Name: property_type_name, dtype: int64
Refinancing         782004
Home purchase       579724
Home improvement    110183
Name: loan_purpose_name, dtype: int64
Owner-occupied as a principal dwelling        1305594
Not owner-occupied as a principal dwelling     165114
Not applicable                                   1203
Name: owner_occupancy_name, dtype: int64
Los Angeles, Long Beach, Glendale - CA                   306983
Riverside, San Bernardino, Ontario - CA                  223395
San Diego, Carlsbad - CA                                 138783
Sacramento, Roseville, A

In [32]:
def make_buckets(data, column, cutoff):
    data.copy()
    
    label_list = data[column].value_counts()

    replace_list = list(label_list[label_list < cutoff].index)

    for item in replace_list:
        data[column] = data[column].replace(item,'other')
    
    return data

In [33]:
# reduce the number of msamd and counties
ml_df = make_buckets(ml_df, 'msamd_name', 10000)
ml_df = make_buckets(ml_df, 'county_name', 10000)

In [34]:
ml_df[ml_cat].nunique()

agency_abbr              6
loan_type_name           4
property_type_name       2
loan_purpose_name        3
owner_occupancy_name     3
msamd_name              21
county_name             26
purchaser_type_name     10
hoepa_status_name        2
lien_status_name         4
dtype: int64

In [35]:
# note that columns with two values will produce two encoded columns in the preprocessing steps. Make sure to delete one of those encoded columns
# drop(columns='property_type_name Manufactured Housing','hoepa_status_name HOEPA loan')
ml_df['property_type_name'].value_counts()
ml_df['hoepa_status_name'].value_counts()

Not a HOEPA loan    1471632
HOEPA loan              279
Name: hoepa_status_name, dtype: int64

In [36]:
# need to correct buckets for lien_status

In [37]:
def fix_lien_status(member):
  if member == 'Not applicable' or member == 'Not secured by a lien':
    return 'Not secured by a lien'
  else:
    return member

In [38]:
ml_df['lien_status_name'] = ml_df['lien_status_name'].apply(fix_lien_status)
ml_df.lien_status_name.value_counts()

Secured by a first lien          1287256
Not secured by a lien             139228
Secured by a subordinate lien      45427
Name: lien_status_name, dtype: int64

In [39]:
def encode_data(data, encoded_columns):
    data = data.copy()

    # make df of columns to be encoded
    enc_df = data[encoded_columns]

    #initialize the encoder
    enc = OneHotEncoder(sparse=False)
    #fit and transform the encoded columns
    encoded = enc.fit_transform(enc_df)

    encoded_df = pd.DataFrame(encoded)

    encoded_df.columns = enc.get_feature_names_out(encoded_columns)

    encoded_df = encoded_df.drop(columns=['property_type_name_Manufactured housing','hoepa_status_name_Not a HOEPA loan',])

    data = data.merge(encoded_df, left_index=True, right_index=True).drop(columns=encoded_columns, axis=1)

    print('encode_data: Done!')
    return data

In [40]:
ml_df_encoded = encode_data(ml_df, ml_cat)
ml_df_encoded.columns

encode_data: Done!


Index(['loan_amount_000s', 'applicant_income_000s', 'population',
       'minority_population', 'census_tract_number',
       'hud_median_family_income', 'tract_to_msamd_income',
       'number_of_owner_occupied_units', 'number_of_1_to_4_family_units',
       'action_taken', 'agency_abbr_CFPB', 'agency_abbr_FDIC',
       'agency_abbr_FRS', 'agency_abbr_HUD', 'agency_abbr_NCUA',
       'agency_abbr_OCC', 'loan_type_name_Conventional',
       'loan_type_name_FHA-insured', 'loan_type_name_FSA/RHS-guaranteed',
       'loan_type_name_VA-guaranteed',
       'property_type_name_One-to-four family dwelling (other than manufactured housing)',
       'loan_purpose_name_Home improvement', 'loan_purpose_name_Home purchase',
       'loan_purpose_name_Refinancing', 'owner_occupancy_name_Not applicable',
       'owner_occupancy_name_Not owner-occupied as a principal dwelling',
       'owner_occupancy_name_Owner-occupied as a principal dwelling',
       'msamd_name_Anaheim, Santa Ana, Irvine - CA',
  

## Machine Learning Process
### Begin Preprocessing: Input/Target Split, Train/Test Split, Standardization

In [41]:
def split_target(data, target):
  data = data.copy()

  y = data[target]
  X = data.drop(columns=target)

  print('split_target: Done!')
  return X, y

In [42]:
X, y = split_target(ml_df_encoded, 'action_taken')

split_target: Done!


In [43]:
# train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [44]:
def scale_data(train, test):
  scaler = StandardScaler()

  t_scaler = scaler.fit(train)

  train_scaled = t_scaler.transform(train)
  test_scaled = t_scaler.transform(test)

  print('scale_data: Done!')
  return train_scaled, test_scaled

In [45]:
X_train_scaled, X_test_scaled = scale_data(X_train, X_test)

scale_data: Done!


### Begin training balanced random forest model 

In [46]:
def brfc_model(X, y, test):
  model = BalancedRandomForestClassifier()
  model.fit(X, y)
  pred = model.predict(test)
  
  print('brfc_model: Done!')
  return model, pred

In [47]:
brfc, brfc_y_pred = brfc_model(X_train_scaled, y_train, X_test_scaled)

brfc_model: Done!


In [48]:
brfc_accuracy = balanced_accuracy_score(y_test, brfc_y_pred)
print('Balanced Accuracy Score: Balanced Random Forest Classifier')
print(f'Accuracy Score: {brfc_accuracy}')

Balanced Accuracy Score: Balanced Random Forest Classifier
Accuracy Score: 0.5385766050889399


In [49]:
def model_metrics(model, test, pred, title):
  if not os.path.isdir('./ml'):
    os.makedirs('./ml')
      
  with open(f'./ml/{title}_results.txt','w') as txt_file:
    model_accuracy = balanced_accuracy_score(test, pred)
    cm = confusion_matrix(test, pred)
    cm_df = pd.DataFrame(cm, columns=['Predicted 0','Predicted 1'], index=['Actual 0','Actual 1'])
    crib = classification_report_imbalanced(test,pred)
    txt_file.write(f'Accuracy Score: {model_accuracy} \n\n')
    txt_file.write(str(cm_df))
    txt_file.write(f'\n\n {crib}')
    print(model)
    print(f'Accuracy Score: {model_accuracy:.4f}')
    print(cm_df)
    print(crib)

In [50]:
brfc_model_name = 'Balanced Random Forest Classifier'
model_metrics(model=brfc_model_name, test=y_test, pred=brfc_y_pred, title='brfc_0')

Balanced Random Forest Classifier
Accuracy Score: 0.5386
          Predicted 0  Predicted 1
Actual 0        90056        75372
Actual 1        94504       107761
                   pre       rec       spe        f1       geo       iba       sup

          0       0.49      0.54      0.53      0.51      0.54      0.29    165428
          1       0.59      0.53      0.54      0.56      0.54      0.29    202265

avg / total       0.54      0.54      0.54      0.54      0.54      0.29    367693



In [51]:
def get_feature_importances(model, data, title):
  if not os.path.isdir('./ml'):
    os.makedirs('./ml')
      
  with open(f'./ml/{title}_feature_importances.txt','w') as txt_file:
  
    importances = model.feature_importances_
  
    zipped_importances = sorted(zip(importances, data.columns), reverse=True)
    importance_df = pd.DataFrame({'importance':[i[0] for i in zipped_importances], 'parameter':[i[1] for i in zipped_importances]})

    txt_file.write(f'feature importances - {title} \n\n')
    txt_file.write(importance_df.to_string())
    print(importance_df.head(20))
    return importance_df

In [52]:
brfc_importances = get_feature_importances(brfc, X, title='brfc_0')


    importance                                          parameter
0     0.128653                              applicant_income_000s
1     0.123659                                   loan_amount_000s
2     0.068657                                minority_population
3     0.068155                              tract_to_msamd_income
4     0.067435                                census_tract_number
5     0.066579                                         population
6     0.066217                     number_of_owner_occupied_units
7     0.066050                      number_of_1_to_4_family_units
8     0.030444                           hud_median_family_income
9     0.014125  purchaser_type_name_Loan was not originated or...
10    0.011760                      loan_purpose_name_Refinancing
11    0.010970                                    agency_abbr_HUD
12    0.010966                    loan_purpose_name_Home purchase
13    0.010196                                   agency_abbr_CFPB
14    0.00

Since Accuracy was so low, we need to go back and look at buckets again. let's reduce MSAMD and County to 10 buckets each

In [53]:
ml_df_1 = ml_df.copy()

In [54]:
ml_df_1 = make_buckets(ml_df_1, 'msamd_name', 34500)
ml_df_1 = make_buckets(ml_df_1, 'county_name', 35000)

In [55]:
ml_df_1.msamd_name.value_counts()

Los Angeles, Long Beach, Glendale - CA                   306983
other                                                    294941
Riverside, San Bernardino, Ontario - CA                  223395
San Diego, Carlsbad - CA                                 138783
Sacramento, Roseville, Arden-Arcade - CA                 122479
Oakland, Hayward, Berkeley - CA                          121615
Anaheim, Santa Ana, Irvine - CA                          117071
San Jose, Sunnyvale, Santa Clara - CA                     67416
San Francisco, Redwood City, South San Francisco - CA     44697
Oxnard, Thousand Oaks, Ventura - CA                       34531
Name: msamd_name, dtype: int64

In [56]:
ml_df_1.county_name.value_counts()

other                    423508
Los Angeles County       306983
San Diego County         138783
Riverside County         125636
Orange County            117071
San Bernardino County     97759
Sacramento County         76397
Santa Clara County        64159
Alameda County            63347
Contra Costa County       58268
Name: county_name, dtype: int64

In [57]:
ml_df_1_encoded = encode_data(ml_df_1, ml_cat)
ml_df_1_encoded.columns

encode_data: Done!


Index(['loan_amount_000s', 'applicant_income_000s', 'population',
       'minority_population', 'census_tract_number',
       'hud_median_family_income', 'tract_to_msamd_income',
       'number_of_owner_occupied_units', 'number_of_1_to_4_family_units',
       'action_taken', 'agency_abbr_CFPB', 'agency_abbr_FDIC',
       'agency_abbr_FRS', 'agency_abbr_HUD', 'agency_abbr_NCUA',
       'agency_abbr_OCC', 'loan_type_name_Conventional',
       'loan_type_name_FHA-insured', 'loan_type_name_FSA/RHS-guaranteed',
       'loan_type_name_VA-guaranteed',
       'property_type_name_One-to-four family dwelling (other than manufactured housing)',
       'loan_purpose_name_Home improvement', 'loan_purpose_name_Home purchase',
       'loan_purpose_name_Refinancing', 'owner_occupancy_name_Not applicable',
       'owner_occupancy_name_Not owner-occupied as a principal dwelling',
       'owner_occupancy_name_Owner-occupied as a principal dwelling',
       'msamd_name_Anaheim, Santa Ana, Irvine - CA',
  

In [58]:
X_1, y_1 = split_target(ml_df_1_encoded, 'action_taken')

split_target: Done!


In [59]:
len(X_1.columns)

60

In [60]:
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y_1, random_state=1, stratify=y_1)

In [61]:
X_1_train_scaled, X_1_test_scaled = scale_data(X_1_train, X_1_test)

scale_data: Done!


In [62]:
brfc_1, brfc_1_pred = brfc_model(X_1_train_scaled, y_1_train, X_1_test_scaled)

brfc_model: Done!


In [63]:
brfc_1_name = 'Balanced Random Forest Classifier Reduced - Test 1'
model_metrics(model=brfc_1_name, test=y_1_test, pred=brfc_1_pred, title='brfc_1')

Balanced Random Forest Classifier Reduced - Test 1
Accuracy Score: 0.5385
          Predicted 0  Predicted 1
Actual 0        89696        75732
Actual 1        94089       108176
                   pre       rec       spe        f1       geo       iba       sup

          0       0.49      0.54      0.53      0.51      0.54      0.29    165428
          1       0.59      0.53      0.54      0.56      0.54      0.29    202265

avg / total       0.54      0.54      0.54      0.54      0.54      0.29    367693



In [64]:
brfc_1_importances = get_feature_importances(brfc_1, X_1, title='brfc_1')
brfc_1_importances.head(20)

    importance                                          parameter
0     0.137319                              applicant_income_000s
1     0.132736                                   loan_amount_000s
2     0.072548                                minority_population
3     0.072021                              tract_to_msamd_income
4     0.071320                                census_tract_number
5     0.070293                                         population
6     0.070134                     number_of_owner_occupied_units
7     0.069833                      number_of_1_to_4_family_units
8     0.031826                           hud_median_family_income
9     0.014489  purchaser_type_name_Loan was not originated or...
10    0.011863                      loan_purpose_name_Refinancing
11    0.011250                                    agency_abbr_HUD
12    0.011176                    loan_purpose_name_Home purchase
13    0.010303              purchaser_type_name_Fannie Mae (FNMA)
14    0.01

Unnamed: 0,importance,parameter
0,0.137319,applicant_income_000s
1,0.132736,loan_amount_000s
2,0.072548,minority_population
3,0.072021,tract_to_msamd_income
4,0.07132,census_tract_number
5,0.070293,population
6,0.070134,number_of_owner_occupied_units
7,0.069833,number_of_1_to_4_family_units
8,0.031826,hud_median_family_income
9,0.014489,purchaser_type_name_Loan was not originated or...


That's not good... let's try reducing purchaser name instead

In [65]:
ml_df_2 = ml_df.copy()

In [66]:
ml_df_2[ml_cat].nunique()

agency_abbr              6
loan_type_name           4
property_type_name       2
loan_purpose_name        3
owner_occupancy_name     3
msamd_name              21
county_name             26
purchaser_type_name     10
hoepa_status_name        2
lien_status_name         3
dtype: int64

In [67]:
ml_df_2 = make_buckets(ml_df_2, 'purchaser_type_name', 31000)


In [68]:
ml_df_2[ml_cat].purchaser_type_name.value_counts()

Loan was not originated or was not sold in calendar year covered by register    785855
Fannie Mae (FNMA)                                                               231027
Freddie Mac (FHLMC)                                                             145419
Ginnie Mae (GNMA)                                                                91902
Commercial bank, savings bank or savings association                             85576
Life insurance company, credit union, mortgage bank, or finance company          75933
other                                                                            56199
Name: purchaser_type_name, dtype: int64

In [69]:
ml_df_2_encoded = encode_data(ml_df_2, ml_cat)

encode_data: Done!


In [70]:
def split_train_build_brfc(data, target, name, title):
  
  X_, y_ = split_target(data, target)

  X_train_, X_test_, y_train_, y_test_ = train_test_split(X_, y_, random_state=1, stratify=y_)

  X_train_scaled_, X_test_scaled_ = scale_data(X_train_, X_test_)

  brfc_, brfc_pred = brfc_model(X_train_scaled_, y_train_, X_test_scaled_)

  model_metrics(model=name, test=y_test_, pred=brfc_pred, title=title)

  brfc_importances = get_feature_importances(brfc_, X_, title=title)

  return brfc_, brfc_pred, brfc_importances

In [71]:
brfc_2, brfc_2_pred, brfc_2_importances = split_train_build_brfc(data=ml_df_2_encoded, target='action_taken', name='Balanced Random Forest Model Reduced - Test 2', title='brfc_2')

split_target: Done!
scale_data: Done!
brfc_model: Done!
Balanced Random Forest Model Reduced - Test 2
Accuracy Score: 0.5387
          Predicted 0  Predicted 1
Actual 0        89877        75551
Actual 1        94243       108022
                   pre       rec       spe        f1       geo       iba       sup

          0       0.49      0.54      0.53      0.51      0.54      0.29    165428
          1       0.59      0.53      0.54      0.56      0.54      0.29    202265

avg / total       0.54      0.54      0.54      0.54      0.54      0.29    367693

    importance                                          parameter
0     0.129158                              applicant_income_000s
1     0.124672                                   loan_amount_000s
2     0.067179                                minority_population
3     0.066911                              tract_to_msamd_income
4     0.066195                                census_tract_number
5     0.065111                         

In [72]:
ml_df_3 = ml_df.copy()

In [73]:
def encode_labels(data, cat):
  data = data.copy()

  enc_data = data[cat]

  le = LabelEncoder()

  for col in enc_data.columns:
    data[col] = le.fit_transform(data[col])

  return data


In [74]:
ml_df_3_encoded = encode_labels(ml_df_3, ml_cat)
ml_df_3_encoded.columns


Index(['loan_amount_000s', 'applicant_income_000s', 'population',
       'minority_population', 'census_tract_number',
       'hud_median_family_income', 'tract_to_msamd_income',
       'number_of_owner_occupied_units', 'number_of_1_to_4_family_units',
       'agency_abbr', 'loan_type_name', 'property_type_name',
       'loan_purpose_name', 'owner_occupancy_name', 'msamd_name',
       'county_name', 'purchaser_type_name', 'hoepa_status_name',
       'lien_status_name', 'action_taken'],
      dtype='object')

In [75]:
brfc_3, brfc_3_pred, brfc_3_importances = split_train_build_brfc(data=ml_df_3_encoded, target='action_taken', name='Balanced Random Forest Model Encoded Labels - Test 3', title='brfc_3')

split_target: Done!
scale_data: Done!
brfc_model: Done!
Balanced Random Forest Model Encoded Labels - Test 3
Accuracy Score: 0.8944
          Predicted 0  Predicted 1
Actual 0       152761        12800
Actual 1        27119       175298
                   pre       rec       spe        f1       geo       iba       sup

          0       0.85      0.92      0.87      0.88      0.89      0.80    165561
          1       0.93      0.87      0.92      0.90      0.89      0.79    202417

avg / total       0.89      0.89      0.90      0.89      0.89      0.80    367978

    importance                       parameter
0     0.454992             purchaser_type_name
1     0.145060                lien_status_name
2     0.077933                loan_amount_000s
3     0.075475           applicant_income_000s
4     0.058027                     agency_abbr
5     0.025028           tract_to_msamd_income
6     0.024296             minority_population
7     0.023806             census_tract_number
8    