In [44]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [67]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [46]:
path = Path('./cleaned_data.csv')
df = pd.read_csv(path, low_memory=False)

In [47]:
ml_columns = ['loan_amount_000s', 'applicant_income_000s', 'population', 'minority_population', 'census_tract_number', 'hud_median_family_income', 'tract_to_msamd_income', 'number_of_owner_occupied_units', 'number_of_1_to_4_family_units', 'agency_abbr', 'loan_type_name', 'property_type_name', 'loan_purpose_name', 'owner_occupancy_name', 'msamd_name', 'county_name', 'action_taken']

In [48]:
ml_df = df[ml_columns].copy()
ml_df = ml_df.dropna()
ml_df = ml_df.drop_duplicates()

ml_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1466293 entries, 0 to 1707304
Data columns (total 17 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   loan_amount_000s                1466293 non-null  float64
 1   applicant_income_000s           1466293 non-null  float64
 2   population                      1466293 non-null  float64
 3   minority_population             1466293 non-null  float64
 4   census_tract_number             1466293 non-null  float64
 5   hud_median_family_income        1466293 non-null  float64
 6   tract_to_msamd_income           1466293 non-null  float64
 7   number_of_owner_occupied_units  1466293 non-null  float64
 8   number_of_1_to_4_family_units   1466293 non-null  float64
 9   agency_abbr                     1466293 non-null  object 
 10  loan_type_name                  1466293 non-null  object 
 11  property_type_name              1466293 non-null  object 
 12  

In [49]:
def fix_target_data(member):
  if member == 1:
    return 1 
  else:
    return 0

In [50]:
ml_df['action_taken'] = ml_df['action_taken'].apply(fix_target_column)

In [51]:
ml_df['action_taken'].value_counts()

0    807981
1    658312
Name: action_taken, dtype: int64

In [52]:
def get_categorical_columns(data):
  data = data.copy()

  encoded_columns = data.dtypes[data.dtypes == 'object'].index.tolist()

  return encoded_columns

In [53]:
ml_cat = get_categorical_columns(ml_df)
ml_cat

['agency_abbr',
 'loan_type_name',
 'property_type_name',
 'loan_purpose_name',
 'owner_occupancy_name',
 'msamd_name',
 'county_name']

In [54]:
ml_df[ml_cat].nunique()

agency_abbr              6
loan_type_name           4
property_type_name       2
loan_purpose_name        3
owner_occupancy_name     3
msamd_name              29
county_name             37
dtype: int64

In [55]:
def encode_data(data, encoded_columns):
    data = data.copy()

    # make df of columns to be encoded
    enc_df = data[encoded_columns]

    #initialize the encoder
    enc = OneHotEncoder(sparse=False)
    #fit and transform the encoded columns
    encoded = enc.fit_transform(enc_df)

    encoded_df = pd.DataFrame(encoded)

    encoded_df.columns = enc.get_feature_names_out(encoded_columns)

    encoded_df = encoded_df.drop(columns=['property_type_name_Manufactured housing'])

    data = data.merge(encoded_df, left_index=True, right_index=True).drop(columns=encoded_columns, axis=1)

    print('encode_data: Done!')
    return data

In [56]:
ml_df_encoded = encode_data(ml_df, ml_cat)
ml_df_encoded

encode_data: Done!


Unnamed: 0,loan_amount_000s,applicant_income_000s,population,minority_population,census_tract_number,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units,action_taken,...,county_name_Santa Cruz County,county_name_Shasta County,county_name_Solano County,county_name_Sonoma County,county_name_Stanislaus County,county_name_Sutter County,county_name_Tulare County,county_name_Ventura County,county_name_Yolo County,county_name_Yuba County
0,570.0,144.0,4824.0,37.230000,61.01,75200.0,57.419998,818.0,1626.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,185.0,51.0,7404.0,57.520000,432.27,63200.0,116.010002,1215.0,1743.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1079.0,278.0,3372.0,33.189999,4038.00,97400.0,141.740005,592.0,1105.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,417.0,125.0,8787.0,65.129997,3020.08,97400.0,97.269997,1463.0,2164.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,379.0,86.0,5356.0,23.100000,58.01,75200.0,126.690002,1711.0,2102.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1466287,600.0,246.0,2687.0,14.630000,80.02,79300.0,143.520004,830.0,1033.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1466288,204.0,94.0,6494.0,21.280001,191.05,79300.0,137.100006,1859.0,2314.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1466289,421.0,128.0,7458.0,34.139999,3040.05,97400.0,123.230003,1657.0,2201.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1466290,204.0,66.0,18026.0,69.089996,31.23,53000.0,170.009995,3344.0,4670.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
def split_target(data, target):
  data = data.copy()

  y = data[target]
  X = data.drop(columns=target)

  print('split_target: Done!')
  return X, y

In [59]:
X, y = split_target(ml_df_encoded, 'action_taken')

split_target: Done!


In [60]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3, stratify=y)

In [61]:
def scale_data(train, test):
  scaler = StandardScaler()

  t_scaler = scaler.fit(train)

  train_scaled = t_scaler.transform(train)
  test_scaled = t_scaler.transform(test)

  print('scale_data: Done!')
  return train_scaled, test_scaled

In [62]:
X_train_scaled, X_test_scaled = scale_data(X_train, X_test)

scale_data: Done!


In [64]:
def build_eec(X_train, X_test, y_train, y_test):
    model = EasyEnsembleClassifier()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    #balanced accuracy score
    accuracy_score = balanced_accuracy_score(y_test, y_pred)
    print(f'Balanced Accuracy Score: Easy Ensemble AdaBoost Classifier')
    print(accuracy_score)
    return model, y_pred,  accuracy_score

In [19]:
eec_model, eec_0_pred, eec_0_accuracy = build_eec(X_train, X_test, y_train, y_test)

Balanced Accuracy Score: Easy Ensemble AdaBoost Classifier
0.5653180103443203


In [63]:
def model_metrics(model_name, test, pred, file_title):
  if not os.path.isdir('./ml'):
    os.makedirs('./ml')
      
  with open(f'./ml/{file_title}_results.txt','w') as txt_file:
    model_accuracy = balanced_accuracy_score(test, pred)
    cm = confusion_matrix(test, pred)
    cm_df = pd.DataFrame(cm, columns=['Predicted 0','Predicted 1'], index=['Actual 0','Actual 1'])
    crib = classification_report_imbalanced(test,pred)
    txt_file.write(f'Accuracy Score: {model_accuracy} \n\n')
    txt_file.write(str(cm_df))
    txt_file.write(f'\n\n {crib}')
    print(model_name)
    print(f'Accuracy Score: {model_accuracy:.4f}')
    print(cm_df)
    print(crib)

In [25]:
model_metrics(model_name='EasyEnsembleClassifier', test=y_test, pred=eec_y_pred, title='eec_0')

EasyEnsembleClassifier
Accuracy Score: 0.5653
          Predicted 0  Predicted 1
Actual 0       114574        69092
Actual 1        66359        68194
                   pre       rec       spe        f1       geo       iba       sup

          0       0.63      0.62      0.51      0.63      0.56      0.32    183666
          1       0.50      0.51      0.62      0.50      0.56      0.31    134553

avg / total       0.58      0.57      0.56      0.57      0.56      0.32    318219



In order to increase the accuracy score of the model, try reducing the number of categorical buckets in the dataset. 

In [29]:
ml_df[ml_cat].nunique()

agency_abbr              6
loan_type_name           4
property_type_name       2
loan_purpose_name        3
owner_occupancy_name     3
msamd_name              29
county_name             37
dtype: int64

In [31]:
ml_df['county_name'].value_counts()

Los Angeles County        305314
San Diego County          138410
Riverside County          125232
Orange County             116254
San Bernardino County      97327
Sacramento County          76206
Santa Clara County         63857
Alameda County             63094
Contra Costa County        58044
Ventura County             34401
San Joaquin County         34358
Fresno County              32105
Kern County                30827
Placer County              25619
Solano County              24532
San Mateo County           24516
Stanislaus County          24054
Sonoma County              20920
San Francisco County       20027
Santa Barbara County       13833
Tulare County              13779
Monterey County            13195
El Dorado County           12783
San Luis Obispo County     12293
Marin County               10480
Santa Cruz County           9712
Merced County               9542
Butte County                8710
Shasta County               7638
Yolo County                 7594
Napa Count

In [32]:
# Define the make_buckets function, which combines categorical values with value counts less than the designated cutoff into an "Other" category. 

def make_buckets(data, column, cutoff):
    data.copy()
    
    label_list = data[column].value_counts()

    replace_list = list(label_list[label_list < cutoff].index)

    for item in replace_list:
        data[column] = data[column].replace(item,'other')
    
    return data

In [33]:
ml_df_buc = make_buckets(ml_df, 'county_name', 20000)
ml_df_buc = make_buckets(ml_df, 'msamd_name', 20000)

After bucketing county_name and msamd_name, re-run the encoding/preprocessing/machine learning process and measure the model's accuracy. 

In [34]:
ml_df_enc_buc = encode_data(ml_df_buc, ml_cat)
ml_df_enc_buc

encode_data: Done!


Unnamed: 0,loan_amount_000s,applicant_income_000s,population,minority_population,census_tract_number,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units,action_taken,...,county_name_San Diego County,county_name_San Francisco County,county_name_San Joaquin County,county_name_San Mateo County,county_name_Santa Clara County,county_name_Solano County,county_name_Sonoma County,county_name_Stanislaus County,county_name_Ventura County,county_name_other
0,570.0,144.0,4824.0,37.230000,61.01,75200.0,57.419998,818.0,1626.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,185.0,51.0,7404.0,57.520000,432.27,63200.0,116.010002,1215.0,1743.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1079.0,278.0,3372.0,33.189999,4038.00,97400.0,141.740005,592.0,1105.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,417.0,125.0,8787.0,65.129997,3020.08,97400.0,97.269997,1463.0,2164.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,379.0,86.0,5356.0,23.100000,58.01,75200.0,126.690002,1711.0,2102.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1466287,600.0,246.0,2687.0,14.630000,80.02,79300.0,143.520004,830.0,1033.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1466288,204.0,94.0,6494.0,21.280001,191.05,79300.0,137.100006,1859.0,2314.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1466289,421.0,128.0,7458.0,34.139999,3040.05,97400.0,123.230003,1657.0,2201.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1466290,204.0,66.0,18026.0,69.089996,31.23,53000.0,170.009995,3344.0,4670.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
X_1, y_1 = split_X_y(ml_df_enc_buc, target='action_taken')

In [36]:
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y_1, random_state=1, stratify=y_1)

In [41]:
X_1_train_scaled, X_1_test_scaled = scale_data(X_1_train, X_1_test)

In [42]:
eec_model_1, eec_1_pred = build_eec(X_1_train, X_1_test, y_1_train, y_1_test)

Balanced Accuracy Score: Easy Ensemble AdaBoost Classifier
0.5647925421835591


In [43]:
model_metrics(model_name='EasyEnsembleClassifier - REDUCED', test=y_1_test, pred=eec_1_pred, title='eec_1')

EasyEnsembleClassifier - REDUCED
Accuracy Score: 0.5648
          Predicted 0  Predicted 1
Actual 0       114583        69083
Actual 1        66507        68046
                   pre       rec       spe        f1       geo       iba       sup

          0       0.63      0.62      0.51      0.63      0.56      0.32    183666
          1       0.50      0.51      0.62      0.50      0.56      0.31    134553

avg / total       0.58      0.57      0.56      0.57      0.56      0.32    318219



In [68]:
def encode_labels(data, cat):
  data = data.copy()

  enc_data = data[cat]

  le = LabelEncoder()

  for col in enc_data.columns:
    data[col] = le.fit_transform(data[col])

  return data

In [69]:
ml_df_2_encoded = encode_labels(ml_df, ml_cat)

In [72]:
def split_train_build_brfc(data, target, model_name, file_title, graph_title):
  
  X_, y_ = split_target(data, target)

  X_train_, X_test_, y_train_, y_test_ = train_test_split(X_, y_, random_state=1, stratify=y_)

  X_train_scaled_, X_test_scaled_ = scale_data(X_train_, X_test_)

  eec_, eec_pred, eec_accuracy = build_eec(X_train_scaled_, X_test_scaled_, y_train_, y_test_)

  model_metrics(model_name=model_name, test=y_test_, pred=eec_pred, file_title=file_title)


  return eec_, eec_pred, eec_accuracy

In [74]:
eec_2, eec_2_pred, eec_accuracy = split_train_build_brfc(data=ml_df_2_encoded, target='action_taken', model_name='Easy Ensemble Classifier Model - Test 2', file_title='eec_2', graph_title='EEC Test 2')

split_target: Done!
scale_data: Done!
