In [1]:
# Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Figures inline and set visualization style
%matplotlib inline
sns.set()

In [2]:
url = 'https://raw.githubusercontent.com/malle72/finalproject/master/la_loans.csv'

In [3]:
loans_df = pd.read_csv(url)

In [4]:
loans_df.head()

Unnamed: 0,app_accepted,respondent_id,agency_name,agency_abbr,loan_type_name,property_type_name,loan_purpose_name,owner_occupancy_name,loan_amount_000s,preapproval_name,...,applicant_income_000s,purchaser_type_name,hoepa_status_name,lien_status_name,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units
0,1,463735,Consumer Financial Protection Bureau,CFPB,Conventional,One-to-four family dwelling (other than manufa...,Home purchase,Owner-occupied as a principal dwelling,424,Not applicable,...,212.0,Loan was not originated or was not sold in cal...,Not a HOEPA loan,Secured by a first lien,8060.0,6.44,63300.0,179.789993,2432.0,3051.0
1,1,13951,Federal Deposit Insurance Corporation,FDIC,Conventional,One-to-four family dwelling (other than manufa...,Home purchase,Not owner-occupied as a principal dwelling,58,Not applicable,...,125.0,Loan was not originated or was not sold in cal...,Not a HOEPA loan,Secured by a first lien,3412.0,26.290001,53700.0,88.519997,541.0,1856.0
2,1,75-2921540,Department of Housing and Urban Development,HUD,Conventional,One-to-four family dwelling (other than manufa...,Refinancing,Owner-occupied as a principal dwelling,87,Not applicable,...,,Freddie Mac (FHLMC),Not a HOEPA loan,Secured by a first lien,11628.0,16.139999,68100.0,169.660004,3549.0,4049.0
3,0,233031,Consumer Financial Protection Bureau,CFPB,Conventional,One-to-four family dwelling (other than manufa...,Refinancing,Owner-occupied as a principal dwelling,80,Not applicable,...,160.0,Loan was not originated or was not sold in cal...,Not a HOEPA loan,Secured by a first lien,3412.0,26.290001,53700.0,88.519997,541.0,1856.0
4,0,233031,Consumer Financial Protection Bureau,CFPB,Conventional,One-to-four family dwelling (other than manufa...,Refinancing,Not owner-occupied as a principal dwelling,64,Not applicable,...,73.0,Loan was not originated or was not sold in cal...,Not a HOEPA loan,Secured by a subordinate lien,2667.0,66.970001,53700.0,65.779999,209.0,1081.0


In [5]:
loans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148996 entries, 0 to 148995
Data columns (total 29 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   app_accepted                    148996 non-null  int64  
 1   respondent_id                   148996 non-null  object 
 2   agency_name                     148996 non-null  object 
 3   agency_abbr                     148996 non-null  object 
 4   loan_type_name                  148996 non-null  object 
 5   property_type_name              148996 non-null  object 
 6   loan_purpose_name               148996 non-null  object 
 7   owner_occupancy_name            148996 non-null  object 
 8   loan_amount_000s                148996 non-null  int64  
 9   preapproval_name                148996 non-null  object 
 10  applicant_ethnicity_name        148996 non-null  object 
 11  co_applicant_ethnicity_name     148996 non-null  object 
 12  applicant_race_n

In [6]:
loans_df.describe()

Unnamed: 0,app_accepted,loan_amount_000s,applicant_sex,applicant_income_000s,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units
count,148996.0,148996.0,148996.0,135141.0,145715.0,145715.0,145715.0,145715.0,145715.0,145715.0
mean,0.55923,164.155742,1.523229,90.640857,5790.106701,33.987342,59589.701815,112.088414,1562.809525,2202.367615
std,0.496481,374.476442,0.765829,502.567417,2860.910917,24.677483,6779.225197,39.329715,897.877141,1010.090677
min,0.0,1.0,1.0,1.0,0.0,0.0,47200.0,0.0,0.0,0.0
25%,0.0,75.0,1.0,43.0,3689.0,15.47,53700.0,84.019997,936.0,1480.0
50%,1.0,134.0,1.0,65.0,5293.0,26.639999,61600.0,108.879997,1416.0,2084.0
75%,1.0,203.0,2.0,102.0,7323.0,47.400002,63300.0,134.199997,2014.0,2721.0
max,1.0,43448.0,4.0,147417.0,16628.0,100.0,68100.0,333.570007,4838.0,5690.0


In [7]:
#loops through all the census numeric columns to replace NaN or missing values
for col in loans_df.columns[23:29]:
    no_na = loans_df[col].dropna() #an intermediary variable so that the median function can be called and used to replace missing values
    loans_df[col].fillna(np.median(no_na),inplace=True)

In [8]:
#loops through the secondary race fields and enters 'Not applicable' where the loan applicant didn't provide a second race
for col in loans_df.columns[13:16]:
    loans_df[col].fillna('Not applicable',inplace=True)

In [9]:
# replaces empty values in applicant_income_000s
loans_df['applicant_income_000s'].fillna(value=np.median(loans_df.applicant_income_000s.dropna()),inplace=True)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(loans_df[loans_df.columns[1:]],loans_df.app_accepted,test_size=0.2,random_state=9)

In [11]:
first_test = tree.DecisionTreeClassifier(x_train,y_train)

In [12]:
first_test

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                       criterion=       respondent_id                                  agency_name agency_abbr  \
8292          463735         Consumer Financial Protection Bureau        CFPB   
36868         808176         Consumer Financial Protection Bureau        CFPB   
68738         463735         Consumer Financial Protection Bureau        CFPB   
61273          12611        Federal Deposit Insurance Corporation        FDIC   
87536     52-2321476  Department of Housing and Urban Development         HUD   
.....
86364                           1471.0                        2133.0  

[119196 rows x 28 columns],
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       ra