# Modelo para decisão de risco de crédito

In [37]:
#install.packages("")

In [29]:
# Carregar pacotes
library(C50)
library(gmodels)

In [4]:
# Importar dados
credit <- read.csv("../data/credito.csv")

In [11]:
dim(credit)
head(credit)

checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
< 0 DM,6,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1 - 200 DM,48,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes
unknown,12,critical,education,2096,< 100 DM,4 - 7 years,2,3,49,none,own,1,unskilled,2,no,no
< 0 DM,42,good,furniture/appliances,7882,< 100 DM,4 - 7 years,2,4,45,none,other,1,skilled,2,no,no
< 0 DM,24,poor,car,4870,< 100 DM,1 - 4 years,3,4,53,none,other,2,skilled,2,no,yes
unknown,36,good,education,9055,unknown,1 - 4 years,2,4,35,none,other,1,unskilled,2,yes,no


# Explorar dados

In [5]:
# Info dos dados
str(credit)

'data.frame':	1000 obs. of  17 variables:
 $ checking_balance    : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
 $ months_loan_duration: int  6 48 12 42 24 36 24 36 12 30 ...
 $ credit_history      : Factor w/ 5 levels "critical","good",..: 1 2 1 2 4 2 2 2 2 1 ...
 $ purpose             : Factor w/ 6 levels "business","car",..: 5 5 4 5 2 4 5 2 5 2 ...
 $ amount              : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
 $ savings_balance     : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
 $ employment_duration : Factor w/ 5 levels "< 1 year","> 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
 $ percent_of_income   : int  4 2 2 2 3 2 3 2 2 4 ...
 $ years_at_residence  : int  4 2 3 4 4 4 4 2 4 2 ...
 $ age                 : int  67 22 49 45 53 35 53 35 61 28 ...
 $ other_credit        : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ housing             : Factor w/ 3 levels "other","own",..: 2 2 2 1 1 1 2 3 2 2 ...
 $ exi

In [10]:
# Verificar atributos do cliente
table(credit$checking_balance)
table(credit$savings_balance)


    < 0 DM   > 200 DM 1 - 200 DM    unknown 
       274         63        269        394 


     < 100 DM     > 1000 DM  100 - 500 DM 500 - 1000 DM       unknown 
          603            48           103            63           183 

In [12]:
# Características do crédito
summary(credit$months_loan_duration)
summary(credit$amount)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    4.0    12.0    18.0    20.9    24.0    72.0 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    250    1366    2320    3271    3972   18424 

In [13]:
# Variável target
table(credit$default)


 no yes 
700 300 

# Construir dados de treino e teste

In [14]:
# Construir indice para separa dados
train_sample <- sample(1000, 900)

In [15]:
# Split dos dataframes
credit_train <- credit[train_sample, ]
credit_test  <- credit[-train_sample, ]

In [18]:
# Verificar proporção da variável target
print("Treino")
prop.table(table(credit_train$default))
print("Teste")
prop.table(table(credit_test$default))

[1] "Treino"



       no       yes 
0.7033333 0.2966667 

[1] "Teste"



  no  yes 
0.67 0.33 

# Construir modelo

In [24]:
credit_model <- C5.0(credit_train[-17], credit_train$default)
credit_model


Call:
C5.0.default(x = credit_train[-17], y = credit_train$default)

Classification Tree
Number of samples: 900 
Number of predictors: 16 

Tree size: 56 

Non-standard options: attempt to group attributes


In [25]:
# Info do modelo
summary(credit_model)


Call:
C5.0.default(x = credit_train[-17], y = credit_train$default)


C5.0 [Release 2.07 GPL Edition]  	Wed Jul 22 17:22:49 2020
-------------------------------

Class specified by attribute `outcome'

Read 900 cases (17 attributes) from undefined.data

Decision tree:

checking_balance = unknown: no (358/45)
checking_balance in {< 0 DM,> 200 DM,1 - 200 DM}:
:...months_loan_duration > 30:
    :...employment_duration = unemployed: no (8)
    :   employment_duration in {< 1 year,> 7 years,1 - 4 years,4 - 7 years}:
    :   :...checking_balance = < 0 DM: yes (39/6)
    :       checking_balance = > 200 DM: no (6/1)
    :       checking_balance = 1 - 200 DM:
    :       :...savings_balance in {500 - 1000 DM,unknown}: no (8/1)
    :           savings_balance in {< 100 DM,> 1000 DM,100 - 500 DM}:
    :           :...months_loan_duration > 42: yes (16)
    :               months_loan_duration <= 42:
    :               :...phone = no: no (9/3)
    :                   phone = yes:
    :         

# Avaliar performance

In [27]:
credit_pred <- predict(credit_model, credit_test)
credit_pred

In [30]:
# Confusion matrix
CrossTable(credit_test$default, 
           credit_pred,
           prop.chisq = FALSE, 
           prop.c = FALSE, 
           prop.r = FALSE,
           dnn = c('Observado', 'Previsto'))


 
   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  100 

 
             | Previsto 
   Observado |        no |       yes | Row Total | 
-------------|-----------|-----------|-----------|
          no |        57 |        10 |        67 | 
             |     0.570 |     0.100 |           | 
-------------|-----------|-----------|-----------|
         yes |        10 |        23 |        33 | 
             |     0.100 |     0.230 |           | 
-------------|-----------|-----------|-----------|
Column Total |        67 |        33 |       100 | 
-------------|-----------|-----------|-----------|

 


# Melhorar performance

In [31]:
# Aumentando a precisão com 10 tentativas
credit_boost10 <- C5.0(credit_train[-17], credit_train$default, trials = 10)
credit_boost10


Call:
C5.0.default(x = credit_train[-17], y = credit_train$default, trials = 10)

Classification Tree
Number of samples: 900 
Number of predictors: 16 

Number of boosting iterations: 10 
Average tree size: 45.3 

Non-standard options: attempt to group attributes


In [32]:
summary(credit_boost10)


Call:
C5.0.default(x = credit_train[-17], y = credit_train$default, trials = 10)


C5.0 [Release 2.07 GPL Edition]  	Wed Jul 22 17:26:51 2020
-------------------------------

Class specified by attribute `outcome'

Read 900 cases (17 attributes) from undefined.data

-----  Trial 0:  -----

Decision tree:

checking_balance = unknown: no (358/45)
checking_balance in {< 0 DM,> 200 DM,1 - 200 DM}:
:...months_loan_duration > 30:
    :...employment_duration = unemployed: no (8)
    :   employment_duration in {< 1 year,> 7 years,1 - 4 years,4 - 7 years}:
    :   :...checking_balance = < 0 DM: yes (39/6)
    :       checking_balance = > 200 DM: no (6/1)
    :       checking_balance = 1 - 200 DM:
    :       :...savings_balance in {500 - 1000 DM,unknown}: no (8/1)
    :           savings_balance in {< 100 DM,> 1000 DM,100 - 500 DM}:
    :           :...months_loan_duration > 42: yes (16)
    :               months_loan_duration <= 42:
    :               :...phone = no: no (9/3)
    :         

In [34]:
# Score do modelo
credit_boost_pred10 <- predict(credit_boost10, credit_test)

# Confusion Matrix
CrossTable(credit_test$default, 
           credit_boost_pred10,
           prop.chisq = FALSE, 
           prop.c = FALSE, 
           prop.r = FALSE,
           dnn = c('Observado', 'Previsto'))


 
   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  100 

 
             | Previsto 
   Observado |        no |       yes | Row Total | 
-------------|-----------|-----------|-----------|
          no |        59 |         8 |        67 | 
             |     0.590 |     0.080 |           | 
-------------|-----------|-----------|-----------|
         yes |        16 |        17 |        33 | 
             |     0.160 |     0.170 |           | 
-------------|-----------|-----------|-----------|
Column Total |        75 |        25 |       100 | 
-------------|-----------|-----------|-----------|

 


In [35]:
# Dando pesos aos erros

# Criando uma matriz de dimensões de custo
matrix_dimensions <- list(c("no", "yes"), c("no", "yes"))
names(matrix_dimensions) <- c("Previsto", "Observado")
matrix_dimensions

In [36]:
# Construindo a matriz
error_cost <- matrix(c(0, 1, 4, 0), nrow = 2, dimnames = matrix_dimensions)
error_cost

# Aplicando a matriz a árvore
credit_cost <- C5.0(credit_train[-17], credit_train$default, costs = error_cost)

# Score do modelo
credit_cost_pred <- predict(credit_cost, credit_test)

# Confusion Matrix
CrossTable(credit_test$default, 
           credit_cost_pred,
           prop.chisq = FALSE, 
           prop.c = FALSE, 
           prop.r = FALSE,
           dnn = c('Observado', 'Previsto'))

Unnamed: 0,no,yes
no,0,4
yes,1,0



 
   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  100 

 
             | Previsto 
   Observado |        no |       yes | Row Total | 
-------------|-----------|-----------|-----------|
          no |        43 |        24 |        67 | 
             |     0.430 |     0.240 |           | 
-------------|-----------|-----------|-----------|
         yes |         7 |        26 |        33 | 
             |     0.070 |     0.260 |           | 
-------------|-----------|-----------|-----------|
Column Total |        50 |        50 |       100 | 
-------------|-----------|-----------|-----------|

 
