Dados vindo do site https://archive.ics.uci.edu/ml/datasets/Credit+Approval

Data Set Information:

This file concerns credit card applications. All attribute names and values have been changed to meaningless symbols to protect confidentiality of the data.

This dataset is interesting because there is a good mix of attributes -- continuous, nominal with small numbers of values, and nominal with larger numbers of values. There are also a few missing values.


Attribute Information:


A1: b, a.

A2: continuous.

A3: continuous.

A4: u, y, l, t.

A5: g, p, gg.

A6: c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff.

A7: v, h, bb, j, n, z, dd, ff, o.

A8: continuous.

A9: t, f.

A10: t, f.

A11: continuous.

A12: t, f.

A13: g, p, s.

A14: continuous.

A15: continuous.

A16: +,- (class attribute)



In [63]:
import numpy as np
import pandas as pd

In [64]:
df = pd.read_csv('crx.data', index_col=False)

# Renomeando as colunas
df.columns = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'Approved']

In [65]:
# Dando um check no conteudo
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,Approved
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+


In [66]:
df.describe()

Unnamed: 0,C,H,K,O
count,689.0,689.0,689.0,689.0
mean,4.765631,2.224819,2.402032,1018.862119
std,4.97847,3.348739,4.86618,5213.743149
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.25,2.625,3.0,396.0
max,28.0,28.5,67.0,100000.0


In [67]:
# Convertendo o conteudo do nosso target para 0 ou 1
df['Approved'] = (df['Approved'] == '+').astype(int)

In [68]:
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,Approved
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,1
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,1
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,1
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,1
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,1


In [69]:
# vai tratar valores nullos
df.isnull().sum()

# bom.. nao tem nulos. Mas não fique tao animado.. sempre tem pegadinha 


A           0
B           0
C           0
D           0
E           0
F           0
G           0
H           0
I           0
J           0
K           0
L           0
M           0
N           0
O           0
Approved    0
dtype: int64

In [70]:
# Vai converter as colunas categoricas
categorical_columns = ['A', 'D', 'E', 'F', 'G', 'I', 'J', 'K', 'L', 'M', 'N']
for c in categorical_columns:
    df[c] = pd.Categorical(df[c]).codes

In [71]:
# Vamos ver os tipos dos dados
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689 entries, 0 to 688
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   A         689 non-null    int8   
 1   B         689 non-null    object 
 2   C         689 non-null    float64
 3   D         689 non-null    int8   
 4   E         689 non-null    int8   
 5   F         689 non-null    int8   
 6   G         689 non-null    int8   
 7   H         689 non-null    float64
 8   I         689 non-null    int8   
 9   J         689 non-null    int8   
 10  K         689 non-null    int8   
 11  L         689 non-null    int8   
 12  M         689 non-null    int8   
 13  N         689 non-null    int16  
 14  O         689 non-null    int64  
 15  Approved  689 non-null    int32  
dtypes: float64(2), int16(1), int32(1), int64(1), int8(10), object(1)
memory usage: 29.7+ KB


In [72]:
# Sacou a pegadinha? a coluna B no info() fala que é Object, porém é para ser continua (de acordo com o texto la no inicio)

# Vamos tirar esse ? e sapegar um -1
df['B'] = df['B'].replace('?', -1)

In [73]:
# Vamos dar um double check pra ver se ficou algo null
df.isnull().sum()

A           0
B           0
C           0
D           0
E           0
F           0
G           0
H           0
I           0
J           0
K           0
L           0
M           0
N           0
O           0
Approved    0
dtype: int64

In [74]:
# Mais uma conferidinha pra ver se o TOP dessa coluna ainda é o ?
df['B'].describe()

count     689
unique    349
top        -1
freq       12
Name: B, dtype: int64

In [90]:
# Vamos colocar os numeros na mesma escala com MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
minmax = ['B', 'C', 'H', 'K', 'N', 'O']
scaler.fit(df[minmax])
df[minmax] = scaler.transform(df[minmax])

In [91]:
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,Approved
0,1,0.7344,0.159286,2,1,11,4,0.106667,1,1,0.272727,0,0,0.065089,0.0056,1
1,1,0.313846,0.017857,2,1,11,4,0.052632,1,0,0.0,0,0,0.56213,0.00824,1
2,2,0.354831,0.055,2,1,13,8,0.131579,1,1,0.227273,1,0,0.183432,3e-05,1
3,2,0.260554,0.200893,2,1,13,8,0.06,1,0,0.0,0,2,0.218935,0.0,1
4,2,0.407138,0.142857,2,1,10,8,0.087719,1,0,0.0,1,0,0.674556,0.0,1


In [92]:
# vai definir X e y
entradas = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O']
saida = ['Approved']

X = df[entradas]
y = df[saida]

In [98]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [101]:
# Aplica os algoritmos...
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor

models = [
    {'name': 'RandomForestClassifier' , 'model': RandomForestClassifier(n_estimators=500, max_depth=5) },
    {'name': 'DecisionTreeClassifier' , 'model': DecisionTreeClassifier() },
    {'name': 'ExtraTreesClassifier' , 'model': ExtraTreesClassifier() },
    {'name': 'LogisticRegression' , 'model': LogisticRegression() },
    {'name': 'SVC' , 'model': SVC() },
    {'name': 'DecisionTreeRegressor' , 'model': DecisionTreeRegressor() },
]

for m in models:
    model = m['model']
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    cm = confusion_matrix(y_test, predict)
    print(m['name'] + ':', '\n', cm)
    print('accuracy_score: ', accuracy_score(y_test, predict))
    print('classification_report:\n', classification_report(y_test, predict))
    
    print()


RandomForestClassifier: 
 [[242  28]
 [ 38 175]]
accuracy_score:  0.8633540372670807
classification_report:
               precision    recall  f1-score   support

           0       0.86      0.90      0.88       270
           1       0.86      0.82      0.84       213

    accuracy                           0.86       483
   macro avg       0.86      0.86      0.86       483
weighted avg       0.86      0.86      0.86       483


DecisionTreeClassifier: 
 [[211  59]
 [ 40 173]]
accuracy_score:  0.7950310559006211
classification_report:
               precision    recall  f1-score   support

           0       0.84      0.78      0.81       270
           1       0.75      0.81      0.78       213

    accuracy                           0.80       483
   macro avg       0.79      0.80      0.79       483
weighted avg       0.80      0.80      0.80       483


ExtraTreesClassifier: 
 [[237  33]
 [ 31 182]]
accuracy_score:  0.8674948240165632
classification_report:
               preci