In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from yellowbrick.classifier import ConfusionMatrix 
import lightgbm as lgb

# Criando os modelos com base no Lightgdm

In [6]:
df = pd.read_csv("datasets/Credit.csv")

In [7]:
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,'critical/other existing credit',radio/tv,1169,'no known savings',>=7,4,'male single',none,...,'real estate',67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,'existing paid',radio/tv,5951,<100,1<=X<4,2,'female div/dep/mar',none,...,'real estate',22,none,own,1,skilled,1,none,yes,bad
2,'no checking',12,'critical/other existing credit',education,2096,<100,4<=X<7,2,'male single',none,...,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good
3,<0,42,'existing paid',furniture/equipment,7882,<100,4<=X<7,2,'male single',guarantor,...,'life insurance',45,none,'for free',1,skilled,2,none,yes,good
4,<0,24,'delayed previously','new car',4870,<100,1<=X<4,3,'male single',none,...,'no known property',53,none,'for free',2,skilled,2,none,yes,bad


* Temos a variável `class` que vamos remover do dataframe localizada na posição 21.
* Então vamos pegar todas as variáveis, exceto a da posição 21. 

## Convertendo as classes para binários

In [8]:
df[df['class'] == "good"] = 1

In [9]:
df[df['class'] == "bad"] = 0

### Convertendo as features do dataframe para category

In [10]:
for col in df.columns:
    df[col] = df[col].astype('category')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   checking_status         1000 non-null   category
 1   duration                1000 non-null   category
 2   credit_history          1000 non-null   category
 3   purpose                 1000 non-null   category
 4   credit_amount           1000 non-null   category
 5   savings_status          1000 non-null   category
 6   employment              1000 non-null   category
 7   installment_commitment  1000 non-null   category
 8   personal_status         1000 non-null   category
 9   other_parties           1000 non-null   category
 10  residence_since         1000 non-null   category
 11  property_magnitude      1000 non-null   category
 12  age                     1000 non-null   category
 13  other_payment_plans     1000 non-null   category
 14  housing                 1

In [12]:
previsores = df.iloc[:,0:20].values
previsores

array([[1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1]], dtype=object)

* Criando um array somente com as classes agora

In [13]:
classe =  df.iloc[:,20].values

* Vamos utilizar o `LabelEncoder()` para transformar as variáveis categóricas em numéricas.

In [14]:
labelencoder1 = LabelEncoder()
previsores[:,0] = labelencoder1.fit_transform(previsores[:,0])

In [15]:
labelencoder2 = LabelEncoder()
previsores[:,2] = labelencoder2.fit_transform(previsores[:,2])

In [16]:
labelencoder3 = LabelEncoder()
previsores[:, 3] = labelencoder3.fit_transform(previsores[:, 3])

In [17]:
labelencoder4 = LabelEncoder()
previsores[:, 5] = labelencoder4.fit_transform(previsores[:, 5])

In [18]:
labelencoder5 = LabelEncoder()
previsores[:, 6] = labelencoder5.fit_transform(previsores[:, 6])

In [19]:
labelencoder6 = LabelEncoder()
previsores[:, 8] = labelencoder6.fit_transform(previsores[:, 8])

In [20]:
labelencoder7 = LabelEncoder()
previsores[:, 9] = labelencoder7.fit_transform(previsores[:, 9])

In [21]:
labelencoder8 = LabelEncoder()
previsores[:, 11] = labelencoder8.fit_transform(previsores[:, 11])

In [22]:
labelencoder9 = LabelEncoder()
previsores[:, 13] = labelencoder9.fit_transform(previsores[:, 13])

In [23]:
labelencoder10 = LabelEncoder()
previsores[:, 14] = labelencoder10.fit_transform(previsores[:, 14])

In [24]:
labelencoder11 = LabelEncoder()
previsores[:, 16] = labelencoder11.fit_transform(previsores[:, 16])

In [25]:
labelencoder12 = LabelEncoder()
previsores[:, 18] = labelencoder12.fit_transform(previsores[:, 18])

In [26]:
labelencoder13 = LabelEncoder()
previsores[:, 19] = labelencoder13.fit_transform(previsores[:, 19])

In [27]:
previsores

array([[1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1]], dtype=object)

In [28]:
X_treinamento, X_teste, y_treinamento, y_teste = train_test_split(previsores,
                                                                  classe,
                                                                  test_size = 0.3,
                                                                  random_state = 0)

In [29]:
X_treinamento

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1]], dtype=object)

In [30]:
X_teste

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=object)

In [31]:
y_treinamento

[0, 0, 1, 1, 1, ..., 0, 0, 1, 0, 1]
Length: 700
Categories (2, int64): [0, 1]

In [32]:
y_teste

[1, 1, 1, 1, 1, ..., 1, 0, 1, 0, 0]
Length: 300
Categories (2, int64): [0, 1]

## Criando o Dataset de Treino e teste para o Lightgbm

### Treino do Modelos

In [33]:
fit_params={'categorical_feature': 'auto'}

In [34]:
lightClf = lgb.LGBMClassifier()

In [35]:
lightClf.fit(X_treinamento, y_treinamento, **fit_params)

In [36]:
previsoes = lightClf.predict(X_teste, num_iteration=14)

In [37]:
previsoes

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0])

* Gerando a Matriz de Confusão para medir a accuracy do modelo

In [38]:
confusao = confusion_matrix(y_teste, previsoes)

In [39]:
confusao

array([[ 86,   0],
       [  0, 214]])

In [None]:
sns.set(rc={'figure.figsize':(15,15)})

In [None]:
cm = confusion_matrix(y_teste, previsoes)
sns.heatmap(cm,annot=True);

* Printando a Matriz de Confusão podemos verificar melhor a performance do modelo. 
    * Clientes que eram `bad` e foram classificados como `bad` foram 38.  
    * Clientes que eram `bad` e foram classificados como `good` foram 20.
    * Clientes que eram `good` e foram classificados como `bad` foram 48.
    * Clientes que eram `good` e foram classificados como `good` foram 194.

In [40]:
taxa_acerto = accuracy_score(y_teste, previsoes)
taxa_erro = 1 - taxa_acerto

In [41]:
taxa_acerto * 100

100.0

In [42]:
taxa_erro * 100

0.0

### Classification Report

In [43]:
from sklearn.metrics import classification_report
print(classification_report(y_teste, previsoes))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        86
           1       1.00      1.00      1.00       214

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



### Probability

In [45]:
lightClf.predict_proba(X_teste, num_iteration=14) * 100

array([[ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [83.76836555, 16.23163445],
       [83.76836555, 16.23163445],
       [83.76836555, 16.23163445],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [83.76836555, 16.23163445],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [83.76836555, 16.23163445],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [83.76836555, 16.23163445],
       [ 7.42841458, 92.57158542],
       [ 7.42841458, 92.57158542],
       [ 7.42841458,