# LightGBM

In [1]:
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

## Importando os dados

In [2]:
Y_treino = pd.read_csv('/home/lucasqueiros/DataScience/Projeto-Classificacao/data/processed/Y_treino.csv', sep=',', encoding='utf-8')
Y_teste = pd.read_csv('/home/lucasqueiros/DataScience/Projeto-Classificacao/data/processed/Y_teste.csv', sep=',', encoding='utf-8')

X_esc_treino = pd.read_csv('/home/lucasqueiros/DataScience/Projeto-Classificacao/data/processed/X_esc_treino.csv', sep=',', encoding='utf-8')
X_esc_teste = pd.read_csv('/home/lucasqueiros/DataScience/Projeto-Classificacao/data/processed/X_esc_teste.csv', sep=',', encoding='utf-8')

In [3]:
Y_teste = Y_teste.iloc[:, 1].values
Y_treino = Y_treino.iloc[:, 1].values

X_esc_teste = X_esc_teste.iloc[:, 1:].values
X_esc_treino = X_esc_treino.iloc[:, 1:].values

### Treinando modelo com dados escalonados

In [4]:
# Dataset para treino
dataset = lgb.Dataset(X_esc_treino,label=Y_treino)

In [5]:
# Parâmetros
parametros = {'num_leaves':250, # número de folhas
              'objective':'binary', # classificação Binária
              'max_depth':2,
              'learning_rate':.05,
              'max_bin':100}

In [6]:
lgbm=lgb.train(parametros,dataset,num_boost_round=200)

[LightGBM] [Info] Number of positive: 149, number of negative: 249
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039864 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3000
[LightGBM] [Info] Number of data points in the train set: 398, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374372 -> initscore=-0.513507
[LightGBM] [Info] Start training from score -0.513507


In [7]:
# Marcação do tempo de execução
from datetime import datetime
inicio=datetime.now()
lgbm=lgb.train(parametros,dataset)
fim=datetime.now()

tempo = fim - inicio
tempo

[LightGBM] [Info] Number of positive: 149, number of negative: 249
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002500 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3000
[LightGBM] [Info] Number of data points in the train set: 398, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374372 -> initscore=-0.513507
[LightGBM] [Info] Start training from score -0.513507


datetime.timedelta(microseconds=330181)

In [8]:
previsoes_lgbm = lgbm.predict(X_esc_teste)
previsoes_lgbm

array([0.05133969, 0.99538632, 0.96806163, 0.0217888 , 0.0056528 ,
       0.99383555, 0.99537121, 0.83355951, 0.78402288, 0.02211905,
       0.04679442, 0.97419976, 0.03643123, 0.91895469, 0.00913194,
       0.99278396, 0.01230597, 0.00932864, 0.00873632, 0.98847947,
       0.07885895, 0.01100535, 0.99414146, 0.0212633 , 0.02308752,
       0.02674436, 0.01385819, 0.03104458, 0.00978392, 0.99323799,
       0.0143739 , 0.00830348, 0.07509882, 0.01230597, 0.00833478,
       0.01195714, 0.60731527, 0.01557847, 0.99183926, 0.06237738,
       0.00881286, 0.9857767 , 0.01704145, 0.00916102, 0.0730617 ,
       0.03961807, 0.00664712, 0.05780722, 0.04499901, 0.01142438,
       0.99307528, 0.99414146, 0.10506058, 0.06764897, 0.00808739,
       0.01531771, 0.00926869, 0.99577714, 0.94542244, 0.00803507,
       0.01028324, 0.99066065, 0.99577714, 0.05315163, 0.0130031 ,
       0.06550726, 0.99323799, 0.99383555, 0.01064915, 0.02159393,
       0.97978415, 0.99161733, 0.01457199, 0.99186828, 0.03271

In [9]:
previsoes_lgbm.shape

(171,)

In [10]:
# Quando for menor que 5 considera 0 e quando for maior ou igual a 5 considera 1
for i in range(0, 171):
    if previsoes_lgbm[i] >= .5:
       previsoes_lgbm[i] = 1
    else:
       previsoes_lgbm[i] = 0

In [11]:
print("Acurácia: %.2f%%" % (accuracy_score(Y_teste, previsoes_lgbm) * 100.0))

Acurácia: 96.49%


In [12]:
confusion_matrix(Y_teste, previsoes_lgbm)

array([[106,   2],
       [  4,  59]])

### Análise com dados de treino

In [17]:
previsoes_treino = lgbm.predict(X_esc_treino)
previsoes_treino

array([0.01265096, 0.01256404, 0.08754191, 0.00657121, 0.03426177,
       0.68124018, 0.03194046, 0.00871238, 0.01856654, 0.02245127,
       0.0256682 , 0.0127964 , 0.95980012, 0.98995271, 0.94210948,
       0.31519721, 0.02591163, 0.95422768, 0.03942848, 0.99309443,
       0.01541359, 0.0106324 , 0.0167273 , 0.00838512, 0.9258451 ,
       0.15548032, 0.00881958, 0.98258915, 0.0112118 , 0.02552573,
       0.06685553, 0.99239017, 0.00789385, 0.99625253, 0.99276728,
       0.0079825 , 0.01478931, 0.03522682, 0.73018294, 0.00793005,
       0.01258736, 0.01168292, 0.00744474, 0.88930248, 0.01018751,
       0.01452215, 0.00581597, 0.00562912, 0.00906533, 0.98952286,
       0.03656666, 0.99625253, 0.99323799, 0.00713631, 0.02034052,
       0.99239017, 0.02403782, 0.00496252, 0.02518931, 0.01100535,
       0.06809973, 0.05703842, 0.01046641, 0.64806996, 0.98731192,
       0.99625253, 0.00639845, 0.03386349, 0.99596209, 0.02346973,
       0.01328138, 0.94838291, 0.02354768, 0.90600534, 0.00527

In [18]:
previsoes_treino.shape

(398,)

In [19]:
# Quando for menor que 5 considera 0 e quando for maior ou igual a 5 considera 1
for i in range(0, 398):
    if previsoes_treino[i] >= .5:
       previsoes_treino[i] = 1
    else:
       previsoes_treino[i] = 0

In [20]:
previsoes_treino


array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0.,
       1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1.,
       1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
       1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0.,
       1., 0., 0., 1., 0., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 1.,
       0., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0., 1., 1.,
       0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0.,
       0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1.,
       0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0.,
       1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 0.

In [21]:
print("Acurácia: %.2f%%" % (accuracy_score(Y_treino, previsoes_treino) * 100.0))

Acurácia: 99.25%


In [22]:
confusion_matrix(Y_treino, previsoes_treino)

array([[249,   0],
       [  3, 146]])

### Validação Cruzada

In [23]:
# Separando os dados em folds
kfold = KFold(n_splits = 30, shuffle=True, random_state = 5)

In [24]:
# Criando o modelo
modelo = lgb.LGBMClassifier(num_leaves = 250, objective = 'binary',
                            max_depth = 2, learning_rate = .05, max_bin =100)

In [25]:
resultado = cross_val_score(modelo, X_esc_treino, Y_treino, cv = kfold)
resultado

[LightGBM] [Info] Number of positive: 143, number of negative: 241
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000234 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3000
[LightGBM] [Info] Number of data points in the train set: 384, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.372396 -> initscore=-0.521952
[LightGBM] [Info] Start training from score -0.521952
[LightGBM] [Info] Number of positive: 142, number of negative: 242
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000135 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3000
[LightGBM] [Info] Number of data points in the train set: 384, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.369792 -> initscore=-0.533111
[LightGBM] [Info] Start training from score -0.533111
[LightGBM] [Info] Number

array([0.85714286, 1.        , 0.92857143, 0.92857143, 1.        ,
       0.92857143, 1.        , 1.        , 1.        , 0.92307692,
       1.        , 0.92307692, 0.92307692, 0.92307692, 1.        ,
       1.        , 1.        , 0.92307692, 1.        , 1.        ,
       0.92307692, 0.92307692, 1.        , 1.        , 1.        ,
       1.        , 0.84615385, 0.84615385, 1.        , 1.        ])

In [26]:
# Usamos a média e o desvio padrão
print("Acurácia Média: %.2f%%" % (resultado.mean() * 100.0))

Acurácia Média: 95.99%


##### LightGBM = 96.49% (treino e teste) e 95.99% (validação cruzada) - previsores - lgb.LGBMClassifier(num_leaves = 250, objective = 'binary', max_depth = 2, learning_rate = .05, max_bin =100)