# Light Gradient Boosting Method

Determinar se um cliente realizou a compra através de uma propaganda utilizando o método de Light Gradiente Boosting Method (GBM)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
dados = pd.read_csv('Social_Network_Ads.csv')

In [3]:
dados.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


Coluna User ID não é necessária para o modelo pois é apenas um identificador do usuário

In [4]:
dados.columns

Index(['User ID', 'Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')

In [5]:
dados = dados.drop('User ID',axis=1)

In [6]:
dados.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


Convertendo a variável categórica Gender para numérica

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
enconder = LabelEncoder()

In [9]:
dados['Gender'] = enconder.fit_transform(dados['Gender'])

In [10]:
dados.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0
3,0,27,57000,0
4,1,19,76000,0


Normalizando as colunas Age e Estimated Salary

In [11]:
cols = ['Age','EstimatedSalary']

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
sc = StandardScaler()

In [14]:
dados[cols] = sc.fit_transform(dados[cols])

In [15]:
dados.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,1,-1.781797,-1.490046,0
1,1,-0.253587,-1.460681,0
2,0,-1.113206,-0.78529,0
3,0,-1.017692,-0.374182,0
4,1,-1.781797,0.183751,0


Determinando as variáveis X e Y

In [16]:
X = dados.drop('Purchased',axis=1).values
Y = dados['Purchased'].values

Separando em amostra de treino e teste

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_treino,X_teste,Y_treino,Y_teste=train_test_split(X,Y,test_size=0.25,random_state=0)

Aplicando modelo LightGBM

In [19]:
import lightgbm as lgb

Antes de aplicar o modelo, precisamos primeiro converter a amostra de treino numa amostra do LightGBM

In [20]:
treino_lgb = lgb.Dataset(X_treino, label=Y_treino)

Precisamos definir os parâmetros de entrada do modelo

In [21]:
parametros = {}
parametros['learning_rate'] = 0.003
parametros['boosting_type'] = 'gbdt'
parametros['objective'] = 'binary'
parametros['metric'] = 'binary_logloss'
parametros['sub_feature'] = 0.5
parametros['num_leaves'] = 10
parametros['min_data'] = 50
parametros['max_depth'] = 10

Realizando treinamento do modelo

In [22]:
iter = 1000

In [23]:
modelo = lgb.train(parametros, treino_lgb, iter)

Realizando previsões na amostra de teste

In [24]:
Y_previsto = modelo.predict(X_teste)

In [25]:
Y_previsto

array([0.32426949, 0.10971938, 0.10137615, 0.08956071, 0.0562084 ,
       0.06696874, 0.10467198, 0.73038678, 0.02774269, 0.6778828 ,
       0.05042561, 0.02615439, 0.08971418, 0.17686419, 0.11316823,
       0.64899527, 0.16945864, 0.11316823, 0.90706404, 0.11796851,
       0.0562084 , 0.90706404, 0.11487534, 0.90706404, 0.06696874,
       0.90706404, 0.22606067, 0.08398511, 0.08835034, 0.18988494,
       0.22606067, 0.16945864, 0.62333407, 0.08971418, 0.02615439,
       0.02607343, 0.06458304, 0.0558601 , 0.10467198, 0.82423272,
       0.22606067, 0.09402934, 0.06458304, 0.11796851, 0.75618867,
       0.02615439, 0.16710506, 0.74011532, 0.06696874, 0.71797916,
       0.90706404, 0.11648036, 0.04690237, 0.65281629, 0.74011532,
       0.60939572, 0.09460798, 0.05695435, 0.82423272, 0.06696874,
       0.11822212, 0.7142406 , 0.05695435, 0.73038678, 0.06696874,
       0.71797916, 0.22606067, 0.02615439, 0.11659627, 0.20513741,
       0.64899527, 0.09985447, 0.02615439, 0.60939572, 0.22606

Observamos que os valores obtidos são diferentes dos valores esperados 0 ou 1. Desta forma, precisamos converte-los para 0 e 1

In [26]:
for i in range(len(Y_previsto)):
    if(Y_previsto[i] >= 0.5):
        Y_previsto[i] = 1.0
    else:
        Y_previsto[i] = 0.0

In [27]:
Y_previsto

array([0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1.,
       0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1.,
       0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 1.])

In [28]:
from sklearn.metrics import confusion_matrix

In [29]:
cm = confusion_matrix(Y_teste, Y_previsto)

In [30]:
cm

array([[64,  4],
       [ 3, 29]])

In [31]:
from sklearn.metrics import accuracy_score

In [32]:
score_treino = accuracy_score(Y_teste,Y_previsto)

In [33]:
score_treino

0.93