In [5]:
import warnings
warnings.filterwarnings("ignore")

from os import listdir, path
import os
import math

import pandas as pd
import numpy as np
import datetime as dt

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

import lightgbm as lgb
from sklearn.externals import joblib
import joblib

from statsmodels.graphics.correlation import plot_corr

from sklearn import linear_model
import statsmodels.api as sm

from statsmodels.graphics.gofplots import qqplot

from scipy.stats import shapiro

from statsmodels.stats.outliers_influence import OLSInfluence as influence

from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
import seaborn as sns

In [15]:
def train_lgbm(train, train_labels):
    train_data = lgb.Dataset(train, label = train_labels)
    # Selecting hyperparameters
    params = {'boosting_type': 'gbdt',
              'max_depth' : -1,
              'objective': 'regression',
              'nthread': 5,
              'num_leaves': 64,
              'learning_rate': 0.07,
              'metric' : 'rmsle'
            }
    # Creating search parameters
    gridParams = {'max_depth' : [-1,6],
                  'learning_rate': [0.09,0.1],
                  'n_estimators': [100,1000],
                  'num_leaves': [64,100],
                  'boosting_type' : ['gbdt'],
                  'objective' : ['regression'],
                  'random_state' : [0], 
                  'colsample_bytree' : [0.63],
                  'subsample' : [0.7]
                }
    # Creating the classifier
    mdl = lgb.LGBMRegressor(boosting_type= params['boosting_type'],
                            objective = params['objective'],
                            n_jobs = -2,
                            max_depth = params['max_depth']
                            )
    # View the default model params:
    mdl.get_params().keys()
    # Create the grid
    grid = GridSearchCV(mdl, gridParams, verbose=0, cv=3, n_jobs=-2)
    # Run the grid
    grid.fit(train, train_labels)
    # Using parameters already set above, replace in the best from the grid search
    params['colsample_bytree'] = grid.best_params_['colsample_bytree']
    params['learning_rate'] = grid.best_params_['learning_rate']
    params['num_leaves'] = grid.best_params_['num_leaves']
    params['subsample'] = grid.best_params_['subsample']
    params['n_estimators'] = grid.best_params_['n_estimators']
    # Train model on selected parameters and number of iterations
    lgbm = lgb.train(params,
                     train_data,
                     verbose_eval= 0
                    )
    print('Done')
    return lgbm, lgb

In [16]:
def predict_lgbm(model, test):
    predictions = model.predict(test)
    return predictions

In [3]:
ls DADOS

DadosBO_2017_1(ROUBO DE VEÍCULOS).txt
DadosBO_2017_10(ROUBO DE VEÍCULOS).txt
DadosBO_2017_11(ROUBO DE VEÍCULOS).txt
DadosBO_2017_12(ROUBO DE VEÍCULOS).txt
DadosBO_2017_2(ROUBO DE VEÍCULOS).txt
DadosBO_2017_3(ROUBO DE VEÍCULOS).txt
DadosBO_2017_4(ROUBO DE VEÍCULOS).txt
DadosBO_2017_5(ROUBO DE VEÍCULOS).txt
DadosBO_2017_6(ROUBO DE VEÍCULOS).txt
DadosBO_2017_7(ROUBO DE VEÍCULOS).txt
DadosBO_2017_8(ROUBO DE VEÍCULOS).txt
DadosBO_2017_9(ROUBO DE VEÍCULOS).txt
DadosBO_2018_1(ROUBO DE VEÍCULOS).txt
DadosBO_2018_10(ROUBO DE VEÍCULOS).txt
DadosBO_2018_11(ROUBO DE VEÍCULOS).txt
DadosBO_2018_12(ROUBO DE VEÍCULOS).txt
DadosBO_2018_2(ROUBO DE VEÍCULOS).txt
DadosBO_2018_3(ROUBO DE VEÍCULOS).txt
DadosBO_2018_4(ROUBO DE VEÍCULOS).txt
DadosBO_2018_5(ROUBO DE VEÍCULOS).txt
DadosBO_2018_6(ROUBO DE VEÍCULOS).txt
DadosBO_2018_7(ROUBO DE VEÍCULOS).txt
DadosBO_2018_8(ROUBO DE VEÍCULOS).txt
DadosBO_2018_9(ROUBO DE VEÍCULOS).txt
DadosBO_2019_1(ROUBO DE VEÍCULOS).txt
DadosBO_2019_2(ROUB

In [12]:
# read data
df = pd.read_csv('DADOS/final.csv', sep = ';')

In [13]:
df.head()

Unnamed: 0,COUNT_NUM_BO,DATAOCORRENCIA,ANOMES,ANO,MES,DIA_DA_SEMANA,FERIADO,ID_PERIDOOCORRENCIA,ID_VITIMAFATAL,ID_FLAGRANTE,...,IDADE,ID_ESTADOCIVIL,ID_PROFISSAO,ID_GRAUINSTRUCAO,ID_CORCUTIS,ID_UF_VEICULO,ID_CIDADE_VEICULO,ID_DESCR_COR_VEICULO,ID_DESCR_MARCA_VEICULO,ANO_MODELO
0,1.0,01/01/2017,201701.0,2017.0,1.0,1.0,1.0,3.0,1.0,1.0,...,,1.0,1.0,1.0,1.0,2.0,76.0,2.0,444.0,2013.0
1,1.0,01/01/2017,201701.0,2017.0,1.0,1.0,1.0,3.0,1.0,1.0,...,,1.0,1.0,1.0,1.0,2.0,16.0,3.0,445.0,2012.0
2,1.0,01/01/2017,201701.0,2017.0,1.0,1.0,1.0,3.0,1.0,1.0,...,,1.0,1.0,1.0,1.0,2.0,2.0,2.0,446.0,2016.0
3,1.0,01/01/2017,201701.0,2017.0,1.0,1.0,1.0,3.0,1.0,1.0,...,,1.0,1.0,1.0,1.0,2.0,14.0,6.0,134.0,2014.0
4,3.0,01/01/2017,201701.0,2017.0,1.0,1.0,1.0,3.0,1.0,1.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239820 entries, 0 to 239819
Data columns (total 29 columns):
COUNT_NUM_BO              238634 non-null float64
DATAOCORRENCIA            238634 non-null object
ANOMES                    122880 non-null float64
ANO                       238634 non-null float64
MES                       238634 non-null float64
DIA_DA_SEMANA             238634 non-null float64
FERIADO                   238634 non-null float64
ID_PERIDOOCORRENCIA       238634 non-null float64
ID_VITIMAFATAL            238634 non-null float64
ID_FLAGRANTE              238634 non-null float64
ID_LOGRADOURO             238634 non-null float64
ID_BAIRRO                 238634 non-null float64
ID_CIDADE                 238634 non-null object
LATITUDE                  217000 non-null object
LONGITUDE                 217000 non-null object
ID_DESCRICAOLOCAL         238634 non-null float64
ID_NATURALIDADE           238634 non-null float64
ID_NACIONALIDADE          238634 non-null f