In [137]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from hyperopt import fmin, tpe, hp
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split 

%matplotlib inline

In [245]:
df = pd.read_csv('database_fires.csv')

In [246]:
df.head()

Unnamed: 0,id,estado,estacao,data,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude,fires
0,1,AC,50484,01/01/2016,,31.4,,2.3,,,96.0,0.0,170.0,0
1,2,AC,50484,02/01/2016,0.0,32.5,23.3,4.4,,26.44,95.0,0.51444,170.0,0
2,3,AC,50484,03/01/2016,35.2,30.5,24.0,0.6,,25.78,97.0,0.51444,170.0,0
3,4,AC,50484,04/01/2016,60.2,31.7,22.3,2.0,,25.94,96.0,0.0,170.0,0
4,5,AC,50484,05/01/2016,28.4,28.5,23.0,0.1,,25.04,95.0,0.0,170.0,0


In [247]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251767 entries, 0 to 251766
Data columns (total 14 columns):
id                  251767 non-null int64
estado              251767 non-null object
estacao             251767 non-null int64
data                251767 non-null object
precipitacao        250360 non-null float64
temp_max            224873 non-null float64
temp_min            240179 non-null float64
insolacao           211218 non-null float64
evaporacao_piche    178196 non-null float64
temp_comp_med       209929 non-null float64
umidade_rel_med     218944 non-null float64
vel_vento_med       208457 non-null float64
altitude            251767 non-null float64
fires               251767 non-null int64
dtypes: float64(9), int64(3), object(2)
memory usage: 26.9+ MB


In [248]:
df['estacao'].nunique()

239

In [249]:
df['data'] = pd.to_datetime(df['data'], format='%d/%m/%Y')
df['mes'] = df['data'].apply(lambda x: x.month_name())

In [250]:
df.head()

Unnamed: 0,id,estado,estacao,data,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude,fires,mes
0,1,AC,50484,2016-01-01,,31.4,,2.3,,,96.0,0.0,170.0,0,January
1,2,AC,50484,2016-01-02,0.0,32.5,23.3,4.4,,26.44,95.0,0.51444,170.0,0,January
2,3,AC,50484,2016-01-03,35.2,30.5,24.0,0.6,,25.78,97.0,0.51444,170.0,0,January
3,4,AC,50484,2016-01-04,60.2,31.7,22.3,2.0,,25.94,96.0,0.0,170.0,0,January
4,5,AC,50484,2016-01-05,28.4,28.5,23.0,0.1,,25.04,95.0,0.0,170.0,0,January


In [251]:
df['fires'].value_counts()

0    188271
1     63496
Name: fires, dtype: int64

In [252]:
for c in df.columns:
    if df[c].dtype == 'object':
        df[c] = df[c].astype('category')
        
X = df.drop(['id', 'data', 'fires'], axis=1)
y = df['fires']

In [253]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)

## Otimização

In [162]:
space = {
    'Learning_Rate': hp.loguniform('Learning_Rate', np.log(0.005), np.log(0.5)),
    'Lambda_L1' : hp.loguniform('Lambda_L1', np.log(0.001), np.log(1)),
    'Lambda_L2' : hp.loguniform('Lambda_L2', np.log(0.001), np.log(1)),
    'Bagging_Fraction' : hp.uniform('Bagging_Fraction', 0.1, 1),
    'Feature_Fraction' : hp.uniform('Feature_Fraction', 0.1, 1),
    'Min_Data_Leaf' : hp.quniform('Min_Data_Leaf', 1, 200, 1),
    'Maximum_Leaves' : hp.quniform('Maximum_Leaves', 2, 50, 1),
    'Bagging_Frequence' : hp.quniform('Bagging_Frequence', 1, 10, 1),
    'Maximum_Bins' : hp.quniform('Maximum_Bins', 15, 255, 1)
}

In [165]:
train_set = lgb.Dataset(X_train, label = y_train)
def obj(x):
    params = {'bagging_freq': int(x['Bagging_Frequence']),  'bagging_fraction': x['Bagging_Fraction'],  
        'boost_from_average':'false',  'boost': 'gbdt', 'feature_fraction': x['Feature_Fraction'],  
        'learning_rate': x['Learning_Rate'],  'metric': 'binary_error',  'min_data_in_leaf': int(x['Min_Data_Leaf']),     
        'num_leaves': int(x['Maximum_Leaves']),  'num_threads': 2,  'tree_learner': 'serial', 'objective': 'binary',  
        'reg_alpha': x['Lambda_L1'],  'reg_lambda': x['Lambda_L2'],  
        'verbosity': 0,  'max_bin': int(x['Maximum_Bins']), 'min_data' : 1, 'min_data_in_bin':1
        }
    cv_results = lgb.cv(params, train_set, num_boost_round=1000000, nfold=5, metrics='binary_error', early_stopping_rounds=1000, verbose_eval=5000)
    best_score = np.min(cv_results['binary_error-mean'])
    return best_score

In [167]:
best = fmin(fn=obj, space=space, algo=tpe.suggest, max_evals=100)

[5000]	cv_agg's binary_error: 0.192252 + 0.000697149                                 
[10000]	cv_agg's binary_error: 0.188707 + 0.000744635                                
[15000]	cv_agg's binary_error: 0.186974 + 0.000804888                                
[20000]	cv_agg's binary_error: 0.185787 + 0.00103239                                 
[25000]	cv_agg's binary_error: 0.18474 + 0.000789481                                 
[30000]	cv_agg's binary_error: 0.184214 + 0.000786872                                
[5000]	cv_agg's binary_error: 0.185539 + 0.000758064                                  
[10000]	cv_agg's binary_error: 0.180857 + 0.000962212                             
[15000]	cv_agg's binary_error: 0.178693 + 0.000809076                             
[20000]	cv_agg's binary_error: 0.177426 + 0.000576425                             
[5000]	cv_agg's binary_error: 0.175773 + 0.000743477                                    
[5000]	cv_agg's binary_error: 0.177973 + 0.000726948       

In [168]:
best

{'Bagging_Fraction': 0.8360017521127768,
 'Bagging_Frequence': 6.0,
 'Feature_Fraction': 0.7697152656464976,
 'Lambda_L1': 0.10039162642626634,
 'Lambda_L2': 0.10270041356582635,
 'Learning_Rate': 0.0051953511991254565,
 'Maximum_Bins': 158.0,
 'Maximum_Leaves': 50.0,
 'Min_Data_Leaf': 51.0}

## Validação

In [227]:
params = {
    'bagging_freq': int(best['Bagging_Frequence']),  'bagging_fraction': best['Bagging_Fraction'],  
    'boost_from_average':'false',  'boost': 'gbdt', 'feature_fraction': best['Feature_Fraction'],  
    'learning_rate': best['Learning_Rate'],  'metric':'binary_error',  'min_data_in_leaf': int(best['Min_Data_Leaf']),     
    'num_leaves': int(best['Maximum_Leaves']),  'num_threads': 2,  'tree_learner': 'serial', 'objective': 'binary',  
    'reg_alpha': best['Lambda_L1'],  'reg_lambda': best['Lambda_L2'],  
    'verbosity': 1,  'max_bin': int(best['Maximum_Bins']), 'min_data' : 1, 'min_data_in_bin':1
        
}

In [254]:
skf = StratifiedKFold(5, shuffle=True)
oof = np.zeros(len(X_train))

In [255]:
for train_idx, val_idx in skf.split(X_train.values, y_train.values):
    train_data = lgb.Dataset(X_train.iloc[train_idx], y_train.iloc[train_idx])
    val_data = lgb.Dataset(X_train.iloc[val_idx], y_train.iloc[val_idx])
    lgbm = lgb.train(params, train_data, 10000, [train_data, val_data], early_stopping_rounds=1000, verbose_eval=5000)
    oof[val_idx] = lgbm.predict(X_train.iloc[val_idx], num_iteration = lgbm.best_iteration)

Training until validation scores don't improve for 1000 rounds
Early stopping, best iteration is:
[3875]	training's binary_error: 0.156837	valid_1's binary_error: 0.174585
Training until validation scores don't improve for 1000 rounds
[5000]	training's binary_error: 0.153156	valid_1's binary_error: 0.173448
Early stopping, best iteration is:
[6041]	training's binary_error: 0.149047	valid_1's binary_error: 0.172703
Training until validation scores don't improve for 1000 rounds
[5000]	training's binary_error: 0.15277	valid_1's binary_error: 0.1702
Early stopping, best iteration is:
[6245]	training's binary_error: 0.148389	valid_1's binary_error: 0.169704
Training until validation scores don't improve for 1000 rounds
[5000]	training's binary_error: 0.153713	valid_1's binary_error: 0.170299
Early stopping, best iteration is:
[5273]	training's binary_error: 0.152578	valid_1's binary_error: 0.169977
Training until validation scores don't improve for 1000 rounds
Early stopping, best iteration

In [256]:
print(roc_auc_score(y_train, oof))

0.8677779567178748


In [257]:
preds = [int(i>0.5) for i in oof]

In [258]:
print(classification_report(y_train, preds))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89    150616
           1       0.71      0.53      0.61     50797

   micro avg       0.83      0.83      0.83    201413
   macro avg       0.78      0.73      0.75    201413
weighted avg       0.82      0.83      0.82    201413



In [259]:
print(confusion_matrix(y_train, preds))

[[139795  10821]
 [ 23733  27064]]


In [270]:
print(accuracy_score(y_train, preds))

0.828442056868226


## Teste

In [261]:
train_data = lgb.Dataset(X_train, y_train)
lgbm = lgb.train(params, train_data, 10000)

In [266]:
preds_prob_teste = lgbm.predict(X_test)

In [267]:
preds2 = [int(i>0.5) for i in preds_prob_teste]

In [268]:
print(classification_report(y_test, preds2))

              precision    recall  f1-score   support

           0       0.86      0.93      0.89     37655
           1       0.71      0.54      0.61     12699

   micro avg       0.83      0.83      0.83     50354
   macro avg       0.79      0.73      0.75     50354
weighted avg       0.82      0.83      0.82     50354



In [269]:
print(accuracy_score(y_test, preds2))

0.8293283552448664


## Treino dataset inteiro

In [187]:
train_data = lgb.Dataset(X, y)
lgbm = lgb.train(params, train_data, 10000)

## Respostas

In [229]:
respostas = pd.read_csv('respostas.csv')

In [230]:
respostas.head()

Unnamed: 0,id,estado,estacao,data,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude
0,251768,AC,50484,27/01/2016,5.8,34.7,23.7,8.8,,27.84,92.0,1.02888,170.0
1,251769,AC,50484,28/01/2016,0.0,30.0,23.7,0.2,,26.6,93.0,0.0,170.0
2,251770,AC,50484,29/01/2016,0.0,35.5,24.1,7.1,,27.78,97.0,0.0,170.0
3,251771,AC,50484,05/08/2016,,,,,,,,0.0,170.0
4,251772,AC,88948,28/05/2019,0.0,34.7,21.6,8.3,2.0,26.56,86.75,0.666667,160.0


In [231]:
respostas['data'] = pd.to_datetime(respostas['data'], format='%d/%m/%Y')
respostas['mes'] = respostas['data'].apply(lambda x: x.month_name())

In [232]:
respostas['estado'] = respostas['estado'].astype('category')
respostas['mes'] = respostas['mes'].astype('category')
features = respostas.drop(['id', 'data'], axis=1)

In [233]:
features.head()

Unnamed: 0,estado,estacao,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude,mes
0,AC,50484,5.8,34.7,23.7,8.8,,27.84,92.0,1.02888,170.0,January
1,AC,50484,0.0,30.0,23.7,0.2,,26.6,93.0,0.0,170.0,January
2,AC,50484,0.0,35.5,24.1,7.1,,27.78,97.0,0.0,170.0,January
3,AC,50484,,,,,,,,0.0,170.0,August
4,AC,88948,0.0,34.7,21.6,8.3,2.0,26.56,86.75,0.666667,160.0,May


In [271]:
preds_sub = lgbm.predict(features)

In [242]:
preds_subB = [int(i>0.5) for i in preds_sub]

In [243]:
respostas['fires'] = preds_subB
respostas[['id', 'fires']].to_csv('lightgbm2.csv', index=False)