In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from hyperopt import fmin, tpe, hp
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split 

%matplotlib inline

In [2]:
df = pd.read_csv('database_fires.csv')

In [3]:
df.head()

Unnamed: 0,id,estado,estacao,data,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude,fires
0,1,AC,50484,01/01/2016,,31.4,,2.3,,,96.0,0.0,170.0,0
1,2,AC,50484,02/01/2016,0.0,32.5,23.3,4.4,,26.44,95.0,0.51444,170.0,0
2,3,AC,50484,03/01/2016,35.2,30.5,24.0,0.6,,25.78,97.0,0.51444,170.0,0
3,4,AC,50484,04/01/2016,60.2,31.7,22.3,2.0,,25.94,96.0,0.0,170.0,0
4,5,AC,50484,05/01/2016,28.4,28.5,23.0,0.1,,25.04,95.0,0.0,170.0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251767 entries, 0 to 251766
Data columns (total 14 columns):
id                  251767 non-null int64
estado              251767 non-null object
estacao             251767 non-null int64
data                251767 non-null object
precipitacao        250360 non-null float64
temp_max            224873 non-null float64
temp_min            240179 non-null float64
insolacao           211218 non-null float64
evaporacao_piche    178196 non-null float64
temp_comp_med       209929 non-null float64
umidade_rel_med     218944 non-null float64
vel_vento_med       208457 non-null float64
altitude            251767 non-null float64
fires               251767 non-null int64
dtypes: float64(9), int64(3), object(2)
memory usage: 26.9+ MB


In [5]:
df['estacao'].nunique()

239

In [6]:
df['data'] = pd.to_datetime(df['data'], format='%d/%m/%Y')
df['mes'] = df['data'].apply(lambda x: x.month_name())

In [7]:
df.head()

Unnamed: 0,id,estado,estacao,data,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude,fires,mes
0,1,AC,50484,2016-01-01,,31.4,,2.3,,,96.0,0.0,170.0,0,January
1,2,AC,50484,2016-01-02,0.0,32.5,23.3,4.4,,26.44,95.0,0.51444,170.0,0,January
2,3,AC,50484,2016-01-03,35.2,30.5,24.0,0.6,,25.78,97.0,0.51444,170.0,0,January
3,4,AC,50484,2016-01-04,60.2,31.7,22.3,2.0,,25.94,96.0,0.0,170.0,0,January
4,5,AC,50484,2016-01-05,28.4,28.5,23.0,0.1,,25.04,95.0,0.0,170.0,0,January


In [8]:
df['fires'].value_counts()

0    188271
1     63496
Name: fires, dtype: int64

In [9]:
for c in df.columns:
    if df[c].dtype == 'object':
        df[c] = df[c].astype('category')
        
X = df.drop(['id', 'data', 'fires'], axis=1)
y = df['fires']

## Otimização

In [11]:
space = {
    'Learning_Rate': hp.loguniform('Learning_Rate', np.log(0.005), np.log(0.5)),
    'Min_Sum_Hessian': hp.loguniform('Min_Sum_Hessian', np.log(0.0001), np.log(50)),
    'Min_Gain_Split': hp.loguniform('Min_Gain_Split', np.log(0.00001), np.log(1)),
    'Lambda_L1' : hp.loguniform('Lambda_L1', np.log(0.001), np.log(1)),
    'Lambda_L2' : hp.loguniform('Lambda_L2', np.log(0.001), np.log(1)),
    'Bagging_Fraction' : hp.uniform('Bagging_Fraction', 0.1, 1),
    'Feature_Fraction' : hp.uniform('Feature_Fraction', 0.1, 1),
    'Min_Data_Leaf' : hp.quniform('Min_Data_Leaf', 1, 500, 1),
    'Maximum_Leaves' : hp.quniform('Maximum_Leaves', 2, 100, 1),
    'Max_Depth' : hp.quniform('Max_Depth', 1, 100, 1),
    'Bagging_Frequence' : hp.quniform('Bagging_Frequence', 1, 10, 1),
    'Maximum_Bins' : hp.quniform('Maximum_Bins', 15, 255, 1)
}

In [14]:
train_set = lgb.Dataset(X, label = y)
def obj(x):
    params = {'bagging_freq': int(x['Bagging_Frequence']),  'bagging_fraction': x['Bagging_Fraction'],  
        'boost': 'gbdt', 'feature_fraction': x['Feature_Fraction'], 'max_depth': int(x['Max_Depth']), 
        'learning_rate': x['Learning_Rate'],  'metric': 'binary_error',  'min_data_in_leaf': int(x['Min_Data_Leaf']),     
        'num_leaves': int(x['Maximum_Leaves']), 'objective': 'binary',  
        'reg_alpha': x['Lambda_L1'],  'reg_lambda': x['Lambda_L2'], 'boost_from_average': False,
        'verbosity': 1,  'max_bin': int(x['Maximum_Bins']), 'min_sum_hessian_in_leaf': x['Min_Sum_Hessian'],
        'is_unbalance': True, 'min_gain_to_split': x['Min_Gain_Split'], 'num_threads': 2
        }
    cv_results = lgb.cv(params, train_set, num_boost_round=5000, nfold=5, metrics='binary_error', early_stopping_rounds=50, verbose_eval=5000)
    best_score = np.min(cv_results['binary_error-mean'])
    return best_score

In [16]:
best = fmin(fn=obj, space=space, algo=tpe.suggest, max_evals=500)

100%|██████████| 500/500 [16:12:57<00:00, 204.46s/trial, best loss: 0.19479915971574513]  


In [17]:
best

{'Bagging_Fraction': 0.9795087092849564,
 'Bagging_Frequence': 4.0,
 'Feature_Fraction': 0.8890312497953411,
 'Lambda_L1': 0.0355809446986378,
 'Lambda_L2': 0.03233190140590459,
 'Learning_Rate': 0.06705864182739209,
 'Max_Depth': 59.0,
 'Maximum_Bins': 28.0,
 'Maximum_Leaves': 93.0,
 'Min_Data_Leaf': 1.0,
 'Min_Gain_Split': 0.0017095753820087405,
 'Min_Sum_Hessian': 0.3665136416815327}

## Validação/Predição

In [30]:
respostas = pd.read_csv('respostas.csv')

In [32]:
respostas['data'] = pd.to_datetime(respostas['data'], format='%d/%m/%Y')
respostas['mes'] = respostas['data'].apply(lambda x: x.month_name())
respostas['estado'] = respostas['estado'].astype('category')
respostas['mes'] = respostas['mes'].astype('category')
resp_X = respostas.drop(['id', 'data'], axis=1)

In [33]:
params = {'bagging_freq': int(best['Bagging_Frequence']),  'bagging_fraction': best['Bagging_Fraction'],  
        'boost': 'gbdt', 'feature_fraction': best['Feature_Fraction'], 'max_depth': int(best['Max_Depth']), 
        'learning_rate': best['Learning_Rate'],  'metric': 'binary_error',  'min_data_in_leaf': int(best['Min_Data_Leaf']),     
        'num_leaves': int(best['Maximum_Leaves']), 'objective': 'binary',  
        'reg_alpha': best['Lambda_L1'],  'reg_lambda': best['Lambda_L2'], 'boost_from_average': False,
        'verbosity': 1,  'max_bin': int(best['Maximum_Bins']), 'min_sum_hessian_in_leaf': best['Min_Sum_Hessian'],
        'is_unbalance': True, 'min_gain_to_split': best['Min_Gain_Split'], 'num_threads': 2
}

In [34]:
n_folds = 5
skf = StratifiedKFold(n_folds, shuffle=True)
oof = np.zeros(len(X))
preds_resp = np.zeros(len(resp_X))

In [37]:
for train_idx, val_idx in skf.split(X.values, y.values):
    train_data = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
    val_data = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])
    lgbm = lgb.train(params, train_data, 5000, [train_data, val_data], early_stopping_rounds=50, verbose_eval=5000)
    oof[val_idx] = lgbm.predict(X.iloc[val_idx], num_iteration = lgbm.best_iteration)
    preds_resp += lgbm.predict(resp_X, num_iteration=lgbm.best_iteration) / n_folds

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[860]	training's binary_error: 0.141094	valid_1's binary_error: 0.206037
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[417]	training's binary_error: 0.178205	valid_1's binary_error: 0.209858
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[683]	training's binary_error: 0.155223	valid_1's binary_error: 0.209858
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[536]	training's binary_error: 0.166711	valid_1's binary_error: 0.212142
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[941]	training's binary_error: 0.133466	valid_1's binary_error: 0.207376


In [38]:
print(roc_auc_score(y, oof))

0.8656035663197458


In [39]:
preds = [int(i>0.5) for i in oof]

In [40]:
print(classification_report(y, preds))

              precision    recall  f1-score   support

           0       0.91      0.80      0.85    188271
           1       0.56      0.76      0.65     63496

   micro avg       0.79      0.79      0.79    251767
   macro avg       0.74      0.78      0.75    251767
weighted avg       0.82      0.79      0.80    251767



In [41]:
print(confusion_matrix(y, preds))

[[150739  37532]
 [ 15101  48395]]


In [42]:
print(accuracy_score(y, preds))

0.79094559652377


In [43]:
preds_respB = [int(i>0.5) for i in preds_resp]
respostas['fires'] = preds_respB
respostas[['id', 'fires']].to_csv('lightgbm3.csv', index=False)