# Catboost implementation

#### Murilo Menezes Mendonça

In [138]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import AllKNN
from imblearn.over_sampling import SMOTENC


In [135]:
df = pd.read_csv('../data/database_fires.csv', index_col='id')

In [159]:
def handle_nans(data):
    data['data'] = pd.to_datetime(data['data'], format = '%d/%m/%Y')
    ###handle nans    
    #create columns with is_nan value
    nans = data.isna()
    nans.columns = [i + '_nan' for i in nans.columns]
    data = pd.concat([data,nans], axis = 1)
    # ffill by estacao
    prev_nans = data.isna().sum()
    grp_data = data.groupby('estacao')
    dfs = []
    for grp,df in tqdm.tqdm(grp_data):
        df = df.sort_values(by = 'data')

        df = df.fillna(method = 'ffill')
        df = df.fillna(method = 'backfill')

        dfs.append(df)
    data = pd.concat(dfs)
    # median by date and state
    data['mes'] = data['data'].dt.month
    grp_data = data.groupby(['estado','mes'])
    dfs = []
    for grp,df in tqdm.tqdm(grp_data):
        df = df.fillna(df.median())
        dfs.append(df)
    data = pd.concat(dfs)
    #median by state
    grp_data = data.groupby('mes')
    dfs = []
    for grp,df in tqdm.tqdm(grp_data):
        df = df.fillna(df.median())
        dfs.append(df)
    data = pd.concat(dfs)
    return data

In [139]:
df = handle_nans(df)

100%|██████████| 239/239 [00:00<00:00, 392.90it/s]
100%|██████████| 312/312 [00:04<00:00, 64.96it/s]
100%|██████████| 12/12 [00:01<00:00,  7.86it/s]


In [160]:
df['ano'] = df['data'].apply(lambda x: x.year)
df['ano'] = df['ano'].astype(str)
df['mes'] = df['mes'].astype(str)

In [169]:
df['temp_delta'] = df['temp_max'] - df['temp_min']

In [184]:
for estacao in df['estacao'].unique():
    df.loc[df['estacao']==estacao,'prec_med'] = df[df['estacao']==estacao]['precipitacao'].mean()

In [191]:
for estacao in df['estacao'].unique():
    df.loc[df['estacao']==estacao,'vel_vento_med_all'] = df[df['estacao']==estacao]['vel_vento_med'].mean()

In [193]:
for estacao in df['estacao'].unique():
    df.loc[df['estacao']==estacao,'umidade_rel_med_all'] = df[df['estacao']==estacao]['umidade_rel_med'].mean()

In [200]:
df['estacao'] = df['estacao'].astype(str)

X = df.drop(['data','fires', 'fires_nan'], axis=1)
y = df['fires']

In [201]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2,random_state=42)

cat_features = [0, 1, 23, 24]

train_data = X_train
train_labels = y_train

eval_data = X_test

In [202]:
model = CatBoostClassifier(iterations=20,
                           learning_rate=0.5,
                           depth=10)
model.fit(train_data, train_labels, cat_features)
preds_class = model.predict(eval_data)

0:	learn: 0.4763649	total: 206ms	remaining: 3.92s
1:	learn: 0.4323271	total: 583ms	remaining: 5.25s
2:	learn: 0.4158709	total: 780ms	remaining: 4.42s
3:	learn: 0.4063971	total: 985ms	remaining: 3.94s
4:	learn: 0.4015067	total: 1.16s	remaining: 3.47s
5:	learn: 0.3970202	total: 1.35s	remaining: 3.15s
6:	learn: 0.3934349	total: 1.54s	remaining: 2.86s
7:	learn: 0.3906310	total: 1.74s	remaining: 2.6s
8:	learn: 0.3882335	total: 1.91s	remaining: 2.33s
9:	learn: 0.3858644	total: 2.1s	remaining: 2.1s
10:	learn: 0.3829002	total: 2.27s	remaining: 1.85s
11:	learn: 0.3805982	total: 2.47s	remaining: 1.64s
12:	learn: 0.3793827	total: 2.63s	remaining: 1.41s
13:	learn: 0.3773018	total: 2.83s	remaining: 1.21s
14:	learn: 0.3755128	total: 2.99s	remaining: 997ms
15:	learn: 0.3743401	total: 3.18s	remaining: 795ms
16:	learn: 0.3732715	total: 3.35s	remaining: 592ms
17:	learn: 0.3720602	total: 3.55s	remaining: 394ms
18:	learn: 0.3706045	total: 3.73s	remaining: 196ms
19:	learn: 0.3696043	total: 3.92s	remaining:

In [203]:
accuracy_score(preds_class, y_test)

0.8214044564483457

In [198]:
f1_score(preds_class, y_test)

0.5972627327798968

# Testing

In [95]:
df_test = pd.read_csv('../data/respostas.csv', index_col='id')
df_test['data'] = pd.to_datetime(df_test['data'], format = '%d/%m/%Y')
df_test['mes'] = df_test['data'].apply(lambda x: x.month)
lista = ['precipitacao',
'temp_max',
'temp_min',
'insolacao',
'evaporacao_piche',
'temp_comp_med',
'umidade_rel_med',
'vel_vento_med']
df_test.sort_values(by='estado', inplace=True)
for column in lista:
    df_test = df_test.fillna(method='ffill')
    df_test = df_test.fillna(method='bfill')
df_test['estacao'] = df_test['estacao'].astype(str)

eval_data = df_test.drop(['data'], axis=1)

In [98]:
preds_class = model.predict(eval_data)

In [99]:
df_test['fires'] = preds_class 

In [100]:
df_test.head()

Unnamed: 0_level_0,estado,estacao,data,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude,mes,fires
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
251768,AC,50484,2016-01-27,5.8,34.7,23.7,8.8,2.5,27.84,92.0,1.02888,170.0,1,0
252067,AC,22864,2019-08-13,0.0,35.4,21.6,8.8,2.5,27.48,73.0,0.34296,190.0,8,1
252066,AC,22864,2019-08-12,0.0,34.4,22.1,8.8,2.5,27.26,78.0,0.34296,190.0,8,1
252065,AC,22864,2019-08-11,0.0,34.2,20.9,8.8,2.5,27.3,77.0,0.34296,190.0,8,1
252064,AC,22864,2019-08-10,0.0,35.0,10.9,8.8,2.5,25.38,87.25,0.34296,190.0,8,1


In [101]:
df_test['fires'].reset_index().to_csv('../data/submission_6.csv', index=False)

## To-do

[x] Incluir data com meses do ano

[x] Balancear variavel resposta

[ ] Tentar incluir estacao

[ ] Tratar NaNs de uma forma melhor

[ ] Diminuir granularidade de estados para regioes

[ ] Diminuir granularidade de meses para estações