In [2]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl ##importing chart libraries
from sklearn import metrics 
from sklearn.model_selection import train_test_split
import catboost as cb #importing machine learning libraries
import os
from IPython import embed
from datetime import date
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import statsmodels
import warnings
warnings.filterwarnings('ignore')

In [2]:
base_path = os.getcwd() #declaring predefinitions and constants
selected_columns = ['precipitacao (mm)-1','temperatura (°C)-1','umidade ar (%)-1','t-1','t-2','t-3','densidade_demografica']
target = 'dengue_diagnosis'
df_input = pd.read_csv(base_path+'/dengue_RIO_POA.csv', sep=';')
months = list(range(1, 13))

In [16]:
def prepare_df (ano_teste = 2020, mes_inicio_teste = 1, meses_previsao = 1):
    df_input['data'] = df_input[['ano', 'mes' ]].apply(lambda x: date(x[0], x[1], 1), axis = 1)
    list_data_teste = list()
    for m in list(range(mes_inicio_teste, mes_inicio_teste+3)):
        if m < 13:
            list_data_teste.append(date(ano_teste, m, 1))
        else:
            list_data_teste.append(date(ano_teste+1, m-12, 1))
            
    print(list_data_teste)
    X_train = df_input[~df_input['data'].isin(list_data_teste)]
    Y_train = df_input[~df_input['data'].isin(list_data_teste)]

    X_train = X_train[selected_columns]
    Y_train = Y_train[target]
    print("Len X_TRAIN", len(X_train))
    print("X_TRAIN: ", X_train)
    print("Y_TRAIN: ", Y_train)

    return X_train, Y_train

In [1]:
def base_prepare(df_input, ano_teste = 2021, mes_inicio_teste = 1, meses_previsao = 1):
    df_input['data'] = df_input[['ano', 'mes']].apply(lambda x: date(x[0], x[1], 1), axis = 1)

    list_data_teste = list()
    for m in list(range(mes_inicio_teste, mes_inicio_teste + meses_previsao)):
        if m < 13:
            list_data_teste.append(date(ano_teste, m, 1))
        else:
            list_data_teste.append(date(ano_teste + 1, mes_teste - 12, 1))

    df_teste = df_input[df_input['data'].isin(list_data_teste)]

    df_treino = df_input[~df_input['data'].isin(list_data_teste)]
    print('df_treino: ', df_treino)
    print('df_teste: ', df_teste)
    print("LEN: ", len(df_treino), len(df_teste), len(df_input))

    df_treino.drop(columns = ['data'], inplace = True)
    df_teste.drop(columns = ['data'], inplace = True)

    return df_treino, df_teste

In [5]:
def main_step(ano = 2020, mes_inicio_teste = 1):
    X_train, Y_train = prepare_df(ano_teste = ano, mes_inicio_teste = mes_inicio_teste)

    df_treino, df_teste = base_prepare(df_input, ano_teste = ano, mes_inicio_teste = mes_inicio_teste)
    x_train_sample = X_train.copy()
    y_train_sample = Y_train.copy()
    
    return df_treino, df_teste, x_train_sample, y_train_sample

In [22]:
def main_step_2(df_test, x_train, y_train):
    regressor, r2 = run_catboost(x_train, y_train, grid_search = False, standart = False)
    df_previsao_X, columns = prepare_base_test(df_test)
    df_previsao_X['date'] = df_previsao_X[['ano', 'mes']].apply(lambda x: date(x[0], x[1], 1), axis = 1)
    list_results = list()
    df_result_consolidate = pd.DataFrame(columns = list(df_previsao_X.columns) + [target + "_previsto"])
    df_result_consolidate.drop(columns = ['date'], inplace = True)

    for d in list(df_previsao_X['date'].unique()):
        df_base = df_previsao_X[df_previsao_X['date'] == d]
        df = df_previsao_X[df_previsao_X['date'] == d]
        df.drop(columns = ['date'], inplace = True)

        df_prev = df.drop(columns = ['nome_distrito', 'chave' ,'ano', 'mes', 'cod_distrito'])
        df_prev = df_prev[columns]
        list_results.append(regressor.predict(df_prev))
        
        if target == 'dengue_diagnosis':
            list_results[-1] = [0 if a < 0 else round(a) for a in list_results[-1]]
            
        df_base[target + "_previsto"] = list_results[-1]
        df_result_consolidate = df_result_consolidate.append(df_base)
        
        df_result_consolidate.sort_values(by = ["cod_distrito", "ano", "mes"], inplacce = True)     
        df_test = pd.merge(df_test, df_result_consolidate[["chave", target + '_previsto']], on = "chave", how = "left")

        print(df_test)
        return df_test

        

In [7]:
def build_model_prediction():
    first = True
    for i in range(2017, 2023+1):
        for initial_test_month in months:
            df_train, df_test, x_train, y_train = main_step(i, mes_inicio_teste = initial_test_month)

            df_result = main_step_2(df_test, x_train, y_train)

            variables = ['nome_distrito', 'ano', 'mes_inicial', 'mes', target, target + '_previsto']
            variables.extend(selected_columns)
            df_consolidate = pd.DataFrame(columns = variables)

            df_result['mes_inicial'] = [1] * df_result.shape[0]

            df_consolidate = df_consolidate.append(df_result[variables])
            if first:
                df_consolidate.to_csv(base_path + f"/data/result_prediction_simple.csv", mode = 'w', header = True)
            else:
                df_consolidate.to_csv(base_path + f"/data/result_prediction_simple.csv", mode = 'a', header = False)

            first = False

In [8]:
def prepare_base_test(df_previsao):
    df_previsao_X = df_previsao[selected_columns]
    mes_list = ["mes_{}".format(i) for i in list(df_previsao["mes"].unique())]

    columns, index_time_cols = get_indices_columns_time(df_previsao_X.columns)
    df_previsao_X = df_previsao_X[columns]

    #need these variables to run the prediction
    df_previsao_X['ano'] = [0] * df_previsao_X.shape[0]
    df_previsao_X['mes'] = [0] * df_previsao_X.shape[0]
    df_previsao_X['nome_distrito'] = [0] * df_previsao_X.shape[0]
    df_previsao_X['chave'] = [0] * df_previsao_X.shape[0]
    df_previsao_X['cod_distrtito'] = [0] * df_previsao_X.shape[0]

    df_previsao_X.loc[:,['ano']] = list(df_previsao['ano'])
    df_previsao_X.loc[:,['mes']] = list(df_previsao['mes'])
    df_previsao_X.loc[:,['nome_distrito']] = list(df_previsao['nome_distrito'])
    df_previsao_X.loc[:,['chave']] = list(df_previsao['chave'])
    df_previsao_X.loc[:,['cod_distrito']] = list(df_previsao['cod_distrito'])  

    return df_previsao_X, columns

    

In [9]:
def get_indices_columns_time(columns):
    columns_time = list()
    columns_others = list()

    for c in columns:
        if re.match("t-.", c):
            columns_time.append(c)
        else:
            columns_others.append(c)
            
    if len(columns_time) > 0:

        index_time_cols = (len(columns_others), len(columns_others) + len(columns_time) - 1)
        columns_others = columns_others = columns_others + columns_time
        return columns_others, index_time_cols
    
    else:
        return columns_others, list()

        

In [18]:
def run_catboost(x_train, y_train, grid_search = False, standart = False):

    x_train.to_csv(base_path + f"/x_train.csv")
    y_train.to_csv(base_path + f"/y_train.csv")

    if standart:
        regressor = cb.CatBoostRegressor()
    else:
        regressor = cb.CatBoostRegressor(learning_rate = 0.1, bootstrap_type = 'Bernoulli', grow_policy = 'Lossguide', boosting_type = 'Plain',)

    regressor.fit(x_train, y_train, plot = True)

    return regressor, 0
    

In [23]:
build_model_prediction()



  df_input['data'] = df_input[['ano', 'mes' ]].apply(lambda x: date(x[0], x[1], 1), axis = 1)


[datetime.date(2017, 1, 1), datetime.date(2017, 2, 1), datetime.date(2017, 3, 1)]
Len X_TRAIN 17789
X_TRAIN:         precipitacao (mm)-1  temperatura (°C)-1  umidade ar (%)-1   t-1   t-2  \
0                     25.0              242.00             70.00   5.0   2.0   
1                     26.0              216.00             72.00   9.0   5.0   
2                     28.0               26.00             61.00  47.0   9.0   
3                     26.0              161.00             69.00  69.0  47.0   
4                     25.0               24.00             74.00  96.0  69.0   
...                    ...                 ...               ...   ...   ...   
18315                 89.8               25.47             75.70  13.0   4.0   
18316                 35.2               20.60             73.22  29.0  13.0   
18317                106.8               18.39             80.68  25.0  29.0   
18318                247.6               16.29             80.11   6.0  25.0   
18319     

  df_input['data'] = df_input[['ano', 'mes']].apply(lambda x: date(x[0], x[1], 1), axis = 1)


df_treino:        nome_distrito  dengue_diagnosis   ano  mes                      chave  \
0        AboliÃ§Ã£o               9.0  2012    1   827186535985332280320121   
1        AboliÃ§Ã£o              47.0  2012    2   827186535985332280320122   
2        AboliÃ§Ã£o              69.0  2012    3   827186535985332280320123   
3        AboliÃ§Ã£o              96.0  2012    4   827186535985332280320124   
4        AboliÃ§Ã£o              89.0  2012    5   827186535985332280320125   
...             ...               ...   ...  ...                        ...   
18315           SUL              29.0  2023    4  -486640010877500879920234   
18316           SUL              25.0  2023    5  -486640010877500879920235   
18317           SUL               6.0  2023    6  -486640010877500879920236   
18318           SUL               4.0  2023    7  -486640010877500879920237   
18319           SUL               0.0  2023    8  -486640010877500879920238   

       precipitacao (mm)  temperatura (

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_treino.drop(columns = ['data'], inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste.drop(columns = ['data'], inplace = True)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 75.5138223	total: 2.16ms	remaining: 2.15s
1:	learn: 71.7579669	total: 4.41ms	remaining: 2.2s
2:	learn: 68.1269697	total: 6.87ms	remaining: 2.28s
3:	learn: 64.6809241	total: 8.89ms	remaining: 2.21s
4:	learn: 61.9829290	total: 11.4ms	remaining: 2.27s
5:	learn: 59.0468091	total: 13.7ms	remaining: 2.28s
6:	learn: 56.3424093	total: 16ms	remaining: 2.28s
7:	learn: 54.2720537	total: 18.4ms	remaining: 2.28s
8:	learn: 52.3575368	total: 20.4ms	remaining: 2.25s
9:	learn: 50.1722657	total: 22.6ms	remaining: 2.23s
10:	learn: 47.7716972	total: 24.6ms	remaining: 2.21s
11:	learn: 45.6446458	total: 26.8ms	remaining: 2.21s
12:	learn: 44.2689623	total: 28.9ms	remaining: 2.19s
13:	learn: 43.0856786	total: 30.9ms	remaining: 2.18s
14:	learn: 41.8114503	total: 32.9ms	remaining: 2.16s
15:	learn: 40.6902310	total: 35ms	remaining: 2.15s
16:	learn: 39.5831371	total: 37ms	remaining: 2.14s
17:	learn: 38.5150848	total: 39.1ms	remaining: 2.13s
18:	learn: 37.5850116	total: 41.1ms	remaining: 2.12s
19:	learn:

  df_previsao_X['date'] = df_previsao_X[['ano', 'mes']].apply(lambda x: date(x[0], x[1], 1), axis = 1)


AttributeError: 'DataFrame' object has no attribute 'append'