In [None]:
import numpy as np
import pandas as pd

import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

from boruta import BorutaPy

from sklearn.metrics import mean_squared_error

In [None]:
# standard imports
import pandas as pd
from sklearn.model_selection import RepeatedKFold

# some helpful imports from sklearndf
from sklearndf.pipeline import RegressorPipelineDF
from sklearndf.regression import RandomForestRegressorDF

# relevant FACET imports
from facet.data import Sample
from facet.selection import LearnerRanker, LearnerGrid

In [None]:
SINAN_PATH      = './../data/raw_data/SINAN_prep_05.csv'
MUNICIPIOS_PATH = './../data/municipios_prep_06.csv'
BOLSA_PATH = './../data/consolidada_bolsafamilia.csv'
INEP_PATH = './../data/consolidada_inep.csv'
ATLAS_PATH = './../data/atlas_desenvolvimento_humano_por_municipio.csv'
OCUPACOES_PATH  = './../data/cbo_ocupacoes.csv'
PNAD_PATH  = './../data/PNAD_consolidado.csv'
MAPBOX_TOKEN    = 'pk.eyJ1IjoibHVjYXNuc2VxIiwiYSI6ImNrb241dHZ0cTBpd2MycW5yMGp2enFtMmkifQ.N6NJGlWhG-iYrIJMQ1MVVw'

px.set_mapbox_access_token(MAPBOX_TOKEN)

In [None]:
municipios_df = pd.read_csv(MUNICIPIOS_PATH)

In [None]:
municipios_df.head()

In [None]:
municipios_df.shape

In [None]:
def get_encoder(values):
    
    encoder = LabelEncoder()
    encoded = encoder.fit_transform(values.ravel())
    
    return encoded, encoder

In [None]:
fig = px.histogram(municipios_df, x='pop_2017', marginal="box")
fig.show()

In [None]:
df = municipios_df[(municipios_df['pop_2017'] >= 10000) &  (municipios_df['pop_2017'] <= 500000)]
fig = px.histogram(df, x='denun_relat_2017', marginal="box")
fig.show()

### Pipeline de preparação dos dados de treino

In [None]:
def remove_munic_nan_indexes(dataframe):
    
    munic_cols = []

    for col in dataframe.columns:
        if 'munic' in col:
            munic_cols.append(col)
            
    for i, col in enumerate(munic_cols):
        
        if i == 0:
            df = dataframe[dataframe[col].notna()]
        else:
            df = df[df[col].notna()]
    
    return df

In [None]:
def visualize_train_result(X_df, Y_df, regressor, original_df, years):
    
    X_values = X_df.values
    Y_values = Y_df.values.reshape(-1, 1)
    
    predictions_df = pd.DataFrame()
    
    predictions_df['prediction'] = regressor.predict(X_values).reshape(-1,)
    predictions_df['real']       = Y_values.reshape(-1,)

    
    predictions_df['abs_diff']   = np.abs(predictions_df['prediction'] - predictions_df['real'])
    predictions_df['abs_diff_10k']   = np.abs(predictions_df['prediction'] - predictions_df['real'])*10000
    predictions_df['diff_10k']   = (predictions_df['prediction'] - predictions_df['real'])*10000
    predictions_df['diff']       = predictions_df['prediction'] - predictions_df['real']
    predictions_df['sqr_diff']   = (predictions_df['prediction'] - predictions_df['real'])**2
    
    variables           = ['nome', 'uf', 'regiao', 'pop', 'year']
    variables_values    = {}
    
    original_df_indexed = original_df.set_index('id')
    
    for variable in variables:
        variables_values[variable] = []
        
    for index in X_df.index:
        
        index_parts = index.split('_')
        mun_id, year = int(index_parts[0]), int(index_parts[1])

        mun_data = original_df_indexed.loc[mun_id]
        
        for variable in variables:
            
            if variable == 'pop':
                variables_values[variable].append(mun_data[f'pop_{year}'])
            elif variable == 'year':
                variables_values[variable].append(year)
            else:
                variables_values[variable].append(mun_data[variable])
    
    for variable in variables:
        predictions_df[variable] = variables_values[variable]
        
    score = regressor.score(X_values, Y_values)
    rmse  = np.sqrt(mean_squared_error(predictions_df['real'].values, predictions_df['prediction'].values))
    
    name  = str(regressor).split('(')[0]
    
    title = f'Regressor: {name} | Score: {round(score, 3)} | RMSE: {round(rmse, 6)}'
    
    scaler = MinMaxScaler()
    
    size   = scaler.fit_transform(predictions_df['year'].values.reshape(-1, 1)).reshape(-1)
    col_year = predictions_df['year'].apply(lambda x : f'y_{x}')

    fig = px.scatter(predictions_df, x='prediction', y='real', color='diff', size=size, size_max=7,
                     hover_data=variables + ['abs_diff', 'diff', 'sqr_diff'], title=title)
    
    max_val = np.max(predictions_df['prediction'].values)
    min_val = np.min(predictions_df['prediction'].values)
    
    fig.add_trace(go.Line(x=np.linspace(min_val*0.9,max_val*1.1), y=np.linspace(min_val*0.9,max_val*1.1)))

    return fig

In [None]:
def get_prediction_df(X_df, Y_df, regressor, original_df, years):
    
    X_values = X_df.values
    Y_values = Y_df.values.reshape(-1, 1)
    
    predictions_df = pd.DataFrame()
    
    predictions_df['prediction'] = regressor.predict(X_values).reshape(-1,)
    predictions_df['real']       = Y_values.reshape(-1,)

    
    predictions_df['abs_diff']   = np.abs(predictions_df['prediction'] - predictions_df['real'])
    predictions_df['abs_diff_10k']   = np.abs(predictions_df['prediction'] - predictions_df['real'])*10000
    predictions_df['diff_10k']   = (predictions_df['prediction'] - predictions_df['real'])*10000
    predictions_df['diff']       = predictions_df['prediction'] - predictions_df['real']
    predictions_df['sqr_diff']   = (predictions_df['prediction'] - predictions_df['real'])**2
    
    variables           = ['nome', 'uf', 'regiao', 'pop', 'year', 'latitude', 'longitude']
    variables_values    = {}
    
    original_df_indexed = original_df.set_index('id')
    
    for variable in variables:
        variables_values[variable] = []
        
    for index in X_df.index:
        
        index_parts = index.split('_')
        mun_id, year = int(index_parts[0]), int(index_parts[1])

        mun_data = original_df_indexed.loc[mun_id]
        
        for variable in variables:
            
            if variable == 'pop':
                variables_values[variable].append(mun_data[f'pop_{year}'])
            elif variable == 'year':
                variables_values[variable].append(year)
            else:
                variables_values[variable].append(mun_data[variable])
    
    for variable in variables:
        predictions_df[variable] = variables_values[variable]
        
    score = regressor.score(X_values, Y_values)
    rmse  = np.sqrt(mean_squared_error(predictions_df['real'].values, predictions_df['prediction'].values))

    return predictions_df, score, rmse

In [None]:
def train_test_selected_data_pipeline(dataframe, years, train_size = 0.7, filter_pop = True, selected_prefix_cols = None, 
                             ignore_cols = []):
    '''
    Retorna X_df_train, Y_df_train, X_df_test e Y_df_test
    '''
    
    # se selected cols == None => use all
    
    values_df = dataframe.set_index('id').drop(columns=['sinan_id', 'nome', 'uf_nome', 'uf_id'])
    values_df = values_df.drop(columns=ignore_cols)
    
    selected_cols = []
    if selected_prefix_cols:
        
        for prefix in selected_prefix_cols:
            for col in values_df.columns:
                
                if prefix in col:
                    selected_cols.append(col)
        
        values_df = values_df[selected_cols]
        
    # Removendo valores NaN do MUNIC
#     values_df = remove_munic_nan_indexes(values_df)
    
    # Features and Numeric
    feature_cols = []
    numeric_cols = []
    for col in values_df.columns:
        
        if isinstance(values_df[col].values[0], str):
            feature_cols.append(col)
        else:
            numeric_cols.append(col)
    
    encodeds, encoders = {}, {}
    for column in feature_cols:

        encodeds[column], encoders[column] = get_encoder(values_df[column].values)
        values_df[column] = encodeds[column]
    
    # Selecionando colunas que variam com os anos
    year_columns = []
    for col in values_df.columns:
        for year in np.arange(2010,2020):
            y_str = str(year)
            if y_str in col and col not in year_columns:
                year_columns.append(col)
                
    estatic_columns = []
    for col in values_df.columns:
        if col not in year_columns:
            estatic_columns.append(col)
            
                    
    # Removendo sufixo de ano "_year"
    renamed_year_columns_map = {}
    for col in year_columns:
        renamed_year_columns_map[col] = '_'.join(col.split('_')[:-1])
        
    df = pd.DataFrame()
    
    min_population = 10000
    max_population = 500000
    
    for year in years:
        
        # Selecionando intervalo de população
        
        if filter_pop:
            temp_df = values_df[(values_df[f'pop_{year}'] >= min_population) & (values_df[f'pop_{year}'] <= max_population)]
        else:
            temp_df = values_df.copy()
        
        year_df = pd.DataFrame(index = temp_df.index)
        
        for col in estatic_columns:
            year_df[col] = temp_df[col].values
            
        for col in year_columns:
            if str(year) in col:
                year_df[col] = temp_df[col].values
                
        year_df = year_df.rename(renamed_year_columns_map, axis=1)
        
        year_df = year_df.rename(lambda x : f'{x}_{year}', axis=0)
        
        year_df = year_df.fillna(0)
        
        df = pd.concat((df, year_df))
        
        
    X_df = df.drop(f'denun_relat', axis=1)
    Y_df = df['denun_relat']
    
    # Separando df em train/test
    train_indexes = X_df.sample(int(len(X_df) * train_size)).index
    test_indexes  = []
    
    for index in X_df.index:
        if index not in train_indexes:
            test_indexes.append(index)
    
    X_df_train = X_df.drop(test_indexes, axis=0)
    Y_df_train = Y_df.drop(test_indexes, axis=0)
    X_df_test  = X_df.drop(train_indexes, axis=0)
    Y_df_test  = Y_df.drop(train_indexes, axis=0)
    
    return X_df_train, Y_df_train, X_df_test, Y_df_test

In [None]:
%%time

cols = ['latitude', 'pnad_espvida', 'pnad_t_freq6a14', 'uf', 'pop',
       'renda_media_sum', 'percentual_agressor_namorado/a',
       'percentual_agressor_cônjuge', 'percentual_agressor_ex-cônjuge',
       'percentual_agressor_ex-namorado/a', 'percentual_agressor_amigo/a',
       'percentual_agressor_próprio/a', 'percentual_agressor_pai',
       'percentual_agressor_mãe', 'percentual_agressor_padrasto',
       'percentual_agressor_filho/a',
       'percentual_agressor_policial/agente da lei',
       'percentual_violence_sexual', 'percentual_violence_psicológica',
       'percentual_escolaridade_ef ii completo',
       'percentual_escolaridade_es completo',
       'percentual_cbo_grupo_técnicos de nivel médio',
       'percentual_cbo_grupo_trabalhadores agropecuários, florestais e da pesca',
       'percentual_sit_conjug_separado/a',
       'percentual_sit_conjug_viúvo/a',
       'percentual_viol_motiv_conflito geracional',
       'percentual_cs_sexo_f', 'percentual_age_faixa_25_39',
       'percentual_age_faixa_40_59', 'denun_relat']


### Random Forest Regressor

#### Prunning

In [None]:
years = [2013,2014,2015,2016,2017]

X_df_train, Y_df_train, _, _ = train_test_selected_data_pipeline(municipios_df, 
                                                                 years, 
                                                                 train_size = 1, 
                                                                 selected_prefix_cols = cols, 
                                                                 filter_pop = False)

df = X_df_train.copy()
df['denun_relat'] = Y_df_train.values

df.shape

In [None]:
# create FACET sample object
df_sample = Sample(observations=df, target_name="denun_relat")

# create a (trivial) pipeline for a random forest regressor
rnd_forest_reg = RegressorPipelineDF(
    regressor=RandomForestRegressorDF(n_estimators=200, random_state=42)
)

# define grid of models which are "competing" against each other
rnd_forest_grid = [
    LearnerGrid(
        pipeline=rnd_forest_reg,
        learner_parameters={
            "min_samples_leaf": [5, 8],
            "max_depth": [12, 14, 16, 18],
        }
    ),
]

# create repeated k-fold CV iterator
rkf_cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=42)

# rank your candidate models by performance (default is mean CV score - 2*SD)
ranker = LearnerRanker(
    grids=rnd_forest_grid, cv=rkf_cv, n_jobs=-3
).fit(sample=df_sample)

# get summary report
ranker.summary_report()

### Train

In [None]:
%%time

years = [2013, 2014, 2015, 2016, 2017]

X_df_train, Y_df_train, X_df_test, Y_df_test = train_test_selected_data_pipeline(municipios_df, 
                                                                                 years, 
                                                                                 train_size = 0.6, 
                                                                                 selected_prefix_cols = cols, 
                                                                                 filter_pop = False)

X_values_train, Y_value_train = X_df_train.values, Y_df_train.values.reshape(-1,)
X_values_test, Y_values_test  = X_df_test.values, Y_df_test.values.reshape(-1,)

rf_regressor = RandomForestRegressor(min_samples_leaf=8, max_depth=18)

rf_regressor = rf_regressor.fit(X_values_train, Y_value_train)
score     = rf_regressor.score(X_df_test, Y_df_test)

print(f'Train Size: {round(100*len(X_df_train)/(len(municipios_df)*len(years)), 2)}% | {len(X_df_train)}/{len(municipios_df)*len(years)}')
print('Score:', round(score, 4))

In [None]:
fig = visualize_train_result(X_df_test, Y_df_test, rf_regressor, municipios_df, years)
fig.show()

#### Avaliando Capitais

In [None]:
years = [2013, 2014, 2015, 2016, 2017]

capitals = ['Rio Branco','Maceió','Macapá','Manaus','Salvador','Fortaleza','Brasília','Vitória',
            'Goiânia','São Luís','Cuiabá','Campo Grande','Belo Horizonte','Belém','João Pessoa',
            'Curitiba','Recife','Teresina','Rio de Janeiro','Natal','Porto Alegre','Porto Velho',
            'Boa Vista','Florianópolis','São Paulo','Aracaju','Palmas']

mun_data = municipios_df[municipios_df['nome'].isin(capitals)]

X_df, Y_df, _, _ = train_test_selected_data_pipeline(mun_data, 
                                                     years, 
                                                     train_size = 1, 
                                                     selected_prefix_cols = cols, 
                                                     filter_pop = False)

In [None]:
fig = visualize_train_result(X_df, Y_df, rf_regressor, municipios_df, years)
fig.show()

#### Avaliando base toda

In [None]:
X_df, Y_df, _, _ = train_test_selected_data_pipeline(municipios_df, 
                                                     years, 
                                                     train_size = 1, 
                                                     selected_prefix_cols = cols, 
                                                     filter_pop = False)

In [None]:
fig = visualize_train_result(X_df, Y_df, rf_regressor, municipios_df, years)
fig.show()

#### Mapa de outliers

In [None]:
years = [2013,2014,2015,2016,2017]

frame = []

df = pd.DataFrame()

for year in tqdm(years):
    
    X_df, Y_df, _, _ = train_test_selected_data_pipeline(municipios_df, 
                                                         [year], 
                                                         train_size = 1, 
                                                         selected_prefix_cols = cols, 
                                                         filter_pop = True)
    predictions_df, score, rmse = get_prediction_df(X_df, Y_df, rf_regressor, municipios_df, [year])
    
    predictions_df['text'] = predictions_df['nome'] + ', diff = ' + round(predictions_df['diff'], 5).astype(str) + ', ' + predictions_df['year'].astype(str)

    df = pd.concat((df, predictions_df))

In [None]:
df = pd.read_csv('./../data/predictions.csv')

In [None]:
df = df[df['pop'] > 10000]

In [None]:
n_frames = len(years)


fig = ff.create_hexbin_mapbox(
    data_frame=df,lat='latitude', lon='longitude', nx_hexagon=90, animation_frame='year', zoom=3.3, 
    center={'lat': -15.778244, 'lon': -47.906081}, range_color=[-15,15],
    labels={"color": "Difererença por 10k hab.", "frame": "Ano"}, color_continuous_midpoint=0,
    opacity=0.7, color="diff_10k", agg_func=np.mean, color_continuous_scale="Temps_r", width = 650, height=700
)
fig.update_layout(margin=dict(b=0, t=0, l=0, r=0))
fig.layout.sliders[0].pad.t=20
fig.layout.updatemenus[0].pad.t=40
fig.show()