In [None]:
import numpy as np
import pandas as pd

import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split

In [None]:
SINAN_PATH      = './../data/raw_data/SINAN_prep_05.csv'
MUNICIPIOS_PATH = './../data/municipios_prep_02.csv'
BOLSA_PATH = './../data/consolidada_bolsafamilia.csv'
INEP_PATH = './../data/consolidada_inep.csv'
ATLAS_PATH = './../data/atlas_desenvolvimento_humano_por_municipio.csv'
OCUPACOES_PATH  = './../data/cbo_ocupacoes.csv'
MAPBOX_TOKEN    = 'pk.eyJ1IjoibHVjYXNuc2VxIiwiYSI6ImNrb241dHZ0cTBpd2MycW5yMGp2enFtMmkifQ.N6NJGlWhG-iYrIJMQ1MVVw'

px.set_mapbox_access_token(MAPBOX_TOKEN)

In [None]:
municipios_df = pd.read_csv(MUNICIPIOS_PATH)

In [None]:
municipios_df.head()

In [None]:
def get_encoder(values):
    
    encoder = LabelEncoder()
    encoded = encoder.fit_transform(values.reshape(-1, 1))
    
    return encoded, encoder

In [None]:
fig = px.histogram(municipios_df, x='pop_2017', marginal="box")
fig.show()

### Pipeline de preparação dos dados de treino

In [None]:
def train_test_data_pipeline(dataframe, train_size = 0.7):
    '''
    Retorna X_df_train, Y_df_train, X_df_test e Y_df_test
    '''
    
    values_df = dataframe.set_index('id')
    
    # Removemdo colunas indesejadas
    remove_cols = ['id', 'nome', 'uf_nome', 'uf_id', 'sinan_id', 'cluster_id']
    
    for col in remove_cols:
        
        try:
            values_df = values_df.drop(col, axis=1)
        except:
            continue
            
    # Selecionando intervalo de população
    
    min_population = 10000
    max_population = 500000
    values_df      = values_df[values_df['pop_2017'] > min_population]
    values_df      = values_df[values_df['pop_2017'] < max_population]
    
    # Encoders em algumas colunas
    
    encoders = {}
    encodeds = {}
    feature_columns = ['uf', 'regiao']

    for column in feature_columns:

        encodeds[column], encoders[column] = get_encoder(values_df[column].values)
        values_df[column] = encodeds[column]
        
    # Fillna
    values_df = values_df.fillna(-1)
    
    # Removendo outras colunas excessivas
    
    remove_cols = []
    for col in values_df.columns:
        if 'icg' in col or 'comodos' in col:
            remove_cols.append(col)
    
    values_df = values_df.drop(remove_cols, axis=1)
    
    # Removendo colunas de 2017 e dividindo entre X_df e Y_df
    
    remove_cols = []
    for col in values_df.columns:
        if '2017' in col:
            remove_cols.append(col)
    
    X_df = values_df.drop(remove_cols, axis=1)
    Y_df = values_df['denun_relat_2017']
    
    # Separando df em train/test
    train_indexes = list(np.random.choice(X_df.index, int(len(X_df)*train_size), replace = False))
    test_indexes  = []
    
    for index in X_df.index:
        if index not in train_indexes:
            test_indexes.append(index)
    
    X_df_train = X_df.drop(test_indexes, axis=0)
    Y_df_train = Y_df.drop(test_indexes, axis=0)
    X_df_test  = X_df.drop(train_indexes, axis=0)
    Y_df_test  = Y_df.drop(train_indexes, axis=0)
    
    return X_df_train, Y_df_train, X_df_test, Y_df_test

In [None]:
def visualize_train_result(X_df, Y_df, regressor, original_df):
    
    X_values = X_df.values
    Y_values = Y_df.values.reshape(-1, 1)
    
    predictions_df = pd.DataFrame()
    
    predictions_df['prediction'] = regressor.predict(X_values).reshape(-1,)
    predictions_df['real']       = Y_values.reshape(-1,)

    
    predictions_df['abs_diff']   = np.abs(predictions_df['prediction'] - predictions_df['real'])
    predictions_df['diff']       = predictions_df['prediction'] - predictions_df['real']
    predictions_df['sqr_diff']   = (predictions_df['prediction'] - predictions_df['real'])**2
    
    variables           = ['nome', 'uf', 'regiao', 'pop_2016']
    variables_values    = {}
    original_df_indexed = original_df.set_index('id')
    
    for variable in variables:
        variables_values[variable] = []
        
    for mun_id in X_df.index:

        mun_data = original_df_indexed.loc[mun_id]
        
        for variable in variables:
            variables_values[variable].append(mun_data[variable])
    
    for variable in variables:
        predictions_df[variable] = variables_values[variable]
        
    score = regressor.score(X_values, Y_values)
    name  = str(regressor).split('()')[0]
    
    title = f'Regressor: {name} | score: {round(score, 5)}'
    

    fig = px.scatter(predictions_df, x='prediction', y='real', color="abs_diff", 
                     hover_data=variables + ['abs_diff', 'diff', 'sqr_diff'], title=title)
    
    max_val = np.max(predictions_df['prediction'].values)
    min_val = np.min(predictions_df['prediction'].values)
    
    fig.add_trace(go.Line(x=np.linspace(min_val*0.9,max_val*1.1), y=np.linspace(min_val*0.9,max_val*1.1)))

    return fig

### Aplicando alguns regressores

#### Linear Regression

In [None]:
%%time

X_df_train, Y_df_train, X_df_test, Y_df_test = train_test_data_pipeline(municipios_df, train_size = 0.7)

X_values_train, Y_value_train = X_df_train.values, Y_df_train.values.reshape(-1,)
X_values_test, Y_values_test  = X_df_test.values, Y_df_test.values.reshape(-1,)

regressor = LinearRegression()

regressor = regressor.fit(X_values_train, Y_value_train)
score     = regressor.score(X_df_test, Y_df_test)

print('Score:', score)

In [None]:
fig = visualize_train_result(X_df_test, Y_df_test, regressor, municipios_df)
fig.show()

### Random Forest Regressor

In [None]:
%%time

X_df_train, Y_df_train, X_df_test, Y_df_test = train_test_data_pipeline(municipios_df, train_size = 0.7)

X_values_train, Y_value_train = X_df_train.values, Y_df_train.values.reshape(-1,)
X_values_test, Y_values_test  = X_df_test.values, Y_df_test.values.reshape(-1,)

regressor = RandomForestRegressor()

regressor = regressor.fit(X_values_train, Y_value_train)
score     = regressor.score(X_df_test, Y_df_test)

print('Score:', score)

In [None]:
fig = visualize_train_result(X_df_test, Y_df_test, regressor, municipios_df)
fig.show()

### Gradient Boosting Regressor

In [None]:
%%time

X_df_train, Y_df_train, X_df_test, Y_df_test = train_test_data_pipeline(municipios_df, train_size = 0.7)

X_values_train, Y_value_train = X_df_train.values, Y_df_train.values.reshape(-1,)
X_values_test, Y_values_test  = X_df_test.values, Y_df_test.values.reshape(-1,)

regressor = GradientBoostingRegressor()

regressor = regressor.fit(X_values_train, Y_value_train)
score     = regressor.score(X_df_test, Y_df_test)

print('Score:', score)

In [None]:
fig = visualize_train_result(X_df_test, Y_df_test, regressor, municipios_df)
fig.show()

#### Avaliando Capitais

In [None]:
capitals = ['Rio Branco','Maceió','Macapá','Manaus','Salvador','Fortaleza','Brasília','Vitória',
            'Goiânia','São Luís','Cuiabá','Campo Grande','Belo Horizonte','Belém','João Pessoa',
            'Curitiba','Recife','Teresina','Rio de Janeiro','Natal','Porto Alegre','Porto Velho',
            'Boa Vista','Florianópolis','São Paulo','Aracaju','Palmas']

mun_data = municipios_df[municipios_df['nome'].isin(capitals)]

X_df, Y_df, _, _ = train_test_data_pipeline(mun_data, train_size = 1)

fig = visualize_train_result(X_df, Y_df, regressor, municipios_df)
fig.show()