In [None]:
import numpy as np
import pandas as pd

import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm

from sklearn.linear_model import LinearRegression

In [None]:
SINAN_PATH      = './../data/raw_data/SINAN_prep_05.csv'
MUNICIPIOS_PATH = './../data/municipios_prep_01.csv'
BOLSA_PATH = './../data/consolidada_bolsafamilia.csv'
INEP_PATH = './../data/consolidada_inep.csv'
ATLAS_PATH = './../data/atlas_desenvolvimento_humano_por_municipio.csv'
OCUPACOES_PATH  = './../data/cbo_ocupacoes.csv'
MAPBOX_TOKEN    = 'pk.eyJ1IjoibHVjYXNuc2VxIiwiYSI6ImNrb241dHZ0cTBpd2MycW5yMGp2enFtMmkifQ.N6NJGlWhG-iYrIJMQ1MVVw'

px.set_mapbox_access_token(MAPBOX_TOKEN)

In [None]:
municipios_df = pd.read_csv(MUNICIPIOS_PATH)

In [None]:
municipios_df.head()

In [None]:
inep_df = pd.read_csv(INEP_PATH)

In [None]:
inep_df.shape

In [None]:
inep_df

In [None]:
bolsa_df = pd.read_csv(BOLSA_PATH)

In [None]:
bolsa_df.shape

## Bolsa colunas:

- renda_media_sum
- renda_media_mean
- renda_media_median
- renda_media_std
- numero_comodos_sum
- numero_comodos_mean
- numero_comodos_median
- numero_comodos_std
- numero_comodos_dorm_sum
- numero_comodos_dorm_mean
- numero_comodos_dorm_median
- numero_comodos_dorm_std

- share_agua_canalizada

In [None]:
use_cols = ['renda_media_sum','renda_media_mean','renda_media_median',
            'renda_media_std','numero_comodos_sum','numero_comodos_mean',
            'numero_comodos_median','numero_comodos_std','numero_comodos_dorm_sum',
            'numero_comodos_dorm_mean','numero_comodos_dorm_median','numero_comodos_dorm_std',
            'share_agua_canalizada']

In [None]:
df       = bolsa_df.set_index('2018_cd_ibge')
mun_df   = municipios_df.set_index('id')
mun_size = len(mun_df)

for cod_mun in tqdm(df.index):
    
    for year in [2013,2014,2015,2016,2017]:
        
        for col in use_cols:
        
            col_name = f'{year}_{col}'
            
            value = df.loc[cod_mun][col_name]
            
            col_name = f'{col}_{year}'
            
            mun_df.at[cod_mun, col_name] = value

In [None]:
inep_df.shape

In [None]:
inep_df

In [None]:
temp_cols = []
icg_group_values = [f'ICG_{num}' for num in range(1, 7)]

for col in inep_df.columns:
    
    for icg in icg_group_values:
        
        if icg in col:
            temp_cols.append(col)

select_cols = []
for col in temp_cols:
    
    if inep_df.loc[1, col] == 'Total' and inep_df.loc[2, col] == 'Total':
        select_cols.append(col)
    

cols_map = {}
for i, icg_group in enumerate(icg_group_values):
    
    cols_map[icg_group] = {}
    icg_cols = []
    for col in select_cols:
        if icg_group in col:
            icg_cols.append(col)
    
    for j, year in enumerate([2013, 2014, 2015, 2016, 2017]):
        
        cols_map[icg_group][year] = icg_cols[j]
    
cols_map

In [None]:
df = inep_df.set_index('COD_MUNICIPIO')

icg_group_values = [f'ICG_{num}' for num in range(1, 7)]
years = [2013, 2014, 2015, 2016, 2017]

for cod_mun in tqdm(df.index):
    
    if np.isnan(cod_mun): continue

    for icg_group in icg_group_values:

        for year in [2013, 2014, 2015, 2016, 2017]:

            col = cols_map[icg_group][year]
            value = df.loc[cod_mun][col]
            
            col_name = f'{icg_group.lower()}_total_{year}'
            
            mun_df.at[cod_mun, col_name] = value

In [None]:
for col in mun_df.reset_index().columns:
    print(col)

In [None]:
px.imshow(mun_df.corr(), height = 1000)

In [None]:
municipios_df = mun_df.reset_index()

In [None]:
def get_encoder(values):
    
    encoder = LabelEncoder()
    encoded = encoder.fit_transform(values.reshape(-1, 1))
    
    return encoded, encoder

In [None]:
try:
    values_df = municipios_df.drop(columns=['id', 'nome', 'uf_nome', 'uf_id', 'sinan_id', 'latitude', 'longitude', 'cluster', 'cluster_id'])
except:
    values_df = municipios_df.drop(columns=['id', 'nome', 'uf_nome', 'uf_id', 'sinan_id', 'latitude', 'longitude'])

encoders = {}
encodeds = {}
feature_columns = ['uf', 'regiao']

for column in feature_columns:
    
    encodeds[column], encoders[column] = get_encoder(values_df[column].values)
    values_df[f'{column}_ID'] = encodeds[column]

values_df.head()

values_df = values_df.fillna(-1)

In [None]:
scaler = MinMaxScaler()

values = values_df.drop(columns=feature_columns).values
scaled_values = scaler.fit_transform(values)

# Run the Kmeans algorithm and get the index of data points clusters
sse = []
n_clusters_list = list(range(1, 10))

for n_clusters in n_clusters_list:
    km = KMeans(n_clusters=n_clusters)
    km.fit(scaled_values)
    sse.append(km.inertia_)

cluster_eval_df = pd.DataFrame({'n_clusters': n_clusters_list, 'inertia': sse})

fig = px.line(
    cluster_eval_df,
    x='n_clusters', y='inertia', 
    title='Cluster evaluation')

fig.show()  

In [None]:
clusterizer = KMeans(n_clusters = 3)
scaler = MinMaxScaler()

values = values_df.drop(columns=feature_columns).values
scaled_values = scaler.fit_transform(values)

clusterizer = clusterizer.fit(scaled_values)

municipios_df['cluster'] = clusterizer.predict(scaled_values)
municipios_df['cluster_id'] = municipios_df['cluster'].apply(lambda x : f'cluster_{x}')

In [None]:
keep_columns   = list(municipios_df.columns)
remove_columns = ['id','nome','uf_nome','uf_id','uf','regiao','latitude','longitude', 'sinan_id', 'cluster', 'cluster_id']

for c in remove_columns:
    try:
        keep_columns.remove(c)
    except:
        continue
    

In [None]:
df    = pd.DataFrame()
years = [2013, 2014, 2015, 2016, 2017]
num_years = len(years)

for index in tqdm(municipios_df.index):
    
    mun_data = municipios_df.loc[index]
    
    temp_df = pd.DataFrame()
    
    temp_df['year'] = years
    
    for col in remove_columns:
        temp_df[col]   = [mun_data[col]] * num_years
    
    for col in keep_columns:
        
        col = col.split('_201')[0]
        
        values = []
        
        for year in years:

            col_name = f'{col}_{year}'
            
            value = mun_data[col_name]
            values.append(value)
            
        temp_df[col] = values
        
    df = pd.concat((df, temp_df))    

In [None]:
keep_columns

In [None]:
fig = px.scatter_matrix(df,
    dimensions=["icg_2_total", "icg_4_total", 'icg_6_total', 'renda_media_mean', 'numero_comodos_dorm_mean', 'unit_relat', 'denun_relat', 'year'],
    color="cluster_id", height=1000)
fig.show()

In [None]:
municipios_df.head()

In [None]:
regressor = LinearRegression()

In [None]:
try:
    values_df = municipios_df.drop(columns=['id', 'nome', 'uf_nome', 'uf_id', 'sinan_id', 'cluster_id'])
except:
    values_df = municipios_df.drop(columns=['id', 'nome', 'uf_nome', 'uf_id', 'sinan_id'])

encoders = {}
encodeds = {}
feature_columns = ['uf', 'regiao']

for column in feature_columns:
    
    encodeds[column], encoders[column] = get_encoder(values_df[column].values)
    values_df[f'{column}_ID'] = encodeds[column]

values_df.head()

values_df = values_df.fillna(-1)
values_df = values_df.drop(['uf', 'regiao'], axis = 1)

remove_cols = []
for col in values_df.columns:
    if '2017' in col:
        remove_cols.append(col)
        
X_df = values_df.drop(remove_cols, axis=1)
Y_df = values_df['denun_count_2017']

values_X = X_df.values
values_Y = Y_df.values.reshape(-1, 1)

scaler_x      = MinMaxScaler()
scaler_y      = MinMaxScaler()

scaled_X = scaler_x.fit_transform(X_df.values)
scaled_Y = scaler_y.fit_transform(Y_df.values.reshape(-1, 1))

In [None]:
px.histogram(values_df['denun_count_2017'])

In [None]:
px.histogram(values_df['denun_relat_2017'])

In [None]:
regressor = LinearRegression()

regressor = regressor.fit(values_X, values_Y)
regressor.score(values_X, values_Y)

In [None]:
predictions_df = pd.DataFrame()

predictions_df['prediction'] = regressor.predict(values_X).reshape(-1,)
predictions_df['real']       = values_Y.reshape(-1,)
predictions_df['abs_diff']   = np.abs(predictions_df['prediction'] - predictions_df['real'])
predictions_df['nome']       = municipios_df['nome']
predictions_df['uf']         = municipios_df['uf']
predictions_df['regiao']     = municipios_df['regiao']

fig = px.scatter(predictions_df, x='prediction', y='real', color="abs_diff", 
                 hover_data=['nome', 'uf', 'regiao', 'abs_diff'])
fig.add_trace(go.Line(x=np.linspace(0,0.025), y=np.linspace(0,0.025)))

fig.show()

In [None]:
coef_df = pd.DataFrame()


coef_names  = ['bias'] + list(X_df.columns)
coef_values = [regressor.intercept_[0]] + list(regressor.coef_.reshape(-1,))

coef_df['name']  = coef_names
coef_df['value'] = coef_values

px.scatter(coef_df, x='name', y='value', color="value", width = 2500)