In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from zipfile import ZipFile
from datatable import dt, f, by
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3
from plotly import express as px, io as pio

pd.options.plotting.backend = 'plotly'
pio.renderers.default = 'plotly_mimetype+notebook_connected'

# Bases

## SINASC

In [2]:
path_sinasc_sus = f'{Path.home()}/Databases/SINASC/sinasc_sus.csv.gzip'
df_sinasc = pd.read_csv(path_sinasc_sus, low_memory=False)
df_sinasc

Unnamed: 0,ano,cnes,hosp_municipio,res_municipio,hosp_regiao_saude,res_regiao_saude,parto_normal,n_gestados,n_pre_natal,idade,nivel_escolaridade,raca_cor,nasc_raca_cor,sexo_fem,nasc_peso,nasc_apgar1,nasc_apgar5
0,2010,2798484,110030,120040,1103,1201,True,1,4,20,3,,Branca,False,3550.0,8.0,9.0
1,2010,5701929,120001,120001,1201,1201,True,1,3,21,4,,Parda,True,3000.0,9.0,10.0
2,2010,5701929,120001,120001,1201,1201,True,1,3,31,3,,Parda,True,3000.0,9.0,10.0
3,2010,5701929,120001,120001,1201,1201,True,1,4,23,3,,Parda,False,3900.0,7.0,9.0
4,2010,5701929,120001,120001,1201,1201,True,1,4,26,4,,Parda,True,3250.0,7.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22773577,2019,2361787,520110,170730,5211,1710,False,1,3,18,3,Parda,Parda,True,2840.0,8.0,9.0
22773578,2019,2338564,520870,171110,5201,1704,False,1,4,27,4,Parda,Parda,True,3082.0,8.0,9.0
22773579,2019,10537,530010,170240,5301,1709,False,1,3,36,5,,Ignorado,,2870.0,8.0,9.0
22773580,2019,5717515,530010,170610,5301,1710,False,1,3,21,4,Parda,Parda,False,2554.0,8.0,9.0


In [3]:
df_sinasc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22773582 entries, 0 to 22773581
Data columns (total 17 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ano                 int64  
 1   cnes                int64  
 2   hosp_municipio      int64  
 3   res_municipio       int64  
 4   hosp_regiao_saude   int64  
 5   res_regiao_saude    int64  
 6   parto_normal        bool   
 7   n_gestados          int64  
 8   n_pre_natal         int64  
 9   idade               int64  
 10  nivel_escolaridade  int64  
 11  raca_cor            object 
 12  nasc_raca_cor       object 
 13  sexo_fem            object 
 14  nasc_peso           float64
 15  nasc_apgar1         float64
 16  nasc_apgar5         float64
dtypes: bool(1), float64(3), int64(10), object(3)
memory usage: 2.7+ GB


## Municípios

In [4]:
path_muns = f'{Path.home()}/Databases/MUNICIPIOS/municipios.csv.gzip'
df_muns = pd.read_csv(path_muns)
df_muns

Unnamed: 0,cod_ibge,regiao,uf,cod_uf,nome_uf,municipio,nome,capital,fronteira,amazonia,macroregiao_saude,regiao_saude,microregiao_saude,latitude,longitude,altitude,area
0,110000,Norte,RO,11,RONDONIA,Município ignorado - RO,MUNICIPIO IGNORADO - RO,False,False,False,1100,1100,11000,0.000000,0.000000,0.0,0.000000
1,110001,Norte,RO,11,RONDONIA,Alta Floresta D'Oeste,ALTA FLORESTA D'OESTE,False,True,True,1190,1102,11900,-11.929000,-61.995998,350.0,7066.702148
2,110002,Norte,RO,11,RONDONIA,Ariquemes,ARIQUEMES,False,False,True,1190,1104,11900,-9.913000,-63.041000,142.0,4426.558105
3,110003,Norte,RO,11,RONDONIA,Cabixi,CABIXI,False,True,True,1190,1103,11900,-13.492000,-60.544998,230.0,1314.354980
4,110004,Norte,RO,11,RONDONIA,Cacoal,CACOAL,False,False,True,1190,1102,11900,-11.438000,-61.448002,200.0,3792.637939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4789,522200,Centro-Oeste,GO,52,GOIAS,Vianópolis,VIANOPOLIS,False,False,False,5201,5202,52900,-16.742001,-48.515999,1002.0,954.278992
4790,522205,Centro-Oeste,GO,52,GOIAS,Vicentinópolis,VICENTINOPOLIS,False,False,False,5201,5202,52900,-17.735001,-49.806000,646.0,737.250977
4791,522220,Centro-Oeste,GO,52,GOIAS,Vila Boa,VILA BOA,False,False,False,5202,5206,52900,-15.038000,-47.058998,0.0,1060.170044
4792,522230,Centro-Oeste,GO,52,GOIAS,Vila Propício,VILA PROPICIO,False,False,False,5203,5212,52900,-15.457000,-48.889000,744.0,2181.574951


In [5]:
df_muns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4794 entries, 0 to 4793
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   cod_ibge           4794 non-null   int64  
 1   regiao             4794 non-null   object 
 2   uf                 4794 non-null   object 
 3   cod_uf             4794 non-null   int64  
 4   nome_uf            4794 non-null   object 
 5   municipio          4794 non-null   object 
 6   nome               4794 non-null   object 
 7   capital            4794 non-null   bool   
 8   fronteira          4794 non-null   bool   
 9   amazonia           4794 non-null   bool   
 10  macroregiao_saude  4794 non-null   int64  
 11  regiao_saude       4794 non-null   int64  
 12  microregiao_saude  4794 non-null   int64  
 13  latitude           4790 non-null   float64
 14  longitude          4790 non-null   float64
 15  altitude           4790 non-null   float64
 16  area               4790 

## Distâncias

In [6]:
path_dist = f'{Path.home()}/Databases/DISTANCIAS/matriz_distancias.zip'
df_dist = pd.read_csv(path_dist, low_memory=False)
df_dist

Unnamed: 0,origem,destino,distancia,tempo
0,110001,110001,0.000000,0.000000
1,110001,110002,309.050000,6.169056
2,110001,110003,399.499700,6.289056
3,110001,110004,81.201103,1.917750
4,110001,110005,391.704300,6.136361
...,...,...,...,...
31024895,530010,522200,161.728900,2.356000
31024896,530010,522205,382.708800,5.479556
31024897,530010,522220,161.603400,2.248944
31024898,530010,522230,190.000900,3.216083


In [7]:
df_dist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31024900 entries, 0 to 31024899
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   origem     int64  
 1   destino    int64  
 2   distancia  float64
 3   tempo      float64
dtypes: float64(2), int64(2)
memory usage: 946.8 MB


## Indicador SocioEconômico

In [8]:
path_socioeco = f'{Path.home()}/Databases/HEALTH/health_regions.csv'
df_socioeco = pd.read_csv(path_socioeco)
df_socioeco

Unnamed: 0,UF,Município,Cód IBGE,Cód Região de Saúde,Nome da Região de Saúde,Grupo
0,AC,Acrelândia,120001,12002,Baixo Acre e Purus,2
1,AC,Assis Brasil,120005,12001,Alto Acre,1
2,AC,Brasiléia,120010,12001,Alto Acre,1
3,AC,Bujari,120013,12002,Baixo Acre e Purus,2
4,AC,Capixaba,120017,12002,Baixo Acre e Purus,2
...,...,...,...,...,...,...
5566,TO,Tocantinópolis,172120,17002,Bico do Papagaio,1
5567,TO,Tupirama,172125,17004,Cerrado Tocantins Araguaia,2
5568,TO,Tupiratins,172130,17004,Cerrado Tocantins Araguaia,2
5569,TO,Wanderlândia,172208,17001,Médio Norte Araguaia,3


# Tabela Deslocamento

In [9]:
def merge_lat_lon(df_left, df_right, left_on, right_on):
  prefix = left_on.split('_')[0]
  df = pd.merge(df_left, df_right, how='left', left_on=left_on, right_on=right_on)
  df = df.rename(columns={
    'latitude': f'{prefix}_latitude',
    'longitude': f'{prefix}_longitude'})
  df = df.drop(right_on, axis=1)
  return df

In [10]:
cols = ['res_municipio', 'hosp_municipio', 'res_regiao_saude', 'hosp_regiao_saude', 'cnes']
df_deslc = df_sinasc[cols].groupby(
  cols, as_index=False
).size().sort_values('size', ascending=False, ignore_index=True)
df_deslc = df_deslc.rename(columns={'size': 'nascimentos'})
df_deslc

Unnamed: 0,res_municipio,hosp_municipio,res_regiao_saude,hosp_regiao_saude,cnes,nascimentos
0,130260,130260,1301,1301,3151794,80387
1,350950,350950,3512,3512,2022621,74956
2,500270,500270,5011,5011,9768,70431
3,355030,355030,3501,3501,2079186,68200
4,355030,355030,3501,3501,2077388,62884
...,...,...,...,...,...,...
120213,353540,500270,3516,5011,9768,1
120214,353550,316470,3508,3122,2146525,1
120215,353550,350320,3508,3507,6943284,1
120216,260950,260540,2602,2602,2712024,1


In [11]:
df_deslc['mun_diff'] = df_deslc['res_municipio'] != df_deslc['hosp_municipio']
df_deslc['regsau_diff'] = df_deslc['res_regiao_saude'] != df_deslc['hosp_regiao_saude']
df_deslc

Unnamed: 0,res_municipio,hosp_municipio,res_regiao_saude,hosp_regiao_saude,cnes,nascimentos,mun_diff,regsau_diff
0,130260,130260,1301,1301,3151794,80387,False,False
1,350950,350950,3512,3512,2022621,74956,False,False
2,500270,500270,5011,5011,9768,70431,False,False
3,355030,355030,3501,3501,2079186,68200,False,False
4,355030,355030,3501,3501,2077388,62884,False,False
...,...,...,...,...,...,...,...,...
120213,353540,500270,3516,5011,9768,1,True,True
120214,353550,316470,3508,3122,2146525,1,True,True
120215,353550,350320,3508,3507,6943284,1,True,True
120216,260950,260540,2602,2602,2712024,1,True,False


In [12]:
res_cols = [
  'cod_ibge',
  'regiao',
  'uf',
  'capital',
  'fronteira',
  'amazonia',
  'latitude',
  'longitude',]

hosp_cols = [
  'cod_ibge',
  'latitude',
  'longitude',]

In [13]:
df_deslc = merge_lat_lon(df_deslc, df_muns[res_cols], 'res_municipio', 'cod_ibge')
df_deslc = merge_lat_lon(df_deslc, df_muns[hosp_cols], 'hosp_municipio', 'cod_ibge')
df_deslc

Unnamed: 0,res_municipio,hosp_municipio,res_regiao_saude,hosp_regiao_saude,cnes,nascimentos,mun_diff,regsau_diff,regiao,uf,capital,fronteira,amazonia,res_latitude,res_longitude,hosp_latitude,hosp_longitude
0,130260,130260,1301,1301,3151794,80387,False,False,Norte,AM,True,False,True,-3.102000,-60.025002,-3.102000,-60.025002
1,350950,350950,3512,3512,2022621,74956,False,False,Sudeste,SP,False,False,False,-22.906000,-47.061001,-22.906000,-47.061001
2,500270,500270,5011,5011,9768,70431,False,False,Centro-Oeste,MS,True,False,False,-20.443001,-54.646000,-20.443001,-54.646000
3,355030,355030,3501,3501,2079186,68200,False,False,Sudeste,SP,True,False,False,-23.548000,-46.636002,-23.548000,-46.636002
4,355030,355030,3501,3501,2077388,62884,False,False,Sudeste,SP,True,False,False,-23.548000,-46.636002,-23.548000,-46.636002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120218,353540,500270,3516,5011,9768,1,True,True,Sudeste,SP,False,False,False,-21.356001,-51.860001,-20.443001,-54.646000
120219,353550,316470,3508,3122,2146525,1,True,True,Sudeste,SP,False,False,False,-22.413000,-50.576000,-20.917000,-46.991001
120220,353550,350320,3508,3507,6943284,1,True,True,Sudeste,SP,False,False,False,-22.413000,-50.576000,-21.794001,-48.175999
120221,260950,260540,2602,2602,2712024,1,True,False,,,,,,,,,


In [14]:
df_deslc = pd.merge(df_deslc, df_dist, how='left', left_on=['res_municipio', 'hosp_municipio'], right_on=['origem', 'destino'])
df_deslc = df_deslc.drop(['origem', 'destino'], axis=1)
df_deslc

Unnamed: 0,res_municipio,hosp_municipio,res_regiao_saude,hosp_regiao_saude,cnes,nascimentos,mun_diff,regsau_diff,regiao,uf,capital,fronteira,amazonia,res_latitude,res_longitude,hosp_latitude,hosp_longitude,distancia,tempo
0,130260,130260,1301,1301,3151794,80387,False,False,Norte,AM,True,False,True,-3.102000,-60.025002,-3.102000,-60.025002,0.0000,0.000000
1,350950,350950,3512,3512,2022621,74956,False,False,Sudeste,SP,False,False,False,-22.906000,-47.061001,-22.906000,-47.061001,0.0000,0.000000
2,500270,500270,5011,5011,9768,70431,False,False,Centro-Oeste,MS,True,False,False,-20.443001,-54.646000,-20.443001,-54.646000,0.0000,0.000000
3,355030,355030,3501,3501,2079186,68200,False,False,Sudeste,SP,True,False,False,-23.548000,-46.636002,-23.548000,-46.636002,0.0000,0.000000
4,355030,355030,3501,3501,2077388,62884,False,False,Sudeste,SP,True,False,False,-23.548000,-46.636002,-23.548000,-46.636002,0.0000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120218,353540,500270,3516,5011,9768,1,True,True,Sudeste,SP,False,False,False,-21.356001,-51.860001,-20.443001,-54.646000,420.1754,5.446972
120219,353550,316470,3508,3122,2146525,1,True,True,Sudeste,SP,False,False,False,-22.413000,-50.576000,-20.917000,-46.991001,457.4067,5.878667
120220,353550,350320,3508,3507,6943284,1,True,True,Sudeste,SP,False,False,False,-22.413000,-50.576000,-21.794001,-48.175999,307.4889,4.139444
120221,260950,260540,2602,2602,2712024,1,True,False,,,,,,,,,,44.8369,0.741667


## Tabela Final

In [15]:
path_deslc = f'{Path.home()}/Databases/GESTANTES/deslocamento.csv.gzip'
df_deslc.to_csv(path_deslc, index=False)

In [16]:
pd.read_csv(path_deslc)

Unnamed: 0,res_municipio,hosp_municipio,res_regiao_saude,hosp_regiao_saude,cnes,nascimentos,mun_diff,regsau_diff,regiao,uf,capital,fronteira,amazonia,res_latitude,res_longitude,hosp_latitude,hosp_longitude,distancia,tempo
0,130260,130260,1301,1301,3151794,80387,False,False,Norte,AM,True,False,True,-3.102000,-60.025002,-3.102000,-60.025002,0.0000,0.000000
1,350950,350950,3512,3512,2022621,74956,False,False,Sudeste,SP,False,False,False,-22.906000,-47.061001,-22.906000,-47.061001,0.0000,0.000000
2,500270,500270,5011,5011,9768,70431,False,False,Centro-Oeste,MS,True,False,False,-20.443001,-54.646000,-20.443001,-54.646000,0.0000,0.000000
3,355030,355030,3501,3501,2079186,68200,False,False,Sudeste,SP,True,False,False,-23.548000,-46.636002,-23.548000,-46.636002,0.0000,0.000000
4,355030,355030,3501,3501,2077388,62884,False,False,Sudeste,SP,True,False,False,-23.548000,-46.636002,-23.548000,-46.636002,0.0000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120218,353540,500270,3516,5011,9768,1,True,True,Sudeste,SP,False,False,False,-21.356001,-51.860001,-20.443001,-54.646000,420.1754,5.446972
120219,353550,316470,3508,3122,2146525,1,True,True,Sudeste,SP,False,False,False,-22.413000,-50.576000,-20.917000,-46.991001,457.4067,5.878667
120220,353550,350320,3508,3507,6943284,1,True,True,Sudeste,SP,False,False,False,-22.413000,-50.576000,-21.794001,-48.175999,307.4889,4.139444
120221,260950,260540,2602,2602,2712024,1,True,False,,,,,,,,,,44.8369,0.741667


In [18]:
df_deslc.to_excel(path_deslc.replace('csv.gzip', 'xlsx'))