In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from zipfile import ZipFile
from datatable import dt, f, by
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3
from plotly import express as px, io as pio

pd.options.plotting.backend = 'plotly'
pio.renderers.default = 'plotly_mimetype+notebook_connected'

# Bases

## SINASC

In [2]:
path_sinasc_sus = f'{Path.home()}/Databases/SINASC/sinasc_sus.csv.gzip'
df_sinasc = pd.read_csv(path_sinasc_sus, low_memory=False)
df_sinasc

Unnamed: 0,ano,cnes,hosp_municipio,res_municipio,hosp_regiao_saude,res_regiao_saude,parto_normal,n_gestados,n_pre_natal,idade,nivel_escolaridade,raca_cor,nasc_raca_cor,sexo_fem,nasc_peso,nasc_apgar1,nasc_apgar5,periodo
0,2010,2798484,110030,120040,1103,1201,True,1,4,20,3,,Branca,False,3550.0,8.0,9.0,antes
1,2010,5701929,120001,120001,1201,1201,True,1,3,21,4,,Parda,True,3000.0,9.0,10.0,antes
2,2010,5701929,120001,120001,1201,1201,True,1,3,31,3,,Parda,True,3000.0,9.0,10.0,antes
3,2010,5701929,120001,120001,1201,1201,True,1,4,23,3,,Parda,False,3900.0,7.0,9.0,antes
4,2010,5701929,120001,120001,1201,1201,True,1,4,26,4,,Parda,True,3250.0,7.0,10.0,antes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22773577,2019,2361787,520110,170730,5211,1710,False,1,3,18,3,Parda,Parda,True,2840.0,8.0,9.0,depois
22773578,2019,2338564,520870,171110,5201,1704,False,1,4,27,4,Parda,Parda,True,3082.0,8.0,9.0,depois
22773579,2019,10537,530010,170240,5301,1709,False,1,3,36,5,,Ignorado,,2870.0,8.0,9.0,depois
22773580,2019,5717515,530010,170610,5301,1710,False,1,3,21,4,Parda,Parda,False,2554.0,8.0,9.0,depois


In [3]:
df_sinasc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22773582 entries, 0 to 22773581
Data columns (total 18 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ano                 int64  
 1   cnes                int64  
 2   hosp_municipio      int64  
 3   res_municipio       int64  
 4   hosp_regiao_saude   int64  
 5   res_regiao_saude    int64  
 6   parto_normal        bool   
 7   n_gestados          int64  
 8   n_pre_natal         int64  
 9   idade               int64  
 10  nivel_escolaridade  int64  
 11  raca_cor            object 
 12  nasc_raca_cor       object 
 13  sexo_fem            object 
 14  nasc_peso           float64
 15  nasc_apgar1         float64
 16  nasc_apgar5         float64
 17  periodo             object 
dtypes: bool(1), float64(3), int64(10), object(4)
memory usage: 2.9+ GB


## Municípios

In [4]:
path_muns = f'{Path.home()}/Databases/MUNICIPIOS/municipios.csv.gzip'
df_muns = pd.read_csv(path_muns)
df_muns

Unnamed: 0,cod_ibge,regiao,uf,cod_uf,nome_uf,municipio,nome,capital,fronteira,amazonia,macroregiao_saude,regiao_saude,microregiao_saude,latitude,longitude,altitude,area
0,110000,Norte,RO,11,RONDONIA,Município ignorado - RO,MUNICIPIO IGNORADO - RO,False,False,False,1100,1100,11000,0.000000,0.000000,0.0,0.000000
1,110001,Norte,RO,11,RONDONIA,Alta Floresta D'Oeste,ALTA FLORESTA D'OESTE,False,True,True,1190,1102,11900,-11.929000,-61.995998,350.0,7066.702148
2,110002,Norte,RO,11,RONDONIA,Ariquemes,ARIQUEMES,False,False,True,1190,1104,11900,-9.913000,-63.041000,142.0,4426.558105
3,110003,Norte,RO,11,RONDONIA,Cabixi,CABIXI,False,True,True,1190,1103,11900,-13.492000,-60.544998,230.0,1314.354980
4,110004,Norte,RO,11,RONDONIA,Cacoal,CACOAL,False,False,True,1190,1102,11900,-11.438000,-61.448002,200.0,3792.637939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4789,522200,Centro-Oeste,GO,52,GOIAS,Vianópolis,VIANOPOLIS,False,False,False,5201,5202,52900,-16.742001,-48.515999,1002.0,954.278992
4790,522205,Centro-Oeste,GO,52,GOIAS,Vicentinópolis,VICENTINOPOLIS,False,False,False,5201,5202,52900,-17.735001,-49.806000,646.0,737.250977
4791,522220,Centro-Oeste,GO,52,GOIAS,Vila Boa,VILA BOA,False,False,False,5202,5206,52900,-15.038000,-47.058998,0.0,1060.170044
4792,522230,Centro-Oeste,GO,52,GOIAS,Vila Propício,VILA PROPICIO,False,False,False,5203,5212,52900,-15.457000,-48.889000,744.0,2181.574951


In [5]:
df_muns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4794 entries, 0 to 4793
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   cod_ibge           4794 non-null   int64  
 1   regiao             4794 non-null   object 
 2   uf                 4794 non-null   object 
 3   cod_uf             4794 non-null   int64  
 4   nome_uf            4794 non-null   object 
 5   municipio          4794 non-null   object 
 6   nome               4794 non-null   object 
 7   capital            4794 non-null   bool   
 8   fronteira          4794 non-null   bool   
 9   amazonia           4794 non-null   bool   
 10  macroregiao_saude  4794 non-null   int64  
 11  regiao_saude       4794 non-null   int64  
 12  microregiao_saude  4794 non-null   int64  
 13  latitude           4790 non-null   float64
 14  longitude          4790 non-null   float64
 15  altitude           4790 non-null   float64
 16  area               4790 

## Distâncias

In [6]:
path_dist = f'{Path.home()}/Databases/DISTANCIAS/matriz_distancias.zip'
df_dist = pd.read_csv(path_dist, low_memory=False)
df_dist

Unnamed: 0,origem,destino,distancia,tempo
0,110001,110001,0.000000,0.000000
1,110001,110002,309.050000,6.169056
2,110001,110003,399.499700,6.289056
3,110001,110004,81.201103,1.917750
4,110001,110005,391.704300,6.136361
...,...,...,...,...
31024895,530010,522200,161.728900,2.356000
31024896,530010,522205,382.708800,5.479556
31024897,530010,522220,161.603400,2.248944
31024898,530010,522230,190.000900,3.216083


In [7]:
df_dist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31024900 entries, 0 to 31024899
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   origem     int64  
 1   destino    int64  
 2   distancia  float64
 3   tempo      float64
dtypes: float64(2), int64(2)
memory usage: 946.8 MB


## Indicador SocioEconômico

In [8]:
path_socioeco = f'{Path.home()}/Databases/HEALTH/health_regions.csv'
df_socioeco = pd.read_csv(path_socioeco)
df_socioeco

Unnamed: 0,UF,Município,Cód IBGE,Cód Região de Saúde,Nome da Região de Saúde,Grupo
0,AC,Acrelândia,120001,12002,Baixo Acre e Purus,2
1,AC,Assis Brasil,120005,12001,Alto Acre,1
2,AC,Brasiléia,120010,12001,Alto Acre,1
3,AC,Bujari,120013,12002,Baixo Acre e Purus,2
4,AC,Capixaba,120017,12002,Baixo Acre e Purus,2
...,...,...,...,...,...,...
5566,TO,Tocantinópolis,172120,17002,Bico do Papagaio,1
5567,TO,Tupirama,172125,17004,Cerrado Tocantins Araguaia,2
5568,TO,Tupiratins,172130,17004,Cerrado Tocantins Araguaia,2
5569,TO,Wanderlândia,172208,17001,Médio Norte Araguaia,3


# Tabela Deslocamento

In [9]:
def merge_lat_lon(df_left, df_right, left_on, right_on):
  prefix = left_on.split('_')[0]
  df = pd.merge(df_left, df_right, how='left', left_on=left_on, right_on=right_on)
  df = df.rename(columns={
    'latitude': f'{prefix}_latitude',
    'longitude': f'{prefix}_longitude'})
  df = df.drop(right_on, axis=1)
  return df

In [24]:
cols = ['periodo', 'res_municipio', 'hosp_municipio', 'res_regiao_saude', 'hosp_regiao_saude', 'cnes']
df_deslc = df_sinasc[cols]
df_deslc = df_deslc[df_deslc['periodo'] != 'durante']
df_deslc = df_deslc.groupby(
  cols, as_index=False
).size().sort_values('size', ascending=False, ignore_index=True)
df_deslc = df_deslc.rename(columns={'size': 'nascimentos'})
df_deslc

Unnamed: 0,periodo,res_municipio,hosp_municipio,res_regiao_saude,hosp_regiao_saude,cnes,nascimentos
0,antes,130260,130260,1301,1301,3151794,16810
1,depois,140010,140010,1401,1401,2566168,15461
2,depois,500270,500270,5011,5011,9768,14811
3,depois,350950,350950,3512,3512,2022621,14690
4,antes,292740,292740,2901,2901,3956369,14540
...,...,...,...,...,...,...,...
115861,antes,520815,520870,5211,5201,2339196,1
115862,antes,520830,170240,5208,1709,2792451,1
115863,antes,520830,520800,5208,5206,2361477,1
115864,antes,520830,520870,5208,5201,2338424,1


In [25]:
df_deslc['periodo'].value_counts()

depois    58080
antes     57786
Name: periodo, dtype: int64

In [26]:
df_deslc['mun_diff'] = df_deslc['res_municipio'] != df_deslc['hosp_municipio']
df_deslc['regsau_diff'] = df_deslc['res_regiao_saude'] != df_deslc['hosp_regiao_saude']
df_deslc

Unnamed: 0,periodo,res_municipio,hosp_municipio,res_regiao_saude,hosp_regiao_saude,cnes,nascimentos,mun_diff,regsau_diff
0,antes,130260,130260,1301,1301,3151794,16810,False,False
1,depois,140010,140010,1401,1401,2566168,15461,False,False
2,depois,500270,500270,5011,5011,9768,14811,False,False
3,depois,350950,350950,3512,3512,2022621,14690,False,False
4,antes,292740,292740,2901,2901,3956369,14540,False,False
...,...,...,...,...,...,...,...,...,...
115861,antes,520815,520870,5211,5201,2339196,1,True,True
115862,antes,520830,170240,5208,1709,2792451,1,True,True
115863,antes,520830,520800,5208,5206,2361477,1,True,True
115864,antes,520830,520870,5208,5201,2338424,1,True,True


In [27]:
res_cols = [
  'cod_ibge',
  'regiao',
  'uf',
  'capital',
  'fronteira',
  'amazonia',
  'latitude',
  'longitude',]

hosp_cols = [
  'cod_ibge',
  'latitude',
  'longitude',]

In [28]:
df_deslc = merge_lat_lon(df_deslc, df_muns[res_cols], 'res_municipio', 'cod_ibge')
df_deslc = merge_lat_lon(df_deslc, df_muns[hosp_cols], 'hosp_municipio', 'cod_ibge')
df_deslc

Unnamed: 0,periodo,res_municipio,hosp_municipio,res_regiao_saude,hosp_regiao_saude,cnes,nascimentos,mun_diff,regsau_diff,regiao,uf,capital,fronteira,amazonia,res_latitude,res_longitude,hosp_latitude,hosp_longitude
0,antes,130260,130260,1301,1301,3151794,16810,False,False,Norte,AM,True,False,True,-3.102000,-60.025002,-3.102000,-60.025002
1,depois,140010,140010,1401,1401,2566168,15461,False,False,Norte,RR,True,True,True,2.820000,-60.673000,2.820000,-60.673000
2,depois,500270,500270,5011,5011,9768,14811,False,False,Centro-Oeste,MS,True,False,False,-20.443001,-54.646000,-20.443001,-54.646000
3,depois,350950,350950,3512,3512,2022621,14690,False,False,Sudeste,SP,False,False,False,-22.906000,-47.061001,-22.906000,-47.061001
4,antes,292740,292740,2901,2901,3956369,14540,False,False,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115863,antes,520815,520870,5211,5201,2339196,1,True,True,Centro-Oeste,GO,False,False,False,-16.464001,-48.669998,-16.679001,-49.254002
115864,antes,520830,170240,5208,1709,2792451,1,True,True,Centro-Oeste,GO,False,False,False,-13.295000,-46.393002,-12.931000,-46.938000
115865,antes,520830,520800,5208,5206,2361477,1,True,True,Centro-Oeste,GO,False,False,False,-13.295000,-46.393002,-15.537000,-47.334000
115866,antes,520830,520870,5208,5201,2338424,1,True,True,Centro-Oeste,GO,False,False,False,-13.295000,-46.393002,-16.679001,-49.254002


In [29]:
df_deslc = pd.merge(df_deslc, df_dist, how='left', left_on=['res_municipio', 'hosp_municipio'], right_on=['origem', 'destino'])
df_deslc = df_deslc.drop(['origem', 'destino'], axis=1)
df_deslc

Unnamed: 0,periodo,res_municipio,hosp_municipio,res_regiao_saude,hosp_regiao_saude,cnes,nascimentos,mun_diff,regsau_diff,regiao,uf,capital,fronteira,amazonia,res_latitude,res_longitude,hosp_latitude,hosp_longitude,distancia,tempo
0,antes,130260,130260,1301,1301,3151794,16810,False,False,Norte,AM,True,False,True,-3.102000,-60.025002,-3.102000,-60.025002,0.0000,0.000000
1,depois,140010,140010,1401,1401,2566168,15461,False,False,Norte,RR,True,True,True,2.820000,-60.673000,2.820000,-60.673000,0.0000,0.000000
2,depois,500270,500270,5011,5011,9768,14811,False,False,Centro-Oeste,MS,True,False,False,-20.443001,-54.646000,-20.443001,-54.646000,0.0000,0.000000
3,depois,350950,350950,3512,3512,2022621,14690,False,False,Sudeste,SP,False,False,False,-22.906000,-47.061001,-22.906000,-47.061001,0.0000,0.000000
4,antes,292740,292740,2901,2901,3956369,14540,False,False,,,,,,,,,,0.0000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115863,antes,520815,520870,5211,5201,2339196,1,True,True,Centro-Oeste,GO,False,False,False,-16.464001,-48.669998,-16.679001,-49.254002,86.2261,1.310972
115864,antes,520830,170240,5208,1709,2792451,1,True,True,Centro-Oeste,GO,False,False,False,-13.295000,-46.393002,-12.931000,-46.938000,94.2209,1.658667
115865,antes,520830,520800,5208,5206,2361477,1,True,True,Centro-Oeste,GO,False,False,False,-13.295000,-46.393002,-15.537000,-47.334000,373.9899,4.629361
115866,antes,520830,520870,5208,5201,2338424,1,True,True,Centro-Oeste,GO,False,False,False,-13.295000,-46.393002,-16.679001,-49.254002,648.9102,8.660694


## Tabela Final

In [30]:
path_deslc = f'{Path.home()}/Databases/GESTANTES/deslocamento.csv.gzip'
df_deslc.to_csv(path_deslc, index=False)

In [31]:
pd.read_csv(path_deslc)

Unnamed: 0,periodo,res_municipio,hosp_municipio,res_regiao_saude,hosp_regiao_saude,cnes,nascimentos,mun_diff,regsau_diff,regiao,uf,capital,fronteira,amazonia,res_latitude,res_longitude,hosp_latitude,hosp_longitude,distancia,tempo
0,antes,130260,130260,1301,1301,3151794,16810,False,False,Norte,AM,True,False,True,-3.102000,-60.025002,-3.102000,-60.025002,0.0000,0.000000
1,depois,140010,140010,1401,1401,2566168,15461,False,False,Norte,RR,True,True,True,2.820000,-60.673000,2.820000,-60.673000,0.0000,0.000000
2,depois,500270,500270,5011,5011,9768,14811,False,False,Centro-Oeste,MS,True,False,False,-20.443001,-54.646000,-20.443001,-54.646000,0.0000,0.000000
3,depois,350950,350950,3512,3512,2022621,14690,False,False,Sudeste,SP,False,False,False,-22.906000,-47.061001,-22.906000,-47.061001,0.0000,0.000000
4,antes,292740,292740,2901,2901,3956369,14540,False,False,,,,,,,,,,0.0000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115863,antes,520815,520870,5211,5201,2339196,1,True,True,Centro-Oeste,GO,False,False,False,-16.464001,-48.669998,-16.679001,-49.254002,86.2261,1.310972
115864,antes,520830,170240,5208,1709,2792451,1,True,True,Centro-Oeste,GO,False,False,False,-13.295000,-46.393002,-12.931000,-46.938000,94.2209,1.658667
115865,antes,520830,520800,5208,5206,2361477,1,True,True,Centro-Oeste,GO,False,False,False,-13.295000,-46.393002,-15.537000,-47.334000,373.9899,4.629361
115866,antes,520830,520870,5208,5201,2338424,1,True,True,Centro-Oeste,GO,False,False,False,-13.295000,-46.393002,-16.679001,-49.254002,648.9102,8.660694


In [32]:
df_deslc.to_excel(path_deslc.replace('csv.gzip', 'xlsx'))

## RJ

In [33]:
df_deslc_rj = df_deslc[df_deslc['uf'] == 'RJ']
df_deslc_rj

Unnamed: 0,periodo,res_municipio,hosp_municipio,res_regiao_saude,hosp_regiao_saude,cnes,nascimentos,mun_diff,regsau_diff,regiao,uf,capital,fronteira,amazonia,res_latitude,res_longitude,hosp_latitude,hosp_longitude,distancia,tempo
15,antes,330455,330455,3305,3305,2280248,11631,False,False,Sudeste,RJ,True,False,False,-22.903000,-43.208000,-22.903000,-43.208000,0.0000,0.000000
17,depois,330455,330455,3305,3305,2270609,11216,False,False,Sudeste,RJ,True,False,False,-22.903000,-43.208000,-22.903000,-43.208000,0.0000,0.000000
18,antes,330455,330455,3305,3305,2295407,11028,False,False,Sudeste,RJ,True,False,False,-22.903000,-43.208000,-22.903000,-43.208000,0.0000,0.000000
25,depois,330455,330455,3305,3305,2280248,10342,False,False,Sudeste,RJ,True,False,False,-22.903000,-43.208000,-22.903000,-43.208000,0.0000,0.000000
29,antes,330455,330455,3305,3305,2270609,10070,False,False,Sudeste,RJ,True,False,False,-22.903000,-43.208000,-22.903000,-43.208000,0.0000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92621,antes,330070,330490,3302,3305,2292157,1,True,True,Sudeste,RJ,False,False,False,-22.879000,-42.019001,-22.827000,-43.054001,128.7856,1.643194
92622,antes,330070,330570,3302,3308,2268051,1,True,True,Sudeste,RJ,False,False,False,-22.879000,-42.019001,-22.049999,-42.674999,190.6239,3.129167
92623,antes,330070,330580,3302,3308,2292386,1,True,True,Sudeste,RJ,False,False,False,-22.879000,-42.019001,-22.412001,-42.966000,168.7798,2.469361
92624,antes,330070,330630,3302,3304,25135,1,True,True,Sudeste,RJ,False,False,False,-22.879000,-42.019001,-22.523001,-44.104000,273.5160,3.704111


In [36]:
df_deslc_rj.to_excel(path_deslc.replace('.csv.gzip', '_rj.xlsx'), index=False)

In [35]:
df_deslc_rj['periodo'].value_counts()

antes     1781
depois    1718
Name: periodo, dtype: int64