In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from zipfile import ZipFile
from datatable import dt, f, by
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3
from plotly import express as px, io as pio

pd.options.plotting.backend = 'plotly'
pio.renderers.default = 'plotly_mimetype+notebook_connected'

## SINASC com filtros básicos

In [2]:
path_zip_sinasc = f'{Path.home()}/Databases/SINASC/SINASC.csv.gzip'
df_sinasc = pd.read_csv(path_zip_sinasc)
df_sinasc

Unnamed: 0,ano,cnes,hosp_municipio,res_municipio,hosp_regiao_saude,res_regiao_saude,tipo_parto,n_gestados,n_pre_natal,idade,nivel_escolaridade,raca_cor,nasc_raca_cor,nasc_sexo,nasc_peso,nasc_apgar1,nasc_apgar5
0,2010,2515598,110002,120040,1104,1201,Cesáreo,1,4,24,4,,Branca,Masculino,3400.0,9.0,10.0
1,2010,3792595,110011,120040,1101,1201,Cesáreo,1,3,22,4,,Parda,Masculino,3950.0,8.0,9.0
2,2010,2798484,110030,120040,1103,1201,Vaginal,1,4,20,3,,Branca,Masculino,3550.0,8.0,9.0
3,2010,5701929,120001,120001,1201,1201,Vaginal,1,3,21,4,,Parda,Feminino,3000.0,9.0,10.0
4,2010,5701929,120001,120001,1201,1201,Vaginal,1,3,31,3,,Parda,Feminino,3000.0,9.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28197159,2019,3427633,520870,170950,5201,1707,Cesáreo,1,4,29,5,4.0,Parda,Feminino,1700.0,7.0,8.0
28197160,2019,10537,530010,170240,5301,1709,Cesáreo,1,3,36,5,,Ignorado,Ignorado,2870.0,8.0,9.0
28197161,2019,5717515,530010,170610,5301,1710,Cesáreo,1,3,21,4,4.0,Parda,Masculino,2554.0,8.0,9.0
28197162,2019,5717515,530010,172100,5301,1705,Vaginal,1,4,21,4,4.0,Parda,Feminino,2610.0,8.0,9.0


## CNES da base SIH

In [3]:
path_zip_sih = f'{Path.home()}/Databases/SIHSUS/sih_count.csv.gzip'
df_cnes_sih = pd.read_csv(path_zip_sih)
df_cnes_sih

Unnamed: 0,cnes,count
0,2232,101955
1,3151794,75209
2,2323397,74712
3,2311682,64833
4,26794,63700
...,...,...
4144,7320175,1
4145,7603029,1
4146,7621442,1
4147,7704364,1


## Filtro SUS

In [4]:
df_sinasc_sus = df_sinasc.loc[
  df_sinasc['cnes'].isin(df_cnes_sih['cnes'])].copy()
df_sinasc_sus

Unnamed: 0,ano,cnes,hosp_municipio,res_municipio,hosp_regiao_saude,res_regiao_saude,tipo_parto,n_gestados,n_pre_natal,idade,nivel_escolaridade,raca_cor,nasc_raca_cor,nasc_sexo,nasc_peso,nasc_apgar1,nasc_apgar5
2,2010,2798484,110030,120040,1103,1201,Vaginal,1,4,20,3,,Branca,Masculino,3550.0,8.0,9.0
3,2010,5701929,120001,120001,1201,1201,Vaginal,1,3,21,4,,Parda,Feminino,3000.0,9.0,10.0
4,2010,5701929,120001,120001,1201,1201,Vaginal,1,3,31,3,,Parda,Feminino,3000.0,9.0,10.0
5,2010,5701929,120001,120001,1201,1201,Vaginal,1,4,23,3,,Parda,Masculino,3900.0,7.0,9.0
6,2010,5701929,120001,120001,1201,1201,Vaginal,1,4,26,4,,Parda,Feminino,3250.0,7.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28197146,2019,2361787,520110,170730,5211,1710,Cesáreo,1,3,18,3,4.0,Parda,Feminino,2840.0,8.0,9.0
28197149,2019,2338564,520870,171110,5201,1704,Cesáreo,1,4,27,4,4.0,Parda,Feminino,3082.0,8.0,9.0
28197160,2019,10537,530010,170240,5301,1709,Cesáreo,1,3,36,5,,Ignorado,Ignorado,2870.0,8.0,9.0
28197161,2019,5717515,530010,170610,5301,1710,Cesáreo,1,3,21,4,4.0,Parda,Masculino,2554.0,8.0,9.0


In [5]:
28197164 - 22773582

5423582

In [6]:
5423582 / 28197164

0.1923449464634103

## Editar variáveis

In [7]:
df_sinasc_sus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22773582 entries, 2 to 28197162
Data columns (total 17 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ano                 int64  
 1   cnes                int64  
 2   hosp_municipio      int64  
 3   res_municipio       int64  
 4   hosp_regiao_saude   int64  
 5   res_regiao_saude    int64  
 6   tipo_parto          object 
 7   n_gestados          int64  
 8   n_pre_natal         int64  
 9   idade               int64  
 10  nivel_escolaridade  int64  
 11  raca_cor            float64
 12  nasc_raca_cor       object 
 13  nasc_sexo           object 
 14  nasc_peso           float64
 15  nasc_apgar1         float64
 16  nasc_apgar5         float64
dtypes: float64(4), int64(10), object(3)
memory usage: 3.1+ GB


### `tipo_parto`

In [8]:
df_sinasc_sus['tipo_parto'].value_counts()

Vaginal    11563626
Cesáreo    11209956
Name: tipo_parto, dtype: int64

In [9]:
parto_normal = {
  'Vaginal': True, 'Cesáreo': False}

In [10]:
df_sinasc_sus.loc[:, 'tipo_parto'] = df_sinasc_sus['tipo_parto'].map(parto_normal)
df_sinasc_sus = df_sinasc_sus.rename(columns={'tipo_parto': 'parto_normal'})

### `raca_cor`

In [11]:
df_sinasc_sus['raca_cor'].value_counts()

4.0    11625628
1.0     5830464
2.0     1122356
5.0      137716
3.0       61727
9.0       15388
Name: raca_cor, dtype: int64

In [12]:
raca_cor = {
  1: 'Branca',
  2: 'Preta',
  3: 'Amarela',
  4: 'Parda',
  5: 'Indígena',
  9: np.nan
}

In [13]:
df_sinasc_sus.loc[:, 'raca_cor'] = df_sinasc_sus['raca_cor'].map(raca_cor)

### `nasc_sexo`

In [14]:
df_sinasc_sus['nasc_sexo'].value_counts()

Masculino    11666829
Feminino     11102679
Ignorado         4074
Name: nasc_sexo, dtype: int64

In [15]:
sexo_nasc = {
  'Feminino': True, 'Masculino': False, 'Ignorado': np.nan}

In [16]:
df_sinasc_sus.loc[:, 'nasc_sexo'] = df_sinasc_sus['nasc_sexo'].map(sexo_nasc)
df_sinasc_sus = df_sinasc_sus.rename(columns={'nasc_sexo': 'sexo_fem'})

### `nasc_peso`

In [17]:
df_sinasc_sus.loc[:, 'nasc_peso'] = df_sinasc_sus['nasc_peso'].astype('Int64')

### `nasc_apgar1`

In [18]:
df_sinasc_sus.loc[:, 'nasc_apgar1'] = df_sinasc_sus['nasc_apgar1'].astype('Int64').replace(99, np.nan)

In [19]:
df_sinasc_sus['nasc_apgar1'].value_counts()

9     11111390
8      7312028
7      1557666
10      834776
6       577964
5       325601
4       197379
3       133466
2        96165
1        69305
0        23272
Name: nasc_apgar1, dtype: Int64

### `nasc_apgar5`

In [20]:
df_sinasc_sus.loc[:, 'nasc_apgar5'] = df_sinasc_sus['nasc_apgar5'].astype('Int64').replace(99, np.nan)

In [21]:
df_sinasc_sus['nasc_apgar5'].value_counts()

10    10244119
9     10074459
8      1357308
7       291837
6       102954
5        56333
1        31028
4        28884
2        18684
3        18642
0        14524
Name: nasc_apgar5, dtype: Int64

## Dataset final

In [22]:
df_sinasc_sus

Unnamed: 0,ano,cnes,hosp_municipio,res_municipio,hosp_regiao_saude,res_regiao_saude,parto_normal,n_gestados,n_pre_natal,idade,nivel_escolaridade,raca_cor,nasc_raca_cor,sexo_fem,nasc_peso,nasc_apgar1,nasc_apgar5
2,2010,2798484,110030,120040,1103,1201,True,1,4,20,3,,Branca,False,3550,8,9
3,2010,5701929,120001,120001,1201,1201,True,1,3,21,4,,Parda,True,3000,9,10
4,2010,5701929,120001,120001,1201,1201,True,1,3,31,3,,Parda,True,3000,9,10
5,2010,5701929,120001,120001,1201,1201,True,1,4,23,3,,Parda,False,3900,7,9
6,2010,5701929,120001,120001,1201,1201,True,1,4,26,4,,Parda,True,3250,7,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28197146,2019,2361787,520110,170730,5211,1710,False,1,3,18,3,Parda,Parda,True,2840,8,9
28197149,2019,2338564,520870,171110,5201,1704,False,1,4,27,4,Parda,Parda,True,3082,8,9
28197160,2019,10537,530010,170240,5301,1709,False,1,3,36,5,,Ignorado,,2870,8,9
28197161,2019,5717515,530010,170610,5301,1710,False,1,3,21,4,Parda,Parda,False,2554,8,9


In [23]:
path_sinasc_sus = f'{Path.home()}/Databases/SINASC/sinasc_sus.csv.gzip'
df_sinasc_sus.to_csv(path_sinasc_sus, index=False)