In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Knowing the dataset
At first, no cleaning will be applied to the dataset, later we will apply all cleaning and make a new EDA

In [9]:
#Import dataset
df = pd.read_csv('base_suja/base_unificada_suja.csv')
# Configuration to show more rows and columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

In [13]:
url_dicionario = 'https://docs.google.com/spreadsheets/d/1QFy_F2o81ULglNx8knNqg6oI7v3RARaUAaLLwOQr7K4/edit?usp=sharing'
f"""The dataset has {len(df)} lines and {len(df.columns)} columns. More details can be consulted in the dictionary {url_dicionario}. Below the type of variables and the 'face' from the dataset"""

"The dataset has 10905737 lines and 81 columns. More details can be consulted in the dictionary https://docs.google.com/spreadsheets/d/1QFy_F2o81ULglNx8knNqg6oI7v3RARaUAaLLwOQr7K4/edit?usp=sharing. Below the type of variables and the 'face' from the dataset"

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10905737 entries, 0 to 10905736
Data columns (total 81 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   data_evento            object 
 1   ano_evento             int64  
 2   TIPOBITO               float64
 3   evento_MUNNOMEX        object 
 4   res_MUNNOMEX           object 
 5   evento_CAPITAL         object 
 6   res_CAPITAL            object 
 7   evento_REGIAO          object 
 8   res_REGIAO             object 
 9   evento_SIGLA_UF        object 
 10  res_SIGLA_UF           object 
 11  IDADEMAE               float64
 12  idademae_faixa         object 
 13  ESCMAE2010             float64
 14  escolaridade_mae       object 
 15  OBITOGRAV              float64
 16  GRAVIDEZ               int64  
 17  tipo_gravidez          object 
 18  SEMAGESTAC             float64
 19  idade_gestacao_faixa   object 
 20  SEXO                   int64  
 21  def_sexo               object 
 22  PESO            

In [14]:
df.head()

Unnamed: 0,data_evento,ano_evento,TIPOBITO,evento_MUNNOMEX,res_MUNNOMEX,evento_CAPITAL,res_CAPITAL,evento_REGIAO,res_REGIAO,evento_SIGLA_UF,...,TP_UNID_76,TP_UNID_77,TP_UNID_78,TP_UNID_79,TP_UNID_80,TP_UNID_81,TP_UNID_82,TP_UNID_83,TP_UNID_84,TP_UNID_85
0,2019-01-30,2019,1.0,CATU,ENTRE RIOS,N,N,Nordeste,Nordeste,BA,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2019-01-24,2019,1.0,JUAZEIRO DO NORTE,JUAZEIRO DO NORTE,N,N,Nordeste,Nordeste,CE,...,1.0,1.0,0.0,1.0,0.0,2.0,1.0,0.0,0.0,0.0
2,2019-01-29,2019,1.0,LORENA,POTIM,N,N,Sudeste,Sudeste,SP,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019-01-25,2019,1.0,PARNAIBA,AGUA DOCE DO MARANHAO,N,N,Nordeste,Nordeste,PI,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2019-02-14,2019,1.0,CAMACARI,CAMACARI,N,N,Nordeste,Nordeste,BA,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#  Checking data quality

### Duplicates

In [25]:
# Duplicate data 
duplicados = df.duplicated()
soma = duplicados.sum()
f"""The dataset has {soma} duplicate rows, which represents {round((soma/len(df)) * 100, 2)} %"""

'The dataset has 2981 duplicate rows, which represents 0.03 %'

In [27]:
# Frequency of duplicates by FLAG_BASE, res_REGIAO and ano_evento
df_duplicados = df[duplicados]
df_duplicados.value_counts(['FLAG_BASE', 'ano_evento', 'res_REGIAO'])

# There is no concentration of missing items that indicates a structural problem with filling in the data, whether by year or 
# region of residence

FLAG_BASE  ano_evento  res_REGIAO  
SINASC     2019        Sudeste         422
           2020        Sudeste         397
           2022        Sudeste         359
           2021        Sudeste         352
           2019        Nordeste        188
           2021        Nordeste        186
           2020        Nordeste        166
           2022        Nordeste        137
           2021        Norte            78
           2020        Norte            78
           2019        Norte            73
           2022        Norte            57
           2019        Sul              56
                       Centro-Oeste     55
           2020        Centro-Oeste     51
           2021        Centro-Oeste     50
                       Sul              48
           2020        Sul              46
           2022        Centro-Oeste     42
                       Sul              36
SIM_DOFET  2019        Sudeste          19
           2021        Sudeste          16
                  

### Missing

In [39]:
# Counting missing values
df_sim_dofet = df[df['FLAG_BASE']=='SIM_DOFET']
missing_count_sim_dofet = df_sim_dofet.isnull().sum()  # counts the null values in each column
missing_percent_sim_dofet = round((missing_count_sim_dofet / len(df_sim_dofet)) * 100,2)  # calculates the percentage of null values
missing_data_sim_dofet = pd.DataFrame({'Missing Count': missing_count_sim_dofet, 'Missing Percentage': missing_percent_sim_dofet})
missing_data_sim_dofet.index.name = 'Variable'  # sets the index name to 'Variable'
missing_data_sim_dofet.reset_index(inplace=True)  # resets the index to make 'Variable' a column
missing_data_sim_dofet['BASE'] = 'SIM_DOFET'

df_sinasc = df[df['FLAG_BASE']=='SINASC']
missing_count_sinasc = df_sinasc.isnull().sum()  # counts the null values in each column
missing_percent_sinasc = round((missing_count_sinasc / len(df_sinasc)) * 100,2)  # calculates the percentage of null values
missing_data_sinasc = pd.DataFrame({'Missing Count': missing_count_sinasc, 'Missing Percentage': missing_percent_sinasc})
missing_data_sinasc.index.name = 'Variable'  # sets the index name to 'Variable'
missing_data_sinasc.reset_index(inplace=True)  # resets the index to make 'Variable' a column
missing_data_sinasc['BASE'] = 'SINASC'

# appending
missing_data = pd.concat([missing_data_sim_dofet, missing_data_sinasc])

# Sorting the DataFrame by the highest missing frequencies
missing_data_sorted = missing_data.sort_values(by=['BASE', 'Missing Count'], ascending=False)

missing_data_sorted[missing_data_sorted['Missing Percentage'] > 0]

# Some variables are not filled in the sinasc dataset. For EDA it will be used in SIM_DOFET, but in the model it will not be removed
# 'OBITOGRAV' no padding at the base
# Missing points for other variables will be removed, keeping the variables in the study

Unnamed: 0,Variable,Missing Count,Missing Percentage,BASE
2,TIPOBITO,10818314,100.0,SINASC
15,OBITOGRAV,10818314,100.0,SINASC
24,OBITOPARTO,10818314,100.0,SINASC
25,def_obito_parto,10818314,100.0,SINASC
26,CAUSABAS,10818314,100.0,SINASC
27,causabas_capitulo,10818314,100.0,SINASC
28,causabas_categoria,10818314,100.0,SINASC
29,causabas_grupo,10818314,100.0,SINASC
30,causabas_subcategoria,10818314,100.0,SINASC
13,ESCMAE2010,144566,1.34,SINASC


In [None]:
# Checking if there is any pattern in the missing data looking at year and region


# Cleaning the dataset

In [None]:
# Removing duplicates
df_limpo = df.drop_duplicates() # - 2981 linhas
# Removing columns with a high frequency of missings
df_limpo = df_limpo.drop(columns=['OBITOGRAV', 'causabas_categoria', 'TIPOBITO', 'causabas_capitulo', 'CAUSABAS'
                                  , 'def_obito_parto', 'OBITOPARTO', 'causabas_subcategoria', 'causabas_grupo'])
# 