# Cleaning

In [26]:
# imports 
import seaborn as sns
import pandas as pd
import numpy as np

In [27]:
df = pd.read_csv('../data/SINAN/SifAquirida_2010_2017_Long.csv', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


In [28]:
# remove irrelevant columns

# null columns
nulls = ['in_vincula', 'id_ocupa_n', 'codisinf', 'co_usucad','tpuninot']

# ignorable (repetitive or something like that)
ignorable = [ 'nu_idade_ant_string', 'ctrlsifan', 'id_agravo', 
             'tp_sistema', 'nobaiinf', 'id_pais', 'dt_diagnostico',
            # no significant information (as id, name, birth date)
             'dt_nasc',
            ]

labels = nulls + ignorable
df_clean = df.drop(labels=labels, axis='columns')

After droping the columns, we must process both numeric and textual values.

In [29]:
# change empty string to missing values
df_clean.replace('\s+', np.nan, regex=True, inplace=True)

# Asserting it worked
df_clean.evolucao.value_counts()

1    149883
9     67423
3       348
2       121
Name: evolucao, dtype: int64

In [30]:
# categorical data as lowercase
categorical = ['co_usualt']

for category in categorical:
    df_clean[category] = df_clean.iloc[:,df_clean.columns.get_loc(category)].str.lower() 
    
df_clean.co_usualt.value_counts().head()

paciente    7479
ocupacao    2537
antecede    1999
tratamen    1639
titulaca    1244
Name: co_usualt, dtype: int64

In [31]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344358 entries, 0 to 344357
Data columns (total 34 columns):
dt_notific    344355 non-null object
sem_not       344355 non-null float64
nu_ano        342531 non-null float64
anodiag       342531 non-null float64
sg_uf_not     342530 non-null object
id_municip    342531 non-null float64
id_regiona    313840 non-null object
id_unidade    342523 non-null object
UFRES         342531 non-null float64
id_mn_resi    342531 non-null float64
dt_diag       342531 non-null object
sem_pri       342531 non-null float64
nu_idade_n    342531 non-null float64
FXETARIA      342531 non-null float64
SEXO          342531 non-null float64
cs_gestant    342524 non-null object
cs_raca       335034 non-null object
cs_escol_n    316785 non-null object
cs_zona       331169 non-null object
dt_invest     243189 non-null object
classi_fin    329125 non-null object
criterio      231979 non-null object
tpautocto     232044 non-null object
coufinf       147160 non-nul

Before taking the last steps of preprocessing, we need to erase the nan values. Otherwise the methods of conversion and labeling won't work.

In [32]:
# convert columns that are numeric but have string values
not_categorical = ['cs_gestant', 'cs_raca', 'cs_escol_n', 'cs_zona', 
          'tpautocto', 'classi_fin', 'criterio', 'evolucao', 
          'doenca_tra']

df_clean.dropna(inplace=True, axis='rows', subset=not_categorical)

df_clean.loc[:,not_categorical] = df_clean.loc[:,not_categorical].astype('int64')

In [33]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 189875 entries, 2 to 344356
Data columns (total 34 columns):
dt_notific    189875 non-null object
sem_not       189875 non-null float64
nu_ano        189875 non-null float64
anodiag       189875 non-null float64
sg_uf_not     189874 non-null object
id_municip    189875 non-null float64
id_regiona    181584 non-null object
id_unidade    189872 non-null object
UFRES         189875 non-null float64
id_mn_resi    189875 non-null float64
dt_diag       189875 non-null object
sem_pri       189875 non-null float64
nu_idade_n    189875 non-null float64
FXETARIA      189875 non-null float64
SEXO          189875 non-null float64
cs_gestant    189875 non-null int64
cs_raca       189875 non-null int64
cs_escol_n    189875 non-null int64
cs_zona       189875 non-null int64
dt_invest     189874 non-null object
classi_fin    189875 non-null int64
criterio      189875 non-null int64
tpautocto     189875 non-null int64
coufinf       122159 non-null objec

In [35]:
# re-arrange dt_obito
df_clean['obito'] = df_clean.dt_obito.notna()
df_clean.drop(labels=['dt_obito'], axis='columns', inplace=True)

df_clean.obito

2         False
5         False
7         False
9         False
10        False
          ...  
344350    False
344352    False
344353    False
344354    False
344356    False
Name: obito, Length: 189875, dtype: bool

In [36]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 189875 entries, 2 to 344356
Data columns (total 34 columns):
dt_notific    189875 non-null object
sem_not       189875 non-null float64
nu_ano        189875 non-null float64
anodiag       189875 non-null float64
sg_uf_not     189874 non-null object
id_municip    189875 non-null float64
id_regiona    181584 non-null object
id_unidade    189872 non-null object
UFRES         189875 non-null float64
id_mn_resi    189875 non-null float64
dt_diag       189875 non-null object
sem_pri       189875 non-null float64
nu_idade_n    189875 non-null float64
FXETARIA      189875 non-null float64
SEXO          189875 non-null float64
cs_gestant    189875 non-null int64
cs_raca       189875 non-null int64
cs_escol_n    189875 non-null int64
cs_zona       189875 non-null int64
dt_invest     189874 non-null object
classi_fin    189875 non-null int64
criterio      189875 non-null int64
tpautocto     189875 non-null int64
coufinf       122159 non-null objec

## Final preprocessing
