# Sífilis adquirida em Natal

Baseado nos passos de limpeza e profiling feitos anteriormente, alguns serão reproduzidos para tratar e analisar os dados de Natal.


In [1]:
# imports 
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
df = pd.read_csv('../data/SINAN/SifAquirida_2010_2017_Long.csv', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


O primeiro passo é descobrir filtrar as notificações feitas no município

In [3]:
df.query('id_municip == 240810')

Unnamed: 0,ctrlsifan,dt_diagnostico,id_agravo,dt_notific,sem_not,nu_ano,anodiag,sg_uf_not,id_municip,id_regiona,...,evolucao,dt_obito,dt_encerra,flxrecebi,co_usucad,co_usualt,tp_sistema,tpuninot,nu_idade,nu_idade_ant_string
2247,00008451,9/30/2010,A539,5/10/2011,201119.0,2011.0,2010.0,24,240810.0,1416,...,,,5/10/2011,2.0,,,,,38.0,4038.0
2274,00008478,10/1/2010,A539,12/11/2013,201350.0,2013.0,2010.0,24,240810.0,1416,...,,,2/11/2014,2.0,,,,,41.0,4041.0
2384,00008589,10/11/2010,A539,3/8/2012,201210.0,2012.0,2010.0,24,240810.0,1416,...,,,,2.0,,,,,58.0,4058.0
3126,00009364,11/25/2010,A539,2/3/2011,201105.0,2011.0,2010.0,24,240810.0,1416,...,9,,2/8/2011,2.0,,PACIENTE,1,,28.0,4028.0
3127,00009365,11/25/2010,A539,2/3/2011,201105.0,2011.0,2010.0,24,240810.0,1416,...,1,,2/3/2011,2.0,,,1,,38.0,4038.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343972,00361684,7/29/2017,A539,8/2/2017,201731.0,2017.0,2017.0,24,240810.0,1416,...,,,,2.0,,,1,,18.0,4018.0
343973,00361685,7/29/2017,A539,8/2/2017,201731.0,2017.0,2017.0,24,240810.0,1416,...,,,,2.0,,,1,,17.0,4017.0
343974,00361686,7/29/2017,A539,8/2/2017,201731.0,2017.0,2017.0,24,240810.0,1416,...,,,,2.0,,,1,,19.0,4019.0
344184,00361900,8/2/2017,A539,8/4/2017,201731.0,2017.0,2017.0,24,240810.0,1416,...,,,,2.0,,,1,,34.0,4034.0


In [4]:
# criando novo dataframe
filtered_df = df.query('id_municip == 240810')

### Limpeza

Seguindo o tratamento feito no notebook 1.1, são retiradas colunas que não serão úteis

In [5]:
# null columns
nulls = ['in_vincula', 'id_ocupa_n', 'codisinf', 'co_usucad','tpuninot']

# colunas cujos valores repetem ou não são uteis
ignorable = [ 'nu_idade_ant_string',  'id_agravo', 'copaisinf',
             'tp_sistema', 'nobaiinf', 'id_pais', 'dt_diagnostico',
             
            # não sao uteis ( id, nome, data de nascimentos)
             'ctrlsifan',
             # muitos valores nulos
             'co_usualt',
            ]

labels = nulls + ignorable
cleaned_df = filtered_df.drop(labels=labels, axis='columns')

In [6]:
cleaned_df.replace('\s+', np.nan, regex=True, inplace=True)
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1394 entries, 2247 to 344185
Data columns (total 33 columns):
dt_notific    1394 non-null object
sem_not       1394 non-null float64
nu_ano        1394 non-null float64
anodiag       1394 non-null float64
sg_uf_not     1394 non-null object
id_municip    1394 non-null float64
id_regiona    1394 non-null object
id_unidade    1394 non-null object
UFRES         1394 non-null float64
id_mn_resi    1394 non-null float64
dt_diag       1394 non-null object
sem_pri       1394 non-null float64
dt_nasc       1366 non-null object
nu_idade_n    1394 non-null float64
FXETARIA      1394 non-null float64
SEXO          1394 non-null float64
cs_gestant    1394 non-null object
cs_raca       1360 non-null object
cs_escol_n    1303 non-null object
cs_zona       1358 non-null object
dt_invest     815 non-null object
classi_fin    1275 non-null object
criterio      768 non-null object
tpautocto     780 non-null object
coufinf       465 non-null object
comunin

In [7]:
# rearranja dt_obito
cleaned_df['obito'] = cleaned_df.dt_obito.notna()
cleaned_df.drop(labels=['dt_obito'], axis='columns', inplace=True)

cleaned_df.obito

2247      False
2274      False
2384      False
3126      False
3127      False
          ...  
343972    False
343973    False
343974    False
344184    False
344185    False
Name: obito, Length: 1394, dtype: bool

Como escolaridade é um dado importante, as linhas que não contém esse dado serão desconsiderados. Os dados de classificação final também são importantes, mas os dados sem classificacao serão prenchidos com 9, que é valor de "ignorado" no formulário.

In [8]:
cleaned_df.dropna(subset=['cs_escol_n'], axis='rows', inplace=True)

# verifica resultado
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1303 entries, 2247 to 344185
Data columns (total 33 columns):
dt_notific    1303 non-null object
sem_not       1303 non-null float64
nu_ano        1303 non-null float64
anodiag       1303 non-null float64
sg_uf_not     1303 non-null object
id_municip    1303 non-null float64
id_regiona    1303 non-null object
id_unidade    1303 non-null object
UFRES         1303 non-null float64
id_mn_resi    1303 non-null float64
dt_diag       1303 non-null object
sem_pri       1303 non-null float64
dt_nasc       1279 non-null object
nu_idade_n    1303 non-null float64
FXETARIA      1303 non-null float64
SEXO          1303 non-null float64
cs_gestant    1303 non-null object
cs_raca       1298 non-null object
cs_escol_n    1303 non-null object
cs_zona       1272 non-null object
dt_invest     763 non-null object
classi_fin    1188 non-null object
criterio      722 non-null object
tpautocto     731 non-null object
coufinf       442 non-null object
comunin

In [9]:
not_categorical = ['cs_gestant', 'cs_raca', 'cs_escol_n', 'cs_zona', 
          'tpautocto', 'classi_fin', 'criterio', 'evolucao', 
          'doenca_tra', 'sg_uf_not', 'id_regiona', 'id_unidade', 'coufinf']

cleaned_df.loc[:,not_categorical] = cleaned_df.loc[:,not_categorical].fillna( value=9)

cleaned_df.loc[:,not_categorical] = cleaned_df.loc[:,not_categorical].astype('float64')

cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1303 entries, 2247 to 344185
Data columns (total 33 columns):
dt_notific    1303 non-null object
sem_not       1303 non-null float64
nu_ano        1303 non-null float64
anodiag       1303 non-null float64
sg_uf_not     1303 non-null float64
id_municip    1303 non-null float64
id_regiona    1303 non-null float64
id_unidade    1303 non-null float64
UFRES         1303 non-null float64
id_mn_resi    1303 non-null float64
dt_diag       1303 non-null object
sem_pri       1303 non-null float64
dt_nasc       1279 non-null object
nu_idade_n    1303 non-null float64
FXETARIA      1303 non-null float64
SEXO          1303 non-null float64
cs_gestant    1303 non-null float64
cs_raca       1303 non-null float64
cs_escol_n    1303 non-null float64
cs_zona       1303 non-null float64
dt_invest     763 non-null object
classi_fin    1303 non-null float64
criterio      1303 non-null float64
tpautocto     1303 non-null float64
coufinf       1303 non-null f

In [10]:
# Converte campos de data
cleaned_df.dt_notific = pd.to_datetime(cleaned_df.dt_notific,
                                           format='%m/%d/%Y')
cleaned_df.dt_diag = pd.to_datetime(cleaned_df.dt_diag,
                                           format='%m/%d/%Y')
cleaned_df.dt_encerra = pd.to_datetime(cleaned_df.dt_encerra,
                                           format='%m/%d/%Y')
cleaned_df.dt_invest = pd.to_datetime(cleaned_df.dt_invest,
                                           format='%m/%d/%Y')

cleaned_df.dt_nasc = pd.to_datetime(cleaned_df.dt_nasc,
                                           format='%m/%d/%Y')

cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1303 entries, 2247 to 344185
Data columns (total 33 columns):
dt_notific    1303 non-null datetime64[ns]
sem_not       1303 non-null float64
nu_ano        1303 non-null float64
anodiag       1303 non-null float64
sg_uf_not     1303 non-null float64
id_municip    1303 non-null float64
id_regiona    1303 non-null float64
id_unidade    1303 non-null float64
UFRES         1303 non-null float64
id_mn_resi    1303 non-null float64
dt_diag       1303 non-null datetime64[ns]
sem_pri       1303 non-null float64
dt_nasc       1279 non-null datetime64[ns]
nu_idade_n    1303 non-null float64
FXETARIA      1303 non-null float64
SEXO          1303 non-null float64
cs_gestant    1303 non-null float64
cs_raca       1303 non-null float64
cs_escol_n    1303 non-null float64
cs_zona       1303 non-null float64
dt_invest     763 non-null datetime64[ns]
classi_fin    1303 non-null float64
criterio      1303 non-null float64
tpautocto     1303 non-null float

In [11]:
# corrige idade
cleaned_df['idade'] = (cleaned_df.dt_diag - cleaned_df.dt_nasc).dt.days //365

cleaned_df.drop(labels=['dt_nasc'], axis='columns', inplace=True)

cleaned_df['idade']

2247      38.0
2274      41.0
2384      58.0
3126      28.0
3127      38.0
          ... 
343972    18.0
343973    17.0
343974    19.0
344184    34.0
344185    24.0
Name: idade, Length: 1303, dtype: float64

In [12]:
cleaned_df.loc[:, ['dt_diag', 'nu_idade', 'idade']]

Unnamed: 0,dt_diag,nu_idade,idade
2247,2010-09-30,38.0,38.0
2274,2010-10-01,41.0,41.0
2384,2010-10-11,58.0,58.0
3126,2010-11-25,28.0,28.0
3127,2010-11-25,38.0,38.0
...,...,...,...
343972,2017-07-29,18.0,18.0
343973,2017-07-29,17.0,17.0
343974,2017-07-29,19.0,19.0
344184,2017-08-02,34.0,34.0


In [13]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1303 entries, 2247 to 344185
Data columns (total 33 columns):
dt_notific    1303 non-null datetime64[ns]
sem_not       1303 non-null float64
nu_ano        1303 non-null float64
anodiag       1303 non-null float64
sg_uf_not     1303 non-null float64
id_municip    1303 non-null float64
id_regiona    1303 non-null float64
id_unidade    1303 non-null float64
UFRES         1303 non-null float64
id_mn_resi    1303 non-null float64
dt_diag       1303 non-null datetime64[ns]
sem_pri       1303 non-null float64
nu_idade_n    1303 non-null float64
FXETARIA      1303 non-null float64
SEXO          1303 non-null float64
cs_gestant    1303 non-null float64
cs_raca       1303 non-null float64
cs_escol_n    1303 non-null float64
cs_zona       1303 non-null float64
dt_invest     763 non-null datetime64[ns]
classi_fin    1303 non-null float64
criterio      1303 non-null float64
tpautocto     1303 non-null float64
coufinf       1303 non-null float64
comu

In [14]:
cleaned_df.to_csv('../data/natal_adquirida.csv', sep=';')