# Sífilis adquirida em Natal

Baseado nos passos de limpeza e profiling feitos anteriormente, alguns serão reproduzidos para tratar e analisar os dados de Natal.


In [1]:
# imports 
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
df = pd.read_csv('../data/SINAN/SifAquirida_2010_2017_Long.csv', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


O primeiro passo é descobrir filtrar as notificações feitas no município

In [3]:
df.query('id_municip == 240810') 

Unnamed: 0,ctrlsifan,dt_diagnostico,id_agravo,dt_notific,sem_not,nu_ano,anodiag,sg_uf_not,id_municip,id_regiona,...,evolucao,dt_obito,dt_encerra,flxrecebi,co_usucad,co_usualt,tp_sistema,tpuninot,nu_idade,nu_idade_ant_string
2247,00008451,9/30/2010,A539,5/10/2011,201119.0,2011.0,2010.0,24,240810.0,1416,...,,,5/10/2011,2.0,,,,,38.0,4038.0
2274,00008478,10/1/2010,A539,12/11/2013,201350.0,2013.0,2010.0,24,240810.0,1416,...,,,2/11/2014,2.0,,,,,41.0,4041.0
2384,00008589,10/11/2010,A539,3/8/2012,201210.0,2012.0,2010.0,24,240810.0,1416,...,,,,2.0,,,,,58.0,4058.0
3126,00009364,11/25/2010,A539,2/3/2011,201105.0,2011.0,2010.0,24,240810.0,1416,...,9,,2/8/2011,2.0,,PACIENTE,1,,28.0,4028.0
3127,00009365,11/25/2010,A539,2/3/2011,201105.0,2011.0,2010.0,24,240810.0,1416,...,1,,2/3/2011,2.0,,,1,,38.0,4038.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343972,00361684,7/29/2017,A539,8/2/2017,201731.0,2017.0,2017.0,24,240810.0,1416,...,,,,2.0,,,1,,18.0,4018.0
343973,00361685,7/29/2017,A539,8/2/2017,201731.0,2017.0,2017.0,24,240810.0,1416,...,,,,2.0,,,1,,17.0,4017.0
343974,00361686,7/29/2017,A539,8/2/2017,201731.0,2017.0,2017.0,24,240810.0,1416,...,,,,2.0,,,1,,19.0,4019.0
344184,00361900,8/2/2017,A539,8/4/2017,201731.0,2017.0,2017.0,24,240810.0,1416,...,,,,2.0,,,1,,34.0,4034.0


In [4]:
# criando novo dataframe
filtered_df = df #.query('id_municip == 240810') # Ignorado por enquanto

### Limpeza

Seguindo o tratamento feito no notebook 1.1, são retiradas colunas que não serão úteis

In [5]:
# null columns
nulls = ['in_vincula', 'id_ocupa_n', 'codisinf', 'co_usucad','tpuninot']

# colunas cujos valores repetem ou não são uteis
ignorable = [ 'nu_idade_ant_string',  'id_agravo', 'copaisinf',
             'tp_sistema', 'nobaiinf', 'id_pais', 'dt_diagnostico',
             
            # não sao uteis ( id, nome, data de nascimentos)
             'ctrlsifan',
             # muitos valores nulos
             'co_usualt',
            ]

labels = nulls + ignorable
cleaned_df = filtered_df.drop(labels=labels, axis='columns')

In [6]:
cleaned_df.replace('\s+', np.nan, regex=True, inplace=True)
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344358 entries, 0 to 344357
Data columns (total 33 columns):
dt_notific    344355 non-null object
sem_not       344355 non-null float64
nu_ano        342531 non-null float64
anodiag       342531 non-null float64
sg_uf_not     342530 non-null object
id_municip    342531 non-null float64
id_regiona    313840 non-null object
id_unidade    342523 non-null object
UFRES         342531 non-null float64
id_mn_resi    342531 non-null float64
dt_diag       342531 non-null object
sem_pri       342531 non-null float64
dt_nasc       334125 non-null object
nu_idade_n    342531 non-null float64
FXETARIA      342531 non-null float64
SEXO          342531 non-null float64
cs_gestant    342524 non-null object
cs_raca       335034 non-null object
cs_escol_n    316785 non-null object
cs_zona       331169 non-null object
dt_invest     243189 non-null object
classi_fin    329125 non-null object
criterio      231979 non-null object
tpautocto     232044 non-nul

In [7]:
# rearranja dt_obito
cleaned_df['obito'] = cleaned_df.dt_obito.notna()
cleaned_df.drop(labels=['dt_obito'], axis='columns', inplace=True)

cleaned_df.obito

0         False
1         False
2         False
3         False
4         False
          ...  
344353    False
344354    False
344355    False
344356    False
344357    False
Name: obito, Length: 344358, dtype: bool

Como escolaridade é um dado importante, as linhas que não contém esse dado serão desconsiderados. Os dados de classificação final também são importantes, mas os dados sem classificacao serão prenchidos com 9, que é valor de "ignorado" no formulário.

In [8]:
cleaned_df.dropna(subset=['cs_escol_n'], axis='rows', inplace=True)

# verifica resultado
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 316785 entries, 0 to 344357
Data columns (total 33 columns):
dt_notific    316785 non-null object
sem_not       316785 non-null float64
nu_ano        316785 non-null float64
anodiag       316785 non-null float64
sg_uf_not     316784 non-null object
id_municip    316785 non-null float64
id_regiona    290971 non-null object
id_unidade    316778 non-null object
UFRES         316785 non-null float64
id_mn_resi    316785 non-null float64
dt_diag       316785 non-null object
sem_pri       316785 non-null float64
dt_nasc       309481 non-null object
nu_idade_n    316785 non-null float64
FXETARIA      316785 non-null float64
SEXO          316785 non-null float64
cs_gestant    316781 non-null object
cs_raca       315950 non-null object
cs_escol_n    316785 non-null object
cs_zona       307924 non-null object
dt_invest     225868 non-null object
classi_fin    304551 non-null object
criterio      216371 non-null object
tpautocto     216454 non-nul

In [9]:
not_categorical = ['cs_gestant', 'cs_raca', 'cs_escol_n', 'cs_zona', 
          'tpautocto', 'classi_fin', 'criterio', 'evolucao', 
          'doenca_tra', 'sg_uf_not', 'id_regiona', 'id_unidade', 'coufinf']

cleaned_df.loc[:,not_categorical] = cleaned_df.loc[:,not_categorical].fillna( value=9)

cleaned_df.loc[:,not_categorical] = cleaned_df.loc[:,not_categorical].astype('float64')

cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 316785 entries, 0 to 344357
Data columns (total 33 columns):
dt_notific    316785 non-null object
sem_not       316785 non-null float64
nu_ano        316785 non-null float64
anodiag       316785 non-null float64
sg_uf_not     316785 non-null float64
id_municip    316785 non-null float64
id_regiona    316785 non-null float64
id_unidade    316785 non-null float64
UFRES         316785 non-null float64
id_mn_resi    316785 non-null float64
dt_diag       316785 non-null object
sem_pri       316785 non-null float64
dt_nasc       309481 non-null object
nu_idade_n    316785 non-null float64
FXETARIA      316785 non-null float64
SEXO          316785 non-null float64
cs_gestant    316785 non-null float64
cs_raca       316785 non-null float64
cs_escol_n    316785 non-null float64
cs_zona       316785 non-null float64
dt_invest     225868 non-null object
classi_fin    316785 non-null float64
criterio      316785 non-null float64
tpautocto     31678

In [13]:
# Converte campos de data

cleaned_df.dt_notific = pd.to_datetime(cleaned_df.dt_notific,
                                           format='%m/%d/%Y')
cleaned_df.dt_diag = pd.to_datetime(cleaned_df.dt_diag,
                                           format='%m/%d/%Y')
cleaned_df.dt_encerra = pd.to_datetime(cleaned_df.dt_encerra,
                                           format='%m/%d/%Y')
cleaned_df.dt_invest.replace('216', '2016', regex=True,inplace=True)
cleaned_df.dt_invest = pd.to_datetime(cleaned_df.dt_invest,
                                           format='%m/%d/%Y')
cleaned_df.dt_nasc = pd.to_datetime(cleaned_df.dt_nasc,
                                           format='%m/%d/%Y')

cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 316785 entries, 0 to 344357
Data columns (total 33 columns):
dt_notific    316785 non-null datetime64[ns]
sem_not       316785 non-null float64
nu_ano        316785 non-null float64
anodiag       316785 non-null float64
sg_uf_not     316785 non-null float64
id_municip    316785 non-null float64
id_regiona    316785 non-null float64
id_unidade    316785 non-null float64
UFRES         316785 non-null float64
id_mn_resi    316785 non-null float64
dt_diag       316785 non-null datetime64[ns]
sem_pri       316785 non-null float64
dt_nasc       309481 non-null datetime64[ns]
nu_idade_n    316785 non-null float64
FXETARIA      316785 non-null float64
SEXO          316785 non-null float64
cs_gestant    316785 non-null float64
cs_raca       316785 non-null float64
cs_escol_n    316785 non-null float64
cs_zona       316785 non-null float64
dt_invest     225868 non-null datetime64[ns]
classi_fin    316785 non-null float64
criterio      316785 non-

In [14]:
# corrige idade
cleaned_df['idade'] = (cleaned_df.dt_diag - cleaned_df.dt_nasc).dt.days //365

cleaned_df.drop(labels=['dt_nasc'], axis='columns', inplace=True)

cleaned_df['idade']

0         16.0
1         44.0
2         24.0
3         51.0
4          NaN
          ... 
344353    19.0
344354    47.0
344355    33.0
344356    30.0
344357    37.0
Name: idade, Length: 316785, dtype: float64

In [15]:
cleaned_df.loc[:, ['dt_diag', 'nu_idade', 'idade']]

Unnamed: 0,dt_diag,nu_idade,idade
0,2010-01-01,16.0,16.0
1,2010-01-01,44.0,44.0
2,2010-01-01,24.0,24.0
3,2010-01-01,51.0,51.0
4,2010-01-01,41.0,
...,...,...,...
344353,2017-08-11,19.0,19.0
344354,2017-08-11,47.0,47.0
344355,2017-08-14,33.0,33.0
344356,2017-08-14,30.0,30.0


In [16]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 316785 entries, 0 to 344357
Data columns (total 33 columns):
dt_notific    316785 non-null datetime64[ns]
sem_not       316785 non-null float64
nu_ano        316785 non-null float64
anodiag       316785 non-null float64
sg_uf_not     316785 non-null float64
id_municip    316785 non-null float64
id_regiona    316785 non-null float64
id_unidade    316785 non-null float64
UFRES         316785 non-null float64
id_mn_resi    316785 non-null float64
dt_diag       316785 non-null datetime64[ns]
sem_pri       316785 non-null float64
nu_idade_n    316785 non-null float64
FXETARIA      316785 non-null float64
SEXO          316785 non-null float64
cs_gestant    316785 non-null float64
cs_raca       316785 non-null float64
cs_escol_n    316785 non-null float64
cs_zona       316785 non-null float64
dt_invest     225868 non-null datetime64[ns]
classi_fin    316785 non-null float64
criterio      316785 non-null float64
tpautocto     316785 non-null fl

In [17]:
cleaned_df.to_csv('../data/adquirida.csv', sep=';')