# Cleaning

In [16]:
# imports 
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
df = pd.read_csv('../data/SINAN/SifAquirida_2010_2017_Long.csv', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# remove irrelevant columns

# null columns
nulls = ['in_vincula', 'id_ocupa_n', 'codisinf', 'co_usucad','tpuninot']

# ignorable (repetitive or something like that)
ignorable = [ 'nu_idade_ant_string', 'ctrlsifan', 'id_agravo', 
             'tp_sistema', 'nobaiinf', 'id_pais', 'dt_diagnostico',
            # no significant information (as id, name, birth date)
             'dt_nasc',
             # many missing values
             'co_usualt',
            ]

labels = nulls + ignorable
df_clean = df.drop(labels=labels, axis='columns')

After droping the columns, we must process both numeric and textual values.

In [4]:
# change empty string to missing values
df_clean.replace('\s+', np.nan, regex=True, inplace=True)

# Asserting it worked
df_clean.evolucao.value_counts()

1    149883
9     67423
3       348
2       121
Name: evolucao, dtype: int64

In [5]:
# categorical data as lowercase
# categorical = ['co_usualt']

# for category in categorical:
#     df_clean[category] = df_clean.iloc[:,df_clean.columns.get_loc(category)].str.lower() 
    
# df_clean.co_usualt.value_counts().head()

In [6]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344358 entries, 0 to 344357
Data columns (total 33 columns):
dt_notific    344355 non-null object
sem_not       344355 non-null float64
nu_ano        342531 non-null float64
anodiag       342531 non-null float64
sg_uf_not     342530 non-null object
id_municip    342531 non-null float64
id_regiona    313840 non-null object
id_unidade    342523 non-null object
UFRES         342531 non-null float64
id_mn_resi    342531 non-null float64
dt_diag       342531 non-null object
sem_pri       342531 non-null float64
nu_idade_n    342531 non-null float64
FXETARIA      342531 non-null float64
SEXO          342531 non-null float64
cs_gestant    342524 non-null object
cs_raca       335034 non-null object
cs_escol_n    316785 non-null object
cs_zona       331169 non-null object
dt_invest     243189 non-null object
classi_fin    329125 non-null object
criterio      231979 non-null object
tpautocto     232044 non-null object
coufinf       147160 non-nul

Before taking the last steps of preprocessing, we need to erase the nan values. Otherwise the methods of conversion and labeling won't work.

In [7]:
# convert columns that are numeric but have string values
not_categorical = ['cs_gestant', 'cs_raca', 'cs_escol_n', 'cs_zona', 
          'tpautocto', 'classi_fin', 'criterio', 'evolucao', 
          'doenca_tra', 'sg_uf_not', 'id_regiona', 'id_unidade', 'coufinf',
          'comuninf']

df_clean.dropna(inplace=True, axis='rows', subset=not_categorical)

df_clean.loc[:,not_categorical] = df_clean.loc[:,not_categorical].astype('int64')

In [8]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117791 entries, 2 to 344352
Data columns (total 33 columns):
dt_notific    117791 non-null object
sem_not       117791 non-null float64
nu_ano        117791 non-null float64
anodiag       117791 non-null float64
sg_uf_not     117791 non-null int64
id_municip    117791 non-null float64
id_regiona    117791 non-null int64
id_unidade    117791 non-null int64
UFRES         117791 non-null float64
id_mn_resi    117791 non-null float64
dt_diag       117791 non-null object
sem_pri       117791 non-null float64
nu_idade_n    117791 non-null float64
FXETARIA      117791 non-null float64
SEXO          117791 non-null float64
cs_gestant    117791 non-null int64
cs_raca       117791 non-null int64
cs_escol_n    117791 non-null int64
cs_zona       117791 non-null int64
dt_invest     117791 non-null object
classi_fin    117791 non-null int64
criterio      117791 non-null int64
tpautocto     117791 non-null int64
coufinf       117791 non-null int64
co

In [9]:
# re-arrange dt_obito
df_clean['obito'] = df_clean.dt_obito.notna()
df_clean.drop(labels=['dt_obito'], axis='columns', inplace=True)

df_clean.obito

2         False
5         False
9         False
10        False
15        False
          ...  
344340    False
344348    False
344349    False
344350    False
344352    False
Name: obito, Length: 117791, dtype: bool

In [10]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117791 entries, 2 to 344352
Data columns (total 33 columns):
dt_notific    117791 non-null object
sem_not       117791 non-null float64
nu_ano        117791 non-null float64
anodiag       117791 non-null float64
sg_uf_not     117791 non-null int64
id_municip    117791 non-null float64
id_regiona    117791 non-null int64
id_unidade    117791 non-null int64
UFRES         117791 non-null float64
id_mn_resi    117791 non-null float64
dt_diag       117791 non-null object
sem_pri       117791 non-null float64
nu_idade_n    117791 non-null float64
FXETARIA      117791 non-null float64
SEXO          117791 non-null float64
cs_gestant    117791 non-null int64
cs_raca       117791 non-null int64
cs_escol_n    117791 non-null int64
cs_zona       117791 non-null int64
dt_invest     117791 non-null object
classi_fin    117791 non-null int64
criterio      117791 non-null int64
tpautocto     117791 non-null int64
coufinf       117791 non-null int64
co

## Final preprocessing


In [11]:
# test select columns by type
df_clean.select_dtypes(include='object').columns

Index(['dt_notific', 'dt_diag', 'dt_invest', 'dt_encerra'], dtype='object')

In [12]:
categorical = df_clean.select_dtypes(include='object').columns
df_clean[categorical].fillna(' ', inplace=True)

df_clean.dropna(subset=['dt_invest'], axis='rows', inplace=True)


for category in categorical:
    label_enc = LabelEncoder()
    print(category)
    print(df_clean[category])
    labels = label_enc.fit_transform(df_clean[category])
    df_clean[category] = labels

dt_notific
2          4/29/2010
5         12/16/2010
9          5/25/2011
10         6/16/2011
15         4/26/2012
             ...    
344340     8/10/2017
344348     8/11/2017
344349     8/11/2017
344350     8/11/2017
344352     8/11/2017
Name: dt_notific, Length: 117791, dtype: object
dt_diag
2          1/1/2010
5          1/1/2010
9          1/1/2010
10         1/1/2010
15         1/1/2010
            ...    
344340    8/10/2017
344348    8/11/2017
344349    8/11/2017
344350    8/11/2017
344352    8/11/2017
Name: dt_diag, Length: 117791, dtype: object
dt_invest
2          4/29/2010
5         12/16/2010
9          5/25/2011
10         6/16/2011
15         4/26/2012
             ...    
344340     8/10/2017
344348     8/11/2017
344349     8/11/2017
344350     8/11/2017
344352     8/11/2017
Name: dt_invest, Length: 117791, dtype: object
dt_encerra
2          4/29/2010
5         12/16/2010
9         10/15/2012
10         6/21/2011
15         5/28/2012
             ...    
344340     8

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  **kwargs


In [13]:
# check
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117791 entries, 2 to 344352
Data columns (total 33 columns):
dt_notific    117791 non-null int64
sem_not       117791 non-null float64
nu_ano        117791 non-null float64
anodiag       117791 non-null float64
sg_uf_not     117791 non-null int64
id_municip    117791 non-null float64
id_regiona    117791 non-null int64
id_unidade    117791 non-null int64
UFRES         117791 non-null float64
id_mn_resi    117791 non-null float64
dt_diag       117791 non-null int64
sem_pri       117791 non-null float64
nu_idade_n    117791 non-null float64
FXETARIA      117791 non-null float64
SEXO          117791 non-null float64
cs_gestant    117791 non-null int64
cs_raca       117791 non-null int64
cs_escol_n    117791 non-null int64
cs_zona       117791 non-null int64
dt_invest     117791 non-null int64
classi_fin    117791 non-null int64
criterio      117791 non-null int64
tpautocto     117791 non-null int64
coufinf       117791 non-null int64
copai

Finally, we normalize the data.

In [19]:
scaler = StandardScaler()
data = scaler.fit_transform(df_clean.values)
    
    
# transform to df agagin
data_df = pd.DataFrame(data)
data_df.columns = df_clean.columns
    
# check the result
data_df.head()

Unnamed: 0,dt_notific,sem_not,nu_ano,anodiag,sg_uf_not,id_municip,id_regiona,id_unidade,UFRES,id_mn_resi,...,coufinf,copaisinf,comuninf,co_bainfc,doenca_tra,evolucao,dt_encerra,flxrecebi,nu_idade,obito
0,0.157223,0.01095,-2.785952,-2.689504,-0.666427,-0.692168,-0.140795,-0.512813,-0.665904,-0.692282,...,-0.665417,0.0,-0.691688,-0.094565,3.364852,-0.536474,0.123166,0.021016,-0.789577,-0.034619
1,-0.856621,0.013945,-2.785952,-2.689504,-0.076911,-0.097385,-0.296784,-0.165826,-0.076514,-0.097467,...,-0.07627,0.0,-0.097177,-0.094565,-0.290283,-0.536474,-0.882274,0.021016,1.257864,-0.034619
2,0.417925,0.02039,-2.203431,-2.689504,-0.076911,-0.034217,-0.327338,-0.175799,-0.076514,-0.034296,...,-0.07627,0.0,-0.034038,-0.094565,-0.290283,-0.536474,-1.437727,0.021016,0.267167,-0.034619
3,0.627591,0.020662,-2.203431,-2.689504,-0.076911,-0.037464,-0.330554,-0.168189,-0.076514,-0.037543,...,-0.07627,0.0,-0.037283,-0.088081,-0.290283,1.867194,0.656119,0.021016,1.257864,-0.034619
4,0.128256,0.029103,-1.620911,-2.689504,-0.076911,-0.0689,-0.309649,-0.205294,-0.076514,-0.068981,...,-0.07627,0.0,-0.068706,-0.094565,-0.290283,-0.536474,0.41847,0.021016,-0.92167,-0.034619


There are still 33 columns. probably, some have redundant data that can be identified using correlation

In [20]:
# Compute the correlation matrix
corr = data_df.corr()

for index, row in corr.iterrows():
    for val in row :
        if (val > 0.7) or (val < -0.7):
            indx = row[ row == val].index[0]
            if index != indx:
                print('{}, {} ({})'.format(index, indx, val))

dt_notific, dt_invest (0.9825902797965169)
dt_notific, dt_encerra (0.7734640878897021)
sem_not, sem_pri (0.9999958152539423)
nu_ano, anodiag (0.9769028165313747)
anodiag, nu_ano (0.9769028165313747)
sg_uf_not, id_municip (0.9995685946062094)
sg_uf_not, UFRES (0.9989326505444045)
sg_uf_not, id_mn_resi (0.998491771440503)
sg_uf_not, coufinf (0.9981873878147022)
sg_uf_not, comuninf (0.9976505307234134)
id_municip, sg_uf_not (0.9995685946062094)
id_municip, UFRES (0.998502973063139)
id_municip, id_mn_resi (0.9988894299359049)
id_municip, coufinf (0.9977590780482778)
id_municip, comuninf (0.9980476809747266)
UFRES, sg_uf_not (0.9989326505444045)
UFRES, id_municip (0.998502973063139)
UFRES, id_mn_resi (0.9995679077950754)
UFRES, coufinf (0.999239731267903)
UFRES, comuninf (0.9987115595948387)
id_mn_resi, sg_uf_not (0.998491771440503)
id_mn_resi, id_municip (0.9988894299359049)
id_mn_resi, UFRES (0.9995679077950754)
id_mn_resi, coufinf (0.9988089711695586)
id_mn_resi, comuninf (0.999141337557

In [21]:
to_remove = ['id_mn_resi', 'sg_uf_not', 'id_municip', 'UFRES', 'coufinf', # correlate to 'comuninf'
            'nu_idade', # correlate to nu_idade_n and FXETARIA
             'nu_ano', #correlate to anodiag
             'dt_invest' # correlate to dt_notific
            ]

data_df.drop(labels=to_remove, inplace=True, axis='columns')

data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117791 entries, 0 to 117790
Data columns (total 25 columns):
dt_notific    117791 non-null float64
sem_not       117791 non-null float64
anodiag       117791 non-null float64
id_regiona    117791 non-null float64
id_unidade    117791 non-null float64
dt_diag       117791 non-null float64
sem_pri       117791 non-null float64
nu_idade_n    117791 non-null float64
FXETARIA      117791 non-null float64
SEXO          117791 non-null float64
cs_gestant    117791 non-null float64
cs_raca       117791 non-null float64
cs_escol_n    117791 non-null float64
cs_zona       117791 non-null float64
classi_fin    117791 non-null float64
criterio      117791 non-null float64
tpautocto     117791 non-null float64
copaisinf     117791 non-null float64
comuninf      117791 non-null float64
co_bainfc     117791 non-null float64
doenca_tra    117791 non-null float64
evolucao      117791 non-null float64
dt_encerra    117791 non-null float64
flxrecebi     1