## Cleaning

In [1]:
# imports 
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
df = pd.read_csv('../../data/SINAN/SifCongenita_2007_2017_Long.csv', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df['MUNRES'] = pd.to_numeric(df['MUNRES'], errors='coerce')
df.loc[df['MUNRES'] == 240810]

Unnamed: 0,ctrlsifcn,DT_DIAG,DT_NOT,DT_NASC,UFRES,RACA,ANODIAG,ANONOT,SEXO,ESQTT_MAE,DIAG_FINAL,PRE_NATAL,PARCTRAT,ESC_MAE,IDADE_CRI,MOM_DIAG,ID_MAE,FXET_MAE,MUNRES
337,378,1/22/2007,1/23/2007,1/22/2007,24,4,2007,2007,1,2,1,1,9,2,1,2,18,2,240810.0
429,475,1/28/2007,1/29/2007,1/28/2007,24,4,2007,2007,2,2,1,2,2,4,1,2,31,4,240810.0
790,856,2/21/2007,2/23/2007,2/21/2007,24,4,2007,2007,1,3,1,1,2,2,1,2,23,3,240810.0
814,883,2/23/2007,2/23/2007,2/23/2007,24,1,2007,2007,1,9,4,1,9,99,1,2,35,4,240810.0
867,939,2/26/2007,3/2/2007,2/26/2007,24,2,2007,2007,1,2,1,1,1,3,1,1,32,4,240810.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122388,00145213,4/22/2017,5/10/2017,4/21/2017,24,9,2017,2017,2,9,1,1,2,99,1,1,25,3,240810.0
122410,00145242,4/23/2017,4/24/2017,4/23/2017,24,4,2017,2017,1,2,1,1,9,3,1,2,19,2,240810.0
122433,00145269,4/23/2017,4/27/2017,4/23/2017,24,1,2017,2017,2,2,1,1,2,5,1,1,31,4,240810.0
122650,00145551,4/27/2017,5/15/2017,4/27/2017,24,4,2017,2017,2,3,1,1,9,3,1,1,20,3,240810.0


In [4]:
# remove irrelevant columns
df_clean = df.drop(labels=['UFRES', 'IDADE_CRI', 'FXET_MAE', 'ctrlsifcn'], axis='columns')

In [5]:
# change empty string to missing values
df_clean.replace('\s+', np.nan, regex=True, inplace=True)

In [6]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125136 entries, 0 to 125135
Data columns (total 15 columns):
DT_DIAG       125136 non-null object
DT_NOT        125136 non-null object
DT_NASC       125135 non-null object
RACA          125136 non-null int64
ANODIAG       125136 non-null int64
ANONOT        125136 non-null int64
SEXO          125136 non-null int64
ESQTT_MAE     125136 non-null int64
DIAG_FINAL    125136 non-null int64
PRE_NATAL     125136 non-null int64
PARCTRAT      125136 non-null int64
ESC_MAE       125136 non-null int64
MOM_DIAG      125136 non-null int64
ID_MAE        125136 non-null int64
MUNRES        125102 non-null float64
dtypes: float64(1), int64(11), object(3)
memory usage: 14.3+ MB


Before taking the last steps of preprocessing, we need to erase the nan values. Otherwise the methods of conversion and labeling won't work.

In [7]:
# convert columns that are numeric but have string values
not_categorical = [
    'RACA', 'ESQTT_MAE', 'DIAG_FINAL', 'PRE_NATAL', 
    'PARCTRAT', 'ESC_MAE', 'MOM_DIAG', 'ID_MAE', 
    'MUNRES'
]

df_clean.dropna(inplace=True, axis='rows', subset=not_categorical)

df_clean.loc[:,not_categorical] = df_clean.loc[:,not_categorical].astype('int64')

In [8]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 125102 entries, 0 to 125135
Data columns (total 15 columns):
DT_DIAG       125102 non-null object
DT_NOT        125102 non-null object
DT_NASC       125101 non-null object
RACA          125102 non-null int64
ANODIAG       125102 non-null int64
ANONOT        125102 non-null int64
SEXO          125102 non-null int64
ESQTT_MAE     125102 non-null int64
DIAG_FINAL    125102 non-null int64
PRE_NATAL     125102 non-null int64
PARCTRAT      125102 non-null int64
ESC_MAE       125102 non-null int64
MOM_DIAG      125102 non-null int64
ID_MAE        125102 non-null int64
MUNRES        125102 non-null int64
dtypes: int64(12), object(3)
memory usage: 15.3+ MB


In [9]:
# test select columns by type
df_clean.select_dtypes(include='object').columns

Index(['DT_DIAG', 'DT_NOT', 'DT_NASC'], dtype='object')

In [10]:
df_clean['DT_NASC']

0         1/29/1999
1         5/12/2001
2          6/2/2003
3         3/19/2004
4         6/13/2005
            ...    
125131    3/30/2017
125132    6/28/2017
125133    6/23/2017
125134     6/8/2017
125135     4/8/2017
Name: DT_NASC, Length: 125102, dtype: object

In [11]:
categorical = df_clean.select_dtypes(include='object').columns
df_clean[categorical].fillna(' ', inplace=True)

for category in categorical:
    label_enc = LabelEncoder()
    print(category)
    print(df_clean[category])
    labels = label_enc.fit_transform(df_clean[category].astype(str))
    df_clean[category] = labels

DT_DIAG
0         1/29/1999
1         5/12/2001
2          6/2/2003
3         3/29/2004
4         6/13/2005
            ...    
125131    3/31/2017
125132    6/29/2017
125133    6/24/2017
125134     6/8/2017
125135     4/8/2017
Name: DT_DIAG, Length: 125102, dtype: object
DT_NOT
0          5/14/2008
1          5/12/2011
2           6/2/2011
3         10/20/2007
4         10/31/2007
             ...    
125131     3/31/2017
125132     6/30/2017
125133     6/26/2017
125134      7/8/2017
125135      4/8/2017
Name: DT_NOT, Length: 125102, dtype: object
DT_NASC
0         1/29/1999
1         5/12/2001
2          6/2/2003
3         3/19/2004
4         6/13/2005
            ...    
125131    3/30/2017
125132    6/28/2017
125133    6/23/2017
125134     6/8/2017
125135     4/8/2017
Name: DT_NASC, Length: 125102, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  **kwargs


In [12]:
df_clean.to_csv('../../data/congenita.csv', sep=';')