## Cleaning

In [2]:
# imports 
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
df = pd.read_csv('../../data/SINAN/SifCongenita_2007_2017_Long.csv', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df['MUNRES'] = pd.to_numeric(df['MUNRES'], errors='coerce')
df.loc[df['MUNRES'] == 240810]

Unnamed: 0,ctrlsifcn,DT_DIAG,DT_NOT,DT_NASC,UFRES,RACA,ANODIAG,ANONOT,SEXO,ESQTT_MAE,DIAG_FINAL,PRE_NATAL,PARCTRAT,ESC_MAE,IDADE_CRI,MOM_DIAG,ID_MAE,FXET_MAE,MUNRES
337,378,1/22/2007,1/23/2007,1/22/2007,24,4,2007,2007,1,2,1,1,9,2,1,2,18,2,240810.0
429,475,1/28/2007,1/29/2007,1/28/2007,24,4,2007,2007,2,2,1,2,2,4,1,2,31,4,240810.0
790,856,2/21/2007,2/23/2007,2/21/2007,24,4,2007,2007,1,3,1,1,2,2,1,2,23,3,240810.0
814,883,2/23/2007,2/23/2007,2/23/2007,24,1,2007,2007,1,9,4,1,9,99,1,2,35,4,240810.0
867,939,2/26/2007,3/2/2007,2/26/2007,24,2,2007,2007,1,2,1,1,1,3,1,1,32,4,240810.0
880,952,2/27/2007,2/27/2007,2/27/2007,24,9,2007,2007,2,2,1,1,2,4,1,1,32,4,240810.0
918,992,2/28/2007,6/22/2007,2/28/2007,24,4,2007,2007,2,2,1,1,2,2,1,9,26,3,240810.0
955,1030,3/2/2007,3/5/2007,3/2/2007,24,1,2007,2007,2,2,1,2,2,0,1,2,32,4,240810.0
978,1053,3/3/2007,3/15/2007,3/3/2007,24,4,2007,2007,1,2,1,1,2,4,1,1,21,3,240810.0
1046,1127,3/8/2007,3/8/2007,3/8/2007,24,1,2007,2007,2,9,1,1,2,4,1,2,25,3,240810.0


In [5]:
# remove irrelevant columns
df_clean = df.drop(labels=['UFRES', 'IDADE_CRI', 'FXET_MAE', 'ctrlsifcn'], axis='columns')

In [6]:
# change empty string to missing values
df_clean.replace('\s+', np.nan, regex=True, inplace=True)

In [7]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125136 entries, 0 to 125135
Data columns (total 15 columns):
DT_DIAG       125136 non-null object
DT_NOT        125136 non-null object
DT_NASC       125135 non-null object
RACA          125136 non-null int64
ANODIAG       125136 non-null int64
ANONOT        125136 non-null int64
SEXO          125136 non-null int64
ESQTT_MAE     125136 non-null int64
DIAG_FINAL    125136 non-null int64
PRE_NATAL     125136 non-null int64
PARCTRAT      125136 non-null int64
ESC_MAE       125136 non-null int64
MOM_DIAG      125136 non-null int64
ID_MAE        125136 non-null int64
MUNRES        125102 non-null float64
dtypes: float64(1), int64(11), object(3)
memory usage: 14.3+ MB


Before taking the last steps of preprocessing, we need to erase the nan values. Otherwise the methods of conversion and labeling won't work.

In [8]:
# convert columns that are numeric but have string values
not_categorical = [
    'RACA', 'ESQTT_MAE', 'DIAG_FINAL', 'PRE_NATAL', 
    'PARCTRAT', 'ESC_MAE', 'MOM_DIAG', 'ID_MAE', 
    'MUNRES'
]

df_clean.dropna(inplace=True, axis='rows', subset=not_categorical)

df_clean.loc[:,not_categorical] = df_clean.loc[:,not_categorical].astype('int64')

In [9]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 125102 entries, 0 to 125135
Data columns (total 15 columns):
DT_DIAG       125102 non-null object
DT_NOT        125102 non-null object
DT_NASC       125101 non-null object
RACA          125102 non-null int64
ANODIAG       125102 non-null int64
ANONOT        125102 non-null int64
SEXO          125102 non-null int64
ESQTT_MAE     125102 non-null int64
DIAG_FINAL    125102 non-null int64
PRE_NATAL     125102 non-null int64
PARCTRAT      125102 non-null int64
ESC_MAE       125102 non-null int64
MOM_DIAG      125102 non-null int64
ID_MAE        125102 non-null int64
MUNRES        125102 non-null int64
dtypes: int64(12), object(3)
memory usage: 15.3+ MB


In [10]:
# test select columns by type
df_clean.select_dtypes(include='object').columns

Index(['DT_DIAG', 'DT_NOT', 'DT_NASC'], dtype='object')

In [11]:
df_clean['DT_NASC']

0          1/29/1999
1          5/12/2001
2           6/2/2003
3          3/19/2004
4          6/13/2005
5          3/30/2004
6           2/3/2006
7          3/22/2006
8          4/10/2006
9          4/24/2006
10         5/13/2006
11          6/8/2006
12         7/21/2006
13         8/20/2006
14         8/24/2006
15         8/31/2006
16         9/25/2006
17         9/29/2006
18         9/30/2006
19         10/4/2006
20        10/15/2006
21        10/18/2006
22        10/19/2006
23        10/18/2006
24        10/22/2006
25        10/30/2006
26         11/1/2006
27         2/23/2007
28        11/10/2006
29        11/17/2006
             ...    
125106     4/15/2017
125107     12/5/2017
125108     12/5/2017
125109     5/28/2017
125110     5/28/2017
125111     6/21/2017
125112     6/28/2017
125113     1/25/2017
125114     1/24/2017
125115     1/18/2017
125116     2/20/2017
125117     2/18/2017
125118     2/20/2011
125119     2/15/2017
125120     2/22/2017
125121     6/16/2017
125122      3

In [12]:
categorical = df_clean.select_dtypes(include='object').columns
df_clean[categorical].fillna(' ', inplace=True)

for category in categorical:
    label_enc = LabelEncoder()
    print(category)
    print(df_clean[category])
    labels = label_enc.fit_transform(df_clean[category].astype(str))
    df_clean[category] = labels

DT_DIAG
0          1/29/1999
1          5/12/2001
2           6/2/2003
3          3/29/2004
4          6/13/2005
5          9/14/2005
6           2/3/2006
7          3/22/2006
8          4/10/2006
9          4/24/2006
10         5/17/2006
11         6/11/2006
12         7/21/2006
13         8/20/2006
14         8/24/2006
15         8/31/2006
16         9/25/2006
17         9/29/2006
18         10/1/2006
19         10/4/2006
20        10/15/2006
21        10/18/2006
22        10/19/2006
23        10/19/2006
24        10/22/2006
25        10/31/2006
26         11/1/2006
27         11/7/2006
28        11/10/2006
29        11/17/2006
             ...    
125106     4/15/2017
125107     5/13/2017
125108     5/13/2017
125109     5/28/2017
125110     5/29/2017
125111     6/21/2017
125112     6/29/2017
125113     1/25/2017
125114     1/24/2017
125115     1/18/2017
125116     2/21/2017
125117     2/19/2017
125118     2/21/2017
125119     2/15/2017
125120     2/22/2017
125121     6/16/2017
12512

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [13]:
df_clean.to_csv('../../data/congenita.csv', sep=';')

In [9]:
def first_two(d):                
     return str(d)[:2]

df_clean['uf'] = df_clean.MUNRES.apply(first_two)
df_clean.uf.value_counts().head()

33    21829
35    21116
26     9335
43     9271
23     8955
Name: uf, dtype: int64