## Cleaning

In [1]:
# imports 
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
df = pd.read_csv('../../data/SINAN/SifCongenita_2007_2017_Long.csv', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# remove irrelevant columns
df_clean = df.drop(labels=['UFRES', 'IDADE_CRI', 'FXET_MAE', 'ctrlsifcn'], axis='columns')

In [4]:
# change empty string to missing values
df_clean.replace('\s+', np.nan, regex=True, inplace=True)

In [5]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125136 entries, 0 to 125135
Data columns (total 15 columns):
DT_DIAG       125136 non-null object
DT_NOT        125136 non-null object
DT_NASC       125135 non-null object
RACA          125136 non-null int64
ANODIAG       125136 non-null int64
ANONOT        125136 non-null int64
SEXO          125136 non-null int64
ESQTT_MAE     125136 non-null int64
DIAG_FINAL    125136 non-null int64
PRE_NATAL     125136 non-null int64
PARCTRAT      125136 non-null int64
ESC_MAE       125136 non-null int64
MOM_DIAG      125136 non-null int64
ID_MAE        125136 non-null int64
MUNRES        125102 non-null object
dtypes: int64(11), object(4)
memory usage: 14.3+ MB


Before taking the last steps of preprocessing, we need to erase the nan values. Otherwise the methods of conversion and labeling won't work.

In [6]:
# convert columns that are numeric but have string values
not_categorical = [
    'RACA', 'ESQTT_MAE', 'DIAG_FINAL', 'PRE_NATAL', 
    'PARCTRAT', 'ESC_MAE', 'MOM_DIAG', 'ID_MAE', 
    'MUNRES'
]

df_clean.dropna(inplace=True, axis='rows', subset=not_categorical)

df_clean.loc[:,not_categorical] = df_clean.loc[:,not_categorical].astype('int64')

In [7]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 125102 entries, 0 to 125135
Data columns (total 15 columns):
DT_DIAG       125102 non-null object
DT_NOT        125102 non-null object
DT_NASC       125101 non-null object
RACA          125102 non-null int64
ANODIAG       125102 non-null int64
ANONOT        125102 non-null int64
SEXO          125102 non-null int64
ESQTT_MAE     125102 non-null int64
DIAG_FINAL    125102 non-null int64
PRE_NATAL     125102 non-null int64
PARCTRAT      125102 non-null int64
ESC_MAE       125102 non-null int64
MOM_DIAG      125102 non-null int64
ID_MAE        125102 non-null int64
MUNRES        125102 non-null int64
dtypes: int64(12), object(3)
memory usage: 15.3+ MB


In [8]:
# test select columns by type
df_clean.select_dtypes(include='object').columns

Index(['DT_DIAG', 'DT_NOT', 'DT_NASC'], dtype='object')

In [9]:
df_clean['DT_NASC']

0         1/29/1999
1         5/12/2001
2          6/2/2003
3         3/19/2004
4         6/13/2005
            ...    
125131    3/30/2017
125132    6/28/2017
125133    6/23/2017
125134     6/8/2017
125135     4/8/2017
Name: DT_NASC, Length: 125102, dtype: object

In [10]:
categorical = df_clean.select_dtypes(include='object').columns
df_clean[categorical].fillna(' ', inplace=True)

for category in categorical:
    label_enc = LabelEncoder()
    print(category)
    print(df_clean[category])
    labels = label_enc.fit_transform(df_clean[category].astype(str))
    df_clean[category] = labels

DT_DIAG
0         1/29/1999
1         5/12/2001
2          6/2/2003
3         3/29/2004
4         6/13/2005
            ...    
125131    3/31/2017
125132    6/29/2017
125133    6/24/2017
125134     6/8/2017
125135     4/8/2017
Name: DT_DIAG, Length: 125102, dtype: object
DT_NOT
0          5/14/2008
1          5/12/2011
2           6/2/2011
3         10/20/2007
4         10/31/2007
             ...    
125131     3/31/2017
125132     6/30/2017
125133     6/26/2017
125134      7/8/2017
125135      4/8/2017
Name: DT_NOT, Length: 125102, dtype: object
DT_NASC
0         1/29/1999
1         5/12/2001
2          6/2/2003
3         3/19/2004
4         6/13/2005
            ...    
125131    3/30/2017
125132    6/28/2017
125133    6/23/2017
125134     6/8/2017
125135     4/8/2017
Name: DT_NASC, Length: 125102, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  **kwargs


Finally, we normalize the data.


In [11]:
scaler = StandardScaler()
data = scaler.fit_transform(df_clean.values)
    
    
# transform to df agagin
data_df = pd.DataFrame(data)
data_df.columns = df_clean.columns
    
# check the result
data_df.head()

Unnamed: 0,DT_DIAG,DT_NOT,DT_NASC,RACA,ANODIAG,ANONOT,SEXO,ESQTT_MAE,DIAG_FINAL,PRE_NATAL,PARCTRAT,ESC_MAE,MOM_DIAG,ID_MAE,MUNRES
0,-1.563121,0.326454,-1.561054,0.168357,-5.041242,-1.841396,-0.522593,2.674305,3.956686,3.763937,1.789364,-0.696088,4.164893,6.187623,-1.856164
1,0.321478,0.309007,0.322898,-0.715497,-4.326836,-0.767786,-0.003506,-0.483658,-0.291043,-0.348558,-0.49702,-0.626529,-0.561908,-0.172052,0.180272
2,0.712673,0.70019,0.715242,0.168357,-3.612429,-0.767786,-0.003506,-0.03252,2.540776,0.165504,-0.49702,1.57616,4.164893,-0.132956,0.177047
3,-0.108564,-1.356735,-0.212753,-1.157424,-3.255226,-2.199266,-0.522593,-0.934796,3.956686,-0.348558,-0.823646,-0.696088,-0.561908,-0.165536,1.085248
4,0.642204,-1.246543,0.644463,-0.715497,-2.898023,-2.199266,-0.522593,2.674305,-0.291043,-0.348558,1.789364,1.57616,0.619792,-0.185085,0.180272


In [12]:
# Compute the correlation matrix
corr = data_df.corr()

for index, row in corr.iterrows():
    for val in row :
        if (val > 0.7) or (val < -0.7):
            indx = row[ row == val].index[0]
            if index != indx:
                print('{}, {} ({})'.format(index, indx, val))

DT_DIAG, DT_NOT (0.8746020816266783)
DT_DIAG, DT_NASC (0.9733966321276298)
DT_NOT, DT_DIAG (0.8746020816266783)
DT_NOT, DT_NASC (0.8513036411141861)
DT_NASC, DT_DIAG (0.9733966321276298)
DT_NASC, DT_NOT (0.8513036411141861)
ANODIAG, ANONOT (0.9974486348312591)
ANONOT, ANODIAG (0.9974486348312591)
