In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [2]:
def parse_age(value):
    result = 0
    
    if value == np.nan:
        return np.nan
        
    try:
        if 'A' in value:
            result = float(value.replace('A', ''))              
        elif 'M' in value:    
            result = float(value.replace('M', '')) / 12
        elif 'D' in value:    
            result = float(value.replace('D', '')) / 365
        
        return result
    except:
        return result

In [3]:
def parse_job(value):
    
    if value == np.nan:
        return np.nan
    
    try:
        if 'X' in value:
            return value.replace('X', '')
    except:
        return np.nan

In [4]:
def parse_week(value):
    if value == np.nan:
        result = 0.0
    else:        
        temp = str(value)[:-4]
        if temp:
            result = float(temp)
        else: 
            result = 0.0
        
    return result    

In [5]:
df = pd.read_csv('Data/2000-2010/DENGUE_2000_2006.tsv', sep = '\t', nrows=100000, dtype = 'unicode')
#df = pd.read_csv('Data/2000-2010/dengue_2001_dataset.csv', sep = ',', nrows=50000, encoding='cp1252', dtype = 'unicode')

In [6]:
print(df.shape)
df = df.query("CON_CLASSI in ['1', '2', '3', '4', '5']")
print(df.shape)

(100000, 113)
(76769, 113)


In [7]:
df.columns.values
hand_picked_features = [
    'CS_RACA', 'CS_ESCOLAR', 'NU_IDADE', 'CS_SEXO', 
    'CS_ZONA', 'ID_MN_RESI', 'DENGUE', 'OCUPACAO', 'SEM_PRI',
    'NU_ANO', 'VACINADO', 'FEBRE', 'DURACAO', 'LACO', 
    'CEFALEIA', 'EXANTEMA', 'DOR', 'PROSTACAO', 'MIALGIA',
    'NAUSEAS', 'ARTRALGIA', 'DIARREIA', 'OUTROS', 'INSUFICIEN',
    'EPISTAXE', 'PETEQUIAS', 'GENGIVO', 'METRO', 'HEMATURA', 
    'SANGRAM', 'ASCITE', 'PLEURAL', 'PERICARDI', 'ABDOMINAL', 
    'HEPATO', 'MIOCARDI', 'HIPOTENSAO', 'CHOQUE', 'MANIFESTA']
label = 'CON_CLASSI'

df['NU_IDADE'] = df['NU_IDADE'].apply(parse_age).astype('float64')
df['NU_ANO'] = df['NU_ANO'].astype('float64')
df['OCUPACAO'] = df['OCUPACAO'].apply(parse_job)
df = df.fillna(value={'OCUPACAO':0})
df['OCUPACAO'] = df['OCUPACAO'].astype('category')
df['SEM_PRI'] = df['SEM_PRI'].apply(parse_week).astype('float64')

X = df.loc[:, hand_picked_features]
y = df.loc[:, label]

# X = X.iloc[2400:2600, :]
# y = y.iloc[2400:2600]
# y = y.fillna(0).astype('int64')

# X.to_csv('Data/2000-2010/fs_dengue_2001_dataset.csv', sep = '\t')

In [8]:
print(X.shape)
print(y.shape)
print(X.columns.values)

(76769, 39)
(76769,)
['CS_RACA' 'CS_ESCOLAR' 'NU_IDADE' 'CS_SEXO' 'CS_ZONA' 'ID_MN_RESI'
 'DENGUE' 'OCUPACAO' 'SEM_PRI' 'NU_ANO' 'VACINADO' 'FEBRE' 'DURACAO'
 'LACO' 'CEFALEIA' 'EXANTEMA' 'DOR' 'PROSTACAO' 'MIALGIA' 'NAUSEAS'
 'ARTRALGIA' 'DIARREIA' 'OUTROS' 'INSUFICIEN' 'EPISTAXE' 'PETEQUIAS'
 'GENGIVO' 'METRO' 'HEMATURA' 'SANGRAM' 'ASCITE' 'PLEURAL' 'PERICARDI'
 'ABDOMINAL' 'HEPATO' 'MIOCARDI' 'HIPOTENSAO' 'CHOQUE' 'MANIFESTA']


In [9]:
columns_to_scale = ['NU_IDADE', 'NU_ANO', 'SEM_PRI']
X[columns_to_scale] = MinMaxScaler().fit_transform(X[columns_to_scale])

In [10]:
new_X = pd.get_dummies(X) 

In [11]:
#print(new_X.shape)
#print(type(new_X))
new_X.head()


Unnamed: 0,NU_IDADE,SEM_PRI,NU_ANO,CS_RACA_1,CS_RACA_2,CS_RACA_3,CS_RACA_4,CS_RACA_9,CS_ESCOLAR_1,CS_ESCOLAR_2,...,MIOCARDI_9,HIPOTENSAO_1,HIPOTENSAO_2,HIPOTENSAO_9,CHOQUE_1,CHOQUE_2,CHOQUE_9,MANIFESTA_1,MANIFESTA_2,MANIFESTA_9
0,0.403509,0.921569,0.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0.201754,0.27451,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0.175439,0.254902,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.263158,0.705882,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0.070175,0.941176,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# new_X.to_csv('Data/2000-2010/fs_dengue_00_06_dataset.csv', sep = '\t')
kbest_result = SelectKBest(chi2, k='all').fit(new_X, y)
kbest_result

SelectKBest(k='all', score_func=<function chi2 at 0x10e84f0d0>)

In [13]:
scores = np.nan_to_num(kbest_result.scores_)
print('Quantidade de características: ' + str(len(scores)))

min_score = np.min(scores)
mean = np.mean(scores)
max_score = np.max(scores)
std = np.std(scores)


print('Pontuações [mínima: ' + str(min_score) + 
      ', média: ' + str(mean) + 
      ', máxima: ' + str(max_score) + 
      ', desvio padrão: ' + str(std) + ']')

selected_indices = np.where(scores > mean)
possible_outliers = np.where(scores > 1000)[0]
mask = selected_indices[0]
print('Quantidade de características selecionadas: ' + str(len(mask)))
mask


Quantidade de características: 1076
Pontuações [mínima: 0.0, média: 58.55570115361821, máxima: 14634.974772749887, desvio padrão: 461.39124859453506]
Quantidade de características selecionadas: 169


array([   1,    7,   10,   11,   13,   14,   19,   30,   45,   58,   64,
         71,   96,   97,  103,  115,  116,  128,  152,  153,  171,  188,
        190,  195,  226,  235,  236,  243,  252,  261,  283,  285,  290,
        298,  299,  300,  304,  305,  312,  319,  332,  335,  339,  345,
        347,  355,  356,  362,  364,  369,  370,  371,  375,  378,  379,
        390,  395,  416,  417,  428,  458,  467,  480,  484,  489,  490,
        503,  506,  513,  514,  534,  550,  551,  558,  579,  585,  590,
        608,  612,  613,  618,  621,  627,  632,  642,  645,  661,  666,
        667,  669,  677,  695,  707,  708,  709,  713,  715,  716,  730,
        738,  740,  763,  766,  768,  769,  783,  878,  887,  895,  920,
        971,  973,  974,  975,  977,  978,  979,  981,  982,  995, 1001,
       1002, 1003, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
       1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024,
       1031, 1033, 1034, 1035, 1037, 1038, 1039, 10

In [14]:
#print(len(kbest_result.pvalues_))
#kbest_result.pvalues_

In [15]:
# mask = kbest_result.get_support(indices=True)
# print(mask)
kbest_df = new_X.iloc[:, mask]
print(kbest_df.shape)
kbest_df.head()

(76769, 169)


Unnamed: 0,SEM_PRI,CS_RACA_9,CS_ESCOLAR_3,CS_ESCOLAR_4,CS_ESCOLAR_6,CS_ESCOLAR_9,CS_ZONA_2,ID_MN_RESI_1200401,ID_MN_RESI_1302603,ID_MN_RESI_1500800,...,ABDOMINAL_9,HEPATO_1,HEPATO_2,HEPATO_9,MIOCARDI_2,HIPOTENSAO_2,CHOQUE_1,CHOQUE_2,CHOQUE_9,MANIFESTA_2
0,0.921569,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,0.27451,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,0.254902,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.705882,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
4,0.941176,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Características selecionadas:

In [16]:
kbest_df.columns.values

array(['SEM_PRI', 'CS_RACA_9', 'CS_ESCOLAR_3', 'CS_ESCOLAR_4',
       'CS_ESCOLAR_6', 'CS_ESCOLAR_9', 'CS_ZONA_2', 'ID_MN_RESI_1200401',
       'ID_MN_RESI_1302603', 'ID_MN_RESI_1500800', 'ID_MN_RESI_1501402',
       'ID_MN_RESI_1502103', 'ID_MN_RESI_1505536', 'ID_MN_RESI_1506138',
       'ID_MN_RESI_1506807', 'ID_MN_RESI_1508084', 'ID_MN_RESI_1508100',
       'ID_MN_RESI_2100055', 'ID_MN_RESI_2103208', 'ID_MN_RESI_2103307',
       'ID_MN_RESI_2105401', 'ID_MN_RESI_2108009', 'ID_MN_RESI_2108207',
       'ID_MN_RESI_2109106', 'ID_MN_RESI_2114007', 'ID_MN_RESI_2302107',
       'ID_MN_RESI_2302206', 'ID_MN_RESI_2303931', 'ID_MN_RESI_2307304',
       'ID_MN_RESI_2311306', 'ID_MN_RESI_2600500', 'ID_MN_RESI_2600708',
       'ID_MN_RESI_2601201', 'ID_MN_RESI_2602001', 'ID_MN_RESI_2602209',
       'ID_MN_RESI_2602308', 'ID_MN_RESI_2602803', 'ID_MN_RESI_2602902',
       'ID_MN_RESI_2603702', 'ID_MN_RESI_2604205', 'ID_MN_RESI_2605608',
       'ID_MN_RESI_2605905', 'ID_MN_RESI_2606408', 'ID_MN_RE

In [17]:
type(possible_outliers)
new_X.iloc[:, possible_outliers].columns.values

array(['ID_MN_RESI_1302603', 'ID_MN_RESI_3147006', 'ID_MN_RESI_3205309',
       'MIALGIA_2'], dtype=object)

In [21]:
print(np.unique(new_X['ID_MN_RESI_1302603'], return_counts=True))
print(np.unique(new_X['ID_MN_RESI_3147006'], return_counts=True))
print(np.unique(new_X['ID_MN_RESI_3205309'], return_counts=True))
print(np.unique(new_X['MIALGIA_2'], return_counts=True))

(array([0, 1], dtype=uint8), array([70383,  6386]))
(array([0, 1], dtype=uint8), array([76768,     1]))
(array([0, 1], dtype=uint8), array([74902,  1867]))
(array([0, 1], dtype=uint8), array([61921, 14848]))
