In [1]:
import datetime
import time
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import MinMaxScaler

In [2]:
def parse_age(value):
    result = 0
    
    if value == np.nan:
        return np.nan
        
    try:
        if 'A' in value:
            result = float(value.replace('A', ''))              
        elif 'M' in value:    
            result = float(value.replace('M', '')) / 12
        elif 'D' in value:    
            result = float(value.replace('D', '')) / 365
        
        return result
    except:
        return result

In [3]:
def parse_job(value):
    
    if value == np.nan:
        return np.nan
    
    try:
        if 'X' in value:
            return value.replace('X', '')
    except:
        return np.nan

In [4]:
def parse_week(value):
    if value == np.nan:
        result = 0.0
    else:        
        temp = str(value)[:-4]
        if temp:
            result = float(temp)
        else: 
            result = 0.0
        
    return result    

In [5]:
now = time.time()
df = pd.read_csv('Data/2000-2010/DENGUE_2000_2006.tsv', sep = '\t', nrows=1000000, dtype = 'unicode')
#df = pd.read_csv('Data/2000-2010/dengue_2001_dataset.csv', sep = ',', nrows=50000, encoding='cp1252', dtype = 'unicode')
print('tempo para carregar o dataset = ' + time.strftime("%H:%M:%S", time.gmtime(time.time() - now)))

tempo para carregar o dataset = 00:00:00


In [6]:
print('Quantidade de características iniciais: ' + str(df.shape))
df = df.query("CON_CLASSI in ['1', '2', '3', '4', '5']")
print('Quantidade de características classificadas: ' + str(df.shape))

Quantidade de características iniciais: (1000, 113)
Quantidade de características classificadas: (1000, 113)


In [7]:
now = time.time()
df.columns.values
hand_picked_features = [
    'CS_RACA', 'CS_ESCOLAR', 'NU_IDADE', 'CS_SEXO', 
    'CS_ZONA', 'ID_MN_RESI', 'DENGUE', 'OCUPACAO', 'SEM_PRI',
    'NU_ANO', 'VACINADO', 'FEBRE', 'DURACAO', 'LACO', 
    'CEFALEIA', 'EXANTEMA', 'DOR', 'PROSTACAO', 'MIALGIA',
    'NAUSEAS', 'ARTRALGIA', 'DIARREIA', 'OUTROS', 'INSUFICIEN',
    'EPISTAXE', 'PETEQUIAS', 'GENGIVO', 'METRO', 'HEMATURA', 
    'SANGRAM', 'ASCITE', 'PLEURAL', 'PERICARDI', 'ABDOMINAL', 
    'HEPATO', 'MIOCARDI', 'HIPOTENSAO', 'CHOQUE', 'MANIFESTA']
label = 'CON_CLASSI'

df['NU_IDADE'] = df['NU_IDADE'].apply(parse_age).astype('float64')
df['NU_ANO'] = df['NU_ANO'].astype('float64')
df['OCUPACAO'] = df['OCUPACAO'].apply(parse_job)
df = df.fillna(value={'OCUPACAO':0})
df['OCUPACAO'] = df['OCUPACAO'].astype('category')
df['SEM_PRI'] = df['SEM_PRI'].apply(parse_week).astype('float64')

X = df.loc[:, hand_picked_features]
y = df.loc[:, label]

# X = X.iloc[2400:2600, :]
# y = y.iloc[2400:2600]
# y = y.fillna(0).astype('int64')

# X.to_csv('Data/2000-2010/fs_dengue_2001_dataset.csv', sep = '\t')
print('tempo de pré-processamento = ' + time.strftime("%H:%M:%S", time.gmtime(time.time() - now)))

tempo de pré-processamento = 00:00:00


In [8]:
# print(X.shape)
# print(y.shape)
# print(X.columns.values)

In [9]:
now = time.time()
columns_to_scale = ['NU_IDADE', 'NU_ANO', 'SEM_PRI']
X[columns_to_scale] = MinMaxScaler().fit_transform(X[columns_to_scale])
print('tempo para normalizar %s características = %s' % (len(columns_to_scale), 
                                                         time.strftime("%H:%M:%S", time.gmtime(time.time() - now))))

tempo para normalizar 3 características = 00:00:00


In [10]:
now = time.time()
new_X = pd.get_dummies(X) 
print('tempo do OHE = %s' % (time.strftime("%H:%M:%S", time.gmtime(time.time() - now))))

tempo do OHE = 00:00:00


In [11]:
#print(new_X.shape)
#print(type(new_X))
new_X.head()


Unnamed: 0,NU_IDADE,SEM_PRI,NU_ANO,CS_RACA_9,CS_ESCOLAR_1,CS_ESCOLAR_3,CS_ESCOLAR_4,CS_ESCOLAR_5,CS_ESCOLAR_6,CS_ESCOLAR_9,...,PLEURAL_9,ABDOMINAL_1,ABDOMINAL_2,ABDOMINAL_9,HEPATO_1,HEPATO_2,HEPATO_9,CHOQUE_1,CHOQUE_2,CHOQUE_9
0,0.605249,0.921569,0.0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,1,0,0
1,0.302606,0.27451,0.0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
2,0.263131,0.254902,0.0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,0.394715,0.705882,0.0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
4,0.105231,0.941176,0.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
now = time.time()
kbest_result = SelectKBest(chi2, k='all').fit(new_X, y)
print('tempo para aplicar o qui-quadrado = %s' % (time.strftime("%H:%M:%S", time.gmtime(time.time() - now))))
kbest_result

tempo para aplicar o qui-quadrado = 00:00:00


SelectKBest(k='all', score_func=<function chi2 at 0x115d150d0>)

In [13]:

scores = np.nan_to_num(kbest_result.scores_)
print('Quantidade de características: ' + str(len(scores)))

min_score = np.min(scores)
mean = np.mean(scores)
max_score = np.max(scores)
std = np.std(scores)


print('Pontuações [mínima: ' + str(min_score) + 
      ', média: ' + str(mean) + 
      ', máxima: ' + str(max_score) + 
      ', desvio padrão: ' + str(std) + ']')

total_features = new_X.shape[1]
selected_indices = np.where(scores > mean)
possible_outliers = np.where(scores > 1000)[0]
mask = selected_indices[0]
n_features = len(mask)
print('Quantidade de características selecionadas: ' + str(len(mask)))
print('Quantidade de características abaixo da média: ' + str(total_features - len(mask)))
print('Quantidade de características acima da média: ' + str(len(mask)))

Quantidade de características: 82
Pontuações [mínima: 0.0, média: 8.079842043087616, máxima: 92.47067813566734, desvio padrão: 18.401590224023092]
Quantidade de características selecionadas: 16
Quantidade de características abaixo da média: 66
Quantidade de características acima da média: 16


In [14]:
#print(len(kbest_result.pvalues_))
#kbest_result.pvalues_

In [15]:
# mask = kbest_result.get_support(indices=True)
# print(mask)
kbest_df = new_X.iloc[:, mask]
#print(kbest_df.shape)
#kbest_df.head()

Características selecionadas:

In [16]:
kbest_df.columns.values

array(['CS_ESCOLAR_6', 'FEBRE_2', 'CEFALEIA_1', 'CEFALEIA_2', 'DOR_1',
       'DOR_2', 'PROSTACAO_1', 'MIALGIA_1', 'MIALGIA_2', 'NAUSEAS_1',
       'ARTRALGIA_1', 'ARTRALGIA_2', 'PETEQUIAS_1', 'PLEURAL_9',
       'ABDOMINAL_1', 'CHOQUE_1'], dtype=object)

In [17]:
new_X.iloc[:, possible_outliers].columns.values

array([], dtype=object)

In [18]:
# print(np.unique(new_X['ID_MN_RESI_1302603'], return_counts=True))
# print(np.unique(new_X['ID_MN_RESI_3147006'], return_counts=True))
# print(np.unique(new_X['ID_MN_RESI_3205309'], return_counts=True))
# print(np.unique(new_X['MIALGIA_2'], return_counts=True))

# Método RFE

In [19]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

In [20]:
now = time.time()
estimator = SVR(kernel="linear")
selector = RFE(estimator=estimator, n_features_to_select=n_features, step=0.10, verbose=1)
selector = selector.fit(new_X, y)
print('tempo para selectionar %s características com o RFE = %s' % (n_features, time.strftime("%H:%M:%S", time.gmtime(time.time() - now))))

Fitting estimator with 82 features.
Fitting estimator with 74 features.
Fitting estimator with 66 features.
Fitting estimator with 58 features.
Fitting estimator with 50 features.
Fitting estimator with 42 features.
Fitting estimator with 34 features.
Fitting estimator with 26 features.
Fitting estimator with 18 features.
tempo para selectionar 16 características com o RFE = 00:00:00


In [21]:
new_X.loc[:, selector.support_].columns.values

array(['CS_ESCOLAR_9', 'ID_MN_RESI_1200013', 'ID_MN_RESI_1200708',
       'ID_MN_RESI_2509370', 'FEBRE_1', 'FEBRE_2', 'CEFALEIA_1',
       'CEFALEIA_2', 'CEFALEIA_9', 'EXANTEMA_9', 'DOR_1', 'DOR_2',
       'ASCITE_9', 'PLEURAL_9', 'ABDOMINAL_9', 'CHOQUE_9'], dtype=object)