In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [2]:
def parse_age(value):
    result = 0
    
    if value == np.nan:
        return np.nan
        
    try:
        if 'A' in value:
            result = float(value.replace('A', ''))              
        elif 'M' in value:    
            result = float(value.replace('M', '')) / 12
        elif 'D' in value:    
            result = float(value.replace('D', '')) / 365
        
        return result
    except:
        return result

In [3]:
def parse_job(value):
    
    if value == np.nan:
        return np.nan
    
    try:
        if 'X' in value:
            return value.replace('X', '')
    except:
        return np.nan

In [4]:
df = pd.read_csv('Data/2000-2010/DENGUE_2000_2006.tsv', sep = '\t', nrows=3000, dtype = 'unicode')
#df = pd.read_csv('Data/2000-2010/dengue_2001_dataset.csv', sep = ',', nrows=50000, encoding='cp1252', dtype = 'unicode')

In [5]:
df.columns.values
hand_picked_features = [
    'CS_RACA', 'CS_ESCOLAR', 'NU_IDADE', 'CS_SEXO',
    'CS_ZONA', 'ID_MN_RESI', 'DENGUE', 'OCUPACAO', 
    'NU_ANO', 'VACINADO', 'FEBRE', 'DURACAO', 'LACO', 
    'CEFALEIA', 'EXANTEMA', 'DOR', 'PROSTACAO', 'MIALGIA',
    'NAUSEAS', 'ARTRALGIA', 'DIARREIA', 'OUTROS', 'INSUFICIEN',
    'EPISTAXE', 'PETEQUIAS', 'GENGIVO', 'METRO', 'HEMATURA', 
    'SANGRAM', 'ASCITE', 'PLEURAL', 'PERICARDI', 'ABDOMINAL', 
    'HEPATO', 'MIOCARDI', 'HIPOTENSAO', 'CHOQUE', 'MANIFESTA']
label = 'CON_CLASSI'

df['NU_IDADE'] = df['NU_IDADE'].apply(parse_age).astype('int64')
df['NU_ANO'] = df['NU_ANO'].astype('int64')
df['OCUPACAO'] = df['OCUPACAO'].apply(parse_job)
df = df.fillna(value={'OCUPACAO':0})
df['OCUPACAO'] = df['OCUPACAO'].astype('category')


X = df.loc[:, hand_picked_features]
y = df.loc[:, label]

# X = X.iloc[2400:2600, :]
# y = y.iloc[2400:2600]
y = y.fillna(0).astype('int64')

# X.to_csv('Data/2000-2010/fs_dengue_2001_dataset.csv', sep = '\t')

In [6]:
print(X.shape)
print(y.shape)

(3000, 38)
(3000,)


In [7]:
new_X = pd.get_dummies(X)               

In [8]:
#new_X = new_X.fillna(value={'OCUPACAO':0})
#new_X = new_X.dropna(how='all')

# new_X.to_csv('Data/2000-2010/fs_dengue_00_06_dataset.csv', sep = '\t')
kbest_result = SelectKBest(chi2, k=10).fit(new_X, y)
kbest_result

SelectKBest(k=10, score_func=<function chi2 at 0x113c8c0d0>)

In [9]:
print(len(kbest_result.scores_))
kbest_result.scores_

138


array([3.02774364e+02, 0.00000000e+00, 6.81490385e-01, 2.49956932e+01,
       5.17095551e+00, 2.67268739e+01, 1.04050589e+01, 5.04122904e+01,
       2.27268195e+01, 3.50949265e+00, 1.50000000e+00, 4.58782436e+00,
       8.84673314e+00, 6.09513356e+01, 1.51442308e-01, 4.74358974e-01,
       3.85737179e+00, 3.02884615e-01, 4.46514423e-01, 3.36057692e+00,
       2.20993590e+00, 1.50000000e+00, 9.42307692e-01, 1.06842047e+02,
       4.79585799e+00, 4.74358974e-01, 2.07572115e+00, 1.50000000e+00,
       1.03228022e+00, 1.51442308e-01, 1.50000000e+00, 4.31127820e+01,
       2.15563910e+01, 8.02884615e-01, 3.13005640e+01, 1.51442308e-01,
       2.15563910e+01, 8.02884615e-01, 2.18722879e+02, 8.02884615e-01,
       8.02884615e-01, 1.05281955e+01, 4.31127820e+01, 8.02884615e-01,
       2.15563910e+01, 1.05000000e+01, 2.15563910e+01, 1.50000000e+00,
       8.02884615e-01, 2.06512941e+01, 1.72096588e+01, 4.31127820e+01,
       5.78605769e+00, 1.08923283e+01, 6.00000000e+00, 6.95308858e+00,
      

In [10]:
print(len(kbest_result.pvalues_))
#kbest_result.pvalues_

138


In [13]:
mask = kbest_result.get_support()
kbest_df = new_X.loc[:, mask]
kbest_df

Unnamed: 0,NU_IDADE,ID_MN_RESI_2702108,ID_MN_RESI_2707701,ID_MN_RESI_2709152,DOR_1,DOR_2,PROSTACAO_1,MIALGIA_1,MIALGIA_2,ARTRALGIA_1
0,46,0,0,0,1,0,1,1,0,1
1,23,0,0,0,1,0,0,1,0,1
2,20,0,0,0,1,0,0,0,1,0
3,30,0,0,0,0,1,0,1,0,0
4,8,0,0,0,0,0,0,0,0,0
5,21,0,0,0,0,1,0,0,1,0
6,24,0,0,0,1,0,1,1,0,1
7,25,0,0,0,1,0,1,1,0,1
8,57,0,0,0,1,0,0,1,0,1
9,25,0,0,0,0,1,1,0,1,0
