In [6]:
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from mpl_toolkits.mplot3d import Axes3D

In [None]:
def get_logger():
    logger = logging.getLogger('pca')
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler('Data/log/pca.log')
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    return logger

logger = get_logger()

In [2]:
nrows = 30000
logger.info('reading %s rows from file' % (nrows))
data = pd.read_csv('Data/2000-2010/dengue_2001_dataset_cleaned.csv', sep = ',', nrows=nrows, error_bad_lines=False, dtype='unicode')
logger.info('%s rows read' % (nrows))

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
categorical_features = [
    'CS_RACA',
    'CS_ESCOLAR',
    'SG_UF_NOT',
    'CS_SEXO',
    'ID_BAIRRO',
    'ID_MN_RESI',
    'CS_ZONA',
    'SG_UF',
    'ID_PAIS',
    'DENGUE',
    'VACINADO',
    'FEBRE',
    'LACO',
    'CEFALEIA',
    'EXANTEMA',
    'DOR',
    'PROSTACAO',
    'MIALGIA',
    'NAUSEAS',
    'ARTRALGIA',
    'DIARREIA',
    'OUTROS',
    'EPISTAXE',
    'PETEQUIAS',
    'GENGIVO',
    'METRO',
    'HEMATURA',
    'SANGRAM',
    'OUTROS_M',
    'ASCITE',
    'PLEURAL',
    'PERICARDI',
    'ABDOMINAL',
    'HEPATO',
    'MIOCARDI',
    'HIPOTENSAO',
    'CHOQUE',
    'MANIFESTA',
    'INSUFICIEN',
    'OUTRO_S',
    'HOSPITALIZ',
    'UF',
    'S1_IGM',
    'S1_IGG',
    'MATERIAL',
    'SORO1',
    'SORO2',
    'TECIDOS',
    'HISTOPA',
    'IMUNOH',
    'RESUL_PCR',
    'AMOS_OUT',
    'RESUL_OUT',
    'CON_CLASSI',
    'CON_CRITER',
    'CON_INF_UF',
    'CON_INF_PA',
    'CON_DOENCA',
    'CON_EVOLUC',
    'ID_MUNICIP',
    'ID_UNIDADE',
    'ID_DG_NOT',
    'ID_EV_NOT',
    'OCUPACAO',
    'SIN_OUT',
    'OUTROS_M_D',
    'MUNICIPIO',
    'CON_INF_MU',
    'NDUPLIC',
    'NU_LOTE',
]

features = list(data.columns.values)
logger.info('encoding %s features' % (len(categorical_features)))

for feature in features:
    if (feature in categorical_features):
        logger.info('encoding %s' % (feature))
        data[feature] = data[feature].astype('category')
        dummies = pd.get_dummies(data[feature], prefix=feature, prefix_sep='_')
        del data[feature]
        data = data.join(dummies)
        logger.info('%s encoded' % (feature))
               

Unnamed: 0,ID_MUNICIP,ID_UNIDADE,NU_NOTIFIC,DT_NOTIFIC,CS_RACA,CS_ESCOLAR,NU_ANO,SEM_NOT,SG_UF_NOT,ID_REGIONA,...,CON_INF_UF,CON_INF_PA,CON_DOENCA,CON_EVOLUC,CON_DT_OBI,CON_DT_ENC,IN_VINCULA,NDUPLIC,IN_AIDS,NU_LOTE
0,1200013,2001071,7876,05/02/01,,3.0,2001,62001,AC,,...,,,,,,22/07/02,,,,2006011.0
1,1200385,2000997,8413,09/01/01,4.0,2.0,2001,22001,AC,,...,AC,800.0,,1.0,,24/07/02,,,,2006013.0
2,1200385,2000997,8424,12/02/01,4.0,4.0,2001,72001,AC,,...,,,,,,24/07/02,,,,2006013.0
3,1200385,2000997,8425,15/02/01,4.0,3.0,2001,72001,AC,,...,AC,800.0,,1.0,,24/07/02,,,,2006013.0
4,1200385,2000997,8433,21/02/01,4.0,6.0,2001,82001,AC,,...,,800.0,,0.0,,24/07/02,,,,2006013.0
5,1200385,2000997,8434,05/03/01,4.0,3.0,2001,102001,AC,,...,,800.0,,,,24/07/02,,,,2006013.0
6,1200385,2000997,8446,13/03/01,4.0,3.0,2001,112001,AC,,...,,,,,,24/07/02,,,,2006013.0
7,1200385,2000997,8447,12/03/01,4.0,3.0,2001,112001,AC,,...,,800.0,,,,24/07/02,,,,2006013.0
8,1200385,2000997,8460,05/04/01,4.0,,2001,142001,AC,,...,,,,,,24/07/02,,,,2006013.0
9,1200385,2000997,8506,11/11/01,4.0,4.0,2001,462001,AC,,...,,,,,,27/04/02,,0.0,,2007002.0


In [4]:
logger.info('scaling data')
data_std = MinMaxScaler().fit_transform(data);
ncomponents = 100
logger.info('running pca with %s components' % (ncomponents))
pca = PCA(n_components=ncomponents)
pca_result = pca.fit_transform(data_std)
pc = pca.components_
ev = pca.explained_variance_
evr = pca.explained_variance_ratio_
logger.info('representing %s of total variance with %s components' % (evr.sum(), ncomponents))
logger.info(evr)
logger.info(pc)

CS_RACA has 7 uniques
CS_ESCOLAR has 8 uniques
SG_UF_NOT has 28 uniques
CS_SEXO has 4 uniques
ID_BAIRRO has 2935 uniques
ID_MN_RESI has 3533 uniques
CS_ZONA has 5 uniques
SG_UF has 31 uniques
ID_PAIS has 9 uniques
DENGUE has 4 uniques
VACINADO has 4 uniques
FEBRE has 4 uniques
LACO has 4 uniques
CEFALEIA has 4 uniques
EXANTEMA has 4 uniques
DOR has 4 uniques
PROSTACAO has 4 uniques
MIALGIA has 4 uniques
NAUSEAS has 4 uniques
ARTRALGIA has 4 uniques
DIARREIA has 4 uniques
OUTROS has 4 uniques
EPISTAXE has 4 uniques
PETEQUIAS has 4 uniques
GENGIVO has 4 uniques
METRO has 4 uniques
HEMATURA has 4 uniques
SANGRAM has 4 uniques
OUTROS_M has 4 uniques
ASCITE has 4 uniques
PLEURAL has 4 uniques
PERICARDI has 4 uniques
ABDOMINAL has 4 uniques
HEPATO has 4 uniques
MIOCARDI has 4 uniques
HIPOTENSAO has 4 uniques
CHOQUE has 4 uniques
MANIFESTA has 4 uniques
INSUFICIEN has 4 uniques
OUTRO_S has 4 uniques
HOSPITALIZ has 4 uniques
UF has 25 uniques
S1_IGM has 5 uniques
S1_IGG has 5 uniques
MATERIAL 

In [8]:
fig = plt.figure()
x = list(range(11))
bars = np.concatenate((evr[:5], evr[74:80]), axis = None)
plt.bar(x, bars)
plt.xticks(x, ("C1", "C2", "C3", "C4", "C5", "C75", "C76", "C77", "C78", "C79", "C80"))
plt.show()

ValueError: could not convert string to float: 'S'