# Agrupamento dos exemplos

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

In [16]:
df = pd.read_csv('Data/2000-2010/DENGUE_2000_2006.tsv',
                 sep = '\t', dtype = 'unicode')

In [17]:
label_column_name = 'CON_CLASSI'
pos_column_name = 'POSITIVOS'
neg_column_name = 'NEGATIVOS'

Definição da função principal de processamento

In [18]:
def process_data(sample_df, ref_column_name, label_column_name, ):
    df_positive = sample_df[sample_df[label_column_name].isin(['1', '2', '3', '4'])]
    df_negative = sample_df[sample_df[label_column_name] == '5']
    positive_count = df_positive.groupby([ref_column_name])[ref_column_name].count()
    positive_count = positive_count.rename(pos_column_name)
    negative_count = df_negative.groupby([ref_column_name])[ref_column_name].count()
    negative_count = negative_count.rename(neg_column_name)
    df_counts = pd.DataFrame([positive_count, negative_count]).T
    posneg_sum = df_counts.sum(axis = 1)
    group_sum = df_counts.sum(axis = 0)
    df_counts['TOTAL'] = posneg_sum
    df_counts['% POS'] = positive_count.div(posneg_sum)
    df_counts['% NEG'] = negative_count.div(posneg_sum)
    df_counts['% TOTAL_POS'] = positive_count.div(group_sum[pos_column_name])
    df_counts['% TOTAL_NEG'] = negative_count.div(group_sum[neg_column_name])
#     df_counts['% P_TOTAL'] = positive_count.div(group_sum.sum())
#     df_counts['% N_TOTAL'] = negative_count.div(group_sum.sum())
    return df_counts

   ## Agrupamento dos exemplos por grau de escolaridade

In [19]:
ref_column_name = 'CS_ESCOLAR'
sample_df = df.loc[:, [ref_column_name, label_column_name]]
df_counts = process_data(sample_df, ref_column_name, label_column_name)
df_counts

Unnamed: 0_level_0,POSITIVOS,NEGATIVOS,TOTAL,% POS,% NEG,% TOTAL_POS,% TOTAL_NEG
CS_ESCOLAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,67913,38085,105998,0.640701,0.359299,0.053324,0.054497
2,102381,74741,177122,0.578025,0.421975,0.080387,0.106949
3,371663,201663,573326,0.648258,0.351742,0.291822,0.288566
4,236331,142092,378423,0.624515,0.375485,0.185562,0.203324
5,88278,56444,144722,0.609983,0.390017,0.069314,0.080768
6,83521,60968,144489,0.578044,0.421956,0.065579,0.087241
9,323509,124852,448361,0.721537,0.278463,0.254012,0.178655


## Agrupamento dos exemplos por idade

In [20]:
def parse_age(value):
    result = 0
    nvalue = '0'
    
    if value == np.nan:
        return np.nan
        
    try:
        if 'A' in value:
            result = float(value.replace('A', ''))
#             if n < 10:
#                 result = 10
#             elif n < 20:
#                 result = 20
#             elif n < 30:
#                 result = 30
#             elif n < 40:
#                 result = 40
#             elif n < 50:
#                 result = 50
#             elif n < 60:
#                 result = 60
#             else:
#                 result = 61                
        elif 'M' in value:    
            result = float(value.replace('M', '')) / 12
        elif 'D' in value:    
            result = float(value.replace('M', '')) / 365
        
        return result
    except:
        return result

É necessário agrupar as idades em faixas para melhorar o entendimento

In [21]:
ref_column_name = 'NU_IDADE'
sample_df = df.loc[:, [ref_column_name, label_column_name]]
sample_df['NU_IDADE'] = pd.to_numeric(sample_df[ref_column_name].apply(parse_age))
df_counts = process_data(sample_df, ref_column_name, label_column_name)

conditions = [
    (df_counts.index < 10),
    (df_counts.index >= 10) & (df_counts.index < 20),
    (df_counts.index >= 20) & (df_counts.index < 30),
    (df_counts.index >= 30) & (df_counts.index < 40),
    (df_counts.index >= 40) & (df_counts.index < 50),
    (df_counts.index >= 50) & (df_counts.index < 60),
    (df_counts.index >= 60)
]
#print(df_counts)
choices = ['0 a 10', '10 a 20', '20 a 30', '30 a 40', '40 a 50', '50 a 60', '60+']  
df_counts['FAIXA'] = np.select(conditions, choices, default = 'None')

grp = df_counts.groupby(['FAIXA'])
grp.sum()

Unnamed: 0_level_0,POSITIVOS,NEGATIVOS,TOTAL,% POS,% NEG,% TOTAL_POS,% TOTAL_NEG
FAIXA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0 a 10,125693.0,88209.0,213902.0,38.374676,18.625324,0.083208,0.111949
10 a 20,268651.0,146610.0,415261.0,6.462629,3.537371,0.177846,0.186067
20 a 30,346607.0,182375.0,528982.0,6.553583,4.446417,0.229453,0.231457
30 a 40,295688.0,147955.0,443643.0,6.666349,3.333651,0.195744,0.187774
40 a 50,222932.0,107548.0,330480.0,6.745698,3.254302,0.14758,0.136492
50 a 60,137730.0,64124.0,201854.0,6.825115,3.174885,0.091177,0.081382
60+,113281.0,51121.0,164402.0,47.006186,30.993814,0.074992,0.064879


Pelo visto pessoas entre 20 e 40 anos são as mais afetadas

In [22]:
# _0_10  = df_counts.loc[0:9.99999, :].sum(axis = 0)
# _10_20 = df_counts.loc[10:19.99999, :].sum(axis = 0)
# _20_30 = df_counts.loc[20:29.99999, :].sum(axis = 0)
# _30_40 = df_counts.loc[30:39.99999, :].sum(axis = 0)
# _40_50 = df_counts.loc[40:49.99999, :].sum(axis = 0)
# _50_60 = df_counts.loc[50:59.99999, :].sum(axis = 0)
# _60_p  = df_counts.loc[60:, :].sum(axis = 0)

# new_df = pd.DataFrame([_0_10, _10_20, _20_30, _30_40, _40_50, _50_60, _60_p])
# new_df

## Agrupamento dos exemplos por sexo

In [23]:
ref_column_name = 'CS_SEXO'
sample_df = df.loc[:, [ref_column_name, label_column_name]]
df_counts = process_data(sample_df, ref_column_name, label_column_name)
df_counts

Unnamed: 0_level_0,POSITIVOS,NEGATIVOS,TOTAL,% POS,% NEG,% TOTAL_POS,% TOTAL_NEG
CS_SEXO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F,861576,428330,1289906,0.667937,0.332063,0.570361,0.543606
I,2560,699,3259,0.785517,0.214483,0.001695,0.000887
M,646445,358913,1005358,0.643,0.357,0.427945,0.455507


## Agrupamento dos exemplos por etnia

In [24]:
ref_column_name = 'CS_RACA'
sample_df = df.loc[:, [ref_column_name, label_column_name]]
df_counts = process_data(sample_df, ref_column_name, label_column_name)
df_counts

Unnamed: 0_level_0,POSITIVOS,NEGATIVOS,TOTAL,% POS,% NEG,% TOTAL_POS,% TOTAL_NEG
CS_RACA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,311395,239780,551175,0.564966,0.435034,0.410836,0.454617
2,43801,32223,76024,0.576147,0.423853,0.057788,0.061094
3,11523,7618,19141,0.602006,0.397994,0.015203,0.014444
4,270352,183190,453542,0.59609,0.40391,0.356686,0.347324
5,1644,1302,2946,0.558045,0.441955,0.002169,0.002469
9,119240,63320,182560,0.653155,0.346845,0.157318,0.120053


## Agrupamento dos exemplos por zona de residência

In [25]:
ref_column_name = 'CS_ZONA'
sample_df = df.loc[:, [ref_column_name, label_column_name]]
df_counts = process_data(sample_df, ref_column_name, label_column_name)
df_counts

Unnamed: 0_level_0,POSITIVOS,NEGATIVOS,TOTAL,% POS,% NEG,% TOTAL_POS,% TOTAL_NEG
CS_ZONA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1356791,682418,2039209,0.665352,0.334648,0.933286,0.901832
2,91589,70028,161617,0.566704,0.433296,0.063001,0.092544
3,4257,3520,7777,0.547383,0.452617,0.002928,0.004652
9,1141,736,1877,0.607885,0.392115,0.000785,0.000973


In [26]:
stats.describe(df_counts['TOTAL'])

DescribeResult(nobs=4, minmax=(1877, 2039209), mean=552620.0, variance=987667315876.0, skewness=1.1358944866548513, kurtosis=-0.6821315130065169)

Não é possível afirmar que a zona de residência dos habitantes tenha de fato alguma correlação com a incidência da dengue, porém nota-se uma proporção ligeiramente maior de incidência na zona urbana (0.74 vs 0.67) 

## Agrupamento dos exemplos pela semana dos primeiros sintomas

In [27]:
ref_column_name = 'SEM_PRI'
sample_df = df.loc[:, [ref_column_name, label_column_name]]
df_counts = process_data(sample_df, ref_column_name, label_column_name)
df_counts = df_counts.loc[df_counts['TOTAL'] > 20, :]
df_counts

Unnamed: 0,POSITIVOS,NEGATIVOS,TOTAL,% POS,% NEG,% TOTAL_POS,% TOTAL_NEG
002006,59.0,49.0,108.0,0.546296,0.453704,0.000039,0.000062
010620,19.0,16.0,35.0,0.542857,0.457143,0.000013,0.000020
012000,2923.0,1047.0,3970.0,0.736272,0.263728,0.001935,0.001329
012001,3741.0,1448.0,5189.0,0.720948,0.279052,0.002477,0.001838
012002,5980.0,2303.0,8283.0,0.721961,0.278039,0.003959,0.002923
012003,2616.0,1956.0,4572.0,0.572178,0.427822,0.001732,0.002482
012004,1463.0,1536.0,2999.0,0.487829,0.512171,0.000969,0.001949
012005,1657.0,1284.0,2941.0,0.563414,0.436586,0.001097,0.001630
012006,2419.0,2299.0,4718.0,0.512717,0.487283,0.001601,0.002918
012007,18.0,10.0,28.0,0.642857,0.357143,0.000012,0.000013
