# Agrupamento dos exemplos

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

In [2]:
df = pd.read_csv('Data/2000-2010/DENGUE_2000_2006.tsv',
                 sep = '\t', encoding='cp1252', dtype = 'unicode')

In [3]:
label_column_name = 'CON_CLASSI'
pos_column_name = 'POSITIVOS'
neg_column_name = 'NEGATIVOS'

Definição da função principal de processamento

In [4]:
def process_data(sample_df, ref_column_name, label_column_name, ):
    df_positive = sample_df[sample_df[label_column_name].isin(['1', '2', '3', '4'])]
    df_negative = sample_df[sample_df[label_column_name] == '5']
    positive_count = df_positive.groupby([ref_column_name])[ref_column_name].count()
    positive_count = positive_count.rename(pos_column_name)
    negative_count = df_negative.groupby([ref_column_name])[ref_column_name].count()
    negative_count = negative_count.rename(neg_column_name)
    df_counts = pd.DataFrame([positive_count, negative_count]).T
    posneg_sum = df_counts.sum(axis = 1)
    group_sum = df_counts.sum(axis = 0)
    df_counts['TOTAL'] = posneg_sum
    df_counts['% POS'] = positive_count.div(posneg_sum)
    df_counts['% NEG'] = negative_count.div(posneg_sum)
    df_counts['% TOTAL_POS'] = positive_count.div(group_sum[pos_column_name])
    df_counts['% TOTAL_NEG'] = negative_count.div(group_sum[neg_column_name])
#     df_counts['% P_TOTAL'] = positive_count.div(group_sum.sum())
#     df_counts['% N_TOTAL'] = negative_count.div(group_sum.sum())
    return df_counts

   ## Agrupamento dos exemplos por grau de escolaridade

In [5]:
ref_column_name = 'CS_ESCOLAR'
sample_df = df.loc[:, [ref_column_name, label_column_name]]
df_counts = process_data(sample_df, ref_column_name, label_column_name)
df_counts

Unnamed: 0_level_0,POSITIVOS,NEGATIVOS,TOTAL,% POS,% NEG,% TOTAL_POS,% TOTAL_NEG
CS_ESCOLAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,15606,5699,21305,0.732504,0.267496,0.063221,0.062177
2,2224,1440,3664,0.606987,0.393013,0.00901,0.015711
3,95355,39023,134378,0.709603,0.290397,0.386289,0.425746
4,42472,16421,58893,0.721172,0.278828,0.172057,0.179155
5,13541,5293,18834,0.718966,0.281034,0.054855,0.057747
6,15039,7349,22388,0.671744,0.328256,0.060924,0.080178
9,62612,16433,79045,0.792106,0.207894,0.253645,0.179286


## Agrupamento dos exemplos por idade

In [6]:
def parse_age(value):
    result = 0
    nvalue = '0'
    
    if value == np.nan:
        return np.nan
        
    try:
        if 'A' in value:
            result = float(value.replace('A', ''))
#             if n < 10:
#                 result = 10
#             elif n < 20:
#                 result = 20
#             elif n < 30:
#                 result = 30
#             elif n < 40:
#                 result = 40
#             elif n < 50:
#                 result = 50
#             elif n < 60:
#                 result = 60
#             else:
#                 result = 61                
        elif 'M' in value:    
            result = float(value.replace('M', '')) / 12
        elif 'D' in value:    
            result = float(value.replace('M', '')) / 365
        
        return result
    except:
        return result

É necessário agrupar as idades em faixas para melhorar o entendimento

In [7]:
ref_column_name = 'NU_IDADE'
sample_df = df.loc[:, [ref_column_name, label_column_name]]
sample_df['NU_IDADE'] = pd.to_numeric(sample_df[ref_column_name].apply(parse_age))
df_counts = process_data(sample_df, ref_column_name, label_column_name)

conditions = [
    (df_counts.index < 10),
    (df_counts.index >= 10) & (df_counts.index < 20),
    (df_counts.index >= 20) & (df_counts.index < 30),
    (df_counts.index >= 30) & (df_counts.index < 40),
    (df_counts.index >= 40) & (df_counts.index < 50),
    (df_counts.index >= 50) & (df_counts.index < 60),
    (df_counts.index >= 60)
]
#print(df_counts)
choices = ['0 a 10', '10 a 20', '20 a 30', '30 a 40', '40 a 50', '50 a 60', '60+']  
df_counts['FAIXA'] = np.select(conditions, choices, default = 'None')

grp = df_counts.groupby(['FAIXA'])
grp.sum()

Unnamed: 0_level_0,POSITIVOS,NEGATIVOS,TOTAL,% POS,% NEG,% TOTAL_POS,% TOTAL_NEG
FAIXA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0 a 10,21538.0,10122.0,31660.0,20.934093,14.065907,0.075144,0.099642
10 a 20,48855.0,17986.0,66841.0,7.305971,2.694029,0.17045,0.177055
20 a 30,64174.0,23315.0,87489.0,7.336508,2.663492,0.223897,0.229514
30 a 40,58395.0,20499.0,78894.0,7.402713,2.597287,0.203735,0.201794
40 a 50,44228.0,14588.0,58816.0,7.524869,2.475131,0.154307,0.143605
50 a 60,27296.0,8468.0,35764.0,7.638002,2.361998,0.095233,0.08336
60+,22137.0,6606.0,28743.0,35.466599,12.533401,0.077234,0.06503


Pelo visto pessoas entre 20 e 40 anos são as mais afetadas

In [8]:
# _0_10  = df_counts.loc[0:9.99999, :].sum(axis = 0)
# _10_20 = df_counts.loc[10:19.99999, :].sum(axis = 0)
# _20_30 = df_counts.loc[20:29.99999, :].sum(axis = 0)
# _30_40 = df_counts.loc[30:39.99999, :].sum(axis = 0)
# _40_50 = df_counts.loc[40:49.99999, :].sum(axis = 0)
# _50_60 = df_counts.loc[50:59.99999, :].sum(axis = 0)
# _60_p  = df_counts.loc[60:, :].sum(axis = 0)

# new_df = pd.DataFrame([_0_10, _10_20, _20_30, _30_40, _40_50, _50_60, _60_p])
# new_df

## Agrupamento dos exemplos por sexo

In [9]:
ref_column_name = 'CS_SEXO'
sample_df = df.loc[:, [ref_column_name, label_column_name]]
df_counts = process_data(sample_df, ref_column_name, label_column_name)
df_counts

Unnamed: 0_level_0,POSITIVOS,NEGATIVOS,TOTAL,% POS,% NEG,% TOTAL_POS,% TOTAL_NEG
CS_SEXO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F,164666,55957,220623,0.746368,0.253632,0.574506,0.550845
I,619,246,865,0.715607,0.284393,0.00216,0.002422
M,121337,45381,166718,0.727798,0.272202,0.423335,0.446734


## Agrupamento dos exemplos por etnia

In [10]:
ref_column_name = 'CS_RACA'
sample_df = df.loc[:, [ref_column_name, label_column_name]]
df_counts = process_data(sample_df, ref_column_name, label_column_name)
df_counts

Unnamed: 0_level_0,POSITIVOS,NEGATIVOS,TOTAL,% POS,% NEG,% TOTAL_POS,% TOTAL_NEG
CS_RACA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,6828,4398,11226,0.608231,0.391769,0.318233,0.423536
2,917,541,1458,0.628944,0.371056,0.042739,0.052099
3,313,181,494,0.633603,0.366397,0.014588,0.017431
4,8494,3721,12215,0.695375,0.304625,0.39588,0.35834
5,41,38,79,0.518987,0.481013,0.001911,0.003659
9,4863,1505,6368,0.763662,0.236338,0.22665,0.144935


## Agrupamento dos exemplos por zona de residência

In [11]:
ref_column_name = 'CS_ZONA'
sample_df = df.loc[:, [ref_column_name, label_column_name]]
df_counts = process_data(sample_df, ref_column_name, label_column_name)
df_counts

Unnamed: 0_level_0,POSITIVOS,NEGATIVOS,TOTAL,% POS,% NEG,% TOTAL_POS,% TOTAL_NEG
CS_ZONA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,264644,90953,355597,0.744225,0.255775,0.944766,0.922978
2,15401,7529,22930,0.671653,0.328347,0.054981,0.076403
3,32,38,70,0.457143,0.542857,0.000114,0.000386
9,39,23,62,0.629032,0.370968,0.000139,0.000233


In [12]:
stats.describe(df_counts['TOTAL'])

DescribeResult(nobs=4, minmax=(62, 355597), mean=94664.75, variance=30376453494.25, skewness=1.141664802365721, kurtosis=-0.6772758388124673)

Não é possível afirmar que a zona de residência dos habitantes tenha de fato alguma correlação com a incidência da dengue, porém nota-se uma proporção ligeiramente maior de incidência na zona urbana (0.74 vs 0.67) 

## Agrupamento dos exemplos pela semana dos primeiros sintomas

In [13]:
ref_column_name = 'SEM_PRI'
sample_df = df.loc[:, [ref_column_name, label_column_name]]
df_counts = process_data(sample_df, ref_column_name, label_column_name)
df_counts = df_counts.loc[df_counts['TOTAL'] > 20, :]
df_counts

Unnamed: 0,POSITIVOS,NEGATIVOS,TOTAL,% POS,% NEG,% TOTAL_POS,% TOTAL_NEG
102001,9317.0,3171.0,12488.0,0.746076,0.253924,0.032506,0.031216
112001,9959.0,3589.0,13548.0,0.73509,0.26491,0.034746,0.03533
12001,3715.0,1436.0,5151.0,0.721219,0.278781,0.012961,0.014136
12002,23.0,8.0,31.0,0.741935,0.258065,8e-05,7.9e-05
122001,10701.0,4174.0,14875.0,0.719395,0.280605,0.037335,0.041089
132001,10865.0,4016.0,14881.0,0.730126,0.269874,0.037907,0.039534
142001,12868.0,4831.0,17699.0,0.727047,0.272953,0.044895,0.047557
152001,12144.0,4534.0,16678.0,0.728145,0.271855,0.042369,0.044633
162001,14740.0,5031.0,19771.0,0.745536,0.254464,0.051426,0.049526
172001,15142.0,4480.0,19622.0,0.771685,0.228315,0.052829,0.044101
