# Validação do Sistema
Estes dados de validação aparecem no artigo submetido para SBBD 2013

In [642]:
import pandas as pd
import os
import glob

In [643]:
from scipy import stats
def avaliar_corr(df, coluna_base):

    corr_df = pd.DataFrame(columns = ["r", "p"])

    for col in df:
        if pd.api.types.is_numeric_dtype(df[col]):
            r, p = stats.spearmanr(df[coluna_base], df[col])
            corr_df.loc[col] = [round(r,4), p ]
    
    return corr_df

## Carregando datasets

In [644]:
#Dataset Arranjos e Municípios
df_arr_mun = pd.read_csv("../dados_final/2_dados_sem_enriquecimento/arr_mun.csv")

#Dataset de estados
df_estado = pd.read_csv("../dados_final/2_dados_sem_enriquecimento/estado.csv")

#Dataset probabilidades
df_prob = pd.read_csv("../dados_final/4_dados_calculados/arr_calculo_qtd_fluxo.csv")

#Dataset casos iniciais (Alpha)
df_alpha = pd.read_csv("data/brazil_covid19.csv")

#Dataset casos Gamma
# Carregando lista de arquivos .tsv
path = "../data_fiocruz"
tsv_files = glob.glob(os.path.join(path, "*.tsv"))
df_list = []
for f in tsv_files: 
    df_list.append(pd.read_csv(f, sep='\t'))
# Juntando dados em um único dataframe
df_gamma = pd.concat(df_list)

### Tratamento de dados

DF de probabilidades

In [645]:
#Adicionando info ao df de probabilidades

df_prob = df_prob.merge(df_arr_mun[["cod_cidade","uf"]], left_on="cod_origem", right_on="cod_cidade")\
        .rename(columns={"uf": "uf_origem"})\
        .drop(columns="cod_cidade")

df_prob = df_prob.merge(df_arr_mun[["cod_cidade","uf"]], left_on="cod_destino", right_on="cod_cidade")\
        .rename(columns={"uf": "uf_destino"})\
        .drop(columns="cod_cidade")

DF da Alpha

In [646]:
df_alpha["date"] = df_alpha["date"].apply(pd.to_datetime)	

DF da Gamma

In [647]:
# Tranformando datas em tipo date 
df_gamma[["Collection date", "Submission date"]] = df_gamma[["Collection date", "Submission date"]].apply(pd.to_datetime)
#Removendo casos antes de Manaus --> Segundo Ricardo foram inputados errados
df_gamma = df_gamma[df_gamma["Collection date"]>="2020-12-03"]	

# Separando colunas de localidade
df_gamma[['continent','country','state','municipality']] = df_gamma['Location'].str.split(' / ', expand = True)

#Definindo UF do caso
from unicodedata import normalize
import re
def match_codigo(estado):
    estado = (normalize('NFKD', estado).encode('ASCII','ignore').decode('ASCII')).upper()
    return [re.sub(r"[^a-zA-Z0-9]+", ' ', k) for k in estado.split("\n")][0]

df_gamma.loc[df_gamma["state"]=="Federal District", "state"] = "Distrito Federal"
df_gamma["state"] = df_gamma.apply(lambda linha: match_codigo(linha.state), axis= 1)
df_estado["nome_uf"] = df_estado.apply(lambda linha: match_codigo(linha.nome_uf), axis= 1)
df_gamma = df_gamma.merge(df_estado[["nome_uf", "uf"]], left_on="state", right_on="nome_uf").drop(columns=["nome_uf"])

#Casos acumulados por estados
df_gamma = df_gamma.sort_values(by='Collection date')
df_gamma["case"] = 1
df_gamma["cases"] = df_gamma.groupby("uf")["case"].cumsum()



In [648]:
display(df_arr_mun)
display(df_prob)
display(df_alpha)
display(df_gamma)

Unnamed: 0,cod_cidade,nome_cidade,populacao_2021,area,latitude,longitude,cod_uf,nome_uf,uf,pais,densidade_2021
0,1100106,Arranjo Populacional Internacional de Guajará-...,46930.0,24856.877,-10.78890,-65.3296,11,Rondônia,RO,BRA,1.888009
1,1100205,Arranjo Populacional de Porto Velho/RO,577020.0,40934.820,-8.76077,-63.8999,11,Rondônia,RO,BRA,14.096068
2,1200054,Arranjo Populacional Internacional de Assis Br...,7649.0,4979.073,-10.92980,-69.5738,12,Acre,AC,BRA,1.536230
3,1200104,Arranjo Populacional Internacional de Cobija/B...,46102.0,5580.848,-10.99500,-68.7497,12,Acre,AC,BRA,8.260752
4,1303007,Arranjo Populacional de Nhamundá/AM,28659.0,25878.709,-2.20793,-56.7112,13,Amazonas,AM,BRA,1.107435
...,...,...,...,...,...,...,...,...,...,...,...
4894,1508407,Xinguara,45416.0,3779.348,-7.09830,-49.9437,15,Pará,PA,BRA,12.016888
4895,2933604,Xique-Xique,46562.0,5079.662,-10.82300,-42.7245,29,Bahia,BA,BRA,9.166358
4896,2517407,Zabelê,2269.0,106.811,-8.07901,-37.1057,25,Paraíba,PB,BRA,21.243130
4897,3557154,Zacarias,2784.0,319.056,-21.05060,-50.0552,35,São Paulo,SP,BRA,8.725741


Unnamed: 0,cod_origem,cod_destino,passageiros_rodov,passageiros_aereo,passageiros_total,prob_rodov,prob_aereo,prob_total,uf_origem,uf_destino
0,1100106,1100205,381.0,0.0,381.0,0.758964,0.000000,0.758964,RO,RO
1,1501402,1100205,0.0,1126.0,1126.0,0.000000,0.000652,0.000611,PA,RO
2,2105302,1100205,5.0,0.0,5.0,0.000080,0.000000,0.000021,MA,RO
3,2111300,1100205,2.0,0.0,2.0,0.000020,0.000000,0.000002,MA,RO
4,2611606,1100205,1.0,54.0,55.0,0.000002,0.000013,0.000012,PE,RO
...,...,...,...,...,...,...,...,...,...,...
118569,3526308,3550001,138.0,0.0,138.0,1.000000,0.000000,1.000000,SP,SP
118570,3550001,3526308,251.0,0.0,251.0,1.000000,0.000000,1.000000,SP,SP
118571,5101605,5107800,48.0,0.0,48.0,1.000000,0.000000,1.000000,MT,MT
118572,5104203,5108105,2.0,0.0,2.0,0.004926,0.000000,0.004926,MT,MT


Unnamed: 0,date,region,state,cases,deaths
0,2020-02-25,Centro-Oeste,DF,0.0,0
1,2020-02-25,Centro-Oeste,GO,0.0,0
2,2020-02-25,Centro-Oeste,MS,0.0,0
3,2020-02-25,Centro-Oeste,MT,0.0,0
4,2020-02-25,Nordeste,AL,0.0,0
...,...,...,...,...,...
12253,2021-05-23,Sudeste,RJ,840480.0,49515
12254,2021-05-23,Sudeste,SP,3188105.0,107614
12255,2021-05-23,Sul,PR,1060683.0,25506
12256,2021-05-23,Sul,RS,1059990.0,27419


Unnamed: 0,Accession ID,Collection date,Submission date,Location,continent,country,state,municipality,uf,case,cases
43945,EPI_ISL_2777382,2020-12-03,2021-07-02,South America / Brazil / Amazonas / Manaus,South America,Brazil,AMAZONAS,Manaus,AM,1,1
44399,EPI_ISL_2777388,2020-12-04,2021-07-02,South America / Brazil / Amazonas / Manaus,South America,Brazil,AMAZONAS,Manaus,AM,1,2
43277,EPI_ISL_833137,2020-12-04,2021-01-17,South America / Brazil / Amazonas / Manaus,South America,Brazil,AMAZONAS,Manaus,AM,1,3
43347,EPI_ISL_1060879,2020-12-06,2021-02-23,South America / Brazil / Amazonas / Manaus,South America,Brazil,AMAZONAS,Manaus,AM,1,4
43943,EPI_ISL_2777384,2020-12-07,2021-07-02,South America / Brazil / Amazonas / Manaus,South America,Brazil,AMAZONAS,Manaus,AM,1,5
...,...,...,...,...,...,...,...,...,...,...,...
28585,EPI_ISL_8721279,2021-12-30,2022-01-14,South America / Brazil / Sao Paulo,South America,Brazil,SAO PAULO,,SP,1,26826
27333,EPI_ISL_8721616,2021-12-30,2022-01-14,South America / Brazil / Sao Paulo,South America,Brazil,SAO PAULO,,SP,1,26827
27312,EPI_ISL_8721650,2021-12-30,2022-01-14,South America / Brazil / Sao Paulo,South America,Brazil,SAO PAULO,,SP,1,26828
28594,EPI_ISL_9303894,2022-01-10,2022-01-31,South America / Brazil / Sao Paulo,South America,Brazil,SAO PAULO,,SP,1,26829


## Risco das cidades

### Variante Alpha

Por cidade 

Por estado

In [649]:
#Removendo o estado de SP da análise
df_alpha_no_sp = df_alpha[df_alpha["state"]!="SP"]

Rank por 1 caso

In [650]:
df_more_1 = df_alpha_no_sp[df_alpha_no_sp["cases"]>=1]
df_more_1 = df_more_1[~df_more_1.duplicated(subset=["state"])]
df_more_1["rank"] = df_more_1["date"].rank(method='dense').astype(int)
df_more_1

Unnamed: 0,date,region,state,cases,deaths,rank
265,2020-03-05,Sudeste,RJ,1.0,0,1
275,2020-03-06,Nordeste,BA,1.0,0,2
290,2020-03-06,Sudeste,ES,1.0,0,2
297,2020-03-07,Centro-Oeste,DF,1.0,0,3
328,2020-03-08,Nordeste,AL,1.0,0,4
345,2020-03-08,Sudeste,MG,1.0,0,4
403,2020-03-10,Sul,RS,1.0,0,5
441,2020-03-12,Nordeste,PE,2.0,0,6
456,2020-03-12,Sul,PR,6.0,0,6
460,2020-03-13,Centro-Oeste,GO,3.0,0,7


Rank por 10 casos

In [651]:
#Rank com desempate --> deu mesma correlacao
# df_more_10 = df_alpha_no_sp[df_alpha_no_sp["cases"]>=10]
# df_more_10 = df_more_10[~df_more_10.duplicated(subset=["state"])]
# df_more_10['rank_date']=df_more_10["date"].rank(method='dense',ascending=False).astype(int)
# df_more_10["rank"] = df_more_10[["rank_date","cases"]].apply(tuple,axis=1)\
#              .rank(method='dense',ascending=False).astype(int)



In [652]:
#Rank sem desempate na data
df_more_10 = df_alpha_no_sp[df_alpha_no_sp["cases"]>=10]
df_more_10 = df_more_10[~df_more_10.duplicated(subset=["state"])]
df_more_10["rank"] = df_more_10["date"].rank(method='dense').astype(int)
df_more_10


Unnamed: 0,date,region,state,cases,deaths,rank
427,2020-03-11,Sudeste,RJ,13.0,0,1
540,2020-03-16,Centro-Oeste,DF,13.0,0,2
576,2020-03-17,Nordeste,PE,16.0,0,3
592,2020-03-17,Sul,RS,10.0,0,3
615,2020-03-18,Sudeste,MG,15.0,0,4
618,2020-03-18,Sul,PR,13.0,0,4
620,2020-03-18,Sul,SC,10.0,0,4
622,2020-03-19,Centro-Oeste,GO,12.0,0,5
626,2020-03-19,Nordeste,BA,30.0,0,5
627,2020-03-19,Nordeste,CE,20.0,0,5


Calculo de viagem de probabilidade por estado a partir do estado de SP

In [653]:
#Calculando probabilidade de viajar de SP para outros estados
df_prob_sp = df_prob[df_prob["uf_origem"]=="SP"] 
total_pass_sp = df_prob_sp["passageiros_total"].sum()
df_prob_sp_uf = pd.DataFrame(df_prob_sp.groupby(["uf_destino"])["passageiros_total"].sum())
df_prob_sp_uf["prob_viagem_sp"] = df_prob_sp_uf["passageiros_total"]/total_pass_sp
df_prob_sp_uf


Unnamed: 0_level_0,passageiros_total,prob_viagem_sp
uf_destino,Unnamed: 1_level_1,Unnamed: 2_level_1
AC,178.0,7e-06
AL,37529.0,0.0014
AM,84663.0,0.003159
AP,1052.0,3.9e-05
BA,1337603.0,0.049916
CE,218485.0,0.008153
DF,1524573.0,0.056894
ES,508938.0,0.018992
GO,807744.0,0.030143
MA,13519.0,0.000504


Calcular correlacao com os casos de COVID da alpha

In [654]:
#Realizando merge de df
df_alpha_uf_1 = df_more_1.merge(df_prob_sp_uf, left_on="state", right_on = "uf_destino")
df_alpha_uf_10 = df_more_10.merge(df_prob_sp_uf, left_on="state", right_on = "uf_destino")

In [655]:
df_alpha_uf_1

Unnamed: 0,date,region,state,cases,deaths,rank,passageiros_total,prob_viagem_sp
0,2020-03-05,Sudeste,RJ,1.0,0,1,4018677.0,0.149968
1,2020-03-06,Nordeste,BA,1.0,0,2,1337603.0,0.049916
2,2020-03-06,Sudeste,ES,1.0,0,2,508938.0,0.018992
3,2020-03-07,Centro-Oeste,DF,1.0,0,3,1524573.0,0.056894
4,2020-03-08,Nordeste,AL,1.0,0,4,37529.0,0.0014
5,2020-03-08,Sudeste,MG,1.0,0,4,2071848.0,0.077317
6,2020-03-10,Sul,RS,1.0,0,5,1508926.0,0.05631
7,2020-03-12,Nordeste,PE,2.0,0,6,647366.0,0.024158
8,2020-03-12,Sul,PR,6.0,0,6,1662952.0,0.062058
9,2020-03-13,Centro-Oeste,GO,3.0,0,7,807744.0,0.030143


Avaliando correlacao com rank a partir de 1 caso

In [656]:
avaliar_corr(df_alpha_uf_1[["rank","prob_viagem_sp"]], "rank")

Unnamed: 0,r,p
rank,1.0,2.315232e-189
prob_viagem_sp,-0.7534,8.904206e-06


In [657]:
df_alpha_uf_10

Unnamed: 0,date,region,state,cases,deaths,rank,passageiros_total,prob_viagem_sp
0,2020-03-11,Sudeste,RJ,13.0,0,1,4018677.0,0.149968
1,2020-03-16,Centro-Oeste,DF,13.0,0,2,1524573.0,0.056894
2,2020-03-17,Nordeste,PE,16.0,0,3,647366.0,0.024158
3,2020-03-17,Sul,RS,10.0,0,3,1508926.0,0.05631
4,2020-03-18,Sudeste,MG,15.0,0,4,2071848.0,0.077317
5,2020-03-18,Sul,PR,13.0,0,4,1662952.0,0.062058
6,2020-03-18,Sul,SC,10.0,0,4,1265519.0,0.047226
7,2020-03-19,Centro-Oeste,GO,12.0,0,5,807744.0,0.030143
8,2020-03-19,Nordeste,BA,30.0,0,5,1337603.0,0.049916
9,2020-03-19,Nordeste,CE,20.0,0,5,218485.0,0.008153


Avaliando correlacao com rank a partir de 10 casos

In [658]:
avaliar_corr(df_alpha_uf_10[["rank","prob_viagem_sp"]], "rank")

Unnamed: 0,r,p
rank,1.0,2.315232e-189
prob_viagem_sp,-0.8603,1.75676e-08


### Variante Gamma

Por cidade

Por estado

In [659]:
#Removendo o estado de AM da análise
df_gamma_no_am = df_gamma[df_gamma["uf"]!="AM"]

In [660]:
#Rank sem desempate na data
df_more_10 = df_gamma_no_am[df_gamma_no_am["cases"]>=10]
df_more_10 = df_more_10[~df_more_10.duplicated(subset=["state"])]
df_more_10["rank"] = df_more_10["Collection date"].rank(method='dense').astype(int)
df_more_10

Unnamed: 0,Accession ID,Collection date,Submission date,Location,continent,country,state,municipality,uf,case,cases,rank
49299,EPI_ISL_11496650,2021-01-01,2022-03-28,South America / Brazil / Para / Primavera,South America,Brazil,PARA,Primavera,PA,1,10,1
32613,EPI_ISL_1086048,2021-01-14,2021-02-26,South America / Brazil / Sao Paulo / Jau,South America,Brazil,SAO PAULO,Jau,SP,1,10,2
50373,EPI_ISL_2245177,2021-01-18,2021-05-24,South America / Brazil / Goias / Aparecida de ...,South America,Brazil,GOIAS,Aparecida de Goiania,GO,1,10,3
40501,EPI_ISL_2661915,2021-01-21,2021-06-24,South America / Brazil / Ceara / Fortaleza,South America,Brazil,CEARA,Fortaleza,CE,1,10,4
5391,EPI_ISL_1067735,2021-01-23,2021-02-24,South America / Brazil / Bahia,South America,Brazil,BAHIA,,BA,1,10,5
12040,EPI_ISL_1213192,2021-01-26,2021-03-11,South America / Brazil / Rio Grande do Norte /...,South America,Brazil,RIO GRANDE DO NORTE,Natal,RN,1,10,6
48062,EPI_ISL_1533992,2021-01-27,2021-04-10,South America / Brazil / Santa Catarina / Flor...,South America,Brazil,SANTA CATARINA,Florianopolis,SC,1,10,7
48886,EPI_ISL_2645659,2021-01-27,2021-06-22,South America / Brazil / Alagoas / MACEIO,South America,Brazil,ALAGOAS,MACEIO,AL,1,10,7
53274,EPI_ISL_2245111,2021-01-28,2021-05-24,South America / Brazil / Roraima / Boa Vista,South America,Brazil,RORAIMA,Boa Vista,RR,1,10,8
45711,EPI_ISL_2245062,2021-01-30,2021-05-24,South America / Brazil / Acre / Rodrigues Alves,South America,Brazil,ACRE,Rodrigues Alves,AC,1,10,9


Calculo de viagem de probabilidade por estado a partir do estado de AM

In [661]:
#Calculando probabilidade de viajar de AM para outros estados
df_prob_am = df_prob[df_prob["uf_origem"]=="AM"] 
total_pass_am = df_prob_am["passageiros_total"].sum()
df_prob_am_uf = pd.DataFrame(df_prob_am.groupby(["uf_destino"])["passageiros_total"].sum())
df_prob_am_uf["prob_viagem_am"] = df_prob_am_uf["passageiros_total"]/total_pass_am
df_prob_am_uf

Unnamed: 0_level_0,passageiros_total,prob_viagem_am
uf_destino,Unnamed: 1_level_1,Unnamed: 2_level_1
AC,23992.0,0.018305
AL,5.0,4e-06
AM,255089.0,0.194625
AP,133.0,0.000101
BA,352.0,0.000269
CE,111381.0,0.08498
DF,282918.0,0.215857
MG,2716.0,0.002072
MS,41.0,3.1e-05
MT,50.0,3.8e-05


In [662]:
df_prob_am_uf.shape
#Isso significa que a 7 estados estão fora da contagem pois não há fluxo direto de manaus para esses estados, ao menos nos nossos dados

(20, 2)

Calcular correlacao com os casos de COVID da gamma

In [663]:
df_gamma_uf_10 = df_more_10.merge(df_prob_am_uf, left_on="uf", right_on = "uf_destino")

In [664]:
df_gamma_uf_10

Unnamed: 0,Accession ID,Collection date,Submission date,Location,continent,country,state,municipality,uf,case,cases,rank,passageiros_total,prob_viagem_am
0,EPI_ISL_11496650,2021-01-01,2022-03-28,South America / Brazil / Para / Primavera,South America,Brazil,PARA,Primavera,PA,1,10,1,192514.0,0.146882
1,EPI_ISL_1086048,2021-01-14,2021-02-26,South America / Brazil / Sao Paulo / Jau,South America,Brazil,SAO PAULO,Jau,SP,1,10,2,87117.0,0.066467
2,EPI_ISL_2661915,2021-01-21,2021-06-24,South America / Brazil / Ceara / Fortaleza,South America,Brazil,CEARA,Fortaleza,CE,1,10,4,111381.0,0.08498
3,EPI_ISL_1067735,2021-01-23,2021-02-24,South America / Brazil / Bahia,South America,Brazil,BAHIA,,BA,1,10,5,352.0,0.000269
4,EPI_ISL_1533992,2021-01-27,2021-04-10,South America / Brazil / Santa Catarina / Flor...,South America,Brazil,SANTA CATARINA,Florianopolis,SC,1,10,7,943.0,0.000719
5,EPI_ISL_2645659,2021-01-27,2021-06-22,South America / Brazil / Alagoas / MACEIO,South America,Brazil,ALAGOAS,MACEIO,AL,1,10,7,5.0,4e-06
6,EPI_ISL_2245111,2021-01-28,2021-05-24,South America / Brazil / Roraima / Boa Vista,South America,Brazil,RORAIMA,Boa Vista,RR,1,10,8,87002.0,0.06638
7,EPI_ISL_2245062,2021-01-30,2021-05-24,South America / Brazil / Acre / Rodrigues Alves,South America,Brazil,ACRE,Rodrigues Alves,AC,1,10,9,23992.0,0.018305
8,EPI_ISL_4880353,2021-01-31,2021-10-05,South America / Brazil / Parana / Almirante Ta...,South America,Brazil,PARANA,Almirante Tamandare,PR,1,10,10,33.0,2.5e-05
9,EPI_ISL_3048764,2021-01-31,2021-07-23,South America / Brazil / Rio Grande do Sul / P...,South America,Brazil,RIO GRANDE DO SUL,Porto Alegre,RS,1,10,10,17289.0,0.013191


Avaliando correlacao com rank a partir de 10 casos

In [665]:
avaliar_corr(df_gamma_uf_10[["rank","prob_viagem_am"]], "rank")

Unnamed: 0,r,p
rank,1.0,1.6790919999999998e-134
prob_viagem_am,-0.1425,0.5604743


## Rotas Mais prováveis