# Calculo de probabilidade de Fluxo

Autora: Mariama Oliveira
 
Objetivo: Calcular a probabilidade de deslocamento de uma cidade para outra e qtd de passageiros.

In [1]:
import pandas as pd

In [2]:
df_municipio = pd.read_csv("../data/integrado/municipio.csv")

#Lendo arquivos de fluxo
df_aereo = pd.read_csv("../data/integrado/fluxo_transporte/aereos_final.csv")
df_rodoviario_pred = pd.read_csv("predicao_rodov.csv")
df_rodoviario = pd.read_csv("../data/integrado/fluxo_transporte/rodoviarios_final.csv")

In [3]:
#Removendo viagens com cidades fora do Brasil
df_rodoviario[['mun_origem','estado_origem']] = df_rodoviario.origem.str.split("/",expand=True)
df_rodoviario[['mun_destino','estado_destino']] = df_rodoviario.destino.str.split("/",expand=True)
df_rodoviario = df_rodoviario[(df_rodoviario["estado_origem"].str.len()==2) & (df_rodoviario["estado_destino"].str.len()==2) ]

In [4]:
df_rodoviario = df_rodoviario[(df_rodoviario["cod_destino"].notna()) & (df_rodoviario["cod_origem"].notna())]


In [5]:
#Consertando formatos de entrada
df_aereo.cod_origem = df_aereo.cod_origem.astype(int)
df_aereo.cod_destino = df_aereo.cod_destino.astype(int)

df_rodoviario.cod_origem = df_rodoviario.cod_origem.astype(int)
df_rodoviario.cod_destino = df_rodoviario.cod_destino.astype(int)

df_rodoviario_pred.rename(columns = {"passageiros_rodov_pred":"passageiros_rodov"}, inplace = True)

In [6]:
df_aereo

Unnamed: 0,Origem,Destino,Mes,passageiros,cod_origem,cod_destino,Ano
0,AGUA BOA/MT,CONFRESA/MT,12,2.0,5100201,5103353,2019
1,AGUA BOA/MT,SAO FELIX DO ARAGUAIA/MT,3,6.0,5100201,5107859,2019
2,AGUA BOA/MT,SAO FELIX DO ARAGUAIA/MT,4,0.0,5100201,5107859,2019
3,AGUA BOA/MT,SAO FELIX DO ARAGUAIA/MT,5,5.0,5100201,5107859,2019
4,AGUA BOA/MT,SAO FELIX DO ARAGUAIA/MT,6,16.0,5100201,5107859,2019
...,...,...,...,...,...,...,...
11803,VITORIA/ES,UNA/BA,3,0.0,3205309,2932507,2019
11804,VITORIA/ES,VITORIA/ES,1,0.0,3205309,3205309,2019
11805,VITORIA/ES,VITORIA/ES,4,0.0,3205309,3205309,2019
11806,VITORIA/ES,VITORIA/ES,5,0.0,3205309,3205309,2019


In [7]:
df_rodoviario

Unnamed: 0,origem,destino,MesViagem,QuantidaDeBilhetes,cod_origem,cod_destino,mun_origem,estado_origem,mun_destino,estado_destino
0,ABADIA DOS DOURADOS/MG,CAMPINAS/SP,05/2019,1,3100104,3509502,ABADIA DOS DOURADOS,MG,CAMPINAS,SP
1,ABADIA DOS DOURADOS/MG,SAO PAULO/SP,07/2019,1,3100104,3550308,ABADIA DOS DOURADOS,MG,SAO PAULO,SP
2,ABADIANIA/GO,ALEXANIA/GO,01/2019,65,5200100,5200308,ABADIANIA,GO,ALEXANIA,GO
3,ABADIANIA/GO,ALEXANIA/GO,02/2019,41,5200100,5200308,ABADIANIA,GO,ALEXANIA,GO
4,ABADIANIA/GO,ALEXANIA/GO,03/2019,67,5200100,5200308,ABADIANIA,GO,ALEXANIA,GO
...,...,...,...,...,...,...,...,...,...,...
275534,ZORTEA/SC,BELO HORIZONTE/MG,11/2019,1,4219853,3106200,ZORTEA,SC,BELO HORIZONTE,MG
275535,ZORTEA/SC,BELO HORIZONTE/MG,12/2019,22,4219853,3106200,ZORTEA,SC,BELO HORIZONTE,MG
275536,ZORTEA/SC,FLORIANOPOLIS/SC,01/2020,21,4219853,4205407,ZORTEA,SC,FLORIANOPOLIS,SC
275537,ZORTEA/SC,FLORIANOPOLIS/SC,11/2019,39,4219853,4205407,ZORTEA,SC,FLORIANOPOLIS,SC


In [8]:
#Quantifica passagem por ano inteiro 
df_total_aereo = pd.DataFrame({"passageiros_aereo": df_aereo.groupby(['cod_origem','cod_destino'])['passageiros'].sum()}).reset_index()

df_total_rodov = pd.DataFrame({"passageiros_rodov": df_rodoviario.groupby(['cod_origem','cod_destino'])['QuantidaDeBilhetes'].sum()}).reset_index()

In [9]:
df_total_rodov

Unnamed: 0,cod_origem,cod_destino,passageiros_rodov
0,1100023,1100304,7
1,1100023,1200401,31
2,1100023,3119401,3
3,1100023,3127701,152
4,1100023,3131307,212
...,...,...,...
36913,5300108,5221197,5
36914,5300108,5221601,25
36915,5300108,5222005,291
36916,5300108,5222302,15


In [10]:
#Adicionando viagens calculadas a partir do regressor
df_municipio_dest = pd.DataFrame(df_municipio.add_suffix('_dest'))
df_municipio_ori = pd.DataFrame(df_municipio.add_suffix('_ori'))

df_total_rodov_uf =  df_total_rodov.merge(df_municipio_ori[["cod_mun_ori", "uf_ori"]], 
                how='inner', 
                left_on=["cod_origem"], 
                right_on=["cod_mun_ori"])

df_total_rodov_uf =  df_total_rodov_uf.merge(df_municipio_dest[["cod_mun_dest", "uf_dest"]], 
                how='inner', 
                left_on=["cod_destino"], 
                right_on=["cod_mun_dest"])

In [11]:
#Removendo valores dentro do mesmo estado
df_total_rodov_uf = df_total_rodov_uf[df_total_rodov_uf["uf_ori"] != df_total_rodov_uf["uf_dest"]]

In [12]:
df_total_rodov_uf

Unnamed: 0,cod_origem,cod_destino,passageiros_rodov,cod_mun_ori,uf_ori,cod_mun_dest,uf_dest
1,2800308,1100304,1,2800308,SE,1100304,RO
2,3119401,1100304,26,3119401,MG,1100304,RO
3,3127701,1100304,205,3127701,MG,1100304,RO
4,3131307,1100304,105,3131307,MG,1100304,RO
5,3134202,1100304,76,3134202,MG,1100304,RO
...,...,...,...,...,...,...,...
36913,5300108,4319158,96,5300108,DF,4319158,RS
36914,5300108,5213053,151,5300108,DF,5213053,GO
36915,5300108,5215306,291,5300108,DF,5215306,GO
36916,5300108,5221197,5,5300108,DF,5221197,GO


In [13]:
df_rodov_pred_diff = df_rodoviario_pred.merge(df_total_rodov_uf[["cod_destino", "cod_origem"]],indicator = True, how='left',
                        left_on=["cod_destino", "cod_origem"], 
                        right_on=["cod_destino", "cod_origem"]).loc[lambda x : x['_merge']!='both'] 

In [14]:
df_rodov_corrigido = pd.concat([df_total_rodov_uf, df_rodov_pred_diff])
df_rodov_corrigido = df_rodov_corrigido[["cod_origem", "cod_destino", "passageiros_rodov"]]

In [15]:
df_total_aereo

Unnamed: 0,cod_origem,cod_destino,passageiros_aereo
0,1100049,5002704,107.0
1,1100049,5108402,30864.0
2,1100122,3509502,87.0
3,1100122,5002704,165.0
4,1100122,5108402,34977.0
...,...,...,...
1730,5300108,4314902,298553.0
1731,5300108,5002704,88331.0
1732,5300108,5108402,221055.0
1733,5300108,5208707,124936.0


In [16]:
df_rodov_corrigido

Unnamed: 0,cod_origem,cod_destino,passageiros_rodov
1,2800308,1100304,1
2,3119401,1100304,26
3,3127701,1100304,205
4,3131307,1100304,105
5,3134202,1100304,76
...,...,...,...
110081,5104203,5108105,2
110082,3550001,3526308,251
110083,3526308,3550001,138
110084,5107800,5101605,7


In [17]:
#Juntando aereo e rodoviario no mesmo DF
df_total = df_rodov_corrigido.merge(
    df_total_aereo,
    how='outer', 
    left_on=["cod_origem","cod_destino"], 
    right_on=["cod_origem","cod_destino"]
    )

df_total['passageiros_rodov'] = df_total['passageiros_rodov'].fillna(0)
df_total['passageiros_aereo'] = df_total['passageiros_aereo'].fillna(0)
df_total['passageiros_total'] = df_total['passageiros_rodov'] + df_total['passageiros_aereo']
df_total

Unnamed: 0,cod_origem,cod_destino,passageiros_rodov,passageiros_aereo,passageiros_total
0,2800308,1100304,1.0,0.0,1.0
1,3119401,1100304,26.0,0.0,26.0
2,3127701,1100304,205.0,0.0,205.0
3,3131307,1100304,105.0,0.0,105.0
4,3134202,1100304,76.0,0.0,76.0
...,...,...,...,...,...
124211,5300108,3548906,0.0,0.0,0.0
124212,5300108,3549904,0.0,0.0,0.0
124213,5300108,4125506,0.0,205731.0,205731.0
124214,5300108,4211306,0.0,491.0,491.0


## Colocando dados relativos a Arranjo Populacional

In [18]:
df_arranjo = pd.read_csv("../data/integrado/arranjo.csv")
df_arranjo.head()

Unnamed: 0,cod_mun,nome_municipio,cod_arranjo,nome_arranjo,tipo_arranjo
0,3500105,Adamantina,3500105,Arranjo Populacional de Adamantina - Lucélia/SP,Arranjo populacional
1,4100202,Adrianópolis,4100202,Arranjo Populacional de Adrianópolis/PR - Ribe...,Arranjo populacional
2,3500501,Águas de Lindóia,3500501,Arranjo Populacional de Águas de Lindóia/SP,Arranjo populacional
3,3101508,Além Paraíba,3101508,Arranjo Populacional de Além Paraíba/MG - Sapu...,Arranjo populacional
4,5100300,Alto Araguaia,5100300,Arranjo Populacional de Alto Araguaia/MT,Arranjo populacional


In [19]:
#Realizando merge da tabela de fluxo com a de arranjo
df_merge = df_arranjo[["cod_mun", "cod_arranjo"]].merge(df_total, 
                how='right', 
                left_on='cod_mun', 
                right_on='cod_origem')

df_merge_2 = df_arranjo[["cod_mun", "cod_arranjo"]].merge(df_merge, 
                how='right', 
                left_on='cod_mun', 
                right_on='cod_destino')


In [20]:
df_merge_2

Unnamed: 0,cod_mun_x,cod_arranjo_x,cod_mun_y,cod_arranjo_y,cod_origem,cod_destino,passageiros_rodov,passageiros_aereo,passageiros_total
0,,,2800308.0,2800308.0,2800308,1100304,1.0,0.0,1.0
1,,,,,3119401,1100304,26.0,0.0,26.0
2,,,,,3127701,1100304,205.0,0.0,205.0
3,,,3131307.0,3131307.0,3131307,1100304,105.0,0.0,105.0
4,,,,,3134202,1100304,76.0,0.0,76.0
...,...,...,...,...,...,...,...,...,...
124211,3548906.0,3548906.0,5300108.0,5300108.0,5300108,3548906,0.0,0.0,0.0
124212,3549904.0,3549904.0,5300108.0,5300108.0,5300108,3549904,0.0,0.0,0.0
124213,,,5300108.0,5300108.0,5300108,4125506,0.0,205731.0,205731.0
124214,,,5300108.0,5300108.0,5300108,4211306,0.0,491.0,491.0


In [21]:
#Arranjo para Arranjo
df_arr_arr = df_merge_2[(df_merge_2.cod_arranjo_x.notna()) & (df_merge_2.cod_arranjo_y.notna())] 
df_arr_arr_agg = (df_arr_arr.groupby(["cod_arranjo_y", "cod_arranjo_x"])[["passageiros_rodov","passageiros_aereo","passageiros_total"]].sum()).reset_index()
df_arr_arr_agg.rename(columns={"cod_arranjo_x":"cod_destino", "cod_arranjo_y":"cod_origem"}, inplace=True)

#Arranjo para municipio
df_arr_mun = df_merge_2[(df_merge_2.cod_arranjo_y.notna()) & (df_merge_2.cod_arranjo_x.isna())] 
df_arr_mun_agg = (df_arr_mun.groupby(["cod_arranjo_y", "cod_destino"])[["passageiros_rodov","passageiros_aereo","passageiros_total"]].sum()).reset_index()
df_arr_mun_agg.rename(columns={"cod_arranjo_y":"cod_origem"}, inplace=True)

#Municipio para arranjo
df_mun_arr = df_merge_2[(df_merge_2.cod_arranjo_y.isna()) & (df_merge_2.cod_arranjo_x.notna())] 
df_mun_arr_agg = (df_mun_arr.groupby(["cod_origem", "cod_arranjo_x"])[["passageiros_rodov","passageiros_aereo","passageiros_total"]].sum()).reset_index()
df_mun_arr_agg.rename(columns={"cod_arranjo_x":"cod_destino"}, inplace=True)

#Municipio para municipio
df_mun_mun = df_merge_2[(df_merge_2.cod_arranjo_y.isna()) & (df_merge_2.cod_arranjo_x.isna())] 
df_mun_mun_agg = (df_mun_mun.groupby(["cod_origem", "cod_destino"])[["passageiros_rodov","passageiros_aereo","passageiros_total"]].sum()).reset_index()

In [22]:
#Concatenando arranjo com municipios
df_total_arr = pd.concat([df_arr_arr_agg, df_arr_mun_agg,df_mun_arr_agg, df_mun_mun_agg], ignore_index=True)

In [23]:
df_total_arr.cod_destino = df_total_arr.cod_destino.astype(int)
df_total_arr.cod_origem = df_total_arr.cod_origem.astype(int)

In [24]:
#DF que contem arranjo e municipios
df_total_arr

Unnamed: 0,cod_origem,cod_destino,passageiros_rodov,passageiros_aereo,passageiros_total
0,1100106,1100205,381.0,0.0,381.0
1,1100205,1100106,763.0,0.0,763.0
2,1100205,1100205,0.0,5.0,5.0
3,1100205,2105302,6.0,0.0,6.0
4,1100205,2111300,10.0,0.0,10.0
...,...,...,...,...,...
124211,5222203,5218300,44.0,0.0,44.0
124212,5222302,5201108,19.0,0.0,19.0
124213,5222302,5205513,18.0,0.0,18.0
124214,5222302,5205802,10.0,0.0,10.0


## Calculando a probabilidade do fluxo

In [25]:
#DF como fluxos totais
df_fluxo_total = df_total.groupby("cod_origem")[["passageiros_rodov", "passageiros_aereo", "passageiros_total"]].sum().reset_index()
df_fluxo_total_arr = df_total_arr.groupby("cod_origem")[["passageiros_rodov", "passageiros_aereo", "passageiros_total"]].sum().reset_index()

In [26]:
df_fluxo_total

Unnamed: 0,cod_origem,passageiros_rodov,passageiros_aereo,passageiros_total
0,1100015,869.0,0.0,869.0
1,1100023,13953.0,0.0,13953.0
2,1100031,31.0,0.0,31.0
3,1100049,12640.0,30971.0,43611.0
4,1100056,1110.0,0.0,1110.0
...,...,...,...,...
4968,5222005,2714.0,0.0,2714.0
4969,5222054,513.0,0.0,513.0
4970,5222203,338.0,0.0,338.0
4971,5222302,251.0,0.0,251.0


In [27]:
#Funcao que calcula a probabilidade do fluxo de cada cidade
def calculo_probabilidade(row, df_fluxo_total):
    DIAS_ANO = 365
    fluxo_rodov = row["passageiros_rodov"] 
    fluxo_aereo = row["passageiros_aereo"] 
    fluxo_total = row["passageiros_total"] 
    cod_cidade_origem = row["cod_origem"]
    cod_cidade_destino = row["cod_destino"]

    #Filtrar fluxo de saida da cidade
    fluxo_cidade = df_fluxo_total[df_fluxo_total["cod_origem"]==cod_cidade_origem].to_dict(orient='records')[0]
    fluxo_cidade_total = fluxo_cidade["passageiros_total"]
    fluxo_cidade_aereo = fluxo_cidade["passageiros_aereo"]
    fluxo_cidade_rodov = fluxo_cidade["passageiros_rodov"]

    prob_total = 0 if fluxo_cidade_total == 0 else fluxo_total / fluxo_cidade_total
    prob_aereo = 0 if fluxo_cidade_aereo==0 else fluxo_aereo / fluxo_cidade_aereo
    prob_rodov = 0 if fluxo_cidade_rodov == 0 else fluxo_rodov / fluxo_cidade_rodov

    return pd.Series([prob_rodov, prob_aereo, prob_total], index =['prob_rodov', 'prob_aereo', 'prob_total'])

In [28]:
#Calculando probabilidade por linha (row)
df_prob = df_total.apply(lambda row: calculo_probabilidade(row,df_fluxo_total),axis=1)
df_final = pd.concat([df_total, df_prob], axis=1)

#Calculando probabilidade por linha (row) para Arranjo/Municipio
df_prob_arr = df_total_arr.apply(lambda row: calculo_probabilidade(row,df_fluxo_total_arr),axis=1)
df_final_arr = pd.concat([df_total_arr, df_prob_arr], axis=1)

In [29]:
df_final[(df_final["cod_origem"]==2606002) & (df_final["cod_destino"]==2611606)]

Unnamed: 0,cod_origem,cod_destino,passageiros_rodov,passageiros_aereo,passageiros_total,prob_rodov,prob_aereo,prob_total
37560,2606002,2611606,4349.0,0.0,4349.0,0.091792,0.0,0.091792


In [33]:
df_final_arr

Unnamed: 0,cod_origem,cod_destino,passageiros_rodov,passageiros_aereo,passageiros_total,prob_rodov,prob_aereo,prob_total
0,1100106,1100205,381.0,0.0,381.0,0.758964,0.000000,0.758964
1,1100205,1100106,763.0,0.0,763.0,0.017988,0.000000,0.001860
2,1100205,1100205,0.0,5.0,5.0,0.000000,0.000014,0.000012
3,1100205,2105302,6.0,0.0,6.0,0.000141,0.000000,0.000015
4,1100205,2111300,10.0,0.0,10.0,0.000236,0.000000,0.000024
...,...,...,...,...,...,...,...,...
124211,5222203,5218300,44.0,0.0,44.0,0.130178,0.000000,0.130178
124212,5222302,5201108,19.0,0.0,19.0,0.075697,0.000000,0.075697
124213,5222302,5205513,18.0,0.0,18.0,0.071713,0.000000,0.071713
124214,5222302,5205802,10.0,0.0,10.0,0.039841,0.000000,0.039841


In [38]:
#Removendo fluxos com passageiros_total = 0 
df_final = df_final[df_final.passageiros_total > 0]
df_final_arr = df_final_arr[df_final_arr.passageiros_total > 0]


## Salvando dados em .csv

In [39]:
#Removendo fluxos que origem = destino
df_final = df_final[df_final.cod_origem != df_final.cod_destino]
df_final_arr = df_final_arr[df_final_arr.cod_origem != df_final_arr.cod_destino]

In [40]:
df_final

Unnamed: 0,cod_origem,cod_destino,passageiros_rodov,passageiros_aereo,passageiros_total,prob_rodov,prob_aereo,prob_total
0,2800308,1100304,1.0,0.0,1.0,0.000004,0.000000,0.000001
1,3119401,1100304,26.0,0.0,26.0,0.003275,0.000000,0.003275
2,3127701,1100304,205.0,0.0,205.0,0.001359,0.000000,0.001148
3,3131307,1100304,105.0,0.0,105.0,0.000529,0.000000,0.000529
4,3134202,1100304,76.0,0.0,76.0,0.001804,0.000000,0.001804
...,...,...,...,...,...,...,...,...
124208,5300108,3117876,0.0,477592.0,477592.0,0.000000,0.058936,0.054009
124209,5300108,3205309,0.0,81471.0,81471.0,0.000000,0.010054,0.009213
124210,5300108,3518800,0.0,694548.0,694548.0,0.000000,0.085709,0.078543
124213,5300108,4125506,0.0,205731.0,205731.0,0.000000,0.025388,0.023265


In [None]:
#Salvando os dados na pasta dados/calculado
df_final.to_csv("../data/calculado/calculo_qtd_fluxo.csv", index=False)
df_final_arr.to_csv("../data/calculado/arr_calculo_qtd_fluxo.csv", index=False)

## Análise de inconscistências

In [None]:
# df_aereo_original = pd.read_json("../data/anac/Dados_Estatisticos_2011_a_2020.json", lines=True, encoding='latin1')
df_aereo_original = pd.read_json("/Users/mariama/Documents/alerta-pandemia/notebooks_calculo_probabilidades/Dados_Estatisticos_2011_a_2020.json", lines=True)
# df_aereo_original = pd.read_json(open("/Users/mariama/Documents/alerta-pandemia/data/anac/Dados_Estatisticos_2011_a_2020.json", "r", encoding="latin1"),lines=True)

In [None]:
#CSV tratado passado por Natália
df_aereo_bruto = pd.read_csv("../data/anac/transporte_aereo_2019.csv")

In [None]:
df_aereo_bruto[df_aereo_bruto["Município_Destino"] == df_aereo_bruto["Município_Origem"]]["PASSAGEIROS_PAGOS"].sum()

2890.0

In [None]:
df_final[(df_final["cod_origem"]==2611606) & (df_final["cod_destino"]==2507507)]



Unnamed: 0,cod_origem,cod_destino,passageiros_rodov,passageiros_aereo,passageiros_total,prob_rodov,prob_aereo,prob_total
7044,2611606,2507507,117995.0,0.0,117995.0,0.315628,0.0,0.026314
