In [1]:
import sys
import json
import pickle
import numpy as np
import pandas as pd
import googlemaps as gmaps

from os import listdir, walk
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from datatable import dt, f, by

from plotly import express as px, io as pio
pd.options.plotting.backend = 'plotly'
pio.renderers.default = 'plotly_mimetype+notebook_connected'

from utils import get_path, data_load
sys.path.insert(0, '../')
from secret import API_KEY

# Descrição

Get the travel distance and time for a matrix of origins and destinations.

[Documentação](https://developers.google.com/maps/documentation/distance-matrix/start#maps_http_distancematrix_start-py)

[Repositório API Python](https://github.com/googlemaps/google-maps-services-python)

# Fontes de dados

## Municípios

### Carregando tabela

In [2]:
path_munic = get_path('LOCALIDADES', 'municipios.csv.gzip')
df_munic = dt.fread(path_munic)
df_munic

Unnamed: 0_level_0,cod_municipio,nome_municipio,capital,uf,cod_uf,nome_uf,regiao,latitude,longitude
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,520005,Abadia de Goiás,0,GO,52,Goiás,Centro-Oeste,−16.7573,−49.4412
1,310010,Abadia dos Dourados,0,MG,31,Minas Gerais,Sudeste,−18.4831,−47.3916
2,520010,Abadiânia,0,GO,52,Goiás,Centro-Oeste,−16.197,−48.7057
3,310020,Abaeté,0,MG,31,Minas Gerais,Sudeste,−19.1551,−45.4444
4,150010,Abaetetuba,0,PA,15,Pará,Norte,−1.72183,−48.8788
5,230010,Abaiara,0,CE,23,Ceará,Nordeste,−7.34588,−39.0416
6,290010,Abaíra,0,BA,29,Bahia,Nordeste,−13.2488,−41.6619
7,290020,Abaré,0,BA,29,Bahia,Nordeste,−8.72073,−39.1162
8,410010,Abatiá,0,PR,41,Paraná,Sul,−23.3049,−50.3133
9,420005,Abdon Batista,0,SC,42,Santa Catarina,Sul,−27.6126,−51.0233


## SIH

### Colunas

In [3]:
cols_sih = {
  'cod_municipio': 'origem',
  'hosp_cod_municipio': 'destino',
}

### Carregando tabela

In [4]:
path_sih = get_path('SIH', 'sih.jay')
df_sih = dt.fread(path_sih, columns=cols_sih)
df_sih.names = cols_sih
df_sih = df_sih[:, list(cols_sih.values())]
df_sih = df_sih[f.origem != f.destino, :]
df_sih['count'] = 1
df_sih = df_sih[:, dt.sum(f.count), by('origem', 'destino')]
df_sih = df_sih.sort(-f.count)
df_sih

Unnamed: 0_level_0,origem,destino,count
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪
0,510840,510340,22807
1,315460,310620,17147
2,260790,261160,16012
3,521250,530010,15461
4,280480,280030,15090
5,330350,330285,14875
6,230370,230440,14545
7,320500,320530,14543
8,150080,150140,14521
9,432300,431490,14273


# Adicinar colunas

## Latitude & Longitude

In [5]:
def get_latlon_table_by_ref(df_munic, ref):
  cols_latlon = ['cod_municipio', 'latitude', 'longitude']
  df_latlon = df_munic[:, cols_latlon]
  df_latlon.names = {
    'cod_municipio': ref,
    'latitude': f'{ref}_latitude',
    'longitude': f'{ref}_longitude',
  }
  df_latlon.key = ref
  return df_latlon

## Parâmetros

In [6]:
def params_in_cols(df):
  dfs = list()
  for mode in ['driving', 'transit']: # driving | transit | bicycling | walking
    for hour in [6, 12, 18]: # [6, 12, 18] | [7, 13, 19]
      for traffic_model in ['best_guess', 'optimistic', 'pessimistic']: # best_guess | optimistic | pessimistic
        if mode != 'driving' and traffic_model != 'best_guess':
          continue
        df_ = df.copy()
        df_['mode'] = mode
        df_['traffic_model'] = traffic_model
        df_['hour'] = hour
        dfs.append(df_)
  df_param = pd.concat(dfs)
  df_param = df_param.sort_values(by=[
    'origem', 'destino', 'mode', 'traffic_model', 'hour'])
  df_param = df_param.reset_index(drop=True)
  return df_param

# Modelo de Coleta

## Local pra armazenar arquivo

In [7]:
def get_backup_path(data):
  infos = [
    f'{data["hour"]}h',
    data['mode'],
    data['traffic_model'],
    str(data['origem']),
    str(data['destino'])]
  fname = '-'.join(infos)
  dir = '/'.join(infos[:-1])
  path_dir = get_path('LOCALIDADES', f'google_maps/{dir}')
  Path(path_dir).mkdir(parents=True, exist_ok=True)
  return path_dir + f'/{fname}.json'

## Requisição para API

In [8]:
def get_travel_info(row):
  departure_time = datetime(year=2022, month=11, day=7, hour=row['hour'])
  info = row.copy()
  info['departure_time'] = str(departure_time)
  res = gmaps.distance_matrix(
    origins = [(row['origem_latitude'], row['origem_longitude'])],
    destinations = [(row['destino_latitude'], row['destino_longitude'])],
    mode = row['mode'],
    traffic_model = row['traffic_model'],
    departure_time = departure_time,
    language = 'pt-BR',
    units = 'metric',)
  info['origin_addresses'] = res['origin_addresses'][0]
  info['destination_addresses'] = res['destination_addresses'][0]
  elem = res['rows'][0]['elements'][0]
  info['status'] = elem['status']
  for col in list(elem):
    if col == 'status':
      continue
    for i_type in ['text', 'value']:
      col_name = f'{col} ({i_type})'
      if elem['status'] == 'OK':
        info[col_name] = elem[col][i_type]
      else:
        info[col_name] = ''
  return info

## Coleta das informações

In [9]:
def request_travel_infos(df):
  for row in tqdm(df.to_dict(orient='records')):
    fpath = get_backup_path(row)
    if Path(fpath).is_file():
      continue
    info = get_travel_info(row)
    with open(fpath, 'w') as fp:
      json.dump(info, fp)

# Gerando tabela

Tabela com informações a serem enviadas para a API do Google Maps.

In [10]:
df_latlon_orig = get_latlon_table_by_ref(df_munic, 'origem')
df_latlon_dest = get_latlon_table_by_ref(df_munic, 'destino')
df_route = df_sih[:, :, dt.join(df_latlon_orig)]
df_route = df_route[:, :, dt.join(df_latlon_dest)]
df_route = params_in_cols(df_route.to_pandas())
df_route

Unnamed: 0,origem,destino,count,origem_latitude,origem_longitude,destino_latitude,destino_longitude,mode,traffic_model,hour
0,110001,110002,2,-11.9283,-61.9953,-9.90571,-63.0325,driving,best_guess,6
1,110001,110002,2,-11.9283,-61.9953,-9.90571,-63.0325,driving,best_guess,12
2,110001,110002,2,-11.9283,-61.9953,-9.90571,-63.0325,driving,best_guess,18
3,110001,110002,2,-11.9283,-61.9953,-9.90571,-63.0325,driving,optimistic,6
4,110001,110002,2,-11.9283,-61.9953,-9.90571,-63.0325,driving,optimistic,12
...,...,...,...,...,...,...,...,...,...,...
822871,530010,522200,2,-15.7795,-47.9297,-16.74050,-48.5159,driving,pessimistic,12
822872,530010,522200,2,-15.7795,-47.9297,-16.74050,-48.5159,driving,pessimistic,18
822873,530010,522200,2,-15.7795,-47.9297,-16.74050,-48.5159,transit,best_guess,6
822874,530010,522200,2,-15.7795,-47.9297,-16.74050,-48.5159,transit,best_guess,12


In [11]:
df_route_sorted = df_route.sort_values('count', ascending=False)
df_route_sorted

Unnamed: 0,origem,destino,count,origem_latitude,origem_longitude,destino_latitude,destino_longitude,mode,traffic_model,hour
786107,510840,510340,22807,-15.64580,-56.1322,-15.60100,-56.0974,transit,best_guess,18
786106,510840,510340,22807,-15.64580,-56.1322,-15.60100,-56.0974,transit,best_guess,12
786105,510840,510340,22807,-15.64580,-56.1322,-15.60100,-56.0974,transit,best_guess,6
786104,510840,510340,22807,-15.64580,-56.1322,-15.60100,-56.0974,driving,pessimistic,18
786103,510840,510340,22807,-15.64580,-56.1322,-15.60100,-56.0974,driving,pessimistic,12
...,...,...,...,...,...,...,...,...,...,...
247865,260730,270430,1,-7.64505,-40.1476,-9.66599,-35.7350,driving,optimistic,18
247866,260730,270430,1,-7.64505,-40.1476,-9.66599,-35.7350,driving,pessimistic,6
247867,260730,270430,1,-7.64505,-40.1476,-9.66599,-35.7350,driving,pessimistic,12
247868,260730,270430,1,-7.64505,-40.1476,-9.66599,-35.7350,driving,pessimistic,18


# Coleta dos dados

## Seleção parâmetros

In [12]:
def select_params(
    df,
    mode, # driving | transit | bicycling | walking
    traffic_model, # best_guess | optimistic | pessimistic
    hour, # [6, 12, 18] | [7, 13, 19]
  ):
  df_param = df[
    (
      df['mode'] == mode
    ) & (
      df['traffic_model'] == traffic_model
    ) & (
      df['hour'] == hour
    )
  ].sort_values(by='count', ascending=False)
  return df_param

## Filtros

### Intra RJ

`origem` e `destino` no estado do RJ.

In [13]:
origem_rj = df_route['origem'].astype(str).str.startswith('33')
destino_rj = df_route['destino'].astype(str).str.startswith('33')
df_route_rj = df_route[origem_rj & destino_rj]
df_route_rj

Unnamed: 0,origem,destino,count,origem_latitude,origem_longitude,destino_latitude,destino_longitude,mode,traffic_model,hour
497964,330010,330023,1,-23.0011,-44.3196,-22.7528,-41.8846,driving,best_guess,6
497965,330010,330023,1,-23.0011,-44.3196,-22.7528,-41.8846,driving,best_guess,12
497966,330010,330023,1,-23.0011,-44.3196,-22.7528,-41.8846,driving,best_guess,18
497967,330010,330023,1,-23.0011,-44.3196,-22.7528,-41.8846,driving,optimistic,6
497968,330010,330023,1,-23.0011,-44.3196,-22.7528,-41.8846,driving,optimistic,12
...,...,...,...,...,...,...,...,...,...,...
519643,330630,330620,1,-22.5202,-44.0996,-22.4059,-43.6686,driving,pessimistic,12
519644,330630,330620,1,-22.5202,-44.0996,-22.4059,-43.6686,driving,pessimistic,18
519645,330630,330620,1,-22.5202,-44.0996,-22.4059,-43.6686,transit,best_guess,6
519646,330630,330620,1,-22.5202,-44.0996,-22.4059,-43.6686,transit,best_guess,12


## Consulta API

In [14]:
gmaps = gmaps.Client(key=API_KEY)

In [15]:
# df_param = select_params(df_route_rj, 'driving', 'pessimistic', 18)
# df_param = select_params(df_route_rj, 'transit', 'best_guess', 6)
# df_param

Unnamed: 0,origem,destino,count,origem_latitude,origem_longitude,destino_latitude,destino_longitude,mode,traffic_model,hour
508941,330350,330285,14875,-22.7556,-43.4603,-22.8028,-43.4601,transit,best_guess,6
499917,330045,330285,11761,-22.7640,-43.3992,-22.8028,-43.4601,transit,best_guess,6
508809,330350,330045,11658,-22.7556,-43.4603,-22.7640,-43.3992,transit,best_guess,6
515865,330490,330330,7848,-22.8268,-43.0634,-22.8832,-43.1034,transit,best_guess,6
500049,330045,330510,7263,-22.7640,-43.3992,-22.8058,-43.3729,transit,best_guess,6
...,...,...,...,...,...,...,...,...,...,...
507537,330310,330090,1,-21.0390,-41.9697,-21.5691,-41.9187,transit,best_guess,6
507549,330310,330170,1,-21.0390,-41.9697,-22.7858,-43.3049,transit,best_guess,6
507561,330310,330205,1,-21.0390,-41.9697,-21.4296,-41.7014,transit,best_guess,6
507597,330310,330490,1,-21.0390,-41.9697,-22.8268,-43.0634,transit,best_guess,6


In [None]:
df_param = select_params(df_route_sorted, 'driving', 'pessimistic', 18)
request_travel_infos(df_param)

In [None]:
df_param = select_params(df_route_sorted, 'transit', 'best_guess', 6)
request_travel_infos(df_param)

RJ driving best_guess 6h

100%|██████████| 1316/1316 [04:07<00:00,  5.32it/s]

RJ driving best_guess 12h

100%|██████████| 1316/1316 [04:28<00:00,  4.91it/s]

RJ driving best_guess 18h

100%|██████████| 1316/1316 [04:29<00:00,  4.89it/s]

RJ driving pessimistic 18h

100%|██████████| 1316/1316 [04:26<00:00,  5.12it/s]

RJ transit best_guess 12h

100%|██████████| 1316/1316 [03:20<00:00,  6.56it/s]

# Resultado da coleta

## Lendo arquivos salvos

In [17]:
def read_data_to_df(path_dir, cols_order=None):
  files = list()
  for (dirpath, _, filenames) in walk(path_dir):
    files.extend([dirpath + '/' + fl for fl in filenames if 'json' in fl])
  infos = list()
  for file in files:
    with open(file) as fl:
      d = json.load(fl)
      infos.append(d)
  df = pd.DataFrame(infos)
  cols_order = df.columns if cols_order is None else cols_order
  return df[cols_order]

## Ordenando colunas

In [18]:
cols_order = [
  'status', 'origem', 'destino', 'origin_addresses', 'destination_addresses',
  'origem_latitude', 'origem_longitude', 'destino_latitude', 'destino_longitude',
  'hour', 'departure_time', 'mode', 'traffic_model',
  'distance (value)', 'distance (text)',
  'duration (value)', 'duration (text)',
  'duration_in_traffic (value)', 'duration_in_traffic (text)',
  'count',
]

## Tabela resultante

In [19]:
path_gmaps = get_path('LOCALIDADES', f'google_maps/')
df_files = read_data_to_df(path_gmaps, cols_order=cols_order)
df_files

Unnamed: 0,status,origem,destino,origin_addresses,destination_addresses,origem_latitude,origem_longitude,destino_latitude,destino_longitude,hour,departure_time,mode,traffic_model,distance (value),distance (text),duration (value),duration (text),duration_in_traffic (value),duration_in_traffic (text),count
0,OK,330330,330490,"R. Maria Bregua, 37 - São Lourenço, Niterói - ...","R. Temistocles de Almeida, 134 - Camarão, São ...",-22.8832,-43.1034,-22.8268,-43.0634,6,2022-11-07 06:00:00,transit,best_guess,9900.0,"9,9 km",2271.0,38 minutos,,,204
1,OK,330330,330185,"R. Maria Bregua, 37 - São Lourenço, Niterói - ...","R. João da Silva Maia, 14 - Parque Santo Anton...",-22.8832,-43.1034,-22.5347,-42.9895,6,2022-11-07 06:00:00,transit,best_guess,63545.0,"63,5 km",12388.0,3 horas 26 minutos,,,2
2,OK,330330,330023,"R. Maria Bregua, 37 - São Lourenço, Niterói - ...","Av. José Bento Ribeiro Dantas, 842, Búzios - R...",-22.8832,-43.1034,-22.7528,-41.8846,6,2022-11-07 06:00:00,transit,best_guess,171243.0,171 km,28870.0,8 horas 1 minuto,,,1
3,OK,330330,330170,"R. Maria Bregua, 37 - São Lourenço, Niterói - ...","R. Ana Neri, 18 - Jardim Vinte e Cinco de Agos...",-22.8832,-43.1034,-22.7858,-43.3049,6,2022-11-07 06:00:00,transit,best_guess,30933.0,"30,9 km",4995.0,1 hora 23 minutos,,,25
4,ZERO_RESULTS,330330,330130,"-22.8832,-43.1034","-22.4812,-42.2066",-22.8832,-43.1034,-22.4812,-42.2066,6,2022-11-07 06:00:00,transit,best_guess,,,,,,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7891,OK,330380,330130,"R. Manoel Tôrres, 24 - Parque Imperial, Paraty...","R. Jonas Nunes, 200 - Nossa Sra. Saude, Casimi...",-23.2221,-44.7175,-22.4812,-42.2066,18,2022-11-07 18:00:00,driving,pessimistic,371952.0,372 km,19391.0,5 horas 23 minutos,23155.0,6 horas 26 minutos,1
7892,OK,330380,330040,"R. Manoel Tôrres, 24 - Parque Imperial, Paraty...","Av. Jansen de Melo, nº15 - Centro, Barra Mansa...",-23.2221,-44.7175,-22.5481,-44.1752,18,2022-11-07 18:00:00,driving,pessimistic,164056.0,164 km,10235.0,2 horas 51 minutos,11574.0,3 horas 13 minutos,1
7893,OK,330380,330420,"R. Manoel Tôrres, 24 - Parque Imperial, Paraty...","R. do Rosário, 701-723 - Centro, Resende - RJ,...",-23.2221,-44.7175,-22.4705,-44.4509,18,2022-11-07 18:00:00,driving,pessimistic,189799.0,190 km,11218.0,3 horas 7 minutos,12689.0,3 horas 31 minutos,5
7894,OK,330380,330400,"R. Manoel Tôrres, 24 - Parque Imperial, Paraty...","R. Cap. Manoel Tôrres, 830 - Quatro de Abril, ...",-23.2221,-44.7175,-22.6215,-43.9081,18,2022-11-07 18:00:00,driving,pessimistic,164591.0,165 km,10281.0,2 horas 51 minutos,11477.0,3 horas 11 minutos,2


### Exportando XLSX

In [20]:
path_routes = get_path('GESTANTES', f'gmaps/routes.xlsx')
df_files.to_excel(path_routes, index=False)

## 

# Investigação

In [21]:
df_files.columns

Index(['status', 'origem', 'destino', 'origin_addresses',
       'destination_addresses', 'origem_latitude', 'origem_longitude',
       'destino_latitude', 'destino_longitude', 'hour', 'departure_time',
       'mode', 'traffic_model', 'distance (value)', 'distance (text)',
       'duration (value)', 'duration (text)', 'duration_in_traffic (value)',
       'duration_in_traffic (text)', 'count'],
      dtype='object')

In [22]:
cols_inv = [
  'status', 'count',
  'origem', 'destino',
  'hour', 'mode', 'traffic_model',
  'distance (value)',
  'duration (value)', 'duration_in_traffic (value)',
]

In [23]:
df_inv = df_files[cols_inv].sort_values(
  by=['origem', 'destino', 'hour', 'mode', 'traffic_model', 'distance (value)']
).reset_index(drop=True)
df_inv

Unnamed: 0,status,count,origem,destino,hour,mode,traffic_model,distance (value),duration (value),duration_in_traffic (value)
0,OK,1,330010,330023,6,driving,best_guess,330424.0,18528.0,18628.0
1,ZERO_RESULTS,1,330010,330023,6,transit,best_guess,,,
2,OK,1,330010,330023,12,driving,best_guess,317920.0,18249.0,18718.0
3,OK,1,330010,330023,12,transit,best_guess,312980.0,49509.0,
4,OK,1,330010,330023,18,driving,best_guess,317920.0,18249.0,18389.0
...,...,...,...,...,...,...,...,...,...,...
7891,OK,1,330630,330620,6,transit,best_guess,49690.0,6327.0,
7892,OK,1,330630,330620,12,driving,best_guess,55708.0,3646.0,3501.0
7893,OK,1,330630,330620,12,transit,best_guess,65604.0,9417.0,
7894,OK,1,330630,330620,18,driving,best_guess,55708.0,3646.0,3908.0


## driving pessimistic in traffic at 6pm x transit at 12pm

### Consultar duração

In [24]:
def get_duration(df, origem, destino, mode, traffic_model, hour, in_traffic=True):
  col_idx = int(in_traffic)
  col_duration = ['duration (value)', 'duration_in_traffic (value)'][col_idx]
  duration = df[
    (
      df['origem'] == origem
    ) & (
      df['destino'] == destino
    ) & (
      df['mode'] == mode
    ) & (
      df['traffic_model'] == traffic_model
    ) & (
      df['hour'] == hour
    )
  ]
  duration = list(duration[col_duration])[0]
  return duration

### Comparação

In [25]:
def hours_comparison(df):
  cols_route = ['origem', 'destino', 'distance (value)']
  df_route = df[cols_route].groupby(cols_route, as_index=False).sum()
  ratios = list()
  for row in df_route.to_dict(orient='records'):
    try:
      driving = get_duration(df, row['origem'], row['destino'], 'driving', 'pessimistic', 18, in_traffic=True)
      transit = get_duration(df, row['origem'], row['destino'], 'transit', 'best_guess', 12, in_traffic=False)
      ratio = (transit - driving) / driving
    except:
      ratio = np.nan
    ratios.append(ratio)
  df_route['ratio'] = ratios
  df_route = df_route.sort_values(cols_route).reset_index(drop=True)
  return df_route.dropna()

In [26]:
df_vs = hours_comparison(df_inv.dropna(subset=['duration (value)']))
df_vs

Unnamed: 0,origem,destino,distance (value),ratio
0,330010,330023,312980.0,1.269285
1,330010,330023,317920.0,1.269285
2,330010,330023,330424.0,1.269285
4,330010,330045,114022.0,0.963642
5,330010,330045,139606.0,0.963642
...,...,...,...,...
2676,330630,330610,52169.0,0.010966
2677,330630,330610,67130.0,0.010966
2678,330630,330620,49690.0,0.847920
2679,330630,330620,55708.0,0.847920


### Histograma

In [27]:
px.histogram(df_vs, x='ratio')

### Density contour

In [28]:
fig = px.density_contour(df_vs, x='ratio', y='distance (value)', marginal_x="histogram", marginal_y="histogram") #log_x=True, log_y=True) # , opacity=.25
fig.show()

## % diferença com tráfego

### Tabela com atraso percentual

In [29]:
traffic = 'duration_in_traffic (value)'
normal = 'duration (value)'
df_inv['late_pct'] = (df_inv[traffic] - df_inv[normal]) / df_inv[normal]
df_inv

Unnamed: 0,status,count,origem,destino,hour,mode,traffic_model,distance (value),duration (value),duration_in_traffic (value),late_pct
0,OK,1,330010,330023,6,driving,best_guess,330424.0,18528.0,18628.0,0.005397
1,ZERO_RESULTS,1,330010,330023,6,transit,best_guess,,,,
2,OK,1,330010,330023,12,driving,best_guess,317920.0,18249.0,18718.0,0.025700
3,OK,1,330010,330023,12,transit,best_guess,312980.0,49509.0,,
4,OK,1,330010,330023,18,driving,best_guess,317920.0,18249.0,18389.0,0.007672
...,...,...,...,...,...,...,...,...,...,...,...
7891,OK,1,330630,330620,6,transit,best_guess,49690.0,6327.0,,
7892,OK,1,330630,330620,12,driving,best_guess,55708.0,3646.0,3501.0,-0.039770
7893,OK,1,330630,330620,12,transit,best_guess,65604.0,9417.0,,
7894,OK,1,330630,330620,18,driving,best_guess,55708.0,3646.0,3908.0,0.071860


### Histograma

`traffic = normal + normal * late_pct`

In [30]:
px.histogram(df_inv, x='late_pct', color='hour', barmode='overlay', opacity=.5)

## Comparação entre horas

### Consultar duração

In [31]:
def get_duration(df, origem, destino, hour, col):
  return list(df[
    (
      df['origem'] == origem
    ) & (
      df['destino'] == destino
    ) & (
      df['hour'] == hour
    )
  ][col])[0]

### Tabela de comparação entre horários

In [32]:
def hours_comparison(df, i=0):
  cols_route = ['origem', 'destino']
  cols_duration = ['duration_in_traffic (value)', 'duration (value)']
  # hours = list(df_inv['hour'].unique())
  df_route = df[cols_route].groupby(cols_route, as_index=False).sum()
  df_route_soon = df_route.copy()
  df_route_late = df_route.copy()
  df_route_soon['time'] = 'soon (6am)'
  df_route_late['time'] = 'late (6pm)'
  col_val_soon = list()
  col_val_late = list()
  for row in df_route.to_dict(orient='records'):
    dur_6am = get_duration(df, row['origem'], row['destino'], 6, cols_duration[i])
    dur_12h = get_duration(df, row['origem'], row['destino'], 12, cols_duration[i])
    dur_6pm = get_duration(df, row['origem'], row['destino'], 18, cols_duration[i])
    soon = (dur_12h - dur_6am) / dur_12h
    late = (dur_6pm - dur_12h) / dur_12h
    col_val_soon.append(soon)
    col_val_late.append(late)
  df_route_soon['diff_perct'] = col_val_soon
  df_route_late['diff_perct'] = col_val_late
  df_comp = pd.concat([df_route_soon, df_route_late])
  df_comp = df_comp.sort_values(cols_route).reset_index(drop=True)
  return df_comp

In [33]:
df_comp_traffic = hours_comparison(df_inv, 0)
df_comp_normal = hours_comparison(df_inv, 1)

### Histograma (tráfego)

`6(a|p)m = 12pm * diff_perct`

In [34]:
px.histogram(df_comp_traffic, x='diff_perct', color='time', barmode='overlay', opacity=.5)

### Histograma (normal)

`6(a|p)m = 12pm + 12pm * diff_perct`

In [35]:
px.histogram(df_comp_normal, x='diff_perct', color='time', barmode='overlay', opacity=.5)

## Comparação com OSM

### Lendo tabela OSM

In [36]:
path_dist = get_path('DISTANCIAS', 'distancias.jay')
df_osm = dt.fread(path_dist)
df_osm = df_osm.to_pandas()
df_osm

Unnamed: 0,origem,destino,distancia,tempo
0,110001,110001,0.000000,0.000000
1,110001,110002,309.050000,6.169056
2,110001,110003,399.499700,6.289056
3,110001,110004,81.201103,1.917750
4,110001,110005,391.704300,6.136361
...,...,...,...,...
31024895,530010,522200,161.728900,2.356000
31024896,530010,522205,382.708800,5.479556
31024897,530010,522220,161.603400,2.248944
31024898,530010,522230,190.000900,3.216083


### Mesclando tabelas

Normalizando o tempo entre elas (ambas em segundos).

In [37]:
df_dist = pd.merge(df_inv, df_osm, how='left', on=['origem', 'destino'])
df_dist['tempo'] = df_dist['tempo'] * 60 * 60
df_dist

Unnamed: 0,status,count,origem,destino,hour,mode,traffic_model,distance (value),duration (value),duration_in_traffic (value),late_pct,distancia,tempo
0,OK,1,330010,330023,6,driving,best_guess,330424.0,18528.0,18628.0,0.005397,317.3177,15241.9
1,ZERO_RESULTS,1,330010,330023,6,transit,best_guess,,,,,317.3177,15241.9
2,OK,1,330010,330023,12,driving,best_guess,317920.0,18249.0,18718.0,0.025700,317.3177,15241.9
3,OK,1,330010,330023,12,transit,best_guess,312980.0,49509.0,,,317.3177,15241.9
4,OK,1,330010,330023,18,driving,best_guess,317920.0,18249.0,18389.0,0.007672,317.3177,15241.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7891,OK,1,330630,330620,6,transit,best_guess,49690.0,6327.0,,,55.0684,3300.1
7892,OK,1,330630,330620,12,driving,best_guess,55708.0,3646.0,3501.0,-0.039770,55.0684,3300.1
7893,OK,1,330630,330620,12,transit,best_guess,65604.0,9417.0,,,55.0684,3300.1
7894,OK,1,330630,330620,18,driving,best_guess,55708.0,3646.0,3908.0,0.071860,55.0684,3300.1


### Coluna da diferença percentual

OSM em relação ao GMAPS.

In [38]:
reference = ['duration_in_traffic (value)', 'duration (value)']
compare = 'tempo'
df_dist['diff_pct_traffic'] = (df_dist[reference[0]] - df_dist[compare]) / df_dist[compare]
df_dist['diff_pct_normal'] = (df_dist[reference[1]] - df_dist[compare]) / df_dist[compare]
df_dist

Unnamed: 0,status,count,origem,destino,hour,mode,traffic_model,distance (value),duration (value),duration_in_traffic (value),late_pct,distancia,tempo,diff_pct_traffic,diff_pct_normal
0,OK,1,330010,330023,6,driving,best_guess,330424.0,18528.0,18628.0,0.005397,317.3177,15241.9,0.222157,0.215596
1,ZERO_RESULTS,1,330010,330023,6,transit,best_guess,,,,,317.3177,15241.9,,
2,OK,1,330010,330023,12,driving,best_guess,317920.0,18249.0,18718.0,0.025700,317.3177,15241.9,0.228062,0.197292
3,OK,1,330010,330023,12,transit,best_guess,312980.0,49509.0,,,317.3177,15241.9,,2.248217
4,OK,1,330010,330023,18,driving,best_guess,317920.0,18249.0,18389.0,0.007672,317.3177,15241.9,0.206477,0.197292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7891,OK,1,330630,330620,6,transit,best_guess,49690.0,6327.0,,,55.0684,3300.1,,0.917215
7892,OK,1,330630,330620,12,driving,best_guess,55708.0,3646.0,3501.0,-0.039770,55.0684,3300.1,0.060877,0.104815
7893,OK,1,330630,330620,12,transit,best_guess,65604.0,9417.0,,,55.0684,3300.1,,1.853550
7894,OK,1,330630,330620,18,driving,best_guess,55708.0,3646.0,3908.0,0.071860,55.0684,3300.1,0.184207,0.104815


### Histograma (tráfego)

`gmaps = osm + osm * diff_pct_traffic`

In [39]:
px.histogram(df_dist, x='diff_pct_traffic')

### Histograma (normal)

`gmaps = osm + osm * diff_pct_normal`

In [40]:
px.histogram(df_dist, x='diff_pct_normal')

# Exportar tabela

In [91]:
cols_exp = [
  'origem', 'destino',
  'mode', 'traffic_model', 'hour',
  'distance (value)', 'duration (value)', 'duration_in_traffic (value)'
]

In [92]:
df_exp = df_inv[cols_exp]
df_exp = df_exp[(
    (df_exp['mode'] == 'driving') & (df_exp['traffic_model'] == 'pessimistic') & (df_exp['hour'] == 18)
  ) | (
    (df_exp['mode'] == 'transit') & (df_exp['traffic_model'] == 'best_guess') & (df_exp['hour'] == 6)
)]
df_exp = df_exp.fillna(0)
df_exp

Unnamed: 0,origem,destino,mode,traffic_model,hour,distance (value),duration (value),duration_in_traffic (value)
1,330010,330023,transit,best_guess,6,0.0,0.0,0.0
5,330010,330023,driving,pessimistic,18,317920.0,18249.0,21817.0
7,330010,330040,transit,best_guess,6,0.0,0.0,0.0
11,330010,330040,driving,pessimistic,18,96636.0,7113.0,8110.0
13,330010,330045,transit,best_guess,6,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
7883,330630,330490,driving,pessimistic,18,147459.0,8169.0,10417.0
7885,330630,330610,transit,best_guess,6,52143.0,6164.0,0.0
7889,330630,330610,driving,pessimistic,18,67130.0,4727.0,6110.0
7891,330630,330620,transit,best_guess,6,49690.0,6327.0,0.0


In [93]:
durations = list()
for dur, dur_trf in zip(df_exp['duration (value)'], df_exp['duration_in_traffic (value)']):
  durations.append(max(dur, dur_trf))
distances = df_exp['distance (value)']

In [94]:
df_exp = df_exp.loc[:, df_exp.columns[:-3]]
df_exp['duration'] = pd.Series(durations) / 60
df_exp['distance'] = pd.Series(distances) / 1000
df_exp

Unnamed: 0,origem,destino,mode,traffic_model,hour,duration,distance
1,330010,330023,transit,best_guess,6,363.616667,0.000
5,330010,330023,driving,pessimistic,18,167.316667,317.920
7,330010,330040,transit,best_guess,6,242.583333,0.000
11,330010,330040,driving,pessimistic,18,178.166667,96.636
13,330010,330045,transit,best_guess,6,115.983333,0.000
...,...,...,...,...,...,...,...
7883,330630,330490,driving,pessimistic,18,,147.459
7885,330630,330610,transit,best_guess,6,,52.143
7889,330630,330610,driving,pessimistic,18,,67.130
7891,330630,330620,transit,best_guess,6,,49.690


In [95]:
path_exp = get_path('GESTANTES', f'gmaps/route_exp.csv.gzip')
df_exp.to_csv(path_exp, index=False)