In [1]:
import sys
import json
import pickle
import numpy as np
import pandas as pd
import googlemaps as gmaps

from os import listdir, walk
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from datatable import dt, f, by

from plotly import express as px, io as pio
pd.options.plotting.backend = 'plotly'
pio.renderers.default = 'plotly_mimetype+notebook_connected'

from utils import get_path, data_load
sys.path.insert(0, '../')
from secret import API_KEY

# Descrição

Get the travel distance and time for a matrix of origins and destinations.

[Documentação](https://developers.google.com/maps/documentation/distance-matrix/start#maps_http_distancematrix_start-py)

[Repositório API Python](https://github.com/googlemaps/google-maps-services-python)

# Fontes de dados

## Municípios

### Carregando tabela

In [2]:
path_munic = get_path('LOCALIDADES', 'municipios.csv.gzip')
df_munic = dt.fread(path_munic)
df_munic

Unnamed: 0_level_0,cod_municipio,nome_municipio,capital,uf,cod_uf,nome_uf,regiao,latitude,longitude
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,520005,Abadia de Goiás,0,GO,52,Goiás,Centro-Oeste,−16.7573,−49.4412
1,310010,Abadia dos Dourados,0,MG,31,Minas Gerais,Sudeste,−18.4831,−47.3916
2,520010,Abadiânia,0,GO,52,Goiás,Centro-Oeste,−16.197,−48.7057
3,310020,Abaeté,0,MG,31,Minas Gerais,Sudeste,−19.1551,−45.4444
4,150010,Abaetetuba,0,PA,15,Pará,Norte,−1.72183,−48.8788
5,230010,Abaiara,0,CE,23,Ceará,Nordeste,−7.34588,−39.0416
6,290010,Abaíra,0,BA,29,Bahia,Nordeste,−13.2488,−41.6619
7,290020,Abaré,0,BA,29,Bahia,Nordeste,−8.72073,−39.1162
8,410010,Abatiá,0,PR,41,Paraná,Sul,−23.3049,−50.3133
9,420005,Abdon Batista,0,SC,42,Santa Catarina,Sul,−27.6126,−51.0233


## SIH

### Colunas

In [3]:
cols_sih = {
  'cod_municipio': 'origem',
  'hosp_cod_municipio': 'destino',
}

### Carregando tabela

In [4]:
path_sih = get_path('SIH', 'sih.jay')
df_sih = dt.fread(path_sih, columns=cols_sih)
df_sih.names = cols_sih
df_sih = df_sih[:, list(cols_sih.values())]
df_sih = df_sih[f.origem != f.destino, :]
df_sih['count'] = 1
df_sih = df_sih[:, dt.sum(f.count), by('origem', 'destino')]
df_sih = df_sih.sort(-f.count)
df_sih

Unnamed: 0_level_0,origem,destino,count
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪
0,510840,510340,22807
1,315460,310620,17147
2,260790,261160,16012
3,521250,530010,15461
4,280480,280030,15090
5,330350,330285,14875
6,230370,230440,14545
7,320500,320530,14543
8,150080,150140,14521
9,432300,431490,14273


# Adicinar colunas

## Latitude & Longitude

In [5]:
def get_latlon_table_by_ref(df_munic, ref):
  cols_latlon = ['cod_municipio', 'latitude', 'longitude']
  df_latlon = df_munic[:, cols_latlon]
  df_latlon.names = {
    'cod_municipio': ref,
    'latitude': f'{ref}_latitude',
    'longitude': f'{ref}_longitude',
  }
  df_latlon.key = ref
  return df_latlon

## Parâmetros

In [6]:
def params_in_cols(df):
  dfs = list()
  for mode in ['driving', 'transit']: # driving | transit | bicycling | walking
    for hour in [6, 12, 18]: # [6, 12, 18] | [7, 13, 19]
      for traffic_model in ['best_guess', 'optimistic', 'pessimistic']: # best_guess | optimistic | pessimistic
        if mode != 'driving' and traffic_model != 'best_guess':
          continue
        df_ = df.copy()
        df_['mode'] = mode
        df_['traffic_model'] = traffic_model
        df_['hour'] = hour
        dfs.append(df_)
  df_param = pd.concat(dfs)
  df_param = df_param.sort_values(by=[
    'origem', 'destino', 'mode', 'traffic_model', 'hour'])
  df_param = df_param.reset_index(drop=True)
  return df_param

# Modelo de Coleta

## Local pra armazenar arquivo

In [7]:
def get_backup_path(data):
  infos = [
    f'{data["hour"]}h',
    data['mode'],
    data['traffic_model'],
    str(data['origem']),
    str(data['destino'])]
  fname = '-'.join(infos)
  dir = '/'.join(infos[:-1])
  path_dir = get_path('LOCALIDADES', f'google_maps/{dir}')
  Path(path_dir).mkdir(parents=True, exist_ok=True)
  return path_dir + f'/{fname}.json'

## Requisição para API

In [8]:
def get_travel_info(row):
  departure_time = datetime(year=2022, month=11, day=7, hour=row['hour'])
  info = row.copy()
  info['departure_time'] = str(departure_time)
  res = gmaps.distance_matrix(
    origins = [(row['origem_latitude'], row['origem_longitude'])],
    destinations = [(row['destino_latitude'], row['destino_longitude'])],
    mode = row['mode'],
    traffic_model = row['traffic_model'],
    departure_time = departure_time,
    language = 'pt-BR',
    units = 'metric',)
  info['origin_addresses'] = res['origin_addresses'][0]
  info['destination_addresses'] = res['destination_addresses'][0]
  elem = res['rows'][0]['elements'][0]
  info['status'] = elem['status']
  for col in list(elem):
    if col == 'status':
      continue
    for i_type in ['text', 'value']:
      col_name = f'{col} ({i_type})'
      if elem['status'] == 'OK':
        info[col_name] = elem[col][i_type]
      else:
        info[col_name] = ''
  return info

## Coleta das informações

In [9]:
def request_travel_infos(df):
  for row in tqdm(df.to_dict(orient='records')):
    fpath = get_backup_path(row)
    if Path(fpath).is_file():
      continue
    info = get_travel_info(row)
    with open(fpath, 'w') as fp:
      json.dump(info, fp)

# Gerando tabela

Tabela com informações a serem enviadas para a API do Google Maps.

In [10]:
df_latlon_orig = get_latlon_table_by_ref(df_munic, 'origem')
df_latlon_dest = get_latlon_table_by_ref(df_munic, 'destino')
df_route = df_sih[:, :, dt.join(df_latlon_orig)]
df_route = df_sih[:, :, dt.join(df_latlon_dest)]
df_route = params_in_cols(df_route.to_pandas())
df_route

Unnamed: 0,origem,destino,count,destino_latitude,destino_longitude,mode,traffic_model,hour
0,110001,110002,2,-9.90571,-63.0325,driving,best_guess,6
1,110001,110002,2,-9.90571,-63.0325,driving,best_guess,12
2,110001,110002,2,-9.90571,-63.0325,driving,best_guess,18
3,110001,110002,2,-9.90571,-63.0325,driving,optimistic,6
4,110001,110002,2,-9.90571,-63.0325,driving,optimistic,12
...,...,...,...,...,...,...,...,...
822871,530010,522200,2,-16.74050,-48.5159,driving,pessimistic,12
822872,530010,522200,2,-16.74050,-48.5159,driving,pessimistic,18
822873,530010,522200,2,-16.74050,-48.5159,transit,best_guess,6
822874,530010,522200,2,-16.74050,-48.5159,transit,best_guess,12


# Coleta dos dados

## Seleção parâmetros

In [11]:
def select_params(
    df,
    mode, # driving | transit | bicycling | walking
    traffic_model, # best_guess | optimistic | pessimistic
    hour, # [6, 12, 18] | [7, 13, 19]
  ):
  df_param = df[
    (
      df['mode'] == mode
    ) & (
      df['traffic_model'] == traffic_model
    ) & (
      df['hour'] == hour
    )
  ].sort_values(by='count', ascending=False)
  return df_param

## Filtros

### Intra RJ

`origem` e `destino` no estado do RJ.

In [12]:
origem_rj = df_route['origem'].astype(str).str.startswith('33')
destino_rj = df_route['destino'].astype(str).str.startswith('33')
df_route_rj = df_route[origem_rj & destino_rj]
df_route_rj

Unnamed: 0,origem,destino,count,destino_latitude,destino_longitude,mode,traffic_model,hour
497964,330010,330023,1,-22.7528,-41.8846,driving,best_guess,6
497965,330010,330023,1,-22.7528,-41.8846,driving,best_guess,12
497966,330010,330023,1,-22.7528,-41.8846,driving,best_guess,18
497967,330010,330023,1,-22.7528,-41.8846,driving,optimistic,6
497968,330010,330023,1,-22.7528,-41.8846,driving,optimistic,12
...,...,...,...,...,...,...,...,...
519643,330630,330620,1,-22.4059,-43.6686,driving,pessimistic,12
519644,330630,330620,1,-22.4059,-43.6686,driving,pessimistic,18
519645,330630,330620,1,-22.4059,-43.6686,transit,best_guess,6
519646,330630,330620,1,-22.4059,-43.6686,transit,best_guess,12


## Consulta API

In [13]:
gmaps = gmaps.Client(key=API_KEY)

In [14]:
df_param = select_params(df_route_rj, 'driving', 'best_guess', 12)
df_param

Unnamed: 0,origem,destino,count,destino_latitude,destino_longitude,mode,traffic_model,hour
508933,330350,330285,14875,-22.8028,-43.4601,driving,best_guess,12
499909,330045,330285,11761,-22.8028,-43.4601,driving,best_guess,12
508801,330350,330045,11658,-22.7640,-43.3992,driving,best_guess,12
515857,330490,330330,7848,-22.8832,-43.1034,driving,best_guess,12
500041,330045,330510,7263,-22.8058,-43.3729,driving,best_guess,12
...,...,...,...,...,...,...,...,...
507529,330310,330090,1,-21.5691,-41.9187,driving,best_guess,12
507541,330310,330170,1,-22.7858,-43.3049,driving,best_guess,12
507553,330310,330205,1,-21.4296,-41.7014,driving,best_guess,12
507589,330310,330490,1,-22.8268,-43.0634,driving,best_guess,12


In [15]:
request_travel_infos(df_param)

100%|██████████| 1316/1316 [00:00<00:00, 26491.19it/s]


RJ driving best_guess 6h

100%|██████████| 1316/1316 [04:07<00:00,  5.32it/s]

RJ driving best_guess 12h

100%|██████████| 1316/1316 [04:28<00:00,  4.91it/s]

RJ driving best_guess 18h

100%|██████████| 1316/1316 [04:29<00:00,  4.89it/s]

# Resultado da coleta

## Lendo arquivos salvos

In [16]:
def read_data_to_df(path_dir, cols_order=None):
  files = list()
  for (dirpath, _, filenames) in walk(path_dir):
    files.extend([dirpath + '/' + fl for fl in filenames if 'json' in fl])
  infos = list()
  for file in files:
    with open(file) as fl:
      d = json.load(fl)
      infos.append(d)
  df = pd.DataFrame(infos)
  cols_order = df.columns if cols_order is None else cols_order
  return df[cols_order]

## Ordenando colunas

In [17]:
cols_order = [
  'status', 'origem', 'destino', 'origin_addresses', 'destination_addresses',
  'origem_latitude', 'origem_longitude', 'destino_latitude', 'destino_longitude',
  'hour', 'departure_time', 'mode', 'traffic_model',
  'distance (value)', 'distance (text)',
  'duration (value)', 'duration (text)',
  'duration_in_traffic (value)', 'duration_in_traffic (text)',
  'count',
]

## Tabela resultante

In [18]:
path_gmaps = get_path('LOCALIDADES', f'google_maps/')
df_files = read_data_to_df(path_gmaps, cols_order=cols_order)
df_files

Unnamed: 0,status,origem,destino,origin_addresses,destination_addresses,origem_latitude,origem_longitude,destino_latitude,destino_longitude,hour,departure_time,mode,traffic_model,distance (value),distance (text),duration (value),duration (text),duration_in_traffic (value),duration_in_traffic (text),count
0,OK,330330,330430,"R. Maria Bregua, 37 - São Lourenço, Niterói - ...","rua luiz guimaraes, 190 - Centro, Rio Bonito -...",-22.8832,-43.1034,-22.7181,-42.6276,6,2022-11-07 06:00:00,driving,best_guess,61850,"61,8 km",2995,50 minutos,2838,47 minutos,2
1,OK,330330,330580,"R. Maria Bregua, 37 - São Lourenço, Niterói - ...","R. Wancler Fonseca, 153 - Agriões, Teresópolis...",-22.8832,-43.1034,-22.4165,-42.9752,6,2022-11-07 06:00:00,driving,best_guess,81728,"81,7 km",5582,1 hora 33 minutos,5180,1 hora 26 minutos,2
2,OK,330330,330040,"R. Maria Bregua, 37 - São Lourenço, Niterói - ...","Av. Jansen de Melo, nº15 - Centro, Barra Mansa...",-22.8832,-43.1034,-22.5481,-44.1752,6,2022-11-07 06:00:00,driving,best_guess,140548,141 km,7014,1 hora 57 minutos,6746,1 hora 52 minutos,2
3,OK,330330,330390,"R. Maria Bregua, 37 - São Lourenço, Niterói - ...","R. Cel. Land, 11 - Valparaíso, Petrópolis - RJ...",-22.8832,-43.1034,-22.5200,-43.1926,6,2022-11-07 06:00:00,driving,best_guess,72202,"72,2 km",4564,1 hora 16 minutos,4300,1 hora 12 minutos,1
4,OK,330330,330190,"R. Maria Bregua, 37 - São Lourenço, Niterói - ...","R. Quarenta Tres Lto Ampliacao Ant D, 181 - Am...",-22.8832,-43.1034,-22.7565,-42.8639,6,2022-11-07 06:00:00,driving,best_guess,34104,"34,1 km",2066,34 minutos,1938,32 minutos,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3943,OK,330380,330040,"R. Manoel Tôrres, 24 - Parque Imperial, Paraty...","Av. Jansen de Melo, nº15 - Centro, Barra Mansa...",-23.2221,-44.7175,-22.5481,-44.1752,18,2022-11-07 18:00:00,driving,best_guess,164056,164 km,10235,2 horas 51 minutos,10702,2 horas 58 minutos,1
3944,OK,330380,330130,"R. Manoel Tôrres, 24 - Parque Imperial, Paraty...","R. Jonas Nunes, 200 - Nossa Sra. Saude, Casimi...",-23.2221,-44.7175,-22.4812,-42.2066,18,2022-11-07 18:00:00,driving,best_guess,371952,372 km,19391,5 horas 23 minutos,19776,5 horas 30 minutos,1
3945,OK,330380,330330,"R. Manoel Tôrres, 24 - Parque Imperial, Paraty...","R. Maria Bregua, 37 - São Lourenço, Niterói - ...",-23.2221,-44.7175,-22.8832,-43.1034,18,2022-11-07 18:00:00,driving,best_guess,256180,256 km,14230,3 horas 57 minutos,14562,4 horas 3 minutos,1
3946,OK,330380,330510,"R. Manoel Tôrres, 24 - Parque Imperial, Paraty...","R. Profa. Francisca César, 165 - Centro, São J...",-23.2221,-44.7175,-22.8058,-43.3729,18,2022-11-07 18:00:00,driving,best_guess,234709,235 km,12897,3 horas 35 minutos,13299,3 horas 42 minutos,1


# Investigação

## Colunas a investigar

In [19]:
cols_inv = [
  'origem', 'destino', 'hour',
  'duration (value)',
  'duration_in_traffic (value)',
]

In [20]:
df_inv = df_files[cols_inv].sort_values(
  by=['origem', 'destino', 'hour']
).reset_index(drop=True)
df_inv

Unnamed: 0,origem,destino,hour,duration (value),duration_in_traffic (value)
0,330010,330023,6,18528,18628
1,330010,330023,12,18249,18718
2,330010,330023,18,18249,18389
3,330010,330040,6,7113,6948
4,330010,330040,12,7113,7143
...,...,...,...,...,...
3943,330630,330610,12,4727,4575
3944,330630,330610,18,4727,4842
3945,330630,330620,6,3646,3572
3946,330630,330620,12,3646,3501


## % diferença com tráfego

### Tabela com atraso percentual

In [21]:
traffic = 'duration_in_traffic (value)'
normal = 'duration (value)'
df_inv['late_pct'] = (df_inv[traffic] - df_inv[normal]) / df_inv[normal]
df_inv

Unnamed: 0,origem,destino,hour,duration (value),duration_in_traffic (value),late_pct
0,330010,330023,6,18528,18628,0.005397
1,330010,330023,12,18249,18718,0.025700
2,330010,330023,18,18249,18389,0.007672
3,330010,330040,6,7113,6948,-0.023197
4,330010,330040,12,7113,7143,0.004218
...,...,...,...,...,...,...
3943,330630,330610,12,4727,4575,-0.032156
3944,330630,330610,18,4727,4842,0.024328
3945,330630,330620,6,3646,3572,-0.020296
3946,330630,330620,12,3646,3501,-0.039770


### Histograma

In [22]:
px.histogram(df_inv, x='late_pct', color='hour', barmode='overlay', opacity=.5)

## Comparação entre horas

### Consultar duração

In [23]:
def get_duration(df, origem, destino, hour, col):
  return list(df[
    (
      df['origem'] == origem
    ) & (
      df['destino'] == destino
    ) & (
      df['hour'] == hour
    )
  ][col])[0]

### Tabela de comparação entre horários

In [24]:
def hours_comparison(df):
  cols_route = ['origem', 'destino']
  cols_duration = ['duration (value)', 'duration_in_traffic (value)']
  # hours = list(df_inv['hour'].unique())
  df_route = df[cols_route].groupby(cols_route, as_index=False).sum()
  df_route_soon = df_route.copy()
  df_route_late = df_route.copy()
  df_route_soon['time'] = 'soon (6am)'
  df_route_late['time'] = 'late (6pm)'
  col_val_soon = list()
  col_val_late = list()
  for row in df_route.to_dict(orient='records'):
    dur_6am = get_duration(df, row['origem'], row['destino'], 6, cols_duration[0])
    dur_12h = get_duration(df, row['origem'], row['destino'], 12, cols_duration[0])
    dur_6pm = get_duration(df, row['origem'], row['destino'], 18, cols_duration[0])
    soon = (dur_12h - dur_6am) / dur_12h
    late = (dur_6pm - dur_12h) / dur_12h
    col_val_soon.append(soon)
    col_val_late.append(late)
  df_route_soon['diff_perct'] = col_val_soon
  df_route_late['diff_perct'] = col_val_late
  df_comp = pd.concat([df_route_soon, df_route_late])
  df_comp = df_comp.sort_values(cols_route).reset_index(drop=True)
  return df_comp

In [25]:
df_comp = hours_comparison(df_inv)
df_comp

Unnamed: 0,origem,destino,time,diff_perct
0,330010,330023,soon (6am),-0.015289
1,330010,330023,late (6pm),0.000000
2,330010,330040,soon (6am),0.000000
3,330010,330040,late (6pm),0.000000
4,330010,330045,soon (6am),0.000000
...,...,...,...,...
2627,330630,330490,late (6pm),0.000000
2628,330630,330610,soon (6am),0.000000
2629,330630,330610,late (6pm),0.000000
2630,330630,330620,soon (6am),0.000000


### Histograma

In [26]:
px.histogram(df_comp, x='diff_perct', color='time', barmode='overlay', opacity=.5)

## Comparação com OSM

### Lendo tabela OSM

In [27]:
path_dist = get_path('DISTANCIAS', 'distancias.jay')
df_osm = dt.fread(path_dist)
df_osm = df_osm.to_pandas()
df_osm

Unnamed: 0,origem,destino,distancia,tempo
0,110001,110001,0.000000,0.000000
1,110001,110002,309.050000,6.169056
2,110001,110003,399.499700,6.289056
3,110001,110004,81.201103,1.917750
4,110001,110005,391.704300,6.136361
...,...,...,...,...
31024895,530010,522200,161.728900,2.356000
31024896,530010,522205,382.708800,5.479556
31024897,530010,522220,161.603400,2.248944
31024898,530010,522230,190.000900,3.216083


### Mesclando tabelas

Normalizando o tempo entre elas (ambas em segundos).

In [28]:
df_dist = pd.merge(df_inv, df_osm, how='left', on=['origem', 'destino'])
df_dist['tempo'] = df_dist['tempo'] * 60 * 60
df_dist

Unnamed: 0,origem,destino,hour,duration (value),duration_in_traffic (value),late_pct,distancia,tempo
0,330010,330023,6,18528,18628,0.005397,317.3177,15241.9
1,330010,330023,12,18249,18718,0.025700,317.3177,15241.9
2,330010,330023,18,18249,18389,0.007672,317.3177,15241.9
3,330010,330040,6,7113,6948,-0.023197,95.8852,6939.5
4,330010,330040,12,7113,7143,0.004218,95.8852,6939.5
...,...,...,...,...,...,...,...,...
3943,330630,330610,12,4727,4575,-0.032156,66.0632,4166.0
3944,330630,330610,18,4727,4842,0.024328,66.0632,4166.0
3945,330630,330620,6,3646,3572,-0.020296,55.0684,3300.1
3946,330630,330620,12,3646,3501,-0.039770,55.0684,3300.1


### Coluna da diferença percentual

In [30]:
reference = ['duration_in_traffic (value)', 'duration (value)']
compare = 'tempo'
df_dist['diff_pct_traffic'] = (df_dist[reference[0]] - df_dist[compare]) / df_dist[reference[0]]
df_dist['diff_pct_normal'] = (df_dist[reference[1]] - df_dist[compare]) / df_dist[reference[1]]
df_dist

Unnamed: 0,origem,destino,hour,duration (value),duration_in_traffic (value),late_pct,distancia,tempo,diff_pct_traffic,diff_pct_normal
0,330010,330023,6,18528,18628,0.005397,317.3177,15241.9,0.181775,0.177359
1,330010,330023,12,18249,18718,0.025700,317.3177,15241.9,0.185709,0.164782
2,330010,330023,18,18249,18389,0.007672,317.3177,15241.9,0.171140,0.164782
3,330010,330040,6,7113,6948,-0.023197,95.8852,6939.5,0.001223,0.024392
4,330010,330040,12,7113,7143,0.004218,95.8852,6939.5,0.028489,0.024392
...,...,...,...,...,...,...,...,...,...,...
3943,330630,330610,12,4727,4575,-0.032156,66.0632,4166.0,0.089399,0.118680
3944,330630,330610,18,4727,4842,0.024328,66.0632,4166.0,0.139612,0.118680
3945,330630,330620,6,3646,3572,-0.020296,55.0684,3300.1,0.076120,0.094871
3946,330630,330620,12,3646,3501,-0.039770,55.0684,3300.1,0.057384,0.094871


### Histograma (tráfego)

In [31]:
px.histogram(df_dist, x='diff_pct_traffic', color='hour', barmode='overlay', opacity=.5)

### Histograma (normal)

In [32]:
px.histogram(df_dist, x='diff_pct_normal', color='hour', barmode='overlay', opacity=.5)