In [1]:
import sys
import numpy as np
import pandas as pd

from datatable import dt, f, by

from plotly import express as px, io as pio
pd.options.plotting.backend = 'plotly'
pio.renderers.default = 'plotly_mimetype+notebook_connected'

from utils import get_path, data_load
sys.path.insert(0, '../')
from secret import API_KEY

# Fontes de Dados

## Rotas SIH

### Colunas

In [2]:
cols_sih = {
  'cod_municipio': 'origem',
  'hosp_cod_municipio': 'destino',
}

### Carregando tabela

In [3]:
path_sih = get_path('SIH', 'sih.jay')
df_sih = dt.fread(path_sih, columns=cols_sih)
df_sih.names = cols_sih
df_sih = df_sih[:, list(cols_sih.values())]
df_sih = df_sih[f.origem != f.destino, :]
df_sih['count'] = 1
df_sih = df_sih[:, dt.sum(f.count), by('origem', 'destino')]
df_sih = df_sih.sort(-f.count)
df_sih

Unnamed: 0_level_0,origem,destino,count
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪
0,510840,510340,22807
1,315460,310620,17147
2,260790,261160,16012
3,521250,530010,15461
4,280480,280030,15090
5,330350,330285,14875
6,230370,230440,14545
7,320500,320530,14543
8,150080,150140,14521
9,432300,431490,14273


## Rotas OSM

### Carregando tabela

In [4]:
path_osm = get_path('DISTANCIAS', f'deslocamentos.csv.gzip')
df_osm = pd.read_csv(path_osm)
df_osm

Unnamed: 0,origem,destino,distancia (km),tempo (min),count
0,355030,355030,0.0000,0.000000,789070
1,330455,330455,0.0000,0.000000,358134
2,130260,130260,0.0000,0.000000,296660
3,530010,530010,0.0000,0.000000,288921
4,230440,230440,0.0000,0.000000,196047
...,...,...,...,...,...
71833,530010,521308,376.3507,360.035000,1
71834,530010,521770,328.3734,283.283333,1
71835,530010,522000,153.6553,146.226667,1
71836,530010,522020,515.6676,411.118333,1


## Rotas Gmaps

### Colunas

In [5]:
cols_gmaps = [
  'origem', 'destino', 'count',
  'distance (value)', 'duration (value)',
  'duration_in_traffic (value)',
]

### Carregando tabela

In [6]:
path_routes = get_path('GESTANTES', f'gmaps/routes.csv.gzip')
df_files = pd.read_csv(path_routes)
df_files

Unnamed: 0,status,origem,destino,origin_addresses,destination_addresses,origem_latitude,origem_longitude,destino_latitude,destino_longitude,hour,departure_time,mode,traffic_model,distance (value),distance (text),duration (value),duration (text),duration_in_traffic (value),duration_in_traffic (text),count
0,ZERO_RESULTS,292070.0,291360.0,"-14.1035,-39.0137","-14.793,-39.046",-14.1035,-39.0137,-14.7930,-39.0460,6.0,2022-11-07 06:00:00,transit,best_guess,,,,,,,678.0
1,ZERO_RESULTS,290515.0,293330.0,"-14.3347,-40.9175","-14.8615,-40.8442",-14.3347,-40.9175,-14.8615,-40.8442,6.0,2022-11-07 06:00:00,transit,best_guess,,,,,,,883.0
2,OK,330330.0,330490.0,"R. Maria Bregua, 37 - São Lourenço, Niterói - ...","R. Temistocles de Almeida, 134 - Camarão, São ...",-22.8832,-43.1034,-22.8268,-43.0634,6.0,2022-11-07 06:00:00,transit,best_guess,9900.0,"9,9 km",2271.0,38 minutos,,,204.0
3,OK,330330.0,330185.0,"R. Maria Bregua, 37 - São Lourenço, Niterói - ...","R. João da Silva Maia, 14 - Parque Santo Anton...",-22.8832,-43.1034,-22.5347,-42.9895,6.0,2022-11-07 06:00:00,transit,best_guess,63545.0,"63,5 km",12388.0,3 horas 26 minutos,,,2.0
4,OK,330330.0,330023.0,"R. Maria Bregua, 37 - São Lourenço, Niterói - ...","Av. José Bento Ribeiro Dantas, 842, Búzios - R...",-22.8832,-43.1034,-22.7528,-41.8846,6.0,2022-11-07 06:00:00,transit,best_guess,171243.0,171 km,28870.0,8 horas 1 minuto,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65933,OK,291810.0,280290.0,"R. Delmiro Gouveia, 109 - Centro, Jeremoabo - ...","R. Cap. José Ferreira Neto, 244 - Centro, Itab...",-10.0685,-38.3471,-10.6826,-37.4273,18.0,2022-11-07 18:00:00,driving,pessimistic,144835.0,145 km,7519.0,2 horas 5 minutos,7908.0,2 horas 12 minutos,6.0
65934,OK,431075.0,431310.0,"Av. Gen. Osório, 20, Ivorá - RS, 98160-000, Br...","R. Raimundo Aléssio, 325 - Nova Palma, RS, 972...",-29.5232,-53.5842,-29.4710,-53.4689,18.0,2022-11-07 18:00:00,driving,pessimistic,19878.0,"19,9 km",2253.0,38 minutos,2253.0,38 minutos,3.0
65935,OK,431075.0,431490.0,"Av. Gen. Osório, 20, Ivorá - RS, 98160-000, Br...","R. Casemiro de Abreu, 193 - Boa Vista, Porto A...",-29.5232,-53.5842,-30.0318,-51.2065,18.0,2022-11-07 18:00:00,driving,pessimistic,285093.0,285 km,14776.0,4 horas 6 minutos,16624.0,4 horas 37 minutos,4.0
65936,OK,431075.0,431690.0,"Av. Gen. Osório, 20, Ivorá - RS, 98160-000, Br...","R. Conde de Porto Alegre, 365 - Bonfim, Santa ...",-29.5232,-53.5842,-29.6868,-53.8149,18.0,2022-11-07 18:00:00,driving,pessimistic,48429.0,"48,4 km",3566.0,59 minutos,4101.0,1 hora 8 minutos,53.0


### Selecionando

driving pessimistic at 6pm

In [7]:
status = df_files['status'] == 'OK'
mode = df_files['mode'] == 'driving'
traffic_model = df_files['traffic_model'] == 'pessimistic'
hour = df_files['hour'] == 18
df_gmaps = df_files[status & mode & traffic_model & hour]
df_gmaps = df_gmaps.sort_values(by='count', ascending=False).reset_index(drop=True)
df_gmaps = df_gmaps[cols_gmaps]
df_gmaps

Unnamed: 0,origem,destino,count,distance (value),duration (value),duration_in_traffic (value)
0,510840.0,510340.0,22807.0,7555.0,980.0,2041.0
1,315460.0,310620.0,17147.0,42332.0,3223.0,4959.0
2,260790.0,261160.0,16012.0,21010.0,2819.0,4635.0
3,521250.0,530010.0,15461.0,60888.0,3756.0,5001.0
4,280480.0,280030.0,15090.0,16221.0,1388.0,1889.0
...,...,...,...,...,...,...
57271,261020.0,260230.0,1.0,59017.0,5159.0,5393.0
57272,261020.0,260120.0,1.0,169995.0,9306.0,10312.0
57273,261020.0,260080.0,1.0,44086.0,2976.0,3216.0
57274,261020.0,261150.0,1.0,19470.0,1083.0,1188.0


# Gerando tabela

In [8]:
df_sih = df_sih.to_pandas()
df_sih_osm = pd.merge(df_sih, df_osm, on=list(df_sih.columns))
df_sih_gmaps = pd.merge(df_sih, df_gmaps, on=list(df_sih.columns))
df_rotas = pd.merge(df_sih_osm, df_sih_gmaps, on=['origem', 'destino', 'count'])
df_rotas

Unnamed: 0,origem,destino,count,distancia (km),tempo (min),distance (value),duration (value),duration_in_traffic (value)
0,510840,510340,22807,7.4376,7.796667,7555.0,980.0,2041.0
1,315460,310620,17147,33.3064,32.638333,42332.0,3223.0,4959.0
2,260790,261160,16012,23.8586,26.508333,21010.0,2819.0,4635.0
3,521250,530010,15461,59.4445,68.103333,60888.0,3756.0,5001.0
4,280480,280030,15090,13.7860,17.200000,16221.0,1388.0,1889.0
...,...,...,...,...,...,...,...,...
57271,520465,520110,1,409.2583,307.260000,409137.0,19775.0,21747.0
57272,520470,170950,1,308.5311,238.916667,308501.0,14906.0,15955.0
57273,520470,172100,1,528.4934,402.926667,525439.0,25366.0,26778.0
57274,520470,520620,1,398.8699,327.613333,423046.0,20266.0,23104.0


In [9]:
df_rotas['distance (value)'] = df_rotas['distance (value)'] / 1000
df_rotas['duration (value)'] = df_rotas['duration (value)'] / 60
df_rotas['duration_in_traffic (value)'] = df_rotas['duration_in_traffic (value)'] / 60
df_rotas

Unnamed: 0,origem,destino,count,distancia (km),tempo (min),distance (value),duration (value),duration_in_traffic (value)
0,510840,510340,22807,7.4376,7.796667,7.555,16.333333,34.016667
1,315460,310620,17147,33.3064,32.638333,42.332,53.716667,82.650000
2,260790,261160,16012,23.8586,26.508333,21.010,46.983333,77.250000
3,521250,530010,15461,59.4445,68.103333,60.888,62.600000,83.350000
4,280480,280030,15090,13.7860,17.200000,16.221,23.133333,31.483333
...,...,...,...,...,...,...,...,...
57271,520465,520110,1,409.2583,307.260000,409.137,329.583333,362.450000
57272,520470,170950,1,308.5311,238.916667,308.501,248.433333,265.916667
57273,520470,172100,1,528.4934,402.926667,525.439,422.766667,446.300000
57274,520470,520620,1,398.8699,327.613333,423.046,337.766667,385.066667


In [10]:
dist_gmaps = df_rotas['distance (value)']
dist_osm = df_rotas['distancia (km)']
dist_diff = (dist_gmaps - dist_osm) / dist_osm
df_rotas['dist_diff'] = dist_diff
tempo_gmaps = df_rotas['duration (value)']
tempo_traffic_gmaps = df_rotas['duration_in_traffic (value)']
tempo_osm = df_rotas['tempo (min)']
tempo_diff = (tempo_gmaps - tempo_osm) / tempo_osm
df_rotas['tempo_diff'] = tempo_diff
tempo_diff = (tempo_gmaps - tempo_osm) / tempo_osm
tempo_traffic_diff = (tempo_traffic_gmaps - tempo_osm) / tempo_osm
df_rotas['tempo_diff'] = tempo_diff
df_rotas['tempo_traffic_diff'] = tempo_traffic_diff
df_rotas

Unnamed: 0,origem,destino,count,distancia (km),tempo (min),distance (value),duration (value),duration_in_traffic (value),dist_diff,tempo_diff,tempo_traffic_diff
0,510840,510340,22807,7.4376,7.796667,7.555,16.333333,34.016667,0.015785,1.094912,3.362976
1,315460,310620,17147,33.3064,32.638333,42.332,53.716667,82.650000,0.270987,0.645815,1.532298
2,260790,261160,16012,23.8586,26.508333,21.010,46.983333,77.250000,-0.119395,0.772399,1.914178
3,521250,530010,15461,59.4445,68.103333,60.888,62.600000,83.350000,0.024283,-0.080809,0.223875
4,280480,280030,15090,13.7860,17.200000,16.221,23.133333,31.483333,0.176628,0.344961,0.830426
...,...,...,...,...,...,...,...,...,...,...,...
57271,520465,520110,1,409.2583,307.260000,409.137,329.583333,362.450000,-0.000296,0.072653,0.179620
57272,520470,170950,1,308.5311,238.916667,308.501,248.433333,265.916667,-0.000098,0.039833,0.113010
57273,520470,172100,1,528.4934,402.926667,525.439,422.766667,446.300000,-0.005779,0.049240,0.107646
57274,520470,520620,1,398.8699,327.613333,423.046,337.766667,385.066667,0.060611,0.030992,0.175369


# Investigação

## Completude

In [11]:
df_routes_nan = df_sih_osm.merge(df_sih_gmaps, on=['origem', 'destino', 'count'], how='left')
df_routes_valid = df_routes_nan[~df_routes_nan['distance (value)'].isna()]
df_routes_nan = df_routes_nan[df_routes_nan['distance (value)'].isna()]
df_routes_nan

Unnamed: 0,origem,destino,count,distancia (km),tempo (min),distance (value),duration (value),duration_in_traffic (value)
256,150030,160030,2165,168.825961,4598.445000,,,
276,150030,160060,2056,200.480983,4579.221667,,,
533,130050,130340,1392,50.406600,574.661667,,,
643,120035,120020,1236,72.945900,123.543333,,,
739,150450,150180,1120,16.330900,33.276667,,,
...,...,...,...,...,...,...,...,...
68568,530010,521308,1,376.350700,360.035000,,,
68569,530010,521770,1,328.373400,283.283333,,,
68570,530010,522000,1,153.655300,146.226667,,,
68571,530010,522020,1,515.667600,411.118333,,,


In [18]:
df_routes_nan[df_routes_nan['count'] > 1]

Unnamed: 0,origem,destino,count,distancia (km),tempo (min),distance (value),duration (value),duration_in_traffic (value)
256,150030,160030,2165,168.825961,4598.445000,,,
276,150030,160060,2056,200.480983,4579.221667,,,
533,130050,130340,1392,50.406600,574.661667,,,
643,120035,120020,1236,72.945900,123.543333,,,
739,150450,150180,1120,16.330900,33.276667,,,
...,...,...,...,...,...,...,...,...
32387,311110,150530,2,3015.015000,3378.401667,,,
35840,432230,160060,2,4054.317700,7655.155000,,,
36044,510385,510760,2,459.886500,559.865000,,,
36045,510385,510790,2,474.956300,544.650000,,,


In [12]:
nan = df_routes_nan['count'].sum()
valid = df_routes_valid['count'].sum()

### Faltantes

In [13]:
px.bar(x=['nan', 'valid'], y=[nan, valid])

In [14]:
px.histogram(df_routes_nan, x='count')#, range_x=[-.3, .45])

## Diferença entre OSM e Gmaps

### Distância

In [15]:
px.histogram(df_rotas, x='dist_diff', range_x=[-.3, .45])

### Tempo (padrão)

In [16]:
px.histogram(df_rotas, x='tempo_diff', range_x=[-.75, 1])

### Tempo (tráfego)

In [17]:
px.histogram(df_rotas, x='tempo_traffic_diff', range_x=[-.6, 1.5])