# mask ParsedTaxiData

In [1]:
import os
import time
import pandas as pd
import numpy as np
import argparse
from datetime import datetime as dt
from datetime import date, timedelta
import networkx as nx
from utils import time_difference
from road_graph import get_road_list, road_graph


def read_GPS_dataset(date_range=['20160325', '20160325'], in_path='data/ParsedTaxiData_%s.csv', test_mode=False):
    # date_range = [start_date, end_date]
    print('Reading GPS dataset')
    start_date, end_date = date_range
    start_date, end_date = dt.strptime(start_date, '%Y%m%d'), dt.strptime(end_date, '%Y%m%d')
    date_list = [(start_date + timedelta(i)).strftime('%Y%m%d') for i in range((end_date - start_date).days+1)]
    column_names = ['vehicle_id', 'lon', 'lat', 'speed', 'direction', 'status', 'time', 
                    'closest_road_id', 'matched_lon', 'matched_lat', 'matched_road_id', 'matched_road_name']
    if test_mode:
        df_list = [pd.read_csv(in_path%(date), header=None, names=column_names, nrows=100000).drop_duplicates() for date in date_list]
    else:
        df_list = [pd.read_csv(in_path%(date), header=None, names=column_names).drop_duplicates() for date in date_list]
    df = pd.concat(df_list)
    return df

In [2]:
# Parameter Settings
# start_date = '20160325'
# end_date = '20160325'
start_date = '20160314'
end_date = '20160314'
time_gap = 10 # threshold for trajectory extraction
stay_duration = 2 # threshold for trajectory extraction
speed_limit = 120 # threshold for trajectory extraction
GPS_path = 'data/ParsedTaxiData_%s_v0.csv'
road_list_path = 'data/road_list.csv'
graph_path = 'data/road_graph.gml'
trajectory_path = 'data/recovered_trajectory_df_%s_%s.csv'%(start_date, end_date)

In [3]:
road_list = get_road_list(road_df=None, out_path=road_list_path, update=False)
G = road_graph(road_df=None, out_path=graph_path, update=False)    

Road list exists
Graph exists


In [4]:
df = read_GPS_dataset(date_range=[start_date, end_date], in_path=GPS_path, test_mode=0)
df = df.replace({'vehicle_id': 'SHA9148Z'}, {'vehicle_id': 'XXXXXXX'}
          ).replace({'vehicle_id': 'SHC1124J'}, {'vehicle_id': 'YYYYYYY'})
df = df[['vehicle_id', 'time', 'matched_road_id']]

Reading GPS dataset


In [5]:
df

Unnamed: 0,vehicle_id,time,matched_road_id
0,XXXXXXX,14/03/2016 00:00:00,103047123
1,XXXXXXX,14/03/2016 00:00:05,103063511
2,XXXXXXX,14/03/2016 00:00:10,104004183
3,XXXXXXX,14/03/2016 00:00:20,103055323
4,XXXXXXX,14/03/2016 00:00:26,103055319
5,XXXXXXX,14/03/2016 00:00:35,103055311
6,XXXXXXX,14/03/2016 00:00:41,103007624
7,XXXXXXX,14/03/2016 00:00:50,103113967
8,XXXXXXX,14/03/2016 00:00:56,103113987
9,XXXXXXX,14/03/2016 00:01:05,103113963


In [6]:
df.to_csv('data/ParsedTaxiData_20160314.csv', index=False)

# trajectory transition

In [8]:
import pickle as pkl

In [11]:
with open('data/trajectory_transition_20160314_20160314.pkl', 'rb') as f:
    trajectory_transition = pkl.load(f)

In [12]:
trajectory_transition.shape

(96, 2404, 2404)

# RF

In [16]:
with open('../trajectory/result/RF_5hop_100estimator_Y_pred.pkl', 'rb') as f:
    RF_result= pkl.load(f)
RF_result.shape

(1340, 2404)

In [17]:
1340/4/24

13.958333333333334

# TrGNN

In [20]:
import pandas as pd
import time
from datetime import date, timedelta
from datetime import datetime as dt
import os
import folium
from utils import *
import random
import numpy as np
from math import radians, degrees, sin, cos, asin, acos, sqrt
import pickle as pkl
import networkx as nx
from metrics import *
from trajectory_transition import extract_trajectory_transition
from road_graph import extract_road_adj
from model import *
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import argparse

In [23]:
model_name = 'TrGNN'
dataset = 'demo'
model_path = ''
start_time = time.time()

# Model and log
models = {'TrGNN':Model_TrGNN, 'TrGNN-':Model_GNN}
model = models[model_name]()
if model_path == '': # if no pre-trained model path
    prefix = '%s_%s'%(model_name, int(start_time))
    checkpoint_epoch = -1
if os.path.isfile(model_path):
    model.load_state_dict(torch.load(model_path))
    prefix = '_'.join(model_path.split('_')[:2])
    checkpoint_epoch = int(model_path.split('_')[-1][:-9])
model_path = 'model/%s_%sepoch.cpt'%(prefix, '%d')
log_path = 'log/%s.log'%prefix


# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print_log(device, log_path)


# Dataset
# 'sg_expressway_4weeks', 'sg_expressway_8weeks'
road_adj = extract_road_adj() # directed adj

if dataset == 'demo':
    start_date, end_date = '20160314', '20160314'
    calibrate = False
elif dataset == 'sg_expressway_8weeks':
    start_date, end_date = '20160314', '20160424' # train period + validation period
else:
    start_date, end_date = '20160401', '20160421' # train period + validation period
trajectory_transition = extract_trajectory_transition(start_date, end_date)
# smoothing with binary road_adj, in case no historical flow is recorded.
road_adj_mask = np.zeros(road_adj.shape)
road_adj_mask[road_adj > 0] = 1
np.fill_diagonal(road_adj_mask, 0)
for i in range(len(trajectory_transition)):
    trajectory_transition[i] = trajectory_transition[i] + road_adj_mask

if dataset == 'demo':
    start_date, end_date = '20160314', '20160314'
    calibrate = False
elif dataset == 'sg_expressway_8weeks':
    start_date, end_date = '20160314', '20160508' # train (5 weeks) + validation (1 week) + test (2 weeks)
else:
    start_date, end_date = '20160401', '20160428' # train + validation + test
dates = date_range(start_date, end_date)
flow_df = pd.concat([pd.read_csv('data/flow_%s_%s.csv'%(date, date), index_col=0) for date in dates])
flow_df.columns = pd.Index(int(road_id) for road_id in flow_df.columns)
# flow calibration on a daily basis
if calibrate:
    print_log('Calibrating flow...', log_path)
    trajectory_metadata = pd.read_csv('data/trajectory_metadata.csv') # read trajectory metadata
    multipliers = np.repeat(np.array(trajectory_metadata['vehicles'][0] / trajectory_metadata['vehicles']), 96)
    multipliers[multipliers==np.inf]=0
    flow_df = flow_df.mul(multipliers, axis=0)
print_log(flow_df.shape, log_path)
print_log('Total flow: %d'%(flow_df.sum().sum()), log_path)

Road adj exists
Total file exists


In [29]:
if dataset == 'demo': # 20160314
    indices = {'train': list(range(56)), # first 14 hours
               'val': list(range(56, 68)), # next 3 hours
               'test': list(range(68, 92))} # last 6 hours
    weekdays = np.array([0]) # day 0 (i.e. 20160314) is a weekday
elif dataset == 'sg_expressway_8weeks': # version 20160314-20160508
    indices = {'train': list(range(3220)), # first 5 weeks 20160314-20160417 (24-1)*(60/15)*56
               'val': list(range(3220, 3864)), # 6th week 20160418-20160424 (24-1)*(60/15)*7
               'test': list(range(3864, 5152))} # 7th-8th weeks 20160425-20160508 (24-1)*(60/15)*14
    # indices of weekdays (exclude weekends and PHs)
    weekdays = np.array([0, 1, 2, 3, 4, 
                         7, 8, 9, 10, # PH: 25th May, Friday
                         14, 15, 16, 17, 18,
                         21, 22, 23, 24, 25,
                         28, 29, 30, 31, 32, 
                         35, 36, 37, 38, 39,
                         42, 43, 44, 45, 46, 
                         50, 51, 52, 53]) # PH: 2nd May, Monday
else: # version 20160401-20160428
    indices = {'train': list(range(1288)), # first two weeks (24-1)*(60/15)*14
               'val': list(range(1288, 1932)), # third week (24-1)*(60/15)*7
               'test': list(range(1932, 2576))} # fourth week (24-1)*(60/15)*7

In [30]:
print(flow_df.shape)

(96, 2404)


In [31]:
dataset

'demo'

In [32]:
indices['train'] + indices['val']

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67]

In [33]:
flow_df.iloc[indices['train'] + indices['val']].values

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [34]:
with open('result/TrGNN_1596949116_0epoch_Y_pred.pkl', 'rb') as f:
    Y_pred = pkl.load(f)
Y_pred.shape

(24, 2404)