## Adding features to Trajectories

Features used here:

['lat','lng','instante','rota','velocidade','posicao','viaje','matricula_id','lat_uber','lng_uber','label']

Adding features:

* Acceleration
* Bearing
* Weekdays
* Trajectory id
* Point id
* Statistic features
* Flag (is_noise ? 0 - Not noise, 1 - Noise)


**The mainly goals here is to generate data to feed the PAC method**

In [1]:
import os.path
import sys  
from sentences import Sentences
sys.path.insert(0, '/home/mobility/michael/segmentation/its_research')
from labeling.labels import Labels
import pandas as pd
import numpy as np
from tqdm import tqdm
import h5py

### Loading data

In [None]:
GTFS_STOP_PATH = '../dublin/dublin/bus_stops.csv'
SEMAFORO_PATH ='../dublin/dublin/traffic_signals.csv'
DATA = '../data/sentences_dublin_labeled.npy'

stops = pd.read_csv(GTFS_STOP_PATH)
trfl = pd.read_csv(SEMAFORO_PATH)
dt = np.load(DATA,allow_pickle=True )

## Adding other_stop label

Here, we also encoding the labels to :
* 0 - bus_stop
* 1 - in_route
* 2 - other_stop
* 3 - traffic signals

In [5]:
features=['lat','lng','instante','rota','velocidade','posicao','viaje','matricula_id','lat_uber','lng_uber','label']
sentences = Sentences(features)
labels = Labels(20,30,stops,trfl)
labels.add_other_stop_label(dt)
sentences.label_encoder(dt)

## Quick analisys

In [None]:
### Verifying the number of points by trajectories
average_size_trajectory = list()
for items in dt:
    average_size_trajectory.append(len(items))
print(f'mean:{np.mean(average_size_trajectory)}\t std:{np.std(average_size_trajectory)}\t min:{np.min(average_size_trajectory)}\tmax:{np.max(average_size_trajectory)}')

In [None]:
### Verifying average distance by trajectories
average_size_distance = list()
for items in dt:
    average_size_distance.append(items[-1][5])
print(f'mean:{np.mean(average_size_distance)}\t std:{np.std(average_size_distance)}\t min:{np.min(average_size_distance)}\tmax:{np.max(average_size_distance)}')

In [None]:
qtd_routes = {x[0][3]:0 for x in dt}
for items in dt:
    qtd_routes[items[0][3]]+=1
print(f'qnt of distinct routes: {len(list(qtd_routes.keys()))}')
print(f'routes with grater num of traj: {max(qtd_routes, key=lambda k: qtd_routes[k])}')
print(f'routes with lesser num of traj: {min(qtd_routes, key=lambda k: qtd_routes[k])}')
print(f'mean: {np.mean(list(qtd_routes.values()))}')
print(f'std: {np.std(list(qtd_routes.values()))}')

### Adding features

In [None]:
sentences.add_features(dt)

In [15]:
# np.save('dt_features.npy',dt)
# dt = np.load('dt_features.npy', allow_pickle=True)

In [16]:
dt = list(map(sentences.complete_trajectory, dt, [96]*len(dt)))

### Removing false labels

In [None]:
#Removing all other_stop ex: ex: '''bus_stop > other_stop > bus_stop'''
labels.get_false_labels(dt, 2.0, 30)

## Selecting only important features

Here, we also add trajectory id, point id

In [17]:
selected_dt = sentences.select_features(dt)

17701it [00:22, 788.34it/s]


### Adding noise id

In [18]:
sentences.add_id_noise(selected_dt, set([]))

17701it [00:01, 15445.84it/s]


## Turning time into milliseconds

In [19]:
dt_with_new_time = sentences.get_time_in_seconds(selected_dt)

100%|██████████| 17701/17701 [01:46<00:00, 166.62it/s]


## Putting pad

### Optional: Adding pad 0 to begining of trajectory and repeat the last points

Without padding, statistic function does not take into account the initial points


It is important if we need to generate embeddings to model STOD, because we need all trajectory points

In [20]:
dt_with_padding = sentences.padding(16, np.array(dt_with_new_time))

### Adding statistics features

In [26]:
final_data = sentences.put_statistics_metrics_with_padding(np.array(dt_with_padding))

100%|██████████| 17701/17701 [42:30<00:00,  6.94it/s]  


## Saving data

In [None]:
def save_h5py(final_list_0,final_list_1,final_list_2,final_list_3,final_list_4,final_list_5, final_list_6):
    with h5py.File('../data/pac_data/data_to_pac.hdf5', "w") as  hdf5_store:
        list_stat_x_b = hdf5_store.create_dataset("list_window_x_b",data=final_list_0, compression="gzip")
        list_stat_x_a = hdf5_store.create_dataset("list_window_x_a",data=final_list_1, compression="gzip")
        list_stat_x_c = hdf5_store.create_dataset("list_x_c_queries",data=final_list_2, compression="gzip")
        list_stat_x_bs = hdf5_store.create_dataset("list_window_x_before_stats",data=final_list_3, compression="gzip")
        list_stat_x_as = hdf5_store.create_dataset("list_window_x_after_stats",data=final_list_4, compression="gzip")
        list_stat_y = hdf5_store.create_dataset("list_y_queries",data=final_list_5, compression="gzip")
        list_stat_y_list = hdf5_store.create_dataset("list_window_y",data=final_list_6, compression="gzip")
        hdf5_store.flush()
        hdf5_store.close()

In [None]:
save_h5py(sentences_to_feed_model[0],
          sentences_to_feed_model[1],
          sentences_to_feed_model[2],
          sentences_to_feed_model[3],
          sentences_to_feed_model[4],
          sentences_to_feed_model[5],
          sentences_to_feed_model[6])