In [1]:
from matplotlib import pyplot as plt
import numpy as np
import math

import pandas as pd

df_rows = pd.read_csv("./taxi_dataset/training_data_simple.csv", sep='\n', nrows=5, header=None)
df = df_rows[0].str.split(',', expand=True)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,663,664,665,666,667,668,669,670,671,672
0,1372638303,-8.587116,41.162427,-8.586198,41.162112,-8.585982,41.161338,-8.585037,41.160024,-8.584146,...,,,,,,,,,,
1,1372646650,-8.6103,41.153688,-8.610336,41.153679,-8.610192,41.154039,-8.609985,41.154759,-8.609931,...,,,,,,,,,,
2,1372645583,-8.591301,41.156163,-8.592651,41.156163,-8.594415,41.156298,-8.594451,41.156316,-8.59446,...,,,,,,,,,,
3,1372651427,-8.610291,41.153625,-8.61093,41.153643,-8.611965,41.153859,-8.611938,41.154651,-8.611884,...,,,,,,,,,,
4,1372652856,-8.61372,41.148414,-8.614287,41.148414,-8.614431,41.148387,-8.614845,41.147451,-8.614872,...,,,,,,,,,,


In [2]:
df = df.iloc[:7,:7]
df.columns = ["start_time", "lon_1", "lat_1", "lon_2", "lat_2", "lon_3", "lat_3"]
df

Unnamed: 0,start_time,lon_1,lat_1,lon_2,lat_2,lon_3,lat_3
0,1372638303,-8.587116,41.162427,-8.586198,41.162112,-8.585982,41.161338
1,1372646650,-8.6103,41.153688,-8.610336,41.153679,-8.610192,41.154039
2,1372645583,-8.591301,41.156163,-8.592651,41.156163,-8.594415,41.156298
3,1372651427,-8.610291,41.153625,-8.61093,41.153643,-8.611965,41.153859
4,1372652856,-8.61372,41.148414,-8.614287,41.148414,-8.614431,41.148387


In [16]:
from operator import attrgetter

class Trajectory:
    def __init__(self, id, df_row):
        self.id = id
        self.first_timestamp = int(df_row[0])
        
        # populate points array
        self.points = []
        self.points.append(Point(time=int(self.first_timestamp), lon=df_row[1], lat=df_row[2]))
        for i in range(3, df_row.size, 2):
            this_timestamp = int(self.first_timestamp) + (i//2)*15
            self.points.append(Point(time=this_timestamp, lon=df_row[i], lat=df_row[i+1]))

        # find trajectory min and max lat, lon
        self.min_lat = (min(self.points,key=attrgetter('lat')).lat)
        self.max_lat = (max(self.points,key=attrgetter('lat')).lat)
        self.min_lon = (min(self.points,key=attrgetter('lon')).lon)
        self.max_lon = (max(self.points,key=attrgetter('lon')).lon)

    def get_points_info(self):
        i = 1
        for point in self.points:
            print(i, point)
            i+=1

    def __str__(self):
        return("\n{:10s}: [ID: {:5d}, Time_First: {:10d}, Points: {}] \n{:10s}  [LAT_range: ({:5f}, {:5f}), LON_range: ({:5f}, {:5f}])".\
                    format("Trajectory", self.id, self.first_timestamp, len(self.points),\
                            "", self.min_lat, self.max_lat, self.min_lon, self.max_lon))
            
    def __repr__(self):
        # print("\nin __repr__, calling __str__")
        return str(self)
        
        

class Point:
    def __init__(self, time, lon, lat):
        self.timestamp = time
        self.lon = float(lon)
        self.lat = float(lat)
        
        self.is_truth = True        # True if point coordinate is truth value
        self.prev_pt_time = -1
        self.next_pt_time = -1

    def clear_coor(self):
        self.is_truth = False
        self.lon = -1
        self.lat = -1

    def set_prediction(self, pred_pt_lst):
        pred_time, pred_lon, pred_lat = pred_pt_lst
        if self.timestamp == pred_time:
            self.lon = pred_lon
            self.lat = pred_lat
        else:
            print("ERORR: time mismatch")

    def __str__(self):
        return ("{:10s}: [Timestamp: {:10d}, Longitude: {:9f}, Latitude: {:9f}, Truth: {}]"\
            .format("Point", self.timestamp, self.lon, self.lat, self.is_truth))

    def __repr__(self):
        # print("\nin __repr__, calling __str__")
        return str(self)

In [17]:
taxi_trajectories = []

counter = 0
for index, row in df.iterrows():
    taxi_trajectories.append(Trajectory(counter, row))
    counter += 1

for traj in taxi_trajectories:
    print(traj)
    traj.get_points_info()
    print()


Trajectory: [ID:     0, Time_First: 1372638303, Points: 3] 
            [LAT_range: (41.161338, 41.162427), LON_range: (-8.587116, -8.585982])
1 Point     : [Timestamp: 1372638303, Longitude: -8.587116, Latitude: 41.162427, Truth: True]
2 Point     : [Timestamp: 1372638318, Longitude: -8.586198, Latitude: 41.162112, Truth: True]
3 Point     : [Timestamp: 1372638333, Longitude: -8.585982, Latitude: 41.161338, Truth: True]


Trajectory: [ID:     1, Time_First: 1372646650, Points: 3] 
            [LAT_range: (41.153679, 41.154039), LON_range: (-8.610336, -8.610192])
1 Point     : [Timestamp: 1372646650, Longitude: -8.610300, Latitude: 41.153688, Truth: True]
2 Point     : [Timestamp: 1372646665, Longitude: -8.610336, Latitude: 41.153679, Truth: True]
3 Point     : [Timestamp: 1372646680, Longitude: -8.610192, Latitude: 41.154039, Truth: True]


Trajectory: [ID:     2, Time_First: 1372645583, Points: 3] 
            [LAT_range: (41.156163, 41.156298), LON_range: (-8.594415, -8.591301])
1 

In [5]:
grid_obj = []
for traj in taxi_trajectories:
    grid_obj.append(traj.points)

print(pd.DataFrame(grid_obj))

                                                   0  \
0  Point     : [Timestamp: 1372638303, Longitude:...   
1  Point     : [Timestamp: 1372646650, Longitude:...   
2  Point     : [Timestamp: 1372645583, Longitude:...   
3  Point     : [Timestamp: 1372651427, Longitude:...   
4  Point     : [Timestamp: 1372652856, Longitude:...   

                                                   1  \
0  Point     : [Timestamp: 1372638318, Longitude:...   
1  Point     : [Timestamp: 1372646665, Longitude:...   
2  Point     : [Timestamp: 1372645598, Longitude:...   
3  Point     : [Timestamp: 1372651442, Longitude:...   
4  Point     : [Timestamp: 1372652871, Longitude:...   

                                                   2  
0  Point     : [Timestamp: 1372638333, Longitude:...  
1  Point     : [Timestamp: 1372646680, Longitude:...  
2  Point     : [Timestamp: 1372645613, Longitude:...  
3  Point     : [Timestamp: 1372651457, Longitude:...  
4  Point     : [Timestamp: 1372652886, Longitude:..

In [6]:
grid_obj = np.array(grid_obj)

# create copy of truth values
import copy
truth_obj = copy.deepcopy(grid_obj)

# creating test data
for traj in grid_obj:
    traj[1].clear_coor()

pd.options.display.max_rows = 4000
pd.options.display.max_seq_items = 4000
# print(pd.get_option("display.max_rows"))
# print(pd.get_option("display.max_seq_items"))
# print(pd.DataFrame(grid_obj))

print(grid_obj)

[[Point     : [Timestamp: 1372638303, Longitude: -8.587116, Latitude: 41.162427, Truth: True]
  Point     : [Timestamp: 1372638318, Longitude: -1.000000, Latitude: -1.000000, Truth: False]
  Point     : [Timestamp: 1372638333, Longitude: -8.585982, Latitude: 41.161338, Truth: True]]
 [Point     : [Timestamp: 1372646650, Longitude: -8.610300, Latitude: 41.153688, Truth: True]
  Point     : [Timestamp: 1372646665, Longitude: -1.000000, Latitude: -1.000000, Truth: False]
  Point     : [Timestamp: 1372646680, Longitude: -8.610192, Latitude: 41.154039, Truth: True]]
 [Point     : [Timestamp: 1372645583, Longitude: -8.591301, Latitude: 41.156163, Truth: True]
  Point     : [Timestamp: 1372645598, Longitude: -1.000000, Latitude: -1.000000, Truth: False]
  Point     : [Timestamp: 1372645613, Longitude: -8.594415, Latitude: 41.156298, Truth: True]]
 [Point     : [Timestamp: 1372651427, Longitude: -8.610291, Latitude: 41.153625, Truth: True]
  Point     : [Timestamp: 1372651442, Longitude: -1.00

In [7]:
# Point* estimate_linear(Point pt_prev, Point pt_next)
def estimate_linear(pt_prev, pt_next):
    # returns an array of predicted points based on time difference between previous and next coordinates

    pt_prev_t   = pt_prev.timestamp
    pt_prev_lon = pt_prev.lon
    pt_prev_lat = pt_prev.lat

    pt_next_t   = pt_next.timestamp
    pt_next_lon = pt_next.lon
    pt_next_lat = pt_next.lat
    
    num_pt_between = (pt_next_t - pt_prev_t) // 15 - 1
    print("num_pts:", num_pt_between)
    pt_between_arr = []

    lon_interval = (pt_next_lon - pt_prev_lon) / (num_pt_between+1)
    lat_interval = (pt_next_lat - pt_prev_lat) / (num_pt_between+1)

    this_t = pt_prev_t
    this_lon = pt_prev_lon
    this_lat = pt_prev_lat

    for i in range(1, num_pt_between+1):
        this_t += 15
        this_lon = this_lon + lon_interval
        this_lat = this_lat + lat_interval

        pt_between_arr.append([this_t, this_lon, this_lat])

    print(pt_between_arr, '\n')
    return pt_between_arr

In [8]:
# recovering trajectory

num_missing_data = 0

for traj in grid_obj:
    last_truth_pt = None
    next_truth_pt = None
    to_predict = False              # True when current pt is not truth value
    starting_pt_pred = 0
    
    for curr_pt in traj:
        if to_predict is False:
            if curr_pt.is_truth is True:
                # all good, truth
                last_truth_pt = curr_pt
                starting_pt_pred += 1
            else:
                # encountered point with non-truth value
                to_predict = True
        
        else:
            num_missing_data += 1
            # in a streak of non-truth points
            if curr_pt.is_truth is False:
                # curr_pt.prev_pt_time = last_truth_pt.timestamp
                pass

            else:
                # found truth point
                to_predict = False
                next_truth_pt = curr_pt

                # print("ltt:", last_truth_pt.timestamp)
                # print("ntt:", next_truth_pt.timestamp)
                # print("spp:", starting_pt_pred)

                predicted_pts = estimate_linear(last_truth_pt, next_truth_pt)
                for pred_index in range(len(predicted_pts)):
                    traj[starting_pt_pred].set_prediction(predicted_pts[pred_index])
                    starting_pt_pred += 1
                    pred_index += 1

print(grid_obj)

ltt: 1372638303
ntt: 1372638333
spp: 1
num_pts: 1
[[1372638318, -8.586549, 41.161882500000004]] 

ltt: 1372646650
ntt: 1372646680
spp: 1
num_pts: 1
[[1372646665, -8.610246, 41.1538635]] 

ltt: 1372645583
ntt: 1372645613
spp: 1
num_pts: 1
[[1372645598, -8.592858, 41.1562305]] 

ltt: 1372651427
ntt: 1372651457
spp: 1
num_pts: 1
[[1372651442, -8.611128, 41.153741999999994]] 

ltt: 1372652856
ntt: 1372652886
spp: 1
num_pts: 1
[[1372652871, -8.6140755, 41.1484005]] 

[[Point     : [Timestamp: 1372638303, Longitude: -8.587116, Latitude: 41.162427, Truth: True]
  Point     : [Timestamp: 1372638318, Longitude: -8.586549, Latitude: 41.161883, Truth: False]
  Point     : [Timestamp: 1372638333, Longitude: -8.585982, Latitude: 41.161338, Truth: True]]
 [Point     : [Timestamp: 1372646650, Longitude: -8.610300, Latitude: 41.153688, Truth: True]
  Point     : [Timestamp: 1372646665, Longitude: -8.610246, Latitude: 41.153863, Truth: False]
  Point     : [Timestamp: 1372646680, Longitude: -8.610192, 

In [9]:
print(truth_obj)

[[Point     : [Timestamp: 1372638303, Longitude: -8.587116, Latitude: 41.162427, Truth: True]
  Point     : [Timestamp: 1372638318, Longitude: -8.586198, Latitude: 41.162112, Truth: True]
  Point     : [Timestamp: 1372638333, Longitude: -8.585982, Latitude: 41.161338, Truth: True]]
 [Point     : [Timestamp: 1372646650, Longitude: -8.610300, Latitude: 41.153688, Truth: True]
  Point     : [Timestamp: 1372646665, Longitude: -8.610336, Latitude: 41.153679, Truth: True]
  Point     : [Timestamp: 1372646680, Longitude: -8.610192, Latitude: 41.154039, Truth: True]]
 [Point     : [Timestamp: 1372645583, Longitude: -8.591301, Latitude: 41.156163, Truth: True]
  Point     : [Timestamp: 1372645598, Longitude: -8.592651, Latitude: 41.156163, Truth: True]
  Point     : [Timestamp: 1372645613, Longitude: -8.594415, Latitude: 41.156298, Truth: True]]
 [Point     : [Timestamp: 1372651427, Longitude: -8.610291, Latitude: 41.153625, Truth: True]
  Point     : [Timestamp: 1372651442, Longitude: -8.61093

In [10]:
def cal_dis(lat_1,lon_1,lat_2,lon_2):
    lon_1 = lon_1 * math.pi / 180
    lat_1 = lat_1 * math.pi / 180
    lon_2 = lon_2 * math.pi / 180
    lat_2 = lat_2 * math.pi / 180
    a = abs(lat_1 - lat_2)
    b = abs(lon_1 - lon_2)
    d = 2 * 6378.137 * np.arcsin(
        np.sqrt(np.sin(a / 2) * np.sin(a / 2) + np.cos(lat_1) * np.cos(lat_2) * np.sin(b / 2) * np.sin(b / 2)))
    return d

def calc_mean_abs_error(data1, data2, denom=num_missing_data):
    # iterate over each trajectory in dataset and find total error, then divide by number of predicted data
    total_error = 0

    if len(data1) != len(data2):
        return -1

    for i in range(len(data1)):
        traj1 = data1[i]
        traj2 = data2[i]

        if len(traj1) != len(traj2):
            return -1

        for i in range(len(traj1)):
            total_error += cal_dis(traj1[i].lat, traj1[i].lon, traj2[i].lat, traj2[i].lon)

    mae = total_error / denom
    return mae
    

calc_mean_abs_error(grid_obj, truth_obj)

0.02349254551101152