In [25]:
from matplotlib import pyplot as plt
import numpy as np
import math

import pandas as pd

df_rows = pd.read_csv("./taxi_dataset/training_data.csv", sep='\n', header=None, nrows=2)
df_raw = df_rows[0].str.split(',', expand=True)
df_raw

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,223,224,225,226,227,228,229,230,231,232
0,1372638303,-8.587116,41.162427,-8.586198,41.162112,-8.585982,41.161338,-8.585037,41.160024,-8.584146,...,-8.587134,41.162436,-8.587125,41.162436,-8.587125,41.162427,-8.587134,41.162427,-8.587143,41.162436
1,1372646650,-8.6103,41.153688,-8.610336,41.153679,-8.610192,41.154039,-8.609985,41.154759,-8.609931,...,,,,,,,,,,


In [26]:
df = df_raw.iloc[:,:21]
# df.columns = ["start_time", "lon_1", "lat_1", "lon_2", "lat_2", "lon_3", "lat_3"]
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,1372638303,-8.587116,41.162427,-8.586198,41.162112,-8.585982,41.161338,-8.585037,41.160024,-8.584146,...,-8.583138,41.160204,-8.582175,41.160789,-8.582022,41.162364,-8.583849,41.163192,-8.586189,41.163273
1,1372646650,-8.6103,41.153688,-8.610336,41.153679,-8.610192,41.154039,-8.609985,41.154759,-8.609931,...,-8.610327,41.153625,-8.610669,41.153688,-8.611974,41.153724,-8.611938,41.155452,-8.611344,41.157162


In [27]:
from operator import attrgetter

class Trajectory:
    def __init__(self, id, df_row):
        self.id = id
        self.first_timestamp = int(df_row[0])
        
        # populate points array
        self.points = []
        self.points.append(Point(time=int(self.first_timestamp), lon=df_row[1], lat=df_row[2]))
        for i in range(3, df_row.size, 2):
            if df_row[i] is None or df_row[i]=='':
                break
            
            this_timestamp = int(self.first_timestamp) + (i//2)*15
            self.points.append(Point(time=this_timestamp, lon=df_row[i], lat=df_row[i+1]))

        # find trajectory min and max lat, lon, UNUSED in linear implementation
        self.min_lat = (min(self.points,key=attrgetter('lat')).lat)
        self.max_lat = (max(self.points,key=attrgetter('lat')).lat)
        self.min_lon = (min(self.points,key=attrgetter('lon')).lon)
        self.max_lon = (max(self.points,key=attrgetter('lon')).lon)

    def get_points_info(self):
        i = 1
        for point in self.points:
            print(i, point)
            i+=1

    def __str__(self):
        return("\n{:10s}: [ID: {:5d}, Time_First: {:10d}, Points: {}] \n{:10s}  [LAT_range: ({:5f}, {:5f}), LON_range: ({:5f}, {:5f}])".\
                    format("Trajectory", self.id, self.first_timestamp, len(self.points),\
                            "", self.min_lat, self.max_lat, self.min_lon, self.max_lon))
            
    def __repr__(self):
        # print("\nin __repr__, calling __str__")
        return str(self)
        
        

class Point:
    def __init__(self, time, lon, lat):
        self.timestamp = time
        self.lon = float(lon)
        self.lat = float(lat)
        
        self.is_truth = True        # True if point coordinate is truth value
        self.prev_pt_time = -1
        self.next_pt_time = -1

    def clear_coor(self):
        self.is_truth = False
        self.lon = -1
        self.lat = -1

    def set_prediction(self, pred_pt_lst):
        print("setting prediction")
        pred_time, pred_lon, pred_lat = pred_pt_lst
        if self.timestamp == pred_time:
            self.lon = pred_lon
            self.lat = pred_lat
        else:
            print("ERORR: time mismatch")
            print("actual t :", self.timestamp)
            print("predicted:", pred_time)
            print("\n")

    def __str__(self):
        return ("{:10s}: [Timestamp: {:10d}, Longitude: {:9f}, Latitude: {:9f}, Truth: {}]"\
            .format("Point", self.timestamp, self.lon, self.lat, self.is_truth))

    def __repr__(self):
        # print("\nin __repr__, calling __str__")
        return str(self)

In [28]:
taxi_trajectories = []

counter = 0
for index, row in df.iterrows():    
    taxi_trajectories.append(Trajectory(counter, row))
    counter += 1

# for traj in taxi_trajectories:
#     print(traj)
#     traj.get_points_info()
#     print()

In [29]:
grid_obj = []
for traj in taxi_trajectories:
    grid_obj.append(traj.points)

print(pd.DataFrame(grid_obj))

                                                   0  \
0  Point     : [Timestamp: 1372638303, Longitude:...   
1  Point     : [Timestamp: 1372646650, Longitude:...   

                                                   1  \
0  Point     : [Timestamp: 1372638318, Longitude:...   
1  Point     : [Timestamp: 1372646665, Longitude:...   

                                                   2  \
0  Point     : [Timestamp: 1372638333, Longitude:...   
1  Point     : [Timestamp: 1372646680, Longitude:...   

                                                   3  \
0  Point     : [Timestamp: 1372638348, Longitude:...   
1  Point     : [Timestamp: 1372646695, Longitude:...   

                                                   4  \
0  Point     : [Timestamp: 1372638363, Longitude:...   
1  Point     : [Timestamp: 1372646710, Longitude:...   

                                                   5  \
0  Point     : [Timestamp: 1372638378, Longitude:...   
1  Point     : [Timestamp: 1372646725, Long

In [30]:
grid_obj = np.array(grid_obj, dtype=object)

# create copy of truth values
import copy
truth_obj = copy.deepcopy(grid_obj)

In [31]:
# creating test data
import random

def random_clear_traj(grid_obj, sampling_rate, use_seed=False):
    # randomly clear points per trajectory

    if use_seed is True:
        random.seed(1)

    for traj in grid_obj:
        row_random = random.sample(range(1, len(traj)-1), int(sampling_rate*len(traj)))
        # print(sorted(row_random))
        for i in row_random:
            traj[i].clear_coor()

def random_clear_total(grid_obj, sampling_rate, use_seed=False):
    # randomly clear points in whole grid_obj, except for first and last in each trajectory for linear implementation

    # append every point from grid_obj into ls_points, except first and last in traj
    ls_points = []
    for traj in grid_obj:
        print("new traj")
        ls_points.extend(traj[1:len(traj)-1])

    num_total_points = len(ls_points)
    print("num total:", num_total_points)
    num_test_points = int(sampling_rate * num_total_points)
    print("num test:", num_test_points)

    if use_seed is True:
        random.seed(1)

    random_test_index = random.sample(range(num_total_points), num_test_points)
    # print(sorted(random_test_index))

    [ls_points[i].clear_coor() for i in random_test_index]



random_clear_total(grid_obj, 0.4, use_seed=True)

new traj
new traj
num total: 16
num test: 6


In [32]:
# Point* estimate_linear(Point pt_prev, Point pt_next)
def estimate_linear(pt_prev, pt_next):
    # returns an array of predicted points based on time difference between previous and next coordinates

    pt_prev_t   = pt_prev.timestamp
    pt_prev_lon = pt_prev.lon
    pt_prev_lat = pt_prev.lat

    pt_next_t   = pt_next.timestamp
    pt_next_lon = pt_next.lon
    pt_next_lat = pt_next.lat
    
    num_pt_between = (pt_next_t - pt_prev_t) // 15 - 1
    pt_between_arr = []

    lon_interval = (pt_next_lon - pt_prev_lon) / (num_pt_between+1)
    lat_interval = (pt_next_lat - pt_prev_lat) / (num_pt_between+1)

    this_t = pt_prev_t
    this_lon = pt_prev_lon
    this_lat = pt_prev_lat

    for i in range(1, num_pt_between+1):
        this_t += 15
        this_lon = this_lon + lon_interval
        this_lat = this_lat + lat_interval

        pt_between_arr.append([this_t, this_lon, this_lat])

    # print("### estimate linear ###:")
    # print("num_pts:", num_pt_between)
    # print("pt_prev:", pt_prev_t)
    # print("pt_next:", pt_next_t)
    # print(pt_between_arr, '\n')
    return pt_between_arr

In [39]:
# recovering trajectory

for traj in grid_obj:
    last_truth_pt = None
    next_truth_pt = None
    to_predict = False              # True when current pt is not truth value
    index = 0    

    for curr_pt in traj:
        print("new pt", curr_pt)

        if to_predict is False:
            if curr_pt.is_truth is True:
                # all good, truth
                last_truth_pt = curr_pt
            else:
                # encountered point with non-truth value
                to_predict = True
                print("set predict True")
                num_missing_data = 1
        
        else:
            print("in else")
            # in a streak of non-truth points
            if curr_pt.is_truth is False:
                # curr_pt.prev_pt_time = last_truth_pt.timestamp
                num_missing_data += 1
                print("inc nmd:", num_missing_data)
                pass

            else:
                # found truth point
                to_predict = False
                next_truth_pt = curr_pt

                # print("ltt:", last_truth_pt.timestamp)
                # print("ntt:", next_truth_pt.timestamp)
                # print("spp:", starting_pt_pred)

                predicted_pts = estimate_linear(last_truth_pt, next_truth_pt)
                for pred_index in range(len(predicted_pts)):
                    traj[index-num_missing_data].set_prediction(predicted_pts[pred_index])
                    num_missing_data -= 1
                    pred_index += 1

                last_truth_pt = curr_pt
        
        index += 1

new pt Point     : [Timestamp: 1372638303, Longitude: -8.587116, Latitude: 41.162427, Truth: True]
new pt Point     : [Timestamp: 1372638318, Longitude: -8.586198, Latitude: 41.162112, Truth: True]
new pt Point     : [Timestamp: 1372638333, Longitude: -1.000000, Latitude: -1.000000, Truth: False]
set predict True
new pt Point     : [Timestamp: 1372638348, Longitude: -8.585037, Latitude: 41.160024, Truth: True]
in else
new pt Point     : [Timestamp: 1372638363, Longitude: -8.584146, Latitude: 41.159358, Truth: True]
new pt Point     : [Timestamp: 1372638378, Longitude: -1.000000, Latitude: -1.000000, Truth: False]
set predict True
new pt Point     : [Timestamp: 1372638393, Longitude: -8.582175, Latitude: 41.160789, Truth: True]
in else
new pt Point     : [Timestamp: 1372638408, Longitude: -8.582022, Latitude: 41.162364, Truth: True]
new pt Point     : [Timestamp: 1372638423, Longitude: -8.583849, Latitude: 41.163192, Truth: True]
new pt Point     : [Timestamp: 1372638438, Longitude: -8.

In [34]:
# print(grid_obj)
for i in range(10):
    print(grid_obj[0][i])

Point     : [Timestamp: 1372638303, Longitude: -8.587116, Latitude: 41.162427, Truth: True]
Point     : [Timestamp: 1372638318, Longitude: -8.586198, Latitude: 41.162112, Truth: True]
Point     : [Timestamp: 1372638333, Longitude: -1.000000, Latitude: -1.000000, Truth: False]
Point     : [Timestamp: 1372638348, Longitude: -8.585037, Latitude: 41.160024, Truth: True]
Point     : [Timestamp: 1372638363, Longitude: -8.584146, Latitude: 41.159358, Truth: True]
Point     : [Timestamp: 1372638378, Longitude: -1.000000, Latitude: -1.000000, Truth: False]
Point     : [Timestamp: 1372638393, Longitude: -8.582175, Latitude: 41.160789, Truth: True]
Point     : [Timestamp: 1372638408, Longitude: -8.582022, Latitude: 41.162364, Truth: True]
Point     : [Timestamp: 1372638423, Longitude: -8.583849, Latitude: 41.163192, Truth: True]
Point     : [Timestamp: 1372638438, Longitude: -8.586189, Latitude: 41.163273, Truth: True]


In [35]:
# print(truth_obj)
for i in range(10):
    print(truth_obj[0][i])

Point     : [Timestamp: 1372638303, Longitude: -8.587116, Latitude: 41.162427, Truth: True]
Point     : [Timestamp: 1372638318, Longitude: -8.586198, Latitude: 41.162112, Truth: True]
Point     : [Timestamp: 1372638333, Longitude: -8.585982, Latitude: 41.161338, Truth: True]
Point     : [Timestamp: 1372638348, Longitude: -8.585037, Latitude: 41.160024, Truth: True]
Point     : [Timestamp: 1372638363, Longitude: -8.584146, Latitude: 41.159358, Truth: True]
Point     : [Timestamp: 1372638378, Longitude: -8.583138, Latitude: 41.160204, Truth: True]
Point     : [Timestamp: 1372638393, Longitude: -8.582175, Latitude: 41.160789, Truth: True]
Point     : [Timestamp: 1372638408, Longitude: -8.582022, Latitude: 41.162364, Truth: True]
Point     : [Timestamp: 1372638423, Longitude: -8.583849, Latitude: 41.163192, Truth: True]
Point     : [Timestamp: 1372638438, Longitude: -8.586189, Latitude: 41.163273, Truth: True]


In [22]:
def cal_dis(lat_1,lon_1,lat_2,lon_2):
    lon_1 = lon_1 * math.pi / 180
    lat_1 = lat_1 * math.pi / 180
    lon_2 = lon_2 * math.pi / 180
    lat_2 = lat_2 * math.pi / 180
    a = abs(lat_1 - lat_2)
    b = abs(lon_1 - lon_2)
    d = 2 * 6378.137 * np.arcsin(
        np.sqrt(np.sin(a / 2) * np.sin(a / 2) + np.cos(lat_1) * np.cos(lat_2) * np.sin(b / 2) * np.sin(b / 2)))
    return d

def calc_mean_abs_error(data1, data2):
    # iterate over each trajectory in dataset and find total error, then divide by number of predicted data
    total_error = 0
    num_predicted = 0

    if len(data1) != len(data2):
        return -1

    for i in range(len(data1)):
        traj1 = data1[i]
        traj2 = data2[i]

        if len(traj1) != len(traj2):
            return -1

        for i in range(len(traj1)):
            if traj1[i].is_truth != traj2[i].is_truth:
                total_error += cal_dis(traj1[i].lat, traj1[i].lon, traj2[i].lat, traj2[i].lon)
                num_predicted += 1

    print("num_predicted:", num_predicted)
    mae = total_error / num_predicted
    return mae

calc_mean_abs_error(grid_obj, truth_obj)

num_predicted: 6


0.03868550888294009