In [1]:
from matplotlib import pyplot as plt
import numpy as np
import math

import pandas as pd

df_rows = pd.read_csv("./taxi_dataset/training_data_simple.csv", sep='\n', header=None, nrows=2)
df_raw = df_rows[0].str.split(',', expand=True)
df_raw

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,663,664,665,666,667,668,669,670,671,672
0,1372638303,-8.587116,41.162427,-8.586198,41.162112,-8.585982,41.161338,-8.585037,41.160024,-8.584146,...,,,,,,,,,,


In [45]:
df = df_raw.iloc[:7,:27]
# df.columns = ["start_time", "lon_1", "lat_1", "lon_2", "lat_2", "lon_3", "lat_3"]
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,1372638303,-8.587116,41.162427,-8.586198,41.162112,-8.585982,41.161338,-8.585037,41.160024,-8.584146,...,-8.586702,41.169411,-8.586729,41.169402,-8.586702,41.169339,-8.586702,41.169294,-8.586756,41.169303


In [53]:
def cal_dis(lat_1,lon_1,lat_2,lon_2):
    lon_1 = lon_1 * math.pi / 180
    lat_1 = lat_1 * math.pi / 180
    lon_2 = lon_2 * math.pi / 180
    lat_2 = lat_2 * math.pi / 180
    a = abs(lat_1 - lat_2)
    b = abs(lon_1 - lon_2)
    d = 2 * 6378.137 * np.arcsin(
        np.sqrt(np.sin(a / 2) * np.sin(a / 2) + np.cos(lat_1) * np.cos(lat_2) * np.sin(b / 2) * np.sin(b / 2)))
    return d

In [54]:
from operator import attrgetter

class Trajectory:
    def __init__(self, id, df_row):
        self.id = id
        self.first_timestamp = int(df_row[0])
        
        # populate points array
        self.points = []
        self.points.append(Point(time=int(self.first_timestamp), lon=df_row[1], lat=df_row[2]))
        for i in range(3, df_row.size, 2):
            if df_row[i] is None or df_row[i]=='':
                break
            
            this_timestamp = int(self.first_timestamp) + (i//2)*15
            self.points.append(Point(time=this_timestamp, lon=df_row[i], lat=df_row[i+1]))

        # find trajectory min and max lat, lon
        self.min_lat = (min(self.points,key=attrgetter('lat')).lat)
        self.max_lat = (max(self.points,key=attrgetter('lat')).lat)
        self.min_lon = (min(self.points,key=attrgetter('lon')).lon)
        self.max_lon = (max(self.points,key=attrgetter('lon')).lon)

        self.radius_of_gyration = -1        # default value -1 since equation is sqrt, making -1 impossible
        self.entropy = -1                   # default value -1 since equation never returns -1

    def get_points_info(self):
        i = 1
        for point in self.points:
            print(i, point)
            i+=1

    def calc_radius_of_gyration(self):
        point_center_lat = np.mean([point.lat for point in self.points])
        point_center_lon = np.mean([point.lon for point in self.points])

        temp_sum_rog = 0
        for point in self.points:
            temp_sum_rog += cal_dis(lat_1=point.lat, lon_1=point.lon, lat_2=point_center_lat, lon_2=point_center_lon)

        m = len(self.points)
        self.radius_of_gyration = math.sqrt(temp_sum_rog / m)
        
    def calc_entropy(self):
        lat_midpoint = self.min_lat + (self.max_lat - self.min_lat)/2
        lon_midpoint = self.min_lon + (self.max_lon - self.min_lon)/2

        length = cal_dis(lat_1=self.min_lat, lon_1=lon_midpoint, lat_2=self.max_lat, lon_2=lon_midpoint)
        width  = cal_dis(lat_1=lat_midpoint, lon_1=self.min_lon, lat_2=lat_midpoint, lon_2=self.max_lon)
        # print(length, width)

        UNIT_CELL_SIZE = 0.1        # for 100 x 100 metres cell size, can try 1km x 1km too
        length_size = math.ceil(length / UNIT_CELL_SIZE) + 1
        width_size  = math.ceil(width  / UNIT_CELL_SIZE) + 1
        # print("dim: {}x{}".format(length_size,width_size))

        count_grid = [ [0]*width_size for i in range(length_size)]
        for point in self.points:
            this_lat = point.lat
            this_lon = point.lon
            # print(point)

            lat_mid = this_lat + (this_lat - self.min_lat)/2
            lon_mid = this_lon + (this_lon - self.min_lon)/2

            lat_dis = cal_dis(lat_1=self.min_lat, lon_1=lon_mid, lat_2=this_lat, lon_2=lon_mid)
            lon_dis = cal_dis(lat_1=lat_mid, lon_1=self.min_lon, lat_2=lat_mid, lon_2=this_lon)

            x = lat_dis / UNIT_CELL_SIZE
            y = lon_dis / UNIT_CELL_SIZE
            x, y = round(x), round(y)
            # print("x:({}), y:({})".format(x,y))
            count_grid[x][y] += 1
        
        # print(pd.DataFrame(count_grid))

        m = len(self.points)
        temp_ent_sum = 0
        for x in range(len(count_grid)):
            for y in range(len(count_grid[x])):
                cell_count = count_grid[x][y]
                if cell_count == 0:
                    # temp_ent_sum += 0
                    continue
                else:
                    percent_i = cell_count / m                              # p(i)
                    temp_ent_sum += (percent_i) * math.log2(percent_i)      # summation
        
        self.entropy = -1 * temp_ent_sum
        

    def __str__(self):
        return("\n{:10s}: [ID: {:5d}, Time_First: {:10d}, Points: {}] \n{:10s}  [LAT_range: ({:5f}, {:5f}), LON_range: ({:5f}, {:5f}])".\
                    format("Trajectory", self.id, self.first_timestamp, len(self.points),\
                            "", self.min_lat, self.max_lat, self.min_lon, self.max_lon))
            
    def __repr__(self):
        # print("\nin __repr__, calling __str__")
        return str(self)


        
        

class Point:
    def __init__(self, time, lon, lat):
        self.timestamp = time
        self.lon = float(lon)
        self.lat = float(lat)
        
        self.is_truth = True        # True if point coordinate is truth value
        self.prev_pt_time = -1
        self.next_pt_time = -1

    def set_truth_false(self):
        self.is_truth = False

    def set_prediction(self, pred_pt_lst):
        pred_time, pred_lon, pred_lat = pred_pt_lst
        if self.timestamp == pred_time:
            self.lon = pred_lon
            self.lat = pred_lat
        else:
            print("ERORR: time mismatch")
            print("actual t :", self.timestamp)
            print("predicted:", pred_time)
            print("\n")

    def __str__(self):
        return ("{:10s}: [Timestamp: {:10d}, Longitude: {:9f}, Latitude: {:9f}, Truth: {}]"\
            .format("Point", self.timestamp, self.lon, self.lat, self.is_truth))

    def __repr__(self):
        # print("\nin __repr__, calling __str__")
        return str(self)

In [55]:
taxi_trajectories = []

counter = 0
for index, row in df.iterrows():
    taxi_trajectories.append(Trajectory(counter, row))
    counter += 1

taxi_trajectories[0].calc_radius_of_gyration()
print("ROG:", taxi_trajectories[0].radius_of_gyration)

taxi_trajectories[0].calc_entropy()
print("Ent:", taxi_trajectories[0].entropy)

ROG: 0.6383871750173397
Ent: 3.232487689168953


In [56]:
global_min_lat = (min(taxi_trajectories, key=attrgetter('min_lat')).min_lat)
global_max_lat = (max(taxi_trajectories, key=attrgetter('max_lat')).max_lat)
global_min_lon = (min(taxi_trajectories, key=attrgetter('min_lon')).min_lon)
global_max_lon = (max(taxi_trajectories, key=attrgetter('max_lon')).max_lon)

print("global_min_lat:", global_min_lat)
print("global_max_lat:", global_max_lat)
print("global_min_lon:", global_min_lon)
print("global_max_lon:", global_max_lon)

global_lat_midpoint = global_min_lat + (global_max_lat - global_min_lat)/2
global_lon_midpoint = global_min_lon + (global_max_lon - global_min_lon)/2

length = cal_dis(lat_1=global_min_lat, lon_1=global_lon_midpoint, lat_2=global_max_lat, lon_2=global_lon_midpoint)
width  = cal_dis(lat_1=global_lat_midpoint, lon_1=global_min_lon, lat_2=global_lat_midpoint, lon_2=global_max_lon)

print("length:", length)
print("width:", width)
# print(max(taxi_trajectories[0].points,key=attrgetter('lat')))
# print(taxi_trajectories[0].max_lat)

global_min_lat: 41.159358
global_max_lat: 41.169429
global_min_lon: -8.58906
global_max_lon: -8.582022
length: 1.121098591779083
width: 0.5898125251027557


In [57]:
UNIT_CELL_SIZE = 0.1        # for 100 x 100 metres cell size, can try 1km x 1km too
length_size = math.ceil(length / UNIT_CELL_SIZE) + 1
width_size  = math.ceil(width  / UNIT_CELL_SIZE) + 1

point_grid = [ [[[]]]*width_size for i in range(length_size)]   # create a 2d array of lists of points (3D array of points)
count_grid = [ [0]*width_size for i in range(length_size)]

for traj in taxi_trajectories:
    for point in traj.points:
        this_lat = point.lat
        this_lon = point.lon
        # print(point)

        lat_mid = this_lat + (this_lat - global_min_lat)/2
        lon_mid = this_lon + (this_lon - global_min_lon)/2

        lat_dis = cal_dis(lat_1=global_min_lat, lon_1=lon_mid, lat_2=this_lat, lon_2=lon_mid)
        lon_dis = cal_dis(lat_1=lat_mid, lon_1=global_min_lon, lat_2=lat_mid, lon_2=this_lon)

        x = lat_dis / UNIT_CELL_SIZE
        y = lon_dis / UNIT_CELL_SIZE
        x, y = round(x), round(y)
        # print("x:({}), y:({})".format(x,y))

        point_grid[x][y].append(point)
        count_grid[x][y] += 1
        # print()

# print(pd.DataFrame(count_grid))

In [58]:
# finding stationary taxis (investigate excess number of point in a cell), turn out to be stationary taxis
for i in range(len(count_grid)):
    for j in range(len(count_grid[i])):
        if count_grid[i][j] > 10:
            print(i,j)

In [59]:
import random
def random_clear_total(percent_testing_data, use_seed=False):
    # randomly clear points in whole grid_obj, except for first and last in each trajectory for linear implementation

    # append every point from grid_obj into ls_points, except first and last in traj
    ls_points = []
    for traj in taxi_trajectories:
        ls_points.extend(traj.points[1:len(traj.points)-1])

    num_total_points = len(ls_points)
    print("num total:", num_total_points)
    num_test_points = int(percent_testing_data * num_total_points)
    print("num test:", num_test_points)

    if use_seed is True:
        random.seed(1)

    random_test_index = random.sample(range(num_total_points), num_test_points)
    # print(sorted(random_test_index))

    [ls_points[i].set_truth_false() for i in random_test_index]

random_clear_total(0.5)

num total: 23
num test: 11


In [61]:
for traj in taxi_trajectories:
    traj.get_points_info()

1 Point     : [Timestamp: 1372638303, Longitude: -8.587116, Latitude: 41.162427, Truth: True]
2 Point     : [Timestamp: 1372638318, Longitude: -8.586198, Latitude: 41.162112, Truth: True]
3 Point     : [Timestamp: 1372638333, Longitude: -8.585982, Latitude: 41.161338, Truth: False]
4 Point     : [Timestamp: 1372638348, Longitude: -8.585037, Latitude: 41.160024, Truth: False]
5 Point     : [Timestamp: 1372638363, Longitude: -8.584146, Latitude: 41.159358, Truth: False]
6 Point     : [Timestamp: 1372638378, Longitude: -8.583138, Latitude: 41.160204, Truth: False]
7 Point     : [Timestamp: 1372638393, Longitude: -8.582175, Latitude: 41.160789, Truth: False]
8 Point     : [Timestamp: 1372638408, Longitude: -8.582022, Latitude: 41.162364, Truth: False]
9 Point     : [Timestamp: 1372638423, Longitude: -8.583849, Latitude: 41.163192, Truth: True]
10 Point     : [Timestamp: 1372638438, Longitude: -8.586189, Latitude: 41.163273, Truth: False]
11 Point     : [Timestamp: 1372638453, Longitude: -8

In [86]:
features_ls = []

for traj in taxi_trajectories:
    traj_points = traj.points
    last_truth_pt = traj_points[0]
    next_truth_pt = traj_points[len(traj_points)-1]
    to_predict = False              # True when current pt is not truth value

    for curr_pt in traj_points:
        if to_predict is False:
            if curr_pt.is_truth is True:
                # all good, truth
                last_truth_pt = curr_pt
            else:
                # encountered point with non-truth value
                to_predict = True
                num_missing_data = 1
                features_ls.append([last_truth_pt.lon, last_truth_pt.lat, last_truth_pt.timestamp, curr_pt.timestamp, "target"])
        
        else:
            # in a streak of non-truth points
            if curr_pt.is_truth is False:
                # curr_pt.prev_pt_time = last_truth_pt.timestamp
                num_missing_data += 1
                features_ls.append([last_truth_pt.lon, last_truth_pt.lat, last_truth_pt.timestamp, curr_pt.timestamp, "target"])

            else:
                # found truth point
                to_predict = False
                next_truth_pt = curr_pt

                # print("ltt:", last_truth_pt.timestamp)
                # print("ntt:", next_truth_pt.timestamp)

                # [features_ls[i][-1:-1][next_truth_pt.lon, next_truth_pt.lat, next_truth_pt.timestamp, traj.radius_of_gyration, traj.entropy])\
                #     for i in range(len(features_ls)-1, len(features_ls)-num_missing_data-1, -1)]

                print(num_missing_data)
                curr_len = len(features_ls)
                for i in range(curr_len-1, curr_len-num_missing_data-1, -1):
                    print(features_ls[i])
                    features_ls[i][-1:-1] = [next_truth_pt.lon, next_truth_pt.lat, next_truth_pt.timestamp, traj.radius_of_gyration, traj.entropy]
                    print(features_ls[i])
                    print()

                last_truth_pt = curr_pt

6
[-8.586198, 41.162112, 1372638318, 1372638408, 'target']
[-8.586198, 41.162112, 1372638318, 1372638408, -8.583849, 41.163192, 1372638423, 0.6383871750173397, 3.232487689168953, 'target']

[-8.586198, 41.162112, 1372638318, 1372638393, 'target']
[-8.586198, 41.162112, 1372638318, 1372638393, -8.583849, 41.163192, 1372638423, 0.6383871750173397, 3.232487689168953, 'target']

[-8.586198, 41.162112, 1372638318, 1372638378, 'target']
[-8.586198, 41.162112, 1372638318, 1372638378, -8.583849, 41.163192, 1372638423, 0.6383871750173397, 3.232487689168953, 'target']

[-8.586198, 41.162112, 1372638318, 1372638363, 'target']
[-8.586198, 41.162112, 1372638318, 1372638363, -8.583849, 41.163192, 1372638423, 0.6383871750173397, 3.232487689168953, 'target']

[-8.586198, 41.162112, 1372638318, 1372638348, 'target']
[-8.586198, 41.162112, 1372638318, 1372638348, -8.583849, 41.163192, 1372638423, 0.6383871750173397, 3.232487689168953, 'target']

[-8.586198, 41.162112, 1372638318, 1372638333, 'target']
[

In [90]:
# for i in range(len(features_ls)):
#     print(features_ls[i])

df = pd.DataFrame(features_ls)
df.columns = ["prev lon", "prev lat", "prev t", "curr t", "next lon", "next lat", "next t", "ROG", "Ent", "Target"]
df

Unnamed: 0,prev lon,prev lat,prev t,curr t,next lon,next lat,next t,ROG,Ent,Target
0,-8.586198,41.162112,1372638318,1372638333,-8.583849,41.163192,1372638423,0.638387,3.232488,target
1,-8.586198,41.162112,1372638318,1372638348,-8.583849,41.163192,1372638423,0.638387,3.232488,target
2,-8.586198,41.162112,1372638318,1372638363,-8.583849,41.163192,1372638423,0.638387,3.232488,target
3,-8.586198,41.162112,1372638318,1372638378,-8.583849,41.163192,1372638423,0.638387,3.232488,target
4,-8.586198,41.162112,1372638318,1372638393,-8.583849,41.163192,1372638423,0.638387,3.232488,target
5,-8.586198,41.162112,1372638318,1372638408,-8.583849,41.163192,1372638423,0.638387,3.232488,target
6,-8.583849,41.163192,1372638423,1372638438,-8.589051,41.164254,1372638468,0.638387,3.232488,target
7,-8.583849,41.163192,1372638423,1372638453,-8.589051,41.164254,1372638468,0.638387,3.232488,target
8,-8.589051,41.164254,1372638468,1372638483,-8.589006,41.164929,1372638498,0.638387,3.232488,target
9,-8.587089,41.169114,1372638543,1372638558,-8.586747,41.169357,1372638573,0.638387,3.232488,target
