In [1]:
from matplotlib import pyplot as plt
import numpy as np
import math

import pandas as pd

df_rows = pd.read_csv("./taxi_dataset/training_data_simple.csv", sep='\n', header=None, nrows=1)
df_raw = df_rows[0].str.split(',', expand=True)
df_raw

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,663,664,665,666,667,668,669,670,671,672
0,1372638303,-8.587116,41.162427,-8.586198,41.162112,-8.585982,41.161338,-8.585037,41.160024,-8.584146,...,,,,,,,,,,


In [16]:
df = df_raw.iloc[:7,:7]
# df.columns = ["start_time", "lon_1", "lat_1", "lon_2", "lat_2", "lon_3", "lat_3"]
df

Unnamed: 0,0,1,2,3,4,5,6
0,1372638303,-8.587116,41.162427,-8.586198,41.162112,-8.585982,41.161338


In [17]:
def cal_dis(lat_1,lon_1,lat_2,lon_2):
    lon_1 = lon_1 * math.pi / 180
    lat_1 = lat_1 * math.pi / 180
    lon_2 = lon_2 * math.pi / 180
    lat_2 = lat_2 * math.pi / 180
    a = abs(lat_1 - lat_2)
    b = abs(lon_1 - lon_2)
    d = 2 * 6378.137 * np.arcsin(
        np.sqrt(np.sin(a / 2) * np.sin(a / 2) + np.cos(lat_1) * np.cos(lat_2) * np.sin(b / 2) * np.sin(b / 2)))
    return d

In [20]:
from operator import attrgetter

class Trajectory:
    def __init__(self, id, df_row):
        self.id = id
        self.first_timestamp = int(df_row[0])
        
        # populate points array
        self.points = []
        self.points.append(Point(time=int(self.first_timestamp), lon=df_row[1], lat=df_row[2]))
        for i in range(3, df_row.size, 2):
            if df_row[i] is None or df_row[i]=='':
                break
            
            this_timestamp = int(self.first_timestamp) + (i//2)*15
            self.points.append(Point(time=this_timestamp, lon=df_row[i], lat=df_row[i+1]))

        # find trajectory min and max lat, lon
        self.min_lat = (min(self.points,key=attrgetter('lat')).lat)
        self.max_lat = (max(self.points,key=attrgetter('lat')).lat)
        self.min_lon = (min(self.points,key=attrgetter('lon')).lon)
        self.max_lon = (max(self.points,key=attrgetter('lon')).lon)

        self.radius_of_gyration = -1        # default value -1 since equation is sqrt, making -1 impossible
        self.entropy = -1                   # default value -1 since equation never returns -1

    def get_points_info(self):
        i = 1
        for point in self.points:
            print(i, point)
            i+=1

    def calc_radius_of_gyration(self):
        point_center_lat = np.mean([point.lat for point in self.points])
        point_center_lon = np.mean([point.lon for point in self.points])

        temp_sum_rog = 0
        for point in self.points:
            temp_sum_rog += cal_dis(lat_1=point.lat, lon_1=point.lon, lat_2=point_center_lat, lon_2=point_center_lon)

        m = len(self.points)
        self.radius_of_gyration = math.sqrt(temp_sum_rog / m)
        
    def calc_entropy(self):
        lat_midpoint = self.min_lat + (self.max_lat - self.min_lat)/2
        lon_midpoint = self.min_lon + (self.max_lon - self.min_lon)/2

        length = cal_dis(lat_1=self.min_lat, lon_1=lon_midpoint, lat_2=self.max_lat, lon_2=lon_midpoint)
        width  = cal_dis(lat_1=lat_midpoint, lon_1=self.min_lon, lat_2=lat_midpoint, lon_2=self.max_lon)
        # print(length, width)

        UNIT_CELL_SIZE = 0.1        # for 100 x 100 metres cell size, can try 1km x 1km too
        length_size = math.ceil(length / UNIT_CELL_SIZE) + 1
        width_size  = math.ceil(width  / UNIT_CELL_SIZE) + 1
        # print("dim: {}x{}".format(length_size,width_size))

        count_grid = [ [0]*width_size for i in range(length_size)]
        for point in self.points:
            this_lat = point.lat
            this_lon = point.lon
            # print(point)

            lat_mid = this_lat + (this_lat - self.min_lat)/2
            lon_mid = this_lon + (this_lon - self.min_lon)/2

            lat_dis = cal_dis(lat_1=self.min_lat, lon_1=lon_mid, lat_2=this_lat, lon_2=lon_mid)
            lon_dis = cal_dis(lat_1=lat_mid, lon_1=self.min_lon, lat_2=lat_mid, lon_2=this_lon)

            x = lat_dis / UNIT_CELL_SIZE
            y = lon_dis / UNIT_CELL_SIZE
            x, y = round(x), round(y)
            # print("x:({}), y:({})".format(x,y))
            count_grid[x][y] += 1
        
        # print(pd.DataFrame(count_grid))

        m = len(self.points)
        temp_ent_sum = 0
        for x in range(len(count_grid)):
            for y in range(len(count_grid[x])):
                cell_count = count_grid[x][y]
                if cell_count == 0:
                    # temp_ent_sum += 0
                    continue
                else:
                    percent_i = cell_count / m                              # p(i)
                    temp_ent_sum += (percent_i) * math.log2(percent_i)      # summation
        
        self.entropy = -1 * temp_ent_sum
        

    def __str__(self):
        return("\n{:10s}: [ID: {:5d}, Time_First: {:10d}, Points: {}] \n{:10s}  [LAT_range: ({:5f}, {:5f}), LON_range: ({:5f}, {:5f}])".\
                    format("Trajectory", self.id, self.first_timestamp, len(self.points),\
                            "", self.min_lat, self.max_lat, self.min_lon, self.max_lon))
            
    def __repr__(self):
        # print("\nin __repr__, calling __str__")
        return str(self)


        
        

class Point:
    def __init__(self, time, lon, lat):
        self.timestamp = time
        self.lon = float(lon)
        self.lat = float(lat)
        
        self.is_truth = True        # True if point coordinate is truth value
        self.prev_pt_time = -1
        self.next_pt_time = -1

    def clear_coor(self):
        self.is_truth = False
        self.lon = -1
        self.lat = -1

    def set_prediction(self, pred_pt_lst):
        pred_time, pred_lon, pred_lat = pred_pt_lst
        if self.timestamp == pred_time:
            self.lon = pred_lon
            self.lat = pred_lat
        else:
            print("ERORR: time mismatch")
            print("actual t :", self.timestamp)
            print("predicted:", pred_time)
            print("\n")

    def __str__(self):
        return ("{:10s}: [Timestamp: {:10d}, Longitude: {:9f}, Latitude: {:9f}, Truth: {}]"\
            .format("Point", self.timestamp, self.lon, self.lat, self.is_truth))

    def __repr__(self):
        # print("\nin __repr__, calling __str__")
        return str(self)

In [21]:
taxi_trajectories = []

counter = 0
for index, row in df.iterrows():
    taxi_trajectories.append(Trajectory(counter, row))
    counter += 1

taxi_trajectories[0].calc_radius_of_gyration()
print("ROG:", taxi_trajectories[0].radius_of_gyration)

taxi_trajectories[0].calc_entropy()
print("Ent:", taxi_trajectories[0].entropy)

1.584962500721156


In [35]:
global_min_lat = (min(taxi_trajectories, key=attrgetter('min_lat')).min_lat)
global_max_lat = (max(taxi_trajectories, key=attrgetter('max_lat')).max_lat)
global_min_lon = (min(taxi_trajectories, key=attrgetter('min_lon')).min_lon)
global_max_lon = (max(taxi_trajectories, key=attrgetter('max_lon')).max_lon)

print("global_min_lat:", global_min_lat)
print("global_max_lat:", global_max_lat)
print("global_min_lon:", global_min_lon)
print("global_max_lon:", global_max_lon)

global_lat_midpoint = global_min_lat + (global_max_lat - global_min_lat)/2
global_lon_midpoint = global_min_lon + (global_max_lon - global_min_lon)/2

length = cal_dis(lat_1=global_min_lat, lon_1=global_lon_midpoint, lat_2=global_max_lat, lon_2=global_lon_midpoint)
width  = cal_dis(lat_1=global_lat_midpoint, lon_1=global_min_lon, lat_2=global_lat_midpoint, lon_2=global_max_lon)

print("length:", length)
print("width:", width)
# print(max(taxi_trajectories[0].points,key=attrgetter('lat')))
# print(taxi_trajectories[0].max_lat)

global_min_lat: 41.144913
global_max_lat: 41.178789
global_min_lon: -8.655048
global_max_lon: -8.578701
length: 3.77105907011222
width: 7.956479951658587


In [46]:
UNIT_CELL_SIZE = 0.1        # for 100 x 100 metres cell size, can try 1km x 1km too
length_size = math.ceil(length / UNIT_CELL_SIZE) + 1
width_size  = math.ceil(width  / UNIT_CELL_SIZE) + 1

point_grid = [ [[[]]]*width_size for i in range(length_size)]   # create a 2d array of lists of points (3D array of points)
count_grid = [ [0]*width_size for i in range(length_size)]

for traj in taxi_trajectories:
    for point in traj.points:
        this_lat = point.lat
        this_lon = point.lon

        # print(point)

        lat_mid = this_lat + (this_lat - global_min_lat)/2
        lon_mid = this_lon + (this_lon - global_min_lon)/2

        lat_dis = cal_dis(lat_1=global_min_lat, lon_1=lon_mid, lat_2=this_lat, lon_2=lon_mid)
        # print("lat_dis:", lat_dis)

        lon_dis = cal_dis(lat_1=lat_mid, lon_1=global_min_lon, lat_2=lat_mid, lon_2=this_lon)
        # print("lon_dis:", lon_dis)

        x = lat_dis / UNIT_CELL_SIZE
        y = lon_dis / UNIT_CELL_SIZE
        x, y = round(x), round(y)
        # print("x:({}), y:({})".format(x,y))

        point_grid[x][y].append(point)
        count_grid[x][y] += 1
        
        # print()

# print(pd.DataFrame(count_grid))

In [43]:
# def condition(x): return x > 50
# output = [idx for idx, element in enumerate(count_grid) if condition(element)]
# print(output)

# finding stationary taxis (investigate excess number of point in a cell), turn out to be stationary taxis
for i in range(len(count_grid)):
    for j in range(len(count_grid[i])):
        if count_grid[i][j] > 10:
            print(i,j)

2 16
14 28
16 23
20 19
20 27
20 28
21 9
21 27
27 29


In [None]:
def calc_rf_features(num_distinct_locations, ):
    m = num_distinct_locations
    
    radius_of_gyration = math.sqrt(1/m * )