# Data preprocessing and trajectory analysis

In [1]:
library(data.table) # Fread
library(dplyr)
library(geosphere) # Calculate vehicle distance
library(lubridate)# Date column extraction
library(robfilter)# Smooth the data
library(ggplot2) # Plot
library(ggpubr) # Combine different plots
library(ggmap) # plot points on the map
library(RColorBrewer)

"package 'data.table' was built under R version 3.6.3"
"package 'dplyr' was built under R version 3.6.3"

Attaching package: 'dplyr'


The following objects are masked from 'package:data.table':

    between, first, last


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


"package 'geosphere' was built under R version 3.6.3"

Attaching package: 'lubridate'


The following objects are masked from 'package:data.table':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year


The following object is masked from 'package:base':

    date


"package 'robfilter' was built under R version 3.6.3"
Loading required package: robustbase

"package 'robustbase' was built under R version 3.6.3"
Loading required package: MASS

"package 'MASS' was built under R version 3.6.3"

Attaching package: 'MASS'


The following object is masked from 'package:dplyr

# Input location data

In [2]:
# Read data from folder
# Suggestion: Just do one year at one time. One year's table has 94363502 rows(green)
YEARLIST = c('19')
MONTHlIST = c("04") # FOR FULL TABLE
DISTANCE_FILEPATH = "../../data/raw/vehicle-location/"

In [3]:
# Add different time scale columns
add_dd_mm_yy_cols = function(df) {
    df$day = day(df$trxtime)
    df$month = month(df$trxtime)
    df$year = year(df$trxtime)
    return(df)
}
# Read heavy rail location raw data
get_heavy_rail_trajectories = function(year, month){
    assign("dh", fread(paste(DISTANCE_FILEPATH, paste("heavyrail", "trajectories", month, year, ".csv", sep = "-", collapse = ""), sep="")))
    dh = add_dd_mm_yy_cols(dh)
    return(dh)
}
# Read light rail location raw data
get_light_rail_trajectories = function(year, month){
    assign("dg", fread(paste(DISTANCE_FILEPATH, paste("lightrail", "trajectories", month, year, ".csv", sep = "-", collapse = ""), sep="")))
    dg = add_dd_mm_yy_cols(dg)
    return(dg)
}

In [4]:
# Get the raw table
df_heavyrail = get_heavy_rail_trajectories(YEARLIST,MONTHlIST)
df_lightrail = get_light_rail_trajectories(YEARLIST,MONTHlIST)

# Table subset by trajectory index

In [5]:
# Subset table by a given day
get_day_trajectories = function(df, dayid){
    day_df = df[day == dayid, .SD, keyby = .(trainid, vehicleid, routeid) ]
    # print(paste("Number of observations", nrow(day_df), "on day", dayid ))
    return(day_df)
}
# Create trajectory index table 
get_unique_trajectory_indices = function(day_df) {
    trajectory_index_df = unique(day_df[, .(trainid, vehicleid, routeid, car1, car2, car3)])
    # print(paste("Number of unique trajectories extracted: ", dim(trajectory_index_df)[1]))
    return(trajectory_index_df)
}
# Subset the raw table by unique index
extract_unique_trajectory_light = function(day_df, traj_index_df, index){
    day_df[["car2"]][is.na(day_df[["car2"]])] <- 9999999
    day_df[["car3"]][is.na(day_df[["car3"]])] <- 9999999
    traj_index_df[["car2"]][is.na(traj_index_df[["car2"]])] <- 9999999
    traj_index_df[["car3"]][is.na(traj_index_df[["car3"]])] <- 9999999
    trajectory = day_df[trainid == traj_index_df[index, 1][[1]] & 
                        vehicleid == traj_index_df[index, 2][[1]] &
                        # routeid == traj_index_df[index, 3][[1]] &
                        car1 == traj_index_df[index, 4][[1]] &
                        car2 == traj_index_df[index, 5][[1]] &
                        car3 == traj_index_df[index, 6][[1]],][order(trxtime)]
    clean_trajectory = trajectory[, .SD[1], by = trxtime] # takes first observation of multiple with same time
    clean_trajectory[["car2"]][clean_trajectory[["car2"]] == 9999999] = NA
    clean_trajectory[["car3"]][clean_trajectory[["car3"]] == 9999999] = NA
    # transfer trxtime to timestamp
    options(tz = "America/New_York")
    clean_trajectory$time = as.POSIXct(clean_trajectory$trxtime,tz = getOption("tz"))
    return (clean_trajectory)
}
extract_unique_trajectory_heavy = function(day_df, traj_index_df, index){
    day_df[["heavyrailbranchid"]][is.na(day_df[["heavyrailbranchid"]])] <- 9999999
    day_df[["tripid"]][is.na(day_df[["tripid"]])] <- 9999999
    day_df[["vehicleid"]][is.na(day_df[["vehicleid"]])] <- 9999999
    traj_index_df[["heavyrailbranchid"]][is.na(traj_index_df[["heavyrailbranchid"]])] <- 9999999
    traj_index_df[["tripid"]][is.na(traj_index_df[["tripid"]])] <- 9999999
    traj_index_df[["vehicleid"]][is.na(traj_index_df[["vehicleid"]])] <- 9999999
    trajectory = day_df[trainid == traj_index_df[index, 1][[1]] & 
                        vehicleid == traj_index_df[index, 2][[1]] &
                        lineid == traj_index_df[index, 3][[1]] &
                        heavyrailbranchid == traj_index_df[index, 4][[1]] &
                        tripid == traj_index_df[index, 5][[1]],][order(trxtime)]
    clean_trajectory = trajectory[, .SD[1], by = trxtime] # takes first observation of multiple with same time
    clean_trajectory[["heavyrailbranchid"]][clean_trajectory[["heavyrailbranchid"]] == 9999999] = NA
    clean_trajectory[["tripid"]][clean_trajectory[["tripid"]] == 9999999] = NA
    clean_trajectory[["vehicleid"]][clean_trajectory[["vehicleid"]] == 9999999] = NA
    # transfer trxtime to timestamp
    options(tz = "America/New_York")
    clean_trajectory$time = as.POSIXct(clean_trajectory$trxtime,tz = getOption("tz"))
    return (clean_trajectory)
}

In [6]:
# generate the selected trajectory table
light_subset = function(df,day,index){
    day_df = get_day_trajectories(df, day)
    trajectory_index_df = get_unique_trajectory_indices(day_df)
    clean_trajectory_df = extract_unique_trajectory_light(day_df, trajectory_index_df, index)
    return(list(clean_trajectory = clean_trajectory_df , index = trajectory_index_df))
}


In [6]:
# Subset table by a given day
get_day_trajectories = function(df, dayid){
    day_df = df[day == dayid, .SD, keyby = .(lineid, trainid) ]
    print(paste("Number of observations", nrow(day_df), "on day", dayid ))
    return(day_df)
}

# Create trajectory index table 
get_unique_trajectory_indices = function(day_df) {
    trajectory_index_df = unique(day_df[, .(trainid, vehicleid, lineid, heavyrailbranchid, tripid)])
    print(paste("Number of unique trajectories extracted: ", dim(trajectory_index_df)[1]))
    return(trajectory_index_df)
}

# Subset the raw table by unique index
extract_unique_trajectory = function(day_df, traj_index_df, index){
    day_df[["heavyrailbranchid"]][is.na(day_df[["heavyrailbranchid"]])] <- 9999999
    day_df[["tripid"]][is.na(day_df[["tripid"]])] <- 9999999
    day_df[["vehicleid"]][is.na(day_df[["vehicleid"]])] <- 9999999
    traj_index_df[["heavyrailbranchid"]][is.na(traj_index_df[["heavyrailbranchid"]])] <- 9999999
    traj_index_df[["tripid"]][is.na(traj_index_df[["tripid"]])] <- 9999999
    traj_index_df[["vehicleid"]][is.na(traj_index_df[["vehicleid"]])] <- 9999999
    trajectory = day_df[trainid == traj_index_df[index, 1][[1]] & 
                        vehicleid == traj_index_df[index, 2][[1]] &
                        lineid == traj_index_df[index, 3][[1]] &
                        heavyrailbranchid == traj_index_df[index, 4][[1]] &
                        tripid == traj_index_df[index, 5][[1]],][order(trxtime)]
    clean_trajectory = trajectory[, .SD[1], by = trxtime] # takes first observation of multiple with same time
    clean_trajectory[["heavyrailbranchid"]][clean_trajectory[["heavyrailbranchid"]] == 9999999] = NA
    clean_trajectory[["tripid"]][clean_trajectory[["tripid"]] == 9999999] = NA
    clean_trajectory[["vehicleid"]][clean_trajectory[["vehicleid"]] == 9999999] = NA
    # transfer trxtime to timestamp
    options(tz = "America/New_York")
    clean_trajectory$time = as.POSIXct(clean_trajectory$trxtime,tz = getOption("tz"))
    return (clean_trajectory)
}

In [36]:
# generate the selected trajectory table
heavy_subset = function(df,day,index){
    day_df = get_day_trajectories(df, day)
    trajectory_index_df = get_unique_trajectory_indices(day_df)
    clean_trajectory_df = extract_unique_trajectory_heavy(day_df, trajectory_index_df, index)
    return(list(clean_trajectory = clean_trajectory_df , index = trajectory_index_df))
}


In [7]:
df_test = light_subset(df_lightrail,2,7)$clean_trajectory

In [None]:
compute_day_trajectories = function(month_df, dd) {
    df_dd = get_day_trajectories(month_df, dd)
    traj_indices_dd = get_unique_trajectory_indices(df_dd)
    print(head(traj_indices_dd))
    num_traj = nrow(traj_indices_dd)
    for (tt in seq(num_traj)[1:5] ) { # ideally this should be for the whole sequence
        traj = extract_unique_trajectory(df_dd, traj_indices_dd, tt)
        traj$trajid = tt # add a new column
        traj %<>% preprocess_data() %>% compute_distance() %>% compute_speed_acceleration() 
        if (tt==1) {
            processed_traj_df = traj
        } else {
            processed_traj_df = rbind(processed_traj_df, traj)
        }
    }
    return (processed_traj_df)
}


In [26]:
case_1 = function(clean_trajectory){
    clean_trajectory = compute_time_interval(clean_trajectory)
    clean_trajectory = compute_distance(clean_trajectory)
    clean_trajectory = compute_speed_acceleration(clean_trajectory)
    clean_trajectory = compute_cumulative_time_distance(clean_trajectory)
    trajectory_plot(clean_trajectory)
    return(clean_trajectory)
    }

In [9]:
case_2 = function(clean_trajectory,w){
     clean_trajectory = compute_time_interval(clean_trajectory)
     clean_trajectory = compute_distance(clean_trajectory) 
     filter = robreg.filter(clean_trajectory$dist_meters, width = w, online = FALSE, method= "MED")
     clean_trajectory$dist_meters = filter$level$MED
     clean_trajectory = compute_speed_acceleration(clean_trajectory)
     clean_trajectory = compute_cumulative_time_distance(clean_trajectory)
     trajectory_plot(clean_trajectory)
     return(clean_trajectory)
}

In [22]:
case_3 = function(clean_trajectory){
    clean_trajectory = data.table(clean_trajectory)
    clean_trajectory = compute_time_interval(clean_trajectory)
    clean_trajectory = clean_trajectory[interval_seconds < 2500]
    clean_trajectory = compute_time_interval(clean_trajectory)
    clean_trajectory = compute_distance(clean_trajectory)
    clean_trajectory = compute_speed_acceleration(clean_trajectory)
    clean_trajectory = compute_cumulative_time_distance(clean_trajectory)
    trajectory_plot(clean_trajectory)
    return(clean_trajectory)
    
}

In [11]:
case_4 = function(clean_trajectory,w){
    clean_trajectory = data.table(clean_trajectory)
    clean_trajectory = compute_time_interval(clean_trajectory)
    clean_trajectory = clean_trajectory[interval_seconds > 5]
    clean_trajectory = compute_time_interval(clean_trajectory)
    clean_trajectory = compute_distance(clean_trajectory)
    filter = robreg.filter(clean_trajectory$dist_meters, width = w, online = FALSE, method= "MED")
    clean_trajectory$dist_meters = filter$level$MED
    clean_trajectory = compute_speed_acceleration(clean_trajectory)
    clean_trajectory = compute_cumulative_time_distance(clean_trajectory)
    trajectory_plot(clean_trajectory)
    return(clean_trajectory)
}

In [8]:
# Remove the outlier speed
case_5 = function(clean_trajectory){
    clean_trajectory = data.table(clean_trajectory)
    clean_trajectory = compute_time_interval(clean_trajectory)
    # Remove short time interval observations
    clean_trajectory = clean_trajectory[interval_seconds > 1]
    clean_trajectory = compute_time_interval(clean_trajectory)
    clean_trajectory = compute_distance(clean_trajectory)
    clean_trajectory = compute_speed_acceleration(clean_trajectory)
    clean_trajectory = compute_cumulative_time_distance(clean_trajectory)
    # Remove outlier speed observations
    clean_trajectory = clean_trajectory[speed_kph < 120]
    clean_trajectory = clean_trajectory[accel_mps2 > -6 & accel_mps2 < 6 ]
#     # redo the calculation
#     clean_trajectory = compute_time_interval(clean_trajectory)
#     clean_trajectory = compute_distance(clean_trajectory)
#     clean_trajectory = compute_speed_acceleration(clean_trajectory)
#     clean_trajectory = compute_cumulative_time_distance(clean_trajectory)
    trajectory_plot(clean_trajectory)
    return(clean_trajectory)   
}

In [17]:
df_test_case1 = case_5(df_test)

[1] "Line: Green|Trainid: 10004|Vehicleid:   3600|Car1id: 3600|Car2id: 3855|Car3id: NA|Date: 4-2"


Source : http://tile.stamen.com/terrain/14/4949/6059.png

Source : http://tile.stamen.com/terrain/14/4950/6059.png

Source : http://tile.stamen.com/terrain/14/4951/6059.png

Source : http://tile.stamen.com/terrain/14/4952/6059.png

Source : http://tile.stamen.com/terrain/14/4953/6059.png

Source : http://tile.stamen.com/terrain/14/4954/6059.png

Source : http://tile.stamen.com/terrain/14/4955/6059.png

Source : http://tile.stamen.com/terrain/14/4956/6059.png

Source : http://tile.stamen.com/terrain/14/4957/6059.png

Source : http://tile.stamen.com/terrain/14/4958/6059.png

Source : http://tile.stamen.com/terrain/14/4949/6060.png

Source : http://tile.stamen.com/terrain/14/4950/6060.png

Source : http://tile.stamen.com/terrain/14/4951/6060.png

Source : http://tile.stamen.com/terrain/14/4952/6060.png

Source : http://tile.stamen.com/terrain/14/4953/6060.png

Source : http://tile.stamen.com/terrain/14/4954/6060.png

Source : http://tile.stamen.com/terrain/14/4955/6060.png

Source : http:

NULL


In [41]:
# case 1 table
df_test_2_case1

trxtime,trainid,vehicleid,routeid,counter,lineid,lat,lon,servicetypeid,sourceid,...,month,year,time,interval_seconds,dist_meters,speed_mps,speed_kph,accel_mps2,cumdist_km,cumtime_hrs
<chr>,<int>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<int>,...,<dbl>,<dbl>,<dttm>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2019-04-02 08:11:49,10004,3600,852,62364,4,42.33596,-71.24962,1,1,...,4,2019,2019-04-02 08:11:49,,,,,,0.00000000,0.000000000
2019-04-02 08:11:55,10004,3600,852,62386,4,42.33567,-71.24901,1,1,...,4,2019,2019-04-02 08:11:55,6,59.62573,9.937622,35.775439,,0.05962573,0.001666667
2019-04-02 08:11:57,10004,3600,852,62393,4,42.33535,-71.24840,1,1,...,4,2019,2019-04-02 08:11:57,2,61.86077,30.930384,111.349383,10.4963811,0.12148650,0.002222222
2019-04-02 08:12:01,10004,3600,852,62406,4,42.33506,-71.24777,1,1,...,4,2019,2019-04-02 08:12:01,4,60.99345,15.248363,54.894108,-3.9205052,0.18247995,0.003333333
2019-04-02 08:12:07,10004,3600,852,62426,4,42.33475,-71.24716,1,1,...,4,2019,2019-04-02 08:12:07,6,61.18063,10.196772,36.708380,-0.8419319,0.24366059,0.005000000
2019-04-02 08:12:13,10004,3600,852,62455,4,42.33411,-71.24592,1,1,...,4,2019,2019-04-02 08:12:13,6,124.22606,20.704343,74.535635,1.7512618,0.36788664,0.006666667
2019-04-02 08:12:15,10004,3600,852,62469,4,42.33382,-71.24534,1,1,...,4,2019,2019-04-02 08:12:15,2,57.18007,28.590033,102.924118,3.9428449,0.42506671,0.007222222
2019-04-02 08:12:19,10004,3600,852,62499,4,42.33348,-71.24470,1,1,...,4,2019,2019-04-02 08:12:19,4,65.17261,16.293153,58.655351,-3.0742199,0.49023932,0.008333333
2019-04-02 08:12:25,10004,3600,852,62530,4,42.33322,-71.24409,1,1,...,4,2019,2019-04-02 08:12:25,6,58.28943,9.714904,34.973656,-1.0963748,0.54852875,0.010000000
2019-04-02 08:12:31,10004,3600,852,62559,4,42.33285,-71.24337,1,1,...,4,2019,2019-04-02 08:12:31,6,72.00499,12.000832,43.202997,0.3809880,0.62053374,0.011666667


In [40]:
df_test_2_case4_9 = case_4(df_test_2,9)

1 out of 1018 time series values in clean_trajectory$dist_meters are missing. 
[1] "Line: Green|Trainid: 10004|Vehicleid:   3600|Routeid: 852|Car1: 3600|Car2: 3855|Car3: NA|Date: 4-2"


`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

"Removed 4 rows containing non-finite values (stat_bin)."
"Removed 1 rows containing missing values (geom_text)."
"Removed 4 rows containing missing values (geom_point)."
file saved to ../../figures/case4_t2_9.png



NULL


In [51]:
d_compare_table = compare_table(df_test_2_case1,df_test_2_case2_5,df_test_2_case2_7,df_test_2_case2_9,df_test_2_case3,df_test_2_case4_5,df_test_2_case4_7,df_test_2_case4_9)

# Function used for trajectory investigation

In [16]:
# plot each trajectory
trajectory_plot = function(df){
    p_title = title(df)
    # plot histogram for speed and acceleration
    clean_hist = melt(df, id.vars = c("time","month","day") ,
                                 measure.vars = c("dist_meters","interval_seconds",
                                                  "speed_kph","accel_mps2"))
    # Name new labels for hist plot
    levels(clean_hist$variable) = c("Distance (meters)", "Time interval (s)",
                                     "Speed (km/h)","Acceleration (meters/s^2)")
    # geom_jitter()
    p_hist = ggplot(clean_hist,aes(x = value)) + 
    geom_histogram(color="black", fill="lightblue") +  
    facet_wrap(~variable,ncol = 2 , scales = "free") +
    labs(title = "(b) Histograms") +
    theme(strip.text = element_text(size = rel(2)),
          # title
          axis.title.x = element_blank(),
          axis.title.y = element_blank(),
          title= element_text(size = 20),
          # axis label
          axis.text.x = element_text(size = 15),
          axis.text.y = element_text(size = 15)，
          # space between facet plot
           panel.spacing = unit(2, "lines"))
    # plot time series
    clean_trajectory_melt = melt(df, id.vars = c("time","month","day") ,
                                 measure.vars = c("dist_meters","cumdist_km","interval_seconds",
                                                  "cumtime_hrs","speed_kph","accel_mps2"))
    # Name new labels for facet plot
    levels(clean_trajectory_melt$variable) = c("Distance (meters)","Cumulative distance (km)",
                                               "Time interval (s)","Cumulative time (hrs)","Speed (km/h)",
                                               "Acceleration (meters/s^2)")
    # generate the facet plot
    p_main = ggplot(clean_trajectory_melt,aes(x = time,y = value)) + 
    geom_point(colour="darkorange") + 
    facet_wrap(~variable,ncol = 2 , scales = "free_y") +
    labs(title = "(a) Time series ", x = "Time") +
    theme(strip.text = element_text(size = rel(2)),
          axis.title.x = element_blank(),
          axis.title.y = element_blank(),
          title= element_text(size = 20),
          axis.text.x = element_text(size = 15),
          axis.text.y = element_text(size = 15)，
          panel.spacing = unit(2, "lines"))
    # trajectory map
    # get map
    ma <- get_stamenmap(bbox = c(left = min(df$lon), bottom = min(df$lat), 
                                  right = max(df$lon), top = max(df$lat)), 
                         zoom = 14)
    map_label = pretty(df$time, 5)
   # plot trajectory map
    p_map = ggmap(ma, darken = c(0.6, "white")) +
    geom_point(data = df, aes(x = lon, y = lat, color = as.numeric(time), alpha = 0.7), size = 4, shape = 16) +
    geom_text(data = df, aes(label = ifelse(speed_kph > 160,round(speed_kph,0),'')),hjust=0,vjust=0,size = 4) +
    scale_color_gradient(low="red", breaks = as.integer(map_label), labels = format(map_label, "%H:%M") ) + 
    labs(title = "(c) Map", x = "Lon", y = "Lat" ，color = "Time") +
    theme(axis.title.x = element_text(size = 20),
          axis.title.y = element_text(size = 20),
          title= element_text(size = 20),
          axis.text.x = element_text(size = 15,angle = 90),
          axis.text.y = element_text(size = 15),
          legend.text = element_text(size = 15),
         plot.caption = element_text(hjust = 0,margin = unit(c(-15,0,0,0), "mm"))) +
    guides(alpha=FALSE, size=FALSE)
    p1 = ggarrange(p_hist, p_map, nrow = 2, ncol = 1,heights = c(15,10),widths = c(20,50)) 
    # plot for ggarrange
    tgrob <- text_grob(p_title,size = 30)
    plot_0 <- as_ggplot(tgrob) + theme(plot.margin = margin(0,3,0,0, "cm"))
#    plot_name = paste("../../figures/",p_title,".png")
    options(repr.plot.width = 20, repr.plot.height = 10)
    p_plot = ggarrange(p_main, p1, nrow = 1, ncol = 2, widths = c(20,15)) 
    p = ggarrange(plot_0,p_plot,nrow = 2, ncol = 1,heights = c(5,20)) %>% 
     ggexport(filename = "../../figures/case_green.png",width = 1500, height = 1000)
    print(p)
    return(p)
}

In [10]:
# compute time interval
compute_time_interval <- function(d) {
    d$interval_seconds = NA
    n <- nrow(d)
    if (n >= 2) {
        # Compute time interval
        d$interval_seconds[2:n] = as.numeric(difftime(d$trxtime[2:n], d$trxtime[1:n-1], units = "secs"))
        }
    return(d)
}

In [11]:
# compute vehicle distance
compute_distance <- function(d) {
    d$dist_meters = NA
     n <- nrow(d)
    if (n >= 2) {
        # Compute interval distance using Haversine function
        d$dist_meters[2:n] = distHaversine(cbind(d$lon[1:n-1],d$lat[1:n-1]),cbind(d$lon[2:n],d$lat[2:n]))
        }
    return(d)
}

In [12]:
# compute speed and acceleration
compute_speed_acceleration <- function(d) {
    d$speed_mps = NA
    d$speed_kph = NA
    d$accel_mps2 = NA
    n <- nrow(d)
    if (n >= 2) {
        d$speed_mps[2:n] = d$dist_meters[2:n] / d$interval_seconds[2:n]
        # Convert speed to kph
        d$speed_kph[2:n] = d$speed_mps[2:n] * 3.6
        d$accel_mps2[2:n] = (d$speed_mps[2:n] - d$speed_mps[1:n-1])/d$interval_seconds[2:n]
        }
    return(d)
}  

In [13]:
# Calculate the cumulative dist and time
compute_cumulative_time_distance = function(d){
    df = d
    # no rm.na argument in cumsum function,so we make distance and time with NA as 0
    df[is.na(df)] <- 0
    # Calculate the cumulative dist and time
    df = df %>%
    mutate(cumdist = cumsum(dist_meters)) %>%
    mutate(cumtime = cumsum(interval_seconds))
    d$cumdist_km = df$cumdist/1000
    d$cumtime_hrs = df$cumtime/3600
    return(d)
}

In [42]:
head(df_test)

trxtime,trainid,vehicleid,routeid,counter,lineid,lat,lon,servicetypeid,sourceid,...,car3,speed,heading,rcvtime,lastavi,inserted,day,month,year,time
<chr>,<int>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<int>,...,<dbl>,<dbl>,<dbl>,<chr>,<int>,<chr>,<int>,<dbl>,<dbl>,<dttm>
2019-04-02 08:11:49,10004,3600,852,62364,4,42.33596,-71.24962,1,1,...,,28.11,123.68,2019-04-02 08:11:50,34,2019-04-19 15:25:58,2,4,2019,2019-04-02 08:11:49
2019-04-02 08:11:55,10004,3600,852,62386,4,42.33567,-71.24901,1,1,...,,32.56,123.76,2019-04-02 08:11:55,34,2019-04-19 15:25:58,2,4,2019,2019-04-02 08:11:55
2019-04-02 08:11:57,10004,3600,852,62393,4,42.33535,-71.2484,1,1,...,,33.19,123.92,2019-04-02 08:11:58,34,2019-04-19 15:25:58,2,4,2019,2019-04-02 08:11:57
2019-04-02 08:12:01,10004,3600,852,62406,4,42.33506,-71.24777,1,1,...,,33.5,124.01,2019-04-02 08:12:01,34,2019-04-19 15:25:58,2,4,2019,2019-04-02 08:12:01
2019-04-02 08:12:07,10004,3600,852,62426,4,42.33475,-71.24716,1,1,...,,33.55,123.87,2019-04-02 08:12:07,34,2019-04-19 15:25:58,2,4,2019,2019-04-02 08:12:07
2019-04-02 08:12:13,10004,3600,852,62455,4,42.33411,-71.24592,1,1,...,,33.87,124.25,2019-04-02 08:12:13,34,2019-04-19 15:25:58,2,4,2019,2019-04-02 08:12:13


In [15]:
# Add title for each trajectory plot
title = function (clean_trajectory){
    linetype = paste("Line:","Green")      
    trainid = paste("Trainid:",unique(clean_trajectory$trainid))
    vehicleid = paste("Vehicleid:",unique(clean_trajectory$vehicleid))
    # routeid = paste("Routeid:",unique(clean_trajectory$routeid))
    car1id = paste("Car1id:",unique(clean_trajectory$car1))
    car2id = paste("Car2id:",unique(clean_trajectory$car2))
    car3id = paste("Car3id:",unique(clean_trajectory$car3))
    month = unique(clean_trajectory$month)
    day = unique(clean_trajectory$day)
    date = paste("Date:",paste(month , day , sep = "-" ))
    df_title = paste(linetype,trainid,vehicleid,
                     # routeid,
                     car1id,car2id,car3id,date,sep = "|")
    print(df_title)
    return(df_title)
}

In [19]:
# Add title for each trajectory plot
title = function (clean_trajectory){
    if (unique(clean_trajectory$lineid) == 1){
    linetype = paste("Line:","Red")
    }
    else if(unique(clean_trajectory$lineid) == 2){
     linetype = paste("Line:","Blue")   
    }
     else if(unique(clean_trajectory$lineid) == 3){
     linetype = paste("Line:","Orange")   
    }   
    trainid = paste("Trainid:",unique(clean_trajectory$trainid))
    vehicleid = paste("Vehicleid:",unique(clean_trajectory$vehicleid))
    branchid = paste("Branchid:",unique(clean_trajectory$heavyrailbranchid))
    tripid = paste("Tripid:",unique(clean_trajectory$tripid))
    month = unique(clean_trajectory$month)
    day = unique(clean_trajectory$day)
    date = paste("Date:",paste(month , day , sep = "-" ))
    df_title = paste(linetype,trainid,vehicleid,branchid,tripid,date,sep = "|")
    print(df_title)
    return(df_title)
}

In [235]:
comparison_table[,2:11] = round(comparison_table[,2:11],2)

In [48]:
compare_table = function(case_1_test,case_2_test_5,case_2_test_7,case_2_test_9,case_3_test,case_4_test_5,case_4_test_7,case_4_test_9){
comparison_table = data.frame(case = c("case 1","case 2_5","case 2_7","case 2_9","case 3","case 4_5","case 4_7","case 4_9"),
                              speed_median = c(median(case_1_test$speed_kph,na.rm = TRUE),
                                               median(case_2_test_5$speed_kph,na.rm = TRUE),
                                               median(case_2_test_7$speed_kph,na.rm = TRUE),
                                               median(case_2_test_9$speed_kph,na.rm = TRUE),
                                               median(case_3_test$speed_kph,na.rm = TRUE),
                                               median(case_4_test_5$speed_kph,na.rm = TRUE),
                                               median(case_4_test_7$speed_kph,na.rm = TRUE),
                                               median(case_4_test_9$speed_kph,na.rm = TRUE)) ,
                              speed_max = c(max(case_1_test$speed_kph,na.rm = TRUE),
                                               max(case_2_test_5$speed_kph,na.rm = TRUE),
                                               max(case_2_test_7$speed_kph,na.rm = TRUE),
                                               max(case_2_test_9$speed_kph,na.rm = TRUE),
                                               max(case_3_test$speed_kph,na.rm = TRUE),
                                               max(case_4_test_5$speed_kph,na.rm = TRUE),
                                               max(case_4_test_7$speed_kph,na.rm = TRUE),
                                               max(case_4_test_9$speed_kph,na.rm = TRUE)) ,
                              speed_min = c(min(case_1_test$speed_kph,na.rm = TRUE),
                                               min(case_2_test_5$speed_kph,na.rm = TRUE),
                                               min(case_2_test_7$speed_kph,na.rm = TRUE),
                                               min(case_2_test_9$speed_kph,na.rm = TRUE),
                                               min(case_3_test$speed_kph,na.rm = TRUE),
                                               min(case_4_test_5$speed_kph,na.rm = TRUE),
                                               min(case_4_test_7$speed_kph,na.rm = TRUE),
                                               min(case_4_test_9$speed_kph,na.rm = TRUE)) ,
                              speed_mean = c(mean(case_1_test$speed_kph,na.rm = TRUE),
                                               mean(case_2_test_5$speed_kph,na.rm = TRUE),
                                               mean(case_2_test_7$speed_kph,na.rm = TRUE),
                                               mean(case_2_test_9$speed_kph,na.rm = TRUE),
                                               mean(case_3_test$speed_kph,na.rm = TRUE),
                                               mean(case_4_test_5$speed_kph,na.rm = TRUE),
                                               mean(case_4_test_7$speed_kph,na.rm = TRUE),
                                               mean(case_4_test_9$speed_kph,na.rm = TRUE)) ,
                              acceleration_median = c(median(case_1_test$accel_mps2,na.rm = TRUE),
                                               median(case_2_test_5$accel_mps2,na.rm = TRUE),
                                               median(case_2_test_7$accel_mps2,na.rm = TRUE),
                                               median(case_2_test_9$accel_mps2,na.rm = TRUE),
                                               median(case_3_test$accel_mps2,na.rm = TRUE),
                                               median(case_4_test_5$accel_mps2,na.rm = TRUE),
                                               median(case_4_test_7$accel_mps2,na.rm = TRUE),
                                               median(case_4_test_9$accel_mps2,na.rm = TRUE)) ,
                              acceleration_max = c(max(case_1_test$accel_mps2,na.rm = TRUE),
                                               max(case_2_test_5$accel_mps2,na.rm = TRUE),
                                               max(case_2_test_7$accel_mps2,na.rm = TRUE),
                                               max(case_2_test_9$accel_mps2,na.rm = TRUE),
                                               max(case_3_test$accel_mps2,na.rm = TRUE),
                                               max(case_4_test_5$accel_mps2,na.rm = TRUE),
                                               max(case_4_test_7$accel_mps2,na.rm = TRUE),
                                               max(case_4_test_9$accel_mps2,na.rm = TRUE)) ,
                              acceleration_min = c(min(case_1_test$accel_mps2,na.rm = TRUE),
                                               min(case_2_test_5$accel_mps2,na.rm = TRUE),
                                               min(case_2_test_7$accel_mps2,na.rm = TRUE),
                                               min(case_2_test_9$accel_mps2,na.rm = TRUE),
                                               min(case_3_test$accel_mps2,na.rm = TRUE),
                                               min(case_4_test_5$accel_mps2,na.rm = TRUE),
                                               min(case_4_test_7$accel_mps2,na.rm = TRUE),
                                               min(case_4_test_9$accel_mps2,na.rm = TRUE)) ,
                              acceleration_mean = c(mean(case_1_test$accel_mps2,na.rm = TRUE),
                                               mean(case_2_test_5$accel_mps2,na.rm = TRUE),
                                               mean(case_2_test_7$accel_mps2,na.rm = TRUE),
                                               mean(case_2_test_9$accel_mps2,na.rm = TRUE),
                                               mean(case_3_test$accel_mps2,na.rm = TRUE),
                                               mean(case_4_test_5$accel_mps2,na.rm = TRUE),
                                               mean(case_4_test_7$accel_mps2,na.rm = TRUE),
                                               mean(case_4_test_9$accel_mps2,na.rm = TRUE)) ,
                               total_distance_km = c(sum(case_1_test$dist_meters,na.rm = TRUE)/1000,
                                               sum(case_2_test_5$dist_meters,na.rm = TRUE)/1000,
                                               sum(case_2_test_7$dist_meters,na.rm = TRUE)/1000, 
                                               sum(case_2_test_9$dist_meters,na.rm = TRUE)/1000,
                                               sum(case_3_test$dist_meters,na.rm = TRUE)/1000,
                                               sum(case_4_test_5$dist_meters,na.rm = TRUE)/1000,
                                               sum(case_4_test_7$dist_meters,na.rm = TRUE)/1000,
                                               sum(case_4_test_9$dist_meters,na.rm = TRUE)/1000) ,
                              total_time_hrs = c(sum(case_1_test$interval_seconds,na.rm = TRUE)/3600,
                                               sum(case_2_test_5$interval_seconds,na.rm = TRUE)/3600,
                                               sum(case_2_test_7$interval_seconds,na.rm = TRUE)/3600,
                                            sum(case_2_test_9$interval_seconds,na.rm = TRUE)/3600, 
                                               sum(case_3_test$interval_seconds,na.rm = TRUE)/3600,
                                               sum(case_4_test_5$interval_seconds,na.rm = TRUE)/3600,
                                               sum(case_4_test_7$interval_seconds,na.rm = TRUE)/3600,
                                               sum(case_4_test_9$interval_seconds,na.rm = TRUE)/3600)
                       )
    return(comparison_table)
}

In [353]:
case_3_test = distance_filter(clean_trajectory)

1 out of 201 time series values in df$dist_meters are missing. 


In [361]:
comparison_table

case,speed_median,speed_max,speed_min,speed_mean,acceleration_median,acceleration_max,acceleration_min,acceleration_mean,total_distance,total_time
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
case 1,37.56048,235.8641,0.0,58.50301,-0.05806465,59.974425,-10.62974,4.1240532,15542.19,1795
case 2,37.18789,114.83827,0.0,43.94307,-0.04400116,4.654204,-3.580161,0.0842309,15478.92,1789
case 3,37.64274,265.06719,4.395646,56.51924,0.0,70.260843,-10.385708,4.685201,14038.47,1795
case 4,41.48645,82.40222,4.395646,41.58672,0.0,4.879983,-1.907459,0.1193497,14593.23,1789


## Data preprocessing

In [6]:
# Data preprocessing
preprocess_data = function(df){
    # Add time column to do analysis by random time scale
    df$day = day(df$trxtime)
    df$month = month(df$trxtime)
    df$year = year(df$trxtime)
    # check the na values ratio
    df_zero = df[df$lon == 0 | df$lat == 0,]
    df_zero_rate = nrow(df_zero)/nrow(df)
    print(df_zero_rate)
    # remove the rows with values of lat and lon are 0/NA
    df = df[!(is.na(df$lon)) | !(is.na(df$lat)),]
    df = df[df$lon != 0 | df$lat != 0,]
    df = df[order(df$trainid, df$trxtime),]
    df = distinct(df, trxtime, trainid, .keep_all = TRUE) # Remove the duplicated time record
    return(df)
}

In [7]:
df = preprocess_data(df)

[1] 0.0008356593


## something which will be used or not (this part wiil be deleted when this script has been done)

In [None]:
# # aggregrate_trajectory_table
# aggregate_line_trajectories = function(year, month){
#     assign("dg", fread(paste(DISTANCE_FILEPATH, paste(paste(c("lightrail","trajectories",month, year),collapse = "-"),".csv",sep = ""), sep = "")))
#     assign("dh", fread(paste(DISTANCE_FILEPATH, paste(paste(c("heavyrail","trajectories",month, year),collapse = "-"),".csv",sep = ""), sep = "")))
#     # Combine the original tables to a single one for analysis
#     dg = subset(dg, select = c(trxtime, trainid, lineid, lat, lon))
#     dh = subset(dh, select = c(trxtime, trainid, lineid, lat, lon))
#     df = rbind(dg, dh)
#     return(df)
# }

In [None]:
# aggregrate_trajectory_table
aggregate_line_trajectories = function(year, month){
    assign("dg", fread(paste(DISTANCE_FILEPATH, paste("lightrail", "trajectories", month, year, ".csv", sep = "-", collapse = ""), sep="")))
    assign("dh", fread(paste(DISTANCE_FILEPATH, paste("heavyrail", "trajectories", month, year, ".csv", sep = "-", collapse = ""), sep="")))
    # Combine the original tables to a single one for analysis
    dg = subset(dg, select = c(trxtime, trainid, lineid, lat, lon))
    dh = subset(dh, select = c(trxtime, trainid, lineid, lat, lon))
    df = rbind(dg, dh)
    return(df)
}

In [None]:
# Fixed file path
assign("df", fread(paste(DISTANCE_FILEPATH, paste(DISTANCE_FILEPATH, paste("heavyrail", "trajectories", month, year, ".csv", sep = "-", collapse = ""), sep="")))

In [None]:
add_dd_mm_yy_cols = function(df) {
    df$day = day(df$trxtime)
    df$month = month(df$trxtime)
    df$year = year(df$trxtime)
    return(df)
}
get_heavy_rail_trajectories = function(year, month){
    assign("df", fread(paste(DISTANCE_FILEPATH, paste(DISTANCE_FILEPATH, paste("heavyrail", "trajectories", month, year, ".csv", sep = "-", collapse = ""), sep="")))
    df = add_dd_mm_yy_cols(df)
    return(df)
}

get_light_rail_trajectories = function(year, month){
    assign("dg", fread(paste(DISTANCE_FILEPATH, paste("lightrail", "trajectories", month, year, ".csv", sep = "-", collapse = ""), sep="")))
    dg = add_dd_mm_yy_cols(dg)
    return(dg)
}