# Data preprocessing and model preparation

In [5]:
library(data.table) #fread
library(dplyr)
library(geosphere) # Calculate vehicle distance
library(lubridate)# Date column extraction

"package 'data.table' was built under R version 3.6.3"

Attaching package: 'dplyr'


The following objects are masked from 'package:data.table':

    between, first, last


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


"package 'geosphere' was built under R version 3.6.3"

Attaching package: 'lubridate'


The following objects are masked from 'package:data.table':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year


The following object is masked from 'package:base':

    date




In [6]:
# adjust the R limit
memory.limit(size=249200)

### Input location data and data preprocessing

In [7]:
# Read data from folder
# Suggestion: Just do one year at one time. One year's table has 94363502 rows(green)
yearlist = c('19')
monthlist = c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12") #FOR FULL TABLE
distance_filepath = "../../data/raw/vehicle-location/"
# aggregrate_trajectory_table
trajectory_table_combination = function(yearlist,monthlist){
    for (year in yearlist) {
        for (month in monthlist) {
        assign(paste("dg", year, month, sep=""), fread(paste(distance_filepath, paste("lightrail", "trajectories", month, year, ".csv", sep = "-", collapse = ""), sep="")))
        assign(paste("dh", year, month, sep=""), fread(paste(distance_filepath, paste("heavyrail", "trajectories", month, year, ".csv", sep = "-", collapse = ""), sep="")))
        }
    }
   # Combine the original tables to a single one for analysis
   dg = rbind(dg1901,dg1902,dg1903,dg1904,dg1905,dg1906,dg1907,dg1908,dg1909,dg1910,dg1911,dg1912)
   dh = rbind(dh1901,dh1902,dh1903,dh1904,dh1905,dh1906,dh1907,dh1908,dh1909,dh1910,dh1911,dh1912)
   dg = subset(dg,select = c(trxtime,trainid,lineid,lat,lon))
   dh = subset(dh,select = c(trxtime,trainid,lineid,lat,lon))
   df = rbind(dg,dh)
   return(df)
}

In [8]:
df_trajectory_table = trajectory_table_combination(yearlist,monthlist)

In [9]:
head(df_trajectory_table)

trxtime,trainid,lineid,lat,lon
<chr>,<int>,<int>,<dbl>,<dbl>
2019-01-01 23:59:59,10415,4,42.31891,-71.21649
2019-01-01 23:59:42,10018,4,42.35164,-71.07172
2019-01-01 23:59:26,10262,4,42.34763,-71.10144
2019-01-01 23:59:23,10172,4,42.35927,-71.05945
2019-01-01 23:59:47,10444,4,42.35301,-71.06454
2019-01-01 23:59:16,10075,4,42.35986,-71.05891


### Experimental section

In [10]:
d_test = df_trajectory_table[1:1000,]

In [46]:
# Data preprocessing
preprocess_data = function(df){
    # Add time column to do analysis by random time scale
     df$day = day(df$trxtime)
     df$month = month(df$trxtime)
     df$year = year(df$trxtime)
    # check the na values ratio
     df_na = df[(is.na(df$lon)) | (is.na(df$lat)),]
     df_na_rate = nrow(df_na)/nrow(df)
    # remove the rows with values of lat and lon are 0
     df = df[!(is.na(df$lon)) | !(is.na(df$lat)),] 
     df = df[order(df$trainid, df$trxtime),]
     df = distinct(df,trxtime, trainid, .keep_all = TRUE) # Remove the duplicated time record
    return(df)
}

In [47]:
preprocess_data(d_test)

ERROR: Error in return(df, df_na_rate): multi-argument returns are not permitted


In [None]:
# All speed are mph
# Green line
## Branch main 8.6
## Branch B 7.1
## Branch C 7.9
## Branch D 18.7
## Branch E 7.1
# Red line 
## A 16.7
## B 20.1
# Orange line 18.3
# Blue line 18.6
# Ref: https://www.fixmbta.com/green-line-upgrades

In [31]:
head(d_test)

trxtime,trainid,lineid,lat,lon
<chr>,<int>,<int>,<dbl>,<dbl>
2019-01-01 23:59:59,10415,4,42.31891,-71.21649
2019-01-01 23:59:42,10018,4,42.35164,-71.07172
2019-01-01 23:59:26,10262,4,42.34763,-71.10144
2019-01-01 23:59:23,10172,4,42.35927,-71.05945
2019-01-01 23:59:47,10444,4,42.35301,-71.06454
2019-01-01 23:59:16,10075,4,42.35986,-71.05891


### Vehicle distance,speed, acceleration calculation and correction

In [126]:
# Correct table computation
correct_compute_variables <- function(d) {
    d$dist_meters = NA
    d$interval_seconds = NA
    d$speed_mps = NA
    d$speed_kph = NA
    d$accel_mps2 = NA
    n <- nrow(d)
    if (n >= 2) {
        # Compute interval distance using Haversine function
        d$dist_meters[2:n] = distHaversine(cbind(d$lon[1:n-1],d$lat[1:n-1]),cbind(d$lon[2:n],d$lat[2:n]))
        # Compute time interval
        d$interval_seconds[2:n] = as.numeric(difftime(d$trxtime[2:n], d$trxtime[1:n-1], units = "secs"))
        # Compute speed in meters per second
        d$speed_mps[2:n] = d$dist_meters[2:n] / d$interval_seconds[2:n]
        d$speed_kph[2:n] = d$speed_mps[2:n]*3.6
        d$accel_mps2[2:n] = (d$speed_mps[2:n] - d$speed_mps[1:n-1])/d$interval_seconds[2:n]
         # revise the incorrect values 
        for(k in 2:n) { 
            if (!(is.na(d$speed_kph[k]))) {
                if (d$speed_kph[k] > 128 ){
                # the reference link is here:
                # https://www.masslive.com/worcester/2019/09/electrified-faster-and-more-trains-7-ways-the-mbtas-commuter-rail-service-could-be-a-lot-better.html
                     d$speed_kph[k] = d$speed_kph[k-1]
                     d$speed_mps[k] = d$speed_kph[k] / 3.6
                     d$accel_mps2[k] = (d$speed_mps[k]-d$speed_mps[k-1]) / d$interval_seconds[k]
                     d$dist_meters[k] = d$speed_mps[k] * d$interval_seconds[k]  
                 }
             }
         } 
     }
    return(d
}

In [125]:
# Process calculation for four lines
results.df = data.frame() # empty dataframe
compute_trajectory = function(data){
 for(i in unique(data$day)) { 
    if (i <= 31) { # define when the loop will end
        data.day <- data[data$day == i, ]
        for (j in unique(data.day$trainid)) {# Put each train in one loop in a subset
            data.day.train = data.day[data.day$trainid == j, ]        
            results <- compute_variables(data.day.train) # Process data with only one row separately        
            results.df <- rbind(results.df, results)
         # put all the results in one table
          }
       }
    }
    return(results.df)
 # Save the results for one year
# write.csv(x=results.df,file.path("../../data/tidy/", paste("rail_distances",yearlist,monthlist,".csv",sep = "-",collapse = "")))
}

### Summarize the excessive value

In [138]:
# For making sure the data structure is same, please generate the df_trajectory_table firstly
d_excessive_speed = d_test[0,]
d_excessive_speed$dist_meters = NA
d_excessive_speed$interval_seconds = NA
d_excessive_speed$speed_mps = NA
d_excessive_speed$speed_kph = NA
d_excessive_speed$accel_mps2 = NA
# Function to compute distances (D), speeds (S) and acceleration (A) in meters, meters per second, km per hour and m s^-2
excessive_speed <- function(d) {
    d$dist_meters = NA
    d$interval_seconds = NA
    d$speed_mps = NA
    d$speed_kph = NA
    d$accel_mps2 = NA
    n <- nrow(d)
    if (n >= 2) {
        # Compute interval distance using Haversine function
        d$dist_meters[2:n] = distHaversine(cbind(d$lon[1:n-1],d$lat[1:n-1]),cbind(d$lon[2:n],d$lat[2:n]))
        # Compute time interval
        d$interval_seconds[2:n] = as.numeric(difftime(d$trxtime[2:n], d$trxtime[1:n-1], units = "secs"))
        # Compute speed in meters per second
        d$speed_mps[2:n] = d$dist_meters[2:n] / d$interval_seconds[2:n]
        d$speed_kph[2:n] = d$speed_mps[2:n]*3.6
        d$accel_mps2[2:n] = (d$speed_mps[2:n] - d$speed_mps[1:n-1])/d$interval_seconds[2:n]
         # revise the incorrect values 
        for(k in 2:n) { 
            if (!(is.na(d$speed_kph[k]))) {
                if (d$speed_kph[k] > 128 ){
                    # put all excessive values to one single table
                    d_excessive_speed = rbind(d_excessive_speed,as.vector(d[k]))
                # the reference link is here:
                # https://www.masslive.com/worcester/2019/09/electrified-faster-and-more-trains-7-ways-the-mbtas-commuter-rail-service-could-be-a-lot-better.html
                     d$speed_kph[k] = d$speed_kph[k-1]
                     d$speed_mps[k] = d$speed_kph[k] / 3.6
                     d$accel_mps2[k] = (d$speed_mps[k]-d$speed_mps[k-1]) / d$interval_seconds[k]
                     d$dist_meters[k] = d$speed_mps[k] * d$interval_seconds[k]  
                 }
             }
         } 
     }
    return(list(d,d_excessive_speed))
}

In [139]:
# Process calculation for four lines
results.df = data.frame() # empty dataframe
excessive_speed_table = function(data){
 for(i in unique(data$day)) { 
    if (i <= 31) { # define when the loop will end
        data.day <- data[data$day == i, ]
        for (j in unique(data.day$trainid)) {# Put each train in one loop in a subset
            data.day.train = data.day[data.day$trainid == j, ]       
            results <- excessive_speed(data.day.train)
            # Process data with only one row separately        
            results.df <- rbind(results.df, results)
         # put all the results in one table
          }
       }
    }
    return(results.df)
 # Save the results for one year
# write.csv(x=results.df,file.path("../../data/tidy/", paste("rail_distances",yearlist,monthlist,".csv",sep = "-",collapse = "")))
}

In [131]:
d_test

trxtime,trainid,lineid,lat,lon,day,month,year
<chr>,<int>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>
2019-01-01 03:00:21,10012,4,42.33728,-71.25358,1,1,2019
2019-01-01 03:02:09,10012,4,42.33738,-71.25332,1,1,2019
2019-01-01 23:59:42,10018,4,42.35164,-71.07172,1,1,2019
2019-01-01 03:00:20,10045,4,42.34002,-71.16640,1,1,2019
2019-01-01 03:00:37,10045,4,42.34027,-71.16688,1,1,2019
2019-01-01 03:01:33,10049,4,42.35164,-71.07172,1,1,2019
2019-01-01 03:02:20,10049,4,42.35018,-71.07710,1,1,2019
2019-01-01 03:03:46,10049,4,42.34996,-71.07792,1,1,2019
2019-01-01 03:01:27,10070,4,42.34791,-71.08631,1,1,2019
2019-01-01 03:02:07,10070,4,42.34809,-71.08845,1,1,2019


In [132]:
# Generate the summary table
d_correct = compute_trajectory(d_test)
d_excessive_speed_table = excessive_speed_table(d_test)

In [135]:
d_correct = na.omit(d_correct)

In [136]:
# Generate the summary table
summary_computation = data.frame(d_excessive_speed_rate = nrow(d_excessive_speed_table)/nrow(d_correct),excessive_speed_mean = mean(d_excessive_speed_table$speed_kph),excessive_speed_max = max(d_excessive_speed_table$speed_kph),excessive_speed_min = min(d_excessive_speed_table$speed_kph),max_correct = max(d_correct$speed_kph),
                         min_correct = min(d_correct$speed_kph),mean_correct = mean(d_correct$speed_kph))

In [137]:
summary_computation

d_excessive_speed_rate,excessive_speed_mean,excessive_speed_max,excessive_speed_min,max_correct,min_correct,mean_correct
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.003759398,212.3691,212.3691,212.3691,121.8748,0,27.97609


### Experimental section

In [None]:
#  # Count the row numbers which lat and lon are NA
#      df_na = df[(is.na(data$lon)) | (is.na(data$lat)),]
#      number_na = nrow(df_na)

In [None]:
# # Process value correction
#         for(k in 2:n) { 
#                 # revise the incorrect values 
#                 if (!(is.na(d$speed_kph[k]))) {
#                     if (d$speed_kph[k] > 128 ){
#                        # the reference link is here:
#                        # https://www.masslive.com/worcester/2019/09/electrified-faster-and-more-trains-7-ways-the-mbtas-commuter-rail-service-could-be-a-lot-better.html
#                          if(k==3){d$speed_kph[k] = d$speed_kph[k+1]}
#                          else{
#                         d$speed_kph[k] = (d$speed_kph[k-1] + d$speed_kph[k+1] )/2
#                         d$speed_mps[k] = d$speed_kph[k] / 3.6
#                         d$accel_mps2[k] = (d$speed_mps[k]-d$speed_mps[k-1]) / d$interval_seconds[k]
#                         d$dist_meters[k] = d$speed_mps[k] * d$interval_seconds[k]}
#                         else if (k == nrows(d)){d$speed_kph[k] = d$speed_kph[k-1]}
#                     }
#                 }
#     }        