# Data preprocessing and model preparation

In [1]:
library(data.table) #fread
library(dplyr)
library(geosphere) # Calculate vehicle distance
library(lubridate)# Date column extraction

"package 'data.table' was built under R version 3.6.3"

Attaching package: 'dplyr'


The following objects are masked from 'package:data.table':

    between, first, last


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


"package 'geosphere' was built under R version 3.6.3"


In [2]:
# adjust the R limit
memory.limit(size=249200)

### Input location data and data preprocessing

In [3]:
# Read data from folder
# Suggestion: Just do one year at one time. One year's table has 94363502 rows(green)
yearlist = c('19')
monthlist = c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12") #FOR FULL TABLE
distance_filepath = "../../data/raw/vehicle-location/"
trajectory_table_combination = function(yearlist,monthlist){
    for (year in yearlist) {
        for (month in monthlist) {
        assign(paste("dg", year, month, sep=""), fread(paste(lightrail_distance_filepath, paste("lightrail", "trajectories", month, year, ".csv", sep = "-", collapse = ""), sep="")))
        assign(paste("dh", year, month, sep=""), fread(paste(heavyrail_distance_filepath, paste("heavyrail", "trajectories", month, year, ".csv", sep = "-", collapse = ""), sep="")))
        }
    }
   # Combine the original tables to a single one for analysis
   dg = rbind(dg1901,dg1902,dg1903,dg1904,dg1905,dg1906,dg1907,dg1908,dg1909,dg1910,dg1911,dg1912)
   dh = rbind(dh1901,dh1902,dh1903,dh1904,dh1905,dh1906,dh1907,dh1908,dh1909,dh1910,dh1911,dh1912)
   dg = subset(dg,select = c(trxtime,trainid,lineid,lat,lon))
   dh = subset(dh,select = c(trxtime,trainid,lineid,lat,lon))
   df = rbind(dg,dh)
   return(df)
}

In [19]:
# Data preprocessing
data_pre = function(df){
    # Add time column to do analysis by random time scale
     df$day = day(df$trxtime)
     df$month = month(df$trxtime)
     df$year = year(df$trxtime)
     # Count the row numbers which lat and lon are NA
     df_na = df[(is.na(data$lon)) | (is.na(data$lat)),]
     number_na = nrow(df_na)
     df = df[!(is.na(data$lon)) | !(is.na(data$lat)),] #remove the rows with values of lat and lon are 0
    # nrows for checking the number of NA lon and lat
     df = df[order(data$trainid, data$trxtime),]
     df = distinct(df,time, trainid, .keep_all = TRUE) # Remove the duplicated time record
    return(df)
}

In [None]:
df_trajectory_table = data_pre(df_trajectory_table)

In [None]:
# All speed are mph
# Green line
## Branch main 8.6
## Branch B 7.1
## Branch C 7.9
## Branch D 18.7
## Branch E 7.1
# Red line 
## A 16.7
## B 20.1
# Orange line 18.3
# Blue line 18.6
# Ref: https://www.fixmbta.com/green-line-upgrades

### Vehicle distance,speed, acceleration calculation

In [None]:
# Function to compute distances (D), speeds (S) and acceleration (A) in meters, meters per second, km per hour and m s^-2
computeDSA <- function(d) {
    d$dist_meters=NA
    d$interval_seconds=NA
    d$speed_mps=NA
    d$speed_kph=NA
    d$accel_mps2=NA
    n <- nrow(d)
    if (n >= 2) {
        # Compute interval distance using Haversine function
        d$dist_meters[2:n] = distHaversine(cbind(d$lon[1:n-1],d$lat[1:n-1]),cbind(d$lon[2:n],d$lat[2:n]))
        # Compute time interval
        d$interval_seconds[2:n] = as.numeric(difftime(d$time[2:n], d$time[1:n-1], units = "secs"))
        # Compute speed in meters per second
        d$speed_mps[2:n] = d$dist_meters[2:n] / d$interval_seconds[2:n]
        d$speed_kph[2:n] = d$speed_mps[2:n]*3.6
        d$accel_mps2[2:n] = (d$speed_mps[2:n] - d$speed_mps[1:n-1])/d$interval_seconds[2:n]
        # mean(d[d$speed_kph > speed_threshold, speed_kph])
        # Print numbers of incorrect speed 
        # Define the speed threshold
        if(d$lineid = 4){
            Speed_threshold = 18.7
        }
        else{Speed_threshold = 20.1}
        # Process value correction
        for(k in 2:n) { 
                # revise the incorrect values 
                if (!(is.na(d$speed_kph[k]))) {
                    if (d$speed_kph[k] > speed_threshold ) {
                         if(k==3){d$speed_kph[k] = d$speed_kph[k+1]}
                         else{
                        d$speed_kph[k] = ( d$speed_kph[k-1] + d$speed_kph[k+1] )/2
                        d$speed_mps[k] = d$speed_kph[k] / 3.6
                        d$accel_mps2[k] = (d$speed_mps[k]-d$speed_mps[k-1]) / d$interval_seconds[k]
                        d$dist_meters[k] = d$speed_mps[k] * d$interval_seconds[k]}
                        else if (k == nrows(d)){
                       d$speed_kph[k] = d$speed_kph[k-1]}
                    }
                }
            }
        }    
    return(d)
}

In [None]:
# Process calculation for four lines
results.df = data.frame() # empty dataframe
Month_calculation = function(data){
for(i in unique(data$Day)) { 
    if (i <= 31) { # define when the loop will end
        data.day <- data[data$Day == i, ]
        for (j in unique(data.day$trainid)) {# Put each train in one loop in a subset
            data.day.train = data.day[data.day$trainid == j, ]        
            results <- computeDSA(data.day.train) # Process data with only one row separately        
            results.df <- rbind(results.df, results) # put all the results in one table
        }
     }
  }
}
# Save the results for one year
write.csv(x=results.df,file.path("../../data/tidy/", paste("rail_distances_",yearlist,".csv",sep = "",collapse = "")))

In [None]:
# old methods
#  for(k in 2:n) { 
#                 # revise the incorrect values 
#                 if (!(is.na(d$speed_kph[k]))) {
#                     if(d$line == 1)
#                     if (d$speed_kph[k] > #speed_threshold ) {
#                         d$speed_kph[k] = d$speed_kph[k-1]
#                         d$speed_mps[k] = d$speed_kph[k] / 3.6
#                         d$accel_mps2[k] = (d$speed_mps[k]-d$speed_mps[k-1]) / d$interval_seconds[k]
#                         d$dist_meters[k] = d$speed_mps[k] * d$interval_seconds[k] 
#                     }