# Data preprocessing and model preparation

In [1]:
library(data.table) #fread
library(lubridate)
library(dplyr)
library(geosphere) # Calculate vehicle distance
library(reshape2)
library(scales)
library(stringr)
library(ggplot2)
library(tidyverse)

"package 'data.table' was built under R version 3.6.3"

Attaching package: 'lubridate'


The following objects are masked from 'package:data.table':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year


The following object is masked from 'package:base':

    date



Attaching package: 'dplyr'


The following objects are masked from 'package:lubridate':

    intersect, setdiff, union


The following objects are masked from 'package:data.table':

    between, first, last


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


"package 'geosphere' was built under R version 3.6.3"


In [None]:
# adjust the R limit
memory.limit(size=249200)

### Input location data and process calculation

In [None]:
# Choose the time nad line which will be investigated
month = 03
year = 20
rail = "heavy"

In [2]:
df = read.csv("../../data/raw/vehicle-location/lightrail-trajectories-09-20-.csv")

In [3]:
head(df)

Unnamed: 0_level_0,counter,trxtime,lineid,trainid,lat,lon,servicetypeid,routeid,vehicleid,sourceid,car1,car2,car3,speed,heading,rcvtime,lastavi,inserted
Unnamed: 0_level_1,<int>,<fct>,<int>,<int>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<fct>,<int>,<fct>
1,28,2020-09-01 23:59:52,4,10042,42.35986,-71.05891,1,881,3845,0,3845,3607.0,,,,2020-09-01 00:00:26,76,2020-09-01 11:00:20
2,344,2020-09-01 23:57:50,4,10156,42.34885,-71.09469,1,831,3817,0,3817,,,,,2020-09-01 00:03:30,16,2020-09-01 11:00:20
3,5853,2020-09-01 03:02:56,4,10196,42.34027,-71.16688,1,813,3622,1,3622,3816.0,,0.0,338.29,2020-09-01 03:02:56,13,2020-09-02 11:00:19
4,5854,2020-09-01 03:03:49,4,10196,42.341,-71.16742,1,813,3622,1,3622,3816.0,,0.0,350.83,2020-09-01 03:03:49,13,2020-09-02 11:00:19
5,5855,2020-09-01 03:25:23,4,10217,42.33689,-71.25384,1,852,3914,1,3914,3915.0,,0.28,329.94,2020-09-01 03:25:23,35,2020-09-02 11:00:19
6,5856,2020-09-01 03:58:20,4,10099,42.33692,-71.2515,1,0,3834,1,3834,,,13.89,124.1,2020-09-01 03:58:20,0,2020-09-02 11:00:19


In [9]:
df_speed = df$speed

In [12]:
df_speed = na.omit(df_speed)

In [15]:
max(df_speed)

In [None]:
df = read.csv(paste("../../data/raw/vehicle-location/",
            paste("ttr.",paste(rail,"location",year,month,sep = "",collapse = ""),sep = "",collapse = "")sep = "",collapse = ""),header = TRUE, na.strings =0)
df$Day = day(df$trxtime)
# Duplicate the heavy rail table
if(rail = "heavy"){ dr = subset(df,df$lineid==1)
                    db = subset(df,df$lineid==2)
                    do = subset(df,df$lineid==3)}
else{
    dg = df
}
# Data preprocessing
data_pre = function(df){
     data = data.frame(time=df$trxtime, Day=df$Day, trainid=df$trainid, route=df$routeid, lat=df$lat, lon=df$lon)
     data = data[!(is.na(data$lon)) | !(is.na(data$lat)),] #remove the rows with values of lat and lon are 0
     data = data[order(data$trainid, data$time),]
     data = distinct(data, time, trainid, route, .keep_all = TRUE) # Remove the duplicated time record
    return(data)
}

In [None]:
# All speed are mph
# Green line
## Branch main 8.6
## Branch B 7.1
## Branch C 7.9
## Branch D 18.7
## Branch E 7.1
# Red line 
## A 16.7
## B 20.1
# Orange line 18.3
# Blue line 18.6
# Ref: https://www.fixmbta.com/green-line-upgrades

In [None]:
# Get ready for calculation
dr = data_pre(dr)
db = data_pre(db)
do = data_pre(do)
dg = data_pre(dg)
dg$line = 4

In [None]:
# Function to compute distances (D), speeds (S) and acceleration (A) in meters, meters per second, km per hour and m s^-2
computeDSA <- function(d) {
    d$dist_meters=NA
    d$interval_seconds=NA
    d$speed_mps=NA
    d$speed_kph=NA
    d$accel_mps2=NA
    n <- nrow(d)
    # dist_miles
    # speed_kph
    # Name variable 
    # time_sec
    # acc_mps2
    if (n >= 2) {
        # Compute interval distance using Haversine function
        d$dist_meters[2:n] = distHaversine(cbind(d$lon[1:n-1],d$lat[1:n-1]),cbind(d$lon[2:n],d$lat[2:n]))
        # Compute time interval
        d$interval_seconds[2:n] = as.numeric(difftime(d$time[2:n], d$time[1:n-1], units = "secs"))
        # Compute speed in meters per second
        d$speed_mps[2:n] = d$dist_meters[2:n] / d$interval_seconds[2:n]
        d$speed_kph[2:n] = d$speed_mps[2:n]*3.6
        d$accel_mps2[2:n] = (d$speed_mps[2:n] - d$speed_mps[1:n-1])/d$interval_seconds[2:n]
        for(k in 2:n) { 
                # revise the incorrect values 
                if (!(is.na(d$speed_kph[k]))) {
                    if (d$speed_kph[k] > 70 ) {
                        d$speed_kph[k] = d$speed_kph[k-1]
                        d$speed_mps[k] = d$speed_kph[k] / 3.6
                        d$accel_mps2[k] = (d$speed_mps[k]-d$speed_mps[k-1]) / d$interval_seconds[k]
                        d$dist_meters[k] = d$speed_mps[k] * d$interval_seconds[k] 
                    }
                }
            }
        }    
    return(d)
}

In [None]:
# Process calculation for four lines
results.df = data.frame() # empty dataframe
Month_calculation = function(data){
for(i in unique(data$Day)) { 
    if (i <= 31) { # define when the loop will end
        data.day <- data[data$Day == i, ]
        for (j in unique(data.day$trainid)) {# Put each train in one loop in a subset
            data.day.train = data.day[data.day$trainid == j, ]        
            results <- computeDSA(data.day.train) # Process data with only one row separately        
            results.df <- rbind(results.df, results) # put all the results in one table
        }
     }
  }
}
write.csv(x=results.df,file.path("../../data/tidy/Calculated Table", paste(rail,"rail_distances_",paste(year,month,sep = "",collapse = ""),".csv",sep = "",collapse = "")))

### Combine four lines as one table and ready for model

In [4]:
# Read distance calculation data
yearlist = c('19')
monthlist = c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12") #FOR FULL TABLE
heavyrail_distance_filepath = "../../data/tidy/Calculated table/"
lightrail_distance_filepath = "../../data/tidy/Calculated Table/"
for (year in yearlist) {
    for (month in monthlist) {
        assign(paste("dr", year, month, sep=""), fread(paste(heavyrail_distance_filepath, "heavyrailred_distances_", year, month, ".csv", sep="")))
        assign(paste("db", year, month, sep=""), fread(paste(heavyrail_distance_filepath, "heavyrailblue_distances_", year, month, ".csv", sep="")))
        assign(paste("do", year, month, sep=""), fread(paste(heavyrail_distance_filepath, "heavyrailorange_distances_", year, month, ".csv", sep="")))
        assign(paste("dg", year, month, sep=""), fread(paste(lightrail_distance_filepath, "lightrail_distances_", year, month, ".csv", sep="")))
    }
}

In [8]:
# Combine the 4 line table
dg = rbind(dg1901,dg1902,dg1903,dg1904,dg1905,dg1906,dg1907,dg1908,dg1909,dg1910,dg1911,dg1912)
dr = rbind(dr1901,dr1902,dr1903,dr1904,dr1905,dr1906,dr1907,dr1908,dr1909,dr1910,dr1911,dr1912)
do = rbind(do1901,do1902,do1903,do1904,do1905,do1906,do1907,do1908,do1909,do1910,do1911,do1912)
db = rbind(db1901,db1902,db1903,db1904,db1905,db1906,db1907,db1908,db1909,db1910,db1911,db1912)

In [None]:
# Add columns for further analysis
df_adddate = function(df){
    df$Month = month(df$time)
    df$Day = day(df$time)
    df$Hour = hour(df$time)
}

In [None]:
# Combine four lines as one single table
df_combine = function(dg,dr,do,db){
  dg = subset(dg,select= -route)  # Remove the route column of Green line for the combination of tables of 4 lines
  interval_df = rbind(dg,dr,do,db) #combine interval data for all lines
   return(interval_df)
}
write.csv(interval_df,paste("../../data/tidy/","train-trajectories-",year,".csv",sep = "",collapse = ""))

### Trajectory table calculation

In [None]:
table_calculation = function(db){
# Calculate monthly distance
Calculation_table = aggregate(as.numeric(db$dist,db$timeinterval),by = list(Month=db$Month),FUN=sum,na.rm=TRUE)
names(Calculation_table)[names(Calculation_table) == "x"] <- "Monthly distance"
Calculation_table$"Monthly distance" = Calculation_table$"Monthly distance"*0.000621371
# Calculate monthly operating time
dbt = aggregate(as.numeric(db$timeinterval),by = list(Month = db$Month),FUN=sum,na.rm=TRUE)
Calculation_table$"operating time" = dbt$x
# Calculate monthly average speed
dbv = aggregate(as.numeric(db$kph,db$kph),by = list(Month = db$Month),FUN=mean,na.rm=TRUE)
Calculation_table$"average speed" = dbv$x
Calculation_table
}

### Plot the trajectory

In [None]:
# name the output file as the line you want to investigate
Trajectory_analysis = function(db){
# choose one day
btrainset = subset(db,db$Month == 1 & db$Day ==10,select=c(time,dist,trainid,kph))
# choose 30 trains, before do this please apply "range" fucntion to the "trainid" column to find out the threshold
btrainset = subset(btrainset,btrainset$trainid >= 1415219066 & btrainset$trainid <= 1415222776,select=c(time,dist,trainid,kph))
btrainset$kph = as.numeric(as.character(btrainset$kph))
# change the time column to datatime type for plot
btrainset$hms <- strftime(btrainset$time, format = "%H:%M:%OS")
btrainset$hms <- as.POSIXct(btrainset$hms, format = "%H:%M:%OS")
btrainset$dist = as.numeric(as.character(btrainset$dist))
# for clumative distance calculation we set the NA as 0
btrainset[is.na(btrainset)] <- 0
btrainset=btrainset %>%
  group_by(trainid) %>%
  mutate(cumdist = cumsum(dist))
# plot the trajectory
# cumulative distance plot
png("../Results/Train movement analysis plots/Blue line trajectory.png",height= 1500,width=7000,res=360)
ggplot(btrainset, aes(x=hms,y=cumdist*0.000621371,color=factor(trainid)))+
       geom_line(data=btrainset,mapping=aes(x=hms,y=cumdist*0.000621371),size= 1,alpha=0.7)+
       theme(axis.text=element_text(size=12),axis.title.x = element_text(size = 20),axis.title.y = element_text(size = 20),title= element_text(size = 20),axis.text.x = element_text(angle = 90, hjust = 1))+
        #ylim(0, 1000)+
        scale_x_datetime(breaks = "1 hour",labels=date_format("%H:%M"))+
        labs(title ="Blue line distance trajectory", y = "Distance (mile)", x = "Time"，col="Trainid")+
     guides(color=FALSE)   
dev.off()
# speed plot
png("../Results/Train movement analysis plots/Blue line Speed(2019).png",height= 1500,width = 7000,res = 360)
ggplot(btrainset, aes(x=hms,y=kph,color=factor(trainid)))+
       geom_line(data = btrainset,mapping = aes(x = hms,y = kph*0.621371),size = 0.6,alpha = 0.4)+
       theme(axis.text=element_text(size = 12),axis.title.x = element_text(size = 20),axis.title.y = element_text(size = 20),title= element_text(size = 20),axis.text.x = element_text(angle = 90, hjust = 1))+
        #ylim(0, 70)+
        scale_x_datetime(breaks = "1 hour",labels = date_format("%H:%M"))+
        labs(title ="Blue line speed plot", y = "Speed (mph)", x = "Time"，col="Trainid")+
      guides(color=FALSE)
dev.off()
# distance plot
png("../results/Train movement analysis plots/Blue line dist(2019).png",height= 1000,width=7000,res=360)
ggplot(btrainset, aes(x = hms,y = dist*0.000621371,color=factor(trainid)))+
       geom_line(data = btrainset,mapping = aes(x = hms,y = dist),size = 0.6,alpha = 0.4)+
       theme(axis.text.x = element_text(angle = 90, hjust = 1))+
        #ylim(0,2500)+
        scale_x_datetime(breaks = "1 hour",labels = date_format("%H:%M"))+
        labs(title = "Blue line vehicle distance", y = "Distance (mile)", x = "Time"，col = "Trainid")+
        guides(color = FALSE)
dev.off()
}