# Data preprocessing and model preparation
## Vehicle distance,acceleration and speed calculation

In [None]:
library(data.table) #fread
library(lubridate)
library(dplyr)
library(geosphere) # Calculate vehicle distance

In [None]:
# adjust the R limit
memory.limit(size=249200)

### Input location data and process calculation

In [None]:
# Choose the time nad line which will be investigated
month = 03
year = 20
rail = "heavy"

In [None]:
df = read.csv(paste("../../data/raw/Orginal  table/",
            paste("ttr.",paste(rail,"location",year,month,sep = "",collapse = ""),sep = "",collapse = "")sep = "",collapse = ""),header = TRUE, na.strings =0)
df$Day = day(df$trxtime)
# Duplicate the heavy rail table
if(rail = "heavy"){ dr = subsetf,df$lineid==1)
                    db = subset(df,df$lineid==2)
                    do = subset(df,df$lineid==3)}
else{
    dg = df
}
# Data preprocessing
data_pre = function(df){
     data = data.frame(time=df$trxtime, Day=df$Day, trainid=df$trainid, route=df$routeid, lat=df$lat, lon=df$lon)
     data = data[!(is.na(data$lon)) | !(is.na(data$lat)),] #remove the rows with values of lat and lon are 0
     data = data[order(data$trainid, data$time),]
     data = distinct(data, time, trainid, route, .keep_all = TRUE) # Remove the duplicated time record
    return(data)
}

In [None]:
# Get ready for calculation
dr = data_pre(dr)
db = data_pre(db)
do = data_pre(do)
dg = data_pre(dg)

In [None]:
# Function to compute distances (D), speeds (S) and acceleration (A) in meters, meters per second, km per hour and m s^-2
computeDSA <- function(d) {
    d$dist=NA
    d$timeinterval=NA
    d$mps=NA
    d$kph=NA
    d$acc=NA
    d$line = 4
    n <- nrow(d)
    if (n >= 2) {
        d$dist[2:n] = distHaversine(cbind(d$lon[1:n-1],d$lat[1:n-1]),cbind(d$lon[2:n],d$lat[2:n]))
        d$timeinterval[2:n] = as.numeric(difftime(d$time[2:n], d$time[1:n-1], units = "secs"))
        d$mps[2:n] = d$dist[2:n] / d$timeinterval[2:n]
        d$kph[2:n] = d$mps[2:n]*3.6
        d$acc[2:n] = (d$mps[2:n] - d$mps[1:n-1])/d$timeinterval[2:n]
        for(k in 2:n) { 
                # revise the incorrect values 
                if (!(is.na(d$kph[k]))) {
                    if (d$kph[k] > 70 ) {
                        d$kph[k] = d$kph[k-1]
                        d$mps[k] = d$kph[k] / 3.6
                        d$acc[k] = (d$mps[k]-d$mps[k-1]) / d$timeinterval[k]
                        d$dist[k] = d$mps[k] * d$timeinterval[k] 
                    }
                }
            }
        }    
    return(d)
}

In [None]:
# Process calculation for four lines
results.df = data.frame() # empty dataframe
Month_calculation = function(data){
for(i in unique(data$Day)) { 
    if (i <= 31) { #Change this to 15 so we get half the month
        data.day <- data[data$Day == i, ]
        for (j in unique(data.day$trainid)) {# Put each train in one loop in a subset
            data.day.train = data.day[data.day$trainid == j, ]        
            results <- computeDSA(data.day.train) # Process data with only one row separately        
            results.df <- rbind(results.df, results) # put all the results in one table
        }
     }
  }
}
write.csv(x=results.df,file.path("../../data/tidy/Calculated Table", paste(rail,"rail_distances_",paste(year,month,sep = "",collapse = ""),".csv",sep = "",collapse = "")))

### Combine four lines as one table and ready for model

In [4]:
# Read distance calculation data
yearlist = c('19')
monthlist = c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12") #FOR FULL TABLE
heavyrail_distance_filepath = "../../data/tidy/Heavy line movement/Calculated table/"
lightrail_distance_filepath = "../../data/tidy/Green line movement/Calculated Table/"
for (year in yearlist) {
    for (month in monthlist) {
        assign(paste("dr", year, month, sep=""), fread(paste(heavyrail_distance_filepath, "heavyrailred_distances_", year, month, ".csv", sep="")))
        assign(paste("db", year, month, sep=""), fread(paste(heavyrail_distance_filepath, "heavyrailblue_distances_", year, month, ".csv", sep="")))
        assign(paste("do", year, month, sep=""), fread(paste(heavyrail_distance_filepath, "heavyrailorange_distances_", year, month, ".csv", sep="")))
        assign(paste("dg", year, month, sep=""), fread(paste(lightrail_distance_filepath, "lightrail_distances_", year, month, ".csv", sep="")))
    }
}

In [8]:
# Combine the 4 line table
dg = rbind(dg1901,dg1902,dg1903,dg1904,dg1905,dg1906,dg1907,dg1908,dg1909,dg1910,dg1911,dg1912)
write.csv(dg,'../../data/tidy/green-vehicle-trajectories-2019.csv')
dr = rbind(dr1901,dr1902,dr1903,dr1904,dr1905,dr1906,dr1907,dr1908,dr1909,dr1910,dr1911,dr1912)
write.csv(dr,'../../data/tidy/red-vehicle-trajectories-2019.csv')
do = rbind(do1901,do1902,do1903,do1904,do1905,do1906,do1907,do1908,do1909,do1910,do1911,do1912)
write.csv(do,'../../data/tidy/orange-vehicle-trajectories-2019.csv')
db = rbind(db1901,db1902,db1903,db1904,db1905,db1906,db1907,db1908,db1909,db1910,db1911,db1912)
write.csv(db,'../../data/tidy/blue-vehicle-trajectories-2019.csv')

In [None]:
# Add  month column 
dg$Month = month(dg$time)
do$Month = month(do$time)
dr$Month = month(dr$time)
db$Month = month(db$time)
# Add Day column 
dg$Day = day(dg$time)
do$Day = day(do$time)
dr$Day = day(dr$time)
db$Day = day(db$time)
# Add hour column 
dg$Hour = hour(dg$time)
do$Hour = hour(do$time)
dr$Hour = hour(dr$time)
db$Hour = hour(db$time)

In [None]:
# Remove the route column of Green line for the combination of tables of 4 lines
dg = subset(dg,select= -route)

In [None]:
interval_df = rbind(dg,dr,do,db) #combine interval data for all lines

In [None]:
write.csv(interval_df,paste("../../data/tidy/","train-trajectories-",year,".csv",sep = "",collapse = ""))