# Data preprocessing and model preparation

In [1]:
library(data.table) #fread
library(dplyr)
library(geosphere) # Calculate vehicle distance
library(lubridate)# Date column extraction

"package 'data.table' was built under R version 3.6.3"

Attaching package: 'dplyr'


The following objects are masked from 'package:data.table':

    between, first, last


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


"package 'geosphere' was built under R version 3.6.3"

Attaching package: 'lubridate'


The following objects are masked from 'package:data.table':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year


The following object is masked from 'package:base':

    date




In [2]:
# adjust the R limit
memory.limit(size=249200)

## Input location data and data preprocessing

In [4]:
# Read data from folder
# Suggestion: Just do one year at one time. One year's table has 94363502 rows(green)
YEARLIST = c('19')
MONTHlIST = c("04") #FOR FULL TABLE
DISTANCE_FILEPATH = "../../data/raw/vehicle-location/"

In [3]:
# aggregrate_trajectory_table
aggregate_line_trajectories = function(year, month){
    assign("dg", fread(paste(DISTANCE_FILEPATH, paste("lightrail", "trajectories", month, year, ".csv", sep = "-", collapse = ""), sep="")))
    assign("dh", fread(paste(DISTANCE_FILEPATH, paste("heavyrail", "trajectories", month, year, ".csv", sep = "-", collapse = ""), sep="")))
    # Combine the original tables to a single one for analysis
    dg = subset(dg, select = c(trxtime, trainid, lineid, lat, lon))
    dh = subset(dh, select = c(trxtime, trainid, lineid, lat, lon))
    df = rbind(dg, dh)
    return(df)
}

In [347]:
d_test

trxtime,trainid,lineid,lat,lon,day,month,year
<chr>,<int>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>
2019-01-01 04:00:00,10001,4,42.33701,-71.25349,1,1,2019
2019-01-01 04:42:35,10001,4,42.33613,-71.25391,1,1,2019
2019-01-01 04:45:05,10001,4,42.33603,-71.25436,1,1,2019
2019-01-01 05:49:31,10001,4,42.33613,-71.25391,1,1,2019
2019-01-01 05:49:42,10001,4,42.33635,-71.25379,1,1,2019
2019-01-01 06:19:33,10001,4,42.33658,-71.25367,1,1,2019
2019-01-01 06:35:24,10001,4,42.33680,-71.25352,1,1,2019
2019-01-01 06:35:36,10001,4,42.33696,-71.25329,1,1,2019
2019-01-01 06:39:05,10001,4,42.33633,-71.25019,1,1,2019
2019-01-01 06:49:43,10001,4,42.33689,-71.25384,1,1,2019


## Data preprocessing

In [7]:
# Data preprocessing
preprocess_data = function(df){
    # Add time column to do analysis by random time scale
    df$day = day(df$trxtime)
    df$month = month(df$trxtime)
    df$year = year(df$trxtime)
    # check the na values ratio
    df_na = df[(is.na(df$lon)) | (is.na(df$lat)),]
    df_na_rate = nrow(df_na)/nrow(df)
    print(df_na_rate)
    # remove the rows with values of lat and lon are 0
    df = df[!(is.na(df$lon)) | !(is.na(df$lat)),] 
    df = df[order(df$trainid, df$trxtime),]
    df = distinct(df, trxtime, trainid, .keep_all = TRUE) # Remove the duplicated time record
    return(df)
}

## Vehicle distance,speed, acceleration calculation 
### Spurious values statistics
### Value correction

In [350]:
# Function to compute distances (D), speeds (S) and acceleration (A) in meters, meters per second, km per hour and m s^-2
compute_trajectories <- function(d) {
    d = data.frame(d)
    d$dist_meters = NA
    d$dist_meters = as.numeric(d$dist_meters)
    d$interval_seconds = NA
    d$interval_seconds = as.numeric(d$interval_seconds)
    d$speed_mps = NA
    d$speed_mps = as.numeric(d$speed_mps)
    d$speed_kph = NA
    d$speed_kph = as.numeric(d$speed_kph)
    d$accel_mps2 = NA
    d$accel_mps2 = as.numeric(d$accel_mps2)
    n <- nrow(d)
    if (n >= 2) {
        # Compute interval distance using Haversine function
        d$dist_meters[2:n] = distHaversine(cbind(d$lon[1:n-1],d$lat[1:n-1]),cbind(d$lon[2:n],d$lat[2:n]))
        # Compute time interval
        d$interval_seconds[2:n] = as.numeric(difftime(d$trxtime[2:n], d$trxtime[1:n-1], units = "secs"))
        # Compute speed in meters per second
        d$speed_mps[2:n] = d$dist_meters[2:n] / d$interval_seconds[2:n]
        # Convert speed to kph
        d$speed_kph[2:n] = d$speed_mps[2:n] * 3.6
        # Compute accelerations
        d$accel_mps2[2:n] = (d$speed_mps[2:n] - d$speed_mps[1:n-1])/d$interval_seconds[2:n]
        # Spurious values statistics
        diagnostics = {}
        index_excessive_speeds = as.numeric(row.names(d[(d$speed_kph > 128) & (!is.na(d$speed_kph)),]))
        # Original data metrics calculation
         diagnostics$mean_original_speed_kph = mean(as.numeric(d[, "speed_kph"]),na.rm=TRUE)
         diagnostics$min_original_speed_kph = min(as.numeric(d[, "speed_kph"]),na.rm=TRUE)
         diagnostics$max_original_speed_kph = max(as.numeric(d[, "speed_kph"]),na.rm=TRUE)
         diagnostics$num_original_speed_kph = length(as.numeric(d[, "speed_kph"]))
        # Excessive data metrics calculation
         diagnostics$mean_excess_speed_kph = mean(as.numeric(d[index_excessive_speeds, "speed_kph"]))
         diagnostics$min_excess_speed_kph = min(d[index_excessive_speeds, "speed_kph"]) 
         diagnostics$max_excess_speed_kph = max(as.numeric(d[index_excessive_speeds, "speed_kph"])) 
         diagnostics$num_excess_speed_kph = length(d[index_excessive_speeds, "speed_kph"]) 
         diagnostics$prop_excess_speed_kph = round(100*diagnostics$num_excess_speed/n, 2)
        # Correct the excessive speed values and recalculate acceleration and distance
        d[index_excessive_speeds, c('speed_kph',"speed_mps")] = d[index_excessive_speeds - 1 , c('speed_kph',"speed_mps")]
        d[index_excessive_speeds, "dist_meters"] = d[index_excessive_speeds, "speed_mps"] * d[index_excessive_speeds,"interval_seconds"]
        d[index_excessive_speeds, "accel_mps2"] = (d[index_excessive_speeds, "speed_mps"] 
                                                  - d[index_excessive_speeds - 1, "speed_mps"]) / d[index_excessive_speeds,"interval_seconds"]
        # Corrected data metrics calculation
         diagnostics$mean_corrected_speed_kph = mean(as.numeric(d[, "speed_kph"]),na.rm=TRUE)
         diagnostics$min_corrected_speed_kph = min(as.numeric(d[, "speed_kph"]),na.rm=TRUE)
         diagnostics$max_corrected_speed_kph = max(as.numeric(d[, "speed_kph"]),na.rm=TRUE)
         diagnostics$num_corrected_speed_kph = length(as.numeric(d[, "speed_kph"]))
    }
    return(list(data = d ,diag_metrics = diagnostics))   
}

In [349]:
# Process calculation 
process_month_trajectory = function(data){
    results_df = data.frame() # empty dataframe
    diagnostics_df = data.frame()
    for(i in unique(data$day)) { 
        data_day <- data[data$day == i, ]
        # Put each train in one loop in a subset
        for (j in unique(data_day$trainid)) {
            data_day_train = data_day[data_day$trainid == j, ]        
            trajectory_and_diagnostics <- compute_trajectories(data_day_train)  
            trajectories = trajectory_and_diagnostics$data
            diagnostics = trajectory_and_diagnostics$diag_metrics          
            results_df <- rbind(results_df, trajectories)
            diagnostics_df <- rbind(diagnostics_df, diagnostics) 
        }
    }
    #
    # write.csv(x = results_df, file.path("../../data/tidy/", paste("trajectory", yy, mm, ".csv", sep = "-", collapse = "")))
    #  write.csv(x = diagnostics_df, file.path("../../data/tidy/", paste("trajectory-diagnostics" , yy, mm, ".csv", sep = "-",collapse = "")))
    # Uncomment the beneath line and check the computation result
     return(list(result = results_df, result_diag = diagnostics_df ))
 }

In [351]:
process_month_trajectory(d_test)

trxtime,trainid,lineid,lat,lon,day,month,year,dist_meters,interval_seconds,speed_mps,speed_kph,accel_mps2
<chr>,<int>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2019-01-01 04:00:00,10001,4,42.33701,-71.25349,1,1,2019,,,,,
2019-01-01 04:42:35,10001,4,42.33613,-71.25391,1,1,2019,104.64023,2555,0.040955079,0.14743828,
2019-01-01 04:45:05,10001,4,42.33603,-71.25436,1,1,2019,38.35157,150,0.255677118,0.92043762,1.431480e-03
2019-01-01 05:49:31,10001,4,42.33613,-71.25391,1,1,2019,38.35157,3866,0.009920219,0.03571279,-6.356878e-05
2019-01-01 05:49:42,10001,4,42.33635,-71.25379,1,1,2019,26.23150,11,2.384681862,8.58485470,2.158874e-01
2019-01-01 06:19:33,10001,4,42.33658,-71.25367,1,1,2019,28.08000,1791,0.015678392,0.05644221,-1.322727e-03
2019-01-01 06:35:24,10001,4,42.33680,-71.25352,1,1,2019,26.95307,951,0.028341823,0.10203056,1.331591e-05
2019-01-01 06:35:36,10001,4,42.33696,-71.25329,1,1,2019,26.68781,12,2.223984489,8.00634416,1.829702e-01
2019-01-01 06:39:05,10001,4,42.33633,-71.25019,1,1,2019,264.84088,209,1.267181256,4.56185252,-4.578006e-03
2019-01-01 06:49:43,10001,4,42.33689,-71.25384,1,1,2019,307.32665,638,0.481703216,1.73413158,-1.231157e-03

Unnamed: 0_level_0,mean_original_speed_kph,min_original_speed_kph,max_original_speed_kph,num_original_speed_kph,mean_excess_speed_kph,min_excess_speed_kph,max_excess_speed_kph,num_excess_speed_kph,prop_excess_speed_kph,mean_corrected_speed_kph,min_corrected_speed_kph,max_corrected_speed_kph,num_corrected_speed_kph
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
1,76.10174,0,9783.897,1385,1779.5986,212.3691,9783.897,30,2.17,48.51156,0,9783.8969,1385
2,102.39596,0,3432.583,124,512.7405,144.7819,3432.583,16,12.9,55.56925,0,939.4685,124
3,54.94424,0,3590.263,491,917.6562,137.5164,3590.263,9,1.83,38.49006,0,109.589,491


In [None]:
# Generate the final table
main = function(YEARLIST, MONTHLIST) {
    for (y in YEARLIST) {
        for (m in MONTHLIST) {
            df_agg = aggregate_line_trajectories(y, m)
            df_agg = preprocess_data(df_agg)
            process_month_trajectory(df_agg, y, m)            
        }
    }
}