In [17]:
# Clear workspace
rm(list=ls())

In [18]:
# Garabage collect to help prevent memory issues
gc()

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,2470878,132.0,3978168,212.5,3978168,212.5
Vcells,4292266,32.8,14865554,113.5,14865291,113.5


In [19]:
# Install/load libraries
library(tidyverse) # Sagemaker has
install.packages("timetk") # Sagemaker doesn't have
library(timetk)
library(lubridate) # Sagemaker has
install.packages("ggridges") # Sagemaker doesn't have
library(ggridges)

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [20]:
# Read in all files from the Unseen Sensor - Intermediate directory 
# and create a list of data frames 
fnames <- list.files("Data/Unseen Sensor/Intermediate/", pattern="*.csv", full.names=TRUE)
total_df_list <- lapply(fnames, read_csv)

[1mRows: [22m[34m35044[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35044[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

In [21]:
# Sanity check the lengths of list
length(total_df_list)

# Data Missingness

## Percent Missing per Sensor

In [22]:
# Compute the percentage of missingness for each time series
total_missing <- lapply(total_df_list, function(x) 100*sum(x$missing_volume)/nrow(x))  

In [23]:
mean(unlist(total_missing))

# Impute Missing Value Using Temporal Medians Technique

In [24]:
# Ensure all data frames are in proper chronological order
total_df_list <- lapply(total_df_list, function(x) x %>% arrange(timestamp))                          

# Manual Seasonal Imputation Across Full Data Set

In [25]:
man_seas_int <- function(df) {
    # Based on the above results, it makes the most sense to proceed with the modified temporal medians
    # approach for imputation. In this function, we take in a data frame, find the missing values based
    # on a missing_volume boolean column, and impute them using the same modified temporal medians approach
    
    # Set volume where missing to NA
    df <- df %>% mutate(total_volume=ifelse(missing_volume, NA, total_volume))
    
    # Create the list of indexes with missing volume
    missing_vol_list <- which(df$missing_volume==TRUE)
    
    # Initialize an empty list for imputation
    bf <- c()
    
    # For each index
    for (ind in missing_vol_list) {
        
        # Grab the day of year and interval of day
        doy <- df[ind, 4]$day_of_year
        iod <- df[ind, 6]$interval_of_day
        
        # If the day of year is less than 15, grab the most recent 7 days
        if (doy < 15) {
            doy_list <- c(doy-7, doy-6, doy-5, doy-4, doy-3, doy-2, doy-1)
        }
        
        # Else, grab up to the most recent 7 weeks with the same day of week
        else {
            doy_list <- c(doy-49, doy-42, doy-35, doy-28, doy-21, doy-14, doy-7) 
        }
        
        # Get the volume values corresponding to the proper day of year(s) and interval of day
        bf_values <- na.omit((df %>% 
                              arrange(desc(timestamp)) %>% 
                              filter(day_of_year %in% doy_list) %>% 
                              filter(interval_of_day==iod))$total_volume)   
        
        n <- length(bf_values)
        
#         weights <- n:1/(sum(1:n))
        
#         bf_value <- round(weighted.mean(bf_values,
#                                         weights,
#                                         na.rm=T))

        # Compute the median of those volumes
        bf_value <- median(bf_values)
        
        # Append to list
        bf <- c(bf, bf_value)
    }    
    
    # Replace the missing volume values in the df with the imputed values
    df$total_volume <- replace(df$total_volume, missing_vol_list, bf)
    
    # Return the df with imputed values
    df
}

In [26]:
# Impute each df using the modified temporal medians method explored above
total_df_list_int <- lapply(total_df_list, man_seas_int)

### Explore Data Set Lengths to Look for Issues (Like Daylight Savings)

In [27]:
# Length should be 35040 - lengths slightly above this indicate daylight savings is causing multiple entries 
# for one timestamp
lapply(total_df_list_int, function(x) nrow(x))

In [28]:
# Check the max interval of day value - if there are none above 95, then we can simply groupby timestamp to
# eliminate daylight savings issues 
# When doing so, we will avg the speed and take the max of the total volume for that timestamp - this is abritrary
# and other methods could be used, but with so few data points, it is unlikely to matter
lapply(total_df_list_int, function(x) max(x$interval_of_day))

In [29]:
# The timsteamps with issues indicate daylight savings is the likely culprit
lapply(total_df_list_int, function(x) (x %>%  
                                         group_by(timestamp) %>% 
                                         summarise(n=n()) %>% 
                                         ungroup() %>%
                                         arrange(desc(n)) %>%
                                         filter(n>1))$timestamp)

[[1]]
[1] "2019-10-27 01:14:00 UTC" "2019-10-27 01:29:00 UTC"
[3] "2019-10-27 01:44:00 UTC" "2019-10-27 01:59:00 UTC"

[[2]]
[1] "2019-10-27 01:14:00 UTC" "2019-10-27 01:29:00 UTC"
[3] "2019-10-27 01:44:00 UTC" "2019-10-27 01:59:00 UTC"

[[3]]
[1] "2019-10-27 01:14:00 UTC" "2019-10-27 01:29:00 UTC"
[3] "2019-10-27 01:44:00 UTC" "2019-10-27 01:59:00 UTC"

[[4]]
[1] "2019-10-27 01:14:00 UTC" "2019-10-27 01:29:00 UTC"
[3] "2019-10-27 01:44:00 UTC" "2019-10-27 01:59:00 UTC"


In [30]:
# For each data frame, deal with daylight savings issues by computing the max volume of the duplicate start times
total_df_list_int <- lapply(total_df_list_int, 
                              function(x) x <- x %>% 
                                                group_by(site_name, 
                                                         day_of_week, 
                                                         date, 
                                                         day_of_year, 
                                                         timestamp, 
                                                         interval_of_day
                                                        ) %>%
                                                summarise(avg_mph=mean(avg_mph),
                                                          total_volume=max(total_volume),
                                                          missing_speed=max(missing_speed),
                                                          missing_volume=max(missing_volume)
                                                         ) %>%
                                               ungroup() %>%
                                               mutate(missing_speed=ifelse(missing_speed==1, TRUE, FALSE),
                                                      missing_volume=ifelse(missing_volume==1, TRUE, FALSE)
                                                     ))

[1m[22m`summarise()` has grouped output by 'site_name', 'day_of_week', 'date',
'day_of_year', 'timestamp'. You can override using the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'site_name', 'day_of_week', 'date',
'day_of_year', 'timestamp'. You can override using the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'site_name', 'day_of_week', 'date',
'day_of_year', 'timestamp'. You can override using the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'site_name', 'day_of_week', 'date',
'day_of_year', 'timestamp'. You can override using the `.groups` argument.


In [31]:
# Check lengths again - if they are all 35040, we can write to file
lapply(total_df_list_int, function(x) nrow(x))

### Write to Files

In [32]:
# Write each sensor to a file in the Processed sub-folder of the Data directory
for (i in 1:length(total_df_list_int)) {
    write.csv(total_df_list_int[[i]], 
              str_replace_all(fnames[[i]], "Intermediate", "Processed"),
              row.names=FALSE)
}