In [1]:
# Empty the work space
rm(list=ls())

In [2]:
# Uncomment and run to install packages if needed
# install.packages("tidyverse")
# install.packages("cluster")
# install.packages("tsfeatures")
# install.packages("Rcatch22")
# install.packages("tseries")
# install.packages("factoextra")
# install.packages("forecast")
# install.packages("dtwclust")

In [3]:
# Load libraries
library(tidyverse)
library(cluster)
library(tsfeatures)
library(Rcatch22)
library(tseries)
library(factoextra)
library(forecast)
library(dtwclust)

“‘timedatectl’ indicates the non-existent timezone name ‘n/a’”
“Your system is mis-configured: ‘/etc/localtime’ is not a symlink”
“It is strongly recommended to set envionment variable TZ to ‘Etc/UCT’ (or equivalent)”
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

Welcome! Want to learn more? See two fa

In [4]:
# Create the folders to save results in
res_folder1 <- "Results/Unseen Sensor/"
if (!file.exists(res_folder1)) {
 dir.create(res_folder1, recursive=TRUE)
}

# Read In Data

In [5]:
# Read in each file into a list of data frames
fnames <- list.files("Data/Unseen Sensor/Processed/", pattern="*.csv", full.names=TRUE)
total_df_list <- lapply(fnames, read_csv)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

In [6]:
# Sanity check the lengths of each list
length(total_df_list)

In [7]:
# Ensure all data is arranged in time stamp order
total_df_list <- lapply(total_df_list, function(x) x %>% arrange(timestamp))    

In [8]:
# Randomly create a list of starting points from which we we sample each data frame

# Set the seed
set.seed(12345)

# Sample without replacement as to ensure the starting time for each time series is different
starting_points <- sample(1:((96*365)-(96*12*7)), 
                          length(total_df_list), 
                          replace=FALSE)

# Create the ending points by adding on 12 weeks to the starting points
ending_points <- starting_points + (96*7*12) - 1

In [9]:
# Save the sampling start and end points to a csv for use in other notebooks
write.csv(data.frame(start=starting_points, end=ending_points), "start_end_points_unseen.csv", row.names=F)

In [10]:
# For each df in our list, create a row number column called 'rn' - this will allow us to find the starting point
# for each sample using the above starting_points array
total_df_list <- lapply(total_df_list, function(x) x %>% mutate(rn = row_number()))

In [11]:
# Sample each df in the list according to its corresponding starting and ending point - this creates 12-week
# long samples of each data frame which will be used for modeling and testing
total_df_list_samples <- lapply(1:length(total_df_list), 
                                function(x) total_df_list[[x]] %>% 
                                    filter(rn >= starting_points[x]) %>%
                                    filter(rn <= ending_points[x]))

In [12]:
# Add a new column to each data frame to denote whether each row should be part of the training set, validation set
# or test set - the first 8 weeks are designated as train, the next 2 as val, and the final 2 as test. It may be 
# the case that train and val get used for training, depending on the modeling task
total_df_list_samples <- lapply(total_df_list_samples, 
                                function(x) x %>% mutate(rn = row_number()) %>%
                                    mutate(train_val_test = ifelse(rn <= (96*7*8), 
                                                                   "train", 
                                                                   ifelse(rn <= (96*7*10), 
                                                                          "val", 
                                                                          "test")))
                                
                               )

In [13]:
# Create a list of data frames which only have the training and validation rows of each df
train_val_samples <- lapply(total_df_list_samples, function(x) x %>% filter(train_val_test != "test"))

In [14]:
# Create a list of data frames which only have the training rows of each df
train_samples <- lapply(total_df_list_samples, function(x) x %>% filter(train_val_test == "train"))

In [15]:
# Create a list of data frames which only have the test rows of each df
test_samples <- lapply(total_df_list_samples, function(x) x %>% filter(train_val_test == "test"))

In [16]:
# Get the target field (total_volume) from the dfs in the above lists and create msts objects instead of dfs
train_val_samples_ts <- lapply(train_val_samples, 
                               function(x) msts(x$total_volume, seasonal.periods=c(24*4, 24*4*7)))

In [17]:
# Get the target field (total_volume) from the dfs in the above lists and create msts objects instead of dfs
train_samples_ts <- lapply(train_samples, 
                           function(x) msts(x$total_volume, seasonal.periods=c(24*4, 24*4*7)))

In [18]:
# Get the target field (total_volume) from the dfs in the above lists and create msts objects instead of dfs
test_samples_ts <- lapply(test_samples,
                          function(x) msts(x$total_volume, seasonal.periods=c(24*4, 24*4*7)))

# Random Cluster Assignments

In [19]:
# Total number of time series in the data set
total_clust <- 4

In [20]:
# Assign each time series to a random cluster

# Create a list to save the assignments
rand_clust_assign <- c()
# Loop through a list 1:n, where n is the number of training samples
for (ts_no in 1:length(train_samples_ts)){
    # Set a seed
    set.seed(ts_no)
    # Sample from 1:n and append to the list
    rand_clust_assign <- c(rand_clust_assign, sample(1:total_clust, 1))
}

In [21]:
# Print the list of assignments
rand_clust_assign

# Catch22 Based Feature Computation

In [22]:
# For each train time series in our list, compute the Catch22 features using the catch22_all function 
# and save the results in a df
train_catch22_feat <- lapply(lapply(train_samples_ts, function(x) catch22_all(x)), 
                             function(x) x %>% 
                                 pivot_wider(names_from=names, values_from=values))

“As of 0.1.14 the feature 'CO_f1ecac' returns a double instead of int


In [23]:
# Append all features for each time series into a data frame and remove the features which we did not use 
# previously (in notebook 7) due to low variance
train_catch22_feat_df <- do.call("rbind", train_catch22_feat) %>% 
    select(-c('CO_FirstMin_ac', 'PD_PeriodicityWang_th0_01'))
train_catch22_feat_df

DN_HistogramMode_5,DN_HistogramMode_10,CO_f1ecac,CO_HistogramAMI_even_2_5,CO_trev_1_num,MD_hrv_classic_pnn40,SB_BinaryStats_mean_longstretch1,SB_TransitionMatrix_3ac_sumdiagcov,CO_Embed2_Dist_tau_d_expfit_meandiff,IN_AutoMutualInfoStats_40_gaussian_fmmi,FC_LocalSimple_mean1_tauresrat,DN_OutlierInclude_p_001_mdrmd,DN_OutlierInclude_n_001_mdrmd,SP_Summaries_welch_rect_area_5_1,SB_BinaryStats_diff_longstretch0,SB_MotifThree_quantile_hh,SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,SP_Summaries_welch_rect_centroid,FC_LocalSimple_mean3_stderr
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
-0.772714,-1.036745,15.8332,0.6770578,0.002529417,0.7529302,56,0.010604141,0.4183068,23,0.04166667,0.02139137,-0.0167410714,0.9845822,16,1.489225,0.4285714,0.3469388,0.06519418,0.2659134
-0.9985984,-1.195337,18.41606,0.7576623,0.010028003,0.7962791,59,0.006165495,0.3933663,23,0.04,0.07831101,-0.001860119,0.9662259,14,1.573515,0.4285714,0.3469388,0.06519418,0.2833783
-1.2165974,-1.413858,13.28107,0.7864801,0.007527869,0.7672558,61,0.006949684,0.5943701,18,0.52631579,0.07477679,0.0005580357,0.9804185,15,1.442368,0.4081633,0.3265306,0.06596117,0.3063198
-1.1381137,-1.313092,16.05872,0.8409939,0.006194725,0.679814,57,0.021630571,0.6649224,22,0.41666667,-0.01199777,0.0013020833,0.986233,22,1.438768,0.4285714,0.3469388,0.06519418,0.25739


## Read in Catch22 Feature Values for Original Data

In [24]:
# Read in the original catch22 features on the original training data, 
# and remove the first column as it is just an indexer
catch22_orig <- read_csv("Results/Clustering/KMeans/catch22_feat_df.csv")[,-1]

[1m[22mNew names:
[36m•[39m `` -> `...1`
[1mRows: [22m[34m76[39m [1mColumns: [22m[34m21[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (21): ...1, DN_HistogramMode_5, DN_HistogramMode_10, CO_f1ecac, CO_Histo...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


### Scale New Data with Original Scaler Values

In [26]:
# Create a new data frame to save scaled features on the new data
train_catch22_feat_df_scaled <- train_catch22_feat_df

In [27]:
# Loop through each feature
for (n in 1:ncol(train_catch22_feat_df_scaled)){
    # Compute the min of the feature on the original data
    min_n <- min(catch22_orig[,n])
    # Compute the max of the feature on the original data
    max_n <- max(catch22_orig[,n])
    # Scale the feature of the new data using the min and max of the original data
    train_catch22_feat_df_scaled[,n] <- (train_catch22_feat_df_scaled[,n] - min_n)/(max_n - min_n)
}

In [28]:
# Add row names
row.names(train_catch22_feat_df_scaled) <- c("Obs1", "Obs2", "Obs3", "Obs4")

“Setting row names on a tibble is deprecated.”


## Load Original KMeans Cluster Assignments

In [30]:
# Load the original cluster assignments from the catch22 features
catch22_kmeans_orig <- readRDS("Results/Clustering/KMeans/kmeans_catch22.RData")

In [32]:
# Grab the centers from these clusters and add row names to the data frame of centers
catch22_kmeans_orig_cent <- catch22_kmeans_orig$centers
row.names(catch22_kmeans_orig_cent) <- c("Cent1", "Cent2", "Cent3", "Cent4", "Cent5")

In [33]:
catch22_kmeans_orig_cent

Unnamed: 0,DN_HistogramMode_5,DN_HistogramMode_10,CO_f1ecac,CO_HistogramAMI_even_2_5,CO_trev_1_num,MD_hrv_classic_pnn40,SB_BinaryStats_mean_longstretch1,SB_TransitionMatrix_3ac_sumdiagcov,CO_Embed2_Dist_tau_d_expfit_meandiff,IN_AutoMutualInfoStats_40_gaussian_fmmi,FC_LocalSimple_mean1_tauresrat,DN_OutlierInclude_p_001_mdrmd,DN_OutlierInclude_n_001_mdrmd,SP_Summaries_welch_rect_area_5_1,SB_BinaryStats_diff_longstretch0,SB_MotifThree_quantile_hh,SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,SP_Summaries_welch_rect_centroid,FC_LocalSimple_mean3_stderr
Cent1,0.1205032,0.1533184,0.72953216,0.7629098,0.658623,0.4591116,0.7712131,0.3627919,0.32567876,0.7422037,0.008641279,0.3796412,0.5928346,0.8530092,0.37451737,0.2909256,0.5641892,0.6801802,0.002861685,0.265547
Cent2,0.4332344,0.4905539,0.06031402,0.2785821,0.6434783,0.2719642,0.2635659,0.2972055,0.40113183,0.1538462,0.686238532,0.3958622,0.5912265,0.9190752,0.36507937,0.3559023,0.2083333,0.4444444,0.674509804,0.5081063
Cent3,0.287268,0.2605563,0.76679408,0.1267128,0.446417,0.7315333,0.7209302,0.2055811,0.08752028,0.8461538,0.005229358,0.403021,0.4931694,0.2313457,0.05952381,0.7287692,0.75,0.7083333,0.005882353,0.8871061
Cent4,0.1103231,0.1907726,0.57469973,0.8759798,0.6794695,0.280526,0.7174419,0.2498391,0.52647008,0.6692308,0.650918384,0.3378944,0.6045537,0.9692444,0.53809524,0.1620051,0.55625,0.7666667,0.007058824,0.1936715
Cent5,0.8504611,0.4590276,0.61804173,0.7512288,0.6923811,0.5798359,0.8856589,0.1236231,0.22103202,0.6410256,0.081161009,0.3496446,0.5954766,0.7642556,0.4484127,0.4138313,0.4270833,0.5,0.008823529,0.4036425


In [34]:
# Compute the Euclidean distance between each of our new time series and each cluster center
as.matrix(dist(rbind(catch22_kmeans_orig_cent, train_catch22_feat_df_scaled)))[6:9, 1:5]

Unnamed: 0,Cent1,Cent2,Cent3,Cent4,Cent5
Obs1,0.2979139,1.617716,1.398811,0.7771913,0.9603046
Obs2,0.4376365,1.840651,1.301638,0.9600075,1.0159148
Obs3,1.1496276,1.409613,1.907151,0.6156497,1.4387502
Obs4,1.0779367,1.750264,2.073672,0.6435282,1.5740351


In [35]:
# Assigned each new time series to the closest center
catch22_clust_assign <- c(1, 1, 4, 4)

# TSFeat Based Feature Computation

In [36]:
# Compute basic set of tsfeatures
# Set scale to FALSE as to not scale the time series before computing the features
ts_feat_list <- tsfeatures(train_samples_ts, scale=FALSE)

In [38]:
# Remove features with very low variation (these were chosen in notebook 7)
ts_feat_list <- ts_feat_list%>% select(-c(frequency, nperiods, seasonal_period1, seasonal_period2))

## Read in Original TSFeat Features

In [39]:
# Read in the TS features computed on the original training data
tsfeat_orig <- read_csv("Results/Clustering/KMeans/tsfeat_feat_df.csv")[,-1]

[1m[22mNew names:
[36m•[39m `` -> `...1`
[1mRows: [22m[34m76[39m [1mColumns: [22m[34m21[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (21): ...1, trend, spike, linearity, curvature, e_acf1, e_acf10, seasona...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


### Scale with Original Data

In [41]:
# Create a df to save scaled ts features
ts_feat_list_df_scaled <- ts_feat_list

In [42]:
# We use a min max scaling method, with the original data used for scaling the new data
# Loop through each feature
for (n in 1:ncol(ts_feat_list_df_scaled)){
    # Copmute the min and max of each feature from the original data
    min_n <- min(tsfeat_orig[,n])
    max_n <- max(tsfeat_orig[,n])
    # Scaled the new data using the min and max from the original data
    ts_feat_list_df_scaled[,n] <- (ts_feat_list_df_scaled[,n] - min_n)/(max_n - min_n)
}

In [43]:
# Add row names to the data frame
row.names(ts_feat_list_df_scaled) <- c("Obs1", "Obs2", "Obs3", "Obs4")

“Setting row names on a tibble is deprecated.”


## Load Original KMeans Cluster Assignments

In [45]:
# Read in the original cluster assignments
tsfeat_kmeans_orig <- readRDS("Results/Clustering/KMeans/kmeans_tsfeat.RData")

In [47]:
# Grab the centers for each cluster and add row names to the data frame of centers
tsfeat_kmeans_orig_cent <- tsfeat_kmeans_orig$centers
row.names(tsfeat_kmeans_orig_cent) <- c("Cent1", "Cent2")

In [49]:
# Compute the Euclidean distance from each new observation to each center
as.matrix(dist(rbind(tsfeat_kmeans_orig_cent, ts_feat_list_df_scaled)))[3:6, 1:2]

Unnamed: 0,Cent1,Cent2
Obs1,0.6146474,1.024686
Obs2,1.1243401,1.154617
Obs3,0.9115104,1.362195
Obs4,1.3177884,1.706205


In [50]:
# Create a list of cluster assignments for the new data based on the distance to the cluster centers
tsfeat_clust_assign <- c(1, 1, 1, 1)

# DTW Clustering

## Read in Original Clustering Results

In [51]:
# Read in the original cluster results for k medoids dtw clusters
dtw_orig <- readRDS("Results/Clustering/DTW/dtw_clust.RData")

In [52]:
dtw_orig

partitional clustering with 2 clusters
Using dtw_basic distance
Using pam centroids

Time required for analysis:
    user   system  elapsed 
2264.315    1.126  145.180 

Cluster sizes with average intra-cluster distance:

  size  av_dist
1   25 437202.5
2   51 200953.7

In [53]:
# Extract the centroids from the clusters
dtw_orig_cent <- dtw_orig@centroids

### Read in Original Data to Determine which TS are the Centroids

In [54]:
# Read in all files from the Data/Processed directory - these are for the original sensors, not the unseen sensors
england_fnames <- list.files("Data/Processed/Highways_England/", pattern="*.csv", full.names=TRUE)
england_df_list <- lapply(england_fnames, read_csv)

portland_fnames <- list.files("Data/Processed/Portland/", pattern="*.csv", full.names=TRUE)
portland_df_list <- lapply(portland_fnames, read_csv)

utah_fnames <- list.files("Data/Processed/Utah/", pattern="*.csv", full.names=TRUE)
utah_df_list <- lapply(utah_fnames, read_csv)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

In [55]:
# Ensure that the rows in each data frame are in proper chronological order 
england_df_list <- lapply(england_df_list, function(x) x %>% arrange(timestamp))
portland_df_list <- lapply(portland_df_list, function(x) x %>% arrange(timestamp))
utah_df_list <- lapply(utah_df_list, function(x) x %>% arrange(timestamp))    

In [56]:
# Create one list of all data frames
total_df_list_orig <- append(append(england_df_list, portland_df_list), utah_df_list)

In [57]:
# Read in the start and end points for each original time series from csv
start_end_orig <- read_csv("start_end_points.csv")
starting_points <- start_end_orig$start
ending_points <- start_end_orig$end

[1mRows: [22m[34m76[39m [1mColumns: [22m[34m2[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (2): start, end

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [58]:
# For each df in our list, create a row number column called 'rn' - this will allow us to find the starting point
# for each sample using the above starting_points array
total_df_list_orig <- lapply(total_df_list_orig, function(x) x %>% mutate(rn = row_number()))

In [59]:
# Sample each df in the list according to its corresponding starting and ending point - this creates 12-week
# long samples of each data frame which will be used for modeling and testing
total_df_list_orig_samples <- lapply(1:length(total_df_list_orig), 
                                function(x) total_df_list_orig[[x]] %>% 
                                    filter(rn >= starting_points[x]) %>%
                                    filter(rn <= ending_points[x]))

In [60]:
# Add a new column to each data frame to denote whether each row should be part of the training set, validation set
# or test set - the first 8 weeks are designated as train, the next 2 as val, and the final 2 as test. It may be 
# the case that train and val get used for training, depending on the modeling task
total_df_list_orig_samples <- lapply(total_df_list_orig_samples, 
                                function(x) x %>% mutate(rn = row_number()) %>%
                                    mutate(train_val_test = ifelse(rn <= (96*7*8), 
                                                                   "train", 
                                                                   ifelse(rn <= (96*7*10), 
                                                                          "val", 
                                                                          "test")))
                                
                               )

In [61]:
# Create a list of data frames which only have the training rows of each df
train_samples_orig <- lapply(total_df_list_orig_samples, function(x) x %>% filter(train_val_test == "train"))

In [62]:
# Get the target field (total_volume) from the dfs in the above lists and create msts objects instead of dfs
train_samples_ts_orig <- lapply(train_samples_orig, 
                           function(x) msts(x$total_volume, seasonal.periods=c(24*4, 24*4*7)))

In [63]:
# Loop through the time series in the list
for (i in 1:length(train_samples_ts_orig)) {
    # If the time series is equal to a cluster centroid
    if (mean(train_samples_ts_orig[[i]] == dtw_orig_cent[[1]])==1) {
        # Assign the centroid variable to the time series index
        centroid_1 <- i
    }
    if (mean(train_samples_ts_orig[[i]] == dtw_orig_cent[[2]])==1) {
        centroid_2 <- i
    }
    
}

In [64]:
# Print the index of each of the cluster centroids
centroid_1

In [65]:
centroid_2

## Compute Distance to Centroids

In [66]:
# Compute the DTW distance of each new time series to the cluster centroid for both of the original clusters
# We use the dtw_basic function to match the way distance was computed when the clusters were created
dist_obs_1_cent_1 <- dtw_basic(train_samples_ts[[1]], train_samples_ts_orig[[centroid_1]])
dist_obs_1_cent_2 <- dtw_basic(train_samples_ts[[1]], train_samples_ts_orig[[centroid_2]])

dist_obs_2_cent_1 <- dtw_basic(train_samples_ts[[2]], train_samples_ts_orig[[centroid_1]])
dist_obs_2_cent_2 <- dtw_basic(train_samples_ts[[2]], train_samples_ts_orig[[centroid_2]])

dist_obs_3_cent_1 <- dtw_basic(train_samples_ts[[3]], train_samples_ts_orig[[centroid_1]])
dist_obs_3_cent_2 <- dtw_basic(train_samples_ts[[3]], train_samples_ts_orig[[centroid_2]])

dist_obs_4_cent_1 <- dtw_basic(train_samples_ts[[4]], train_samples_ts_orig[[centroid_1]])
dist_obs_4_cent_2 <- dtw_basic(train_samples_ts[[4]], train_samples_ts_orig[[centroid_2]])

### Observation 1

In [67]:
# Print the distance of each observation to each cluster centroid
dist_obs_1_cent_1

In [68]:
dist_obs_1_cent_2

In [69]:
min(dist_obs_1_cent_1, dist_obs_1_cent_2)

### Observation 2

In [70]:
dist_obs_2_cent_1

In [71]:
dist_obs_2_cent_2

In [72]:
min(dist_obs_2_cent_1, dist_obs_2_cent_2)

### Observation 3

In [73]:
dist_obs_3_cent_1

In [74]:
dist_obs_3_cent_2

In [75]:
min(dist_obs_3_cent_1, dist_obs_3_cent_2)

### Observation 4

In [76]:
dist_obs_4_cent_1

In [77]:
dist_obs_4_cent_2

In [78]:
min(dist_obs_4_cent_1, dist_obs_4_cent_2)

In [79]:
# Based on the printed results, create a list of cluster assignments (whichever centroid was closer)
dtw_clust_assign <- c(2, 2, 1, 1)

# Save Results

In [80]:
# Create a data frame of cluster assignments for each clustering method
new_clust_assign_df <- data.frame(rand=rand_clust_assign,
                                  catch22=catch22_clust_assign,
                                  tsfeat=tsfeat_clust_assign,
                                  dtw=dtw_clust_assign)

In [81]:
# Save the data frame to a csv file
write.csv(new_clust_assign_df, "Results/Unseen Sensor/clust_assign.csv")