In [1]:
rm(list=ls())

In [2]:
# install.packages("tidyverse") # sagemaker has
library(tidyverse)

# install.packages("cluster") # sagemaker has
library(cluster)

# install.packages("tsfeatures") # sagemaker doesn't have
library(tsfeatures)

# install.packages("Rcatch22") # sagemaker doesn't have
library(Rcatch22)

# install.packages("tseries") # sagemaker doesn't have
library(tseries)

# install.packages("factoextra") # sagemaker doesn't have
library(factoextra)

# install.packages("forecast") # sagemaker doesn't have
library(forecast)

# install.packages("dtwclust") # sagemaker doesn't have
library(dtwclust)

“‘timedatectl’ indicates the non-existent timezone name ‘n/a’”
“Your system is mis-configured: ‘/etc/localtime’ is not a symlink”
“It is strongly recommended to set envionment variable TZ to ‘Etc/UCT’ (or equivalent)”
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.8
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

Welcome! Want to learn more? See two fa

# Read In Data

In [3]:
fnames <- list.files("Data/Unseen Sensor/Processed/", pattern="*.csv", full.names=TRUE)
total_df_list <- lapply(fnames, read_csv)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

In [4]:
# Sanity check the lengths of each list
length(total_df_list)

In [5]:
total_df_list <- lapply(total_df_list, function(x) x %>% arrange(timestamp))    

In [6]:
# Randomly create a list of starting points from which we we sample each data frame

# Set the seed
set.seed(12345)

# Sample without replacement as to ensure the starting time for each time series is different
starting_points <- sample(1:((96*365)-(96*12*7)), 
                          length(total_df_list), 
                          replace=FALSE)

# Create the ending points by adding on 12 weeks to the starting points
ending_points <- starting_points + (96*7*12) - 1

In [7]:
# For each df in our list, create a row number column called 'rn' - this will allow us to find the starting point
# for each sample using the above starting_points array
total_df_list <- lapply(total_df_list, function(x) x %>% mutate(rn = row_number()))

In [8]:
# Sample each df in the list according to its corresponding starting and ending point - this creates 12-week
# long samples of each data frame which will be used for modeling and testing
total_df_list_samples <- lapply(1:length(total_df_list), 
                                function(x) total_df_list[[x]] %>% 
                                    filter(rn >= starting_points[x]) %>%
                                    filter(rn <= ending_points[x]))

In [9]:
# Add a new column to each data frame to denote whether each row should be part of the training set, validation set
# or test set - the first 8 weeks are designated as train, the next 2 as val, and the final 2 as test. It may be 
# the case that train and val get used for training, depending on the modeling task
total_df_list_samples <- lapply(total_df_list_samples, 
                                function(x) x %>% mutate(rn = row_number()) %>%
                                    mutate(train_val_test = ifelse(rn <= (96*7*8), 
                                                                   "train", 
                                                                   ifelse(rn <= (96*7*10), 
                                                                          "val", 
                                                                          "test")))
                                
                               )

In [10]:
# Create a list of data frames which only have the training and validation rows of each df
train_val_samples <- lapply(total_df_list_samples, function(x) x %>% filter(train_val_test != "test"))

In [11]:
# Create a list of data frames which only have the training rows of each df
train_samples <- lapply(total_df_list_samples, function(x) x %>% filter(train_val_test == "train"))

In [12]:
# Create a list of data frames which only have the test rows of each df
test_samples <- lapply(total_df_list_samples, function(x) x %>% filter(train_val_test == "test"))

In [13]:
# Get the target field (total_volume) from the dfs in the above lists and create msts objects instead of dfs
train_val_samples_ts <- lapply(train_val_samples, 
                               function(x) msts(x$total_volume, seasonal.periods=c(24*4, 24*4*7)))

In [14]:
# Get the target field (total_volume) from the dfs in the above lists and create msts objects instead of dfs
train_samples_ts <- lapply(train_samples, 
                           function(x) msts(x$total_volume, seasonal.periods=c(24*4, 24*4*7)))

In [15]:
# Get the target field (total_volume) from the dfs in the above lists and create msts objects instead of dfs
test_samples_ts <- lapply(test_samples,
                          function(x) msts(x$total_volume, seasonal.periods=c(24*4, 24*4*7)))

# Random Cluster Assignments

In [16]:
# Total number of time series in the data set
total_clust <- 4

In [17]:
rand_clust_assign <- c()
for (ts_no in 1:length(train_samples_ts)){
    set.seed(ts_no)
    rand_clust_assign <- c(rand_clust_assign, sample(1:total_clust, 1))
}

In [18]:
rand_clust_assign

# Catch22 Based Feature Computation

In [19]:
# For each train time series in our list, compute the Catch22 features using the catch22_all function 
# and save the results in a df
train_catch22_feat <- lapply(lapply(train_samples_ts, function(x) catch22_all(x)), 
                             function(x) x %>% 
                                 pivot_wider(names_from=names, values_from=values))

“As of 0.1.14 the feature 'CO_f1ecac' returns a double instead of int


In [20]:
train_catch22_feat_df <- do.call("rbind", train_catch22_feat) %>% 
    select(-c('CO_FirstMin_ac', 'PD_PeriodicityWang_th0_01'))
train_catch22_feat_df

DN_HistogramMode_5,DN_HistogramMode_10,CO_f1ecac,CO_HistogramAMI_even_2_5,CO_trev_1_num,MD_hrv_classic_pnn40,SB_BinaryStats_mean_longstretch1,SB_TransitionMatrix_3ac_sumdiagcov,CO_Embed2_Dist_tau_d_expfit_meandiff,IN_AutoMutualInfoStats_40_gaussian_fmmi,FC_LocalSimple_mean1_tauresrat,DN_OutlierInclude_p_001_mdrmd,DN_OutlierInclude_n_001_mdrmd,SP_Summaries_welch_rect_area_5_1,SB_BinaryStats_diff_longstretch0,SB_MotifThree_quantile_hh,SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,SP_Summaries_welch_rect_centroid,FC_LocalSimple_mean3_stderr
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
-0.772714,-1.036745,15.8332,0.6770578,0.002529417,0.7529302,56,0.010604141,0.4183068,23,0.04166667,0.02139137,-0.0167410714,0.9845822,16,1.489225,0.4285714,0.3469388,0.06519418,0.2659134
-0.9985984,-1.195337,18.41606,0.7576623,0.010028003,0.7962791,59,0.006165495,0.3933663,23,0.04,0.07831101,-0.001860119,0.9662259,14,1.573515,0.4285714,0.3469388,0.06519418,0.2833783
-1.2165974,-1.413858,13.28107,0.7864801,0.007527869,0.7672558,61,0.006949684,0.5943701,18,0.52631579,0.07477679,0.0005580357,0.9804185,15,1.442368,0.4081633,0.3265306,0.06596117,0.3063198
-1.1381137,-1.313092,16.05872,0.8409939,0.006194725,0.679814,57,0.021630571,0.6649224,22,0.41666667,-0.01199777,0.0013020833,0.986233,22,1.438768,0.4285714,0.3469388,0.06519418,0.25739


## Read in Catch22 Feature Values for Original Data

In [21]:
catch22_orig <- read_csv("Results/Clustering/KMeans/catch22_feat_df.csv")[,-1]

[1m[22mNew names:
[36m•[39m `` -> `...1`
[1mRows: [22m[34m76[39m [1mColumns: [22m[34m21[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (21): ...1, DN_HistogramMode_5, DN_HistogramMode_10, CO_f1ecac, CO_Histo...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [22]:
head(catch22_orig)

DN_HistogramMode_5,DN_HistogramMode_10,CO_f1ecac,CO_HistogramAMI_even_2_5,CO_trev_1_num,MD_hrv_classic_pnn40,SB_BinaryStats_mean_longstretch1,SB_TransitionMatrix_3ac_sumdiagcov,CO_Embed2_Dist_tau_d_expfit_meandiff,IN_AutoMutualInfoStats_40_gaussian_fmmi,FC_LocalSimple_mean1_tauresrat,DN_OutlierInclude_p_001_mdrmd,DN_OutlierInclude_n_001_mdrmd,SP_Summaries_welch_rect_area_5_1,SB_BinaryStats_diff_longstretch0,SB_MotifThree_quantile_hh,SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,SP_Summaries_welch_rect_centroid,FC_LocalSimple_mean3_stderr
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
-0.8241913,-1.074272,13.07819,0.7280227,0.0004981483,0.7266977,56,0.005869537,0.5152812,20,0.38095238,0.012741815,-0.022693452,0.987313,19,1.478522,0.4081633,0.3469388,0.06596117,0.2855363
-1.0418788,-1.261398,15.21823,0.7337985,0.0041718968,0.7942326,61,0.001731474,0.4839471,22,0.04347826,0.019717262,0.002046131,0.9784537,13,1.504746,0.4489796,0.3265306,0.06596117,0.2974666
-0.8426436,-1.071948,16.4904,0.7523444,-0.0013779902,0.728186,59,0.016181035,0.4878535,23,0.04166667,-0.003348214,-0.018973214,0.9842704,15,1.515349,0.4081633,0.3469388,0.06519418,0.2538126
-0.8638443,-1.082631,14.80505,0.7816261,0.0027961909,0.6716279,56,0.005071623,0.4337241,21,0.43478261,-0.03218006,0.019717262,0.9871433,20,1.431369,0.4285714,0.3469388,0.06596117,0.2678449
-0.9568506,-1.136406,18.01842,0.8454878,0.0006537947,0.6874419,54,0.002624842,0.8154355,23,0.88,-0.011904762,0.007068452,0.9923321,18,1.40981,0.4285714,0.3265306,0.06519418,0.2070834
-1.0182364,-1.220609,17.61995,0.8269746,0.0042333027,0.7337674,61,0.00182441,0.3391418,23,0.04,-0.026041667,0.017671131,0.9882751,23,1.439875,0.4285714,0.3469388,0.06519418,0.2261327


### Scale New Data with Original Scaler Values

In [23]:
train_catch22_feat_df_scaled <- train_catch22_feat_df

In [24]:
for (n in 1:ncol(train_catch22_feat_df_scaled)){
    min_n <- min(catch22_orig[,n])
    max_n <- max(catch22_orig[,n])
    train_catch22_feat_df_scaled[,n] <- (train_catch22_feat_df_scaled[,n] - min_n)/(max_n - min_n)
}

In [26]:
row.names(train_catch22_feat_df_scaled) <- c("Obs1", "Obs2", "Obs3", "Obs4")

“Setting row names on a tibble is deprecated.”


In [27]:
train_catch22_feat_df_scaled

Unnamed: 0_level_0,DN_HistogramMode_5,DN_HistogramMode_10,CO_f1ecac,CO_HistogramAMI_even_2_5,CO_trev_1_num,MD_hrv_classic_pnn40,SB_BinaryStats_mean_longstretch1,SB_TransitionMatrix_3ac_sumdiagcov,CO_Embed2_Dist_tau_d_expfit_meandiff,IN_AutoMutualInfoStats_40_gaussian_fmmi,FC_LocalSimple_mean1_tauresrat,DN_OutlierInclude_p_001_mdrmd,DN_OutlierInclude_n_001_mdrmd,SP_Summaries_welch_rect_area_5_1,SB_BinaryStats_diff_longstretch0,SB_MotifThree_quantile_hh,SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,SP_Summaries_welch_rect_centroid,FC_LocalSimple_mean3_stderr
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Obs1,0.19399609,0.18380286,0.3976808,0.7606972,0.02414153,0.6395958,0.673913,0.1262771,0.3267886,0.5526316,0.02019194,0.4354647,0.6488925,0.9391341,0.4761905,0.4798353,0.625,0.6666667,0.02298851,0.1945025
Obs2,0.10632181,0.12125817,0.563503,0.8512591,0.03147398,0.7267964,0.7391304,0.07168181,0.305871,0.5526316,0.01824401,0.5082006,0.7145201,0.7949669,0.3809524,0.6321259,0.625,0.6666667,0.02298851,0.2522444
Obs3,0.02170813,0.03507919,0.2338317,0.8836369,0.02902924,0.6684132,0.7826087,0.08132733,0.474453,0.4210526,0.58662906,0.5036843,0.7251846,0.9064332,0.4285714,0.3951763,0.5,0.5555556,0.03448276,0.3280931
Obs4,0.05217063,0.07481877,0.4121594,0.944885,0.02772563,0.492515,0.6956522,0.26190205,0.5336251,0.5263158,0.45847586,0.3927977,0.728466,0.9520989,0.7619048,0.3886728,0.625,0.6666667,0.02298851,0.1663225


## Load Original KMeans Cluster Assignments

In [28]:
catch22_kmeans_orig <- readRDS("Results/Clustering/KMeans/kmeans_catch22.RData")

In [29]:
table(catch22_kmeans_orig$cluster)


 1  2  3 
52  9 15 

In [30]:
catch22_kmeans_orig_cent <- catch22_kmeans_orig$centers
row.names(catch22_kmeans_orig_cent) <- c("Cent1", "Cent2", "Cent3")

In [31]:
catch22_kmeans_orig_cent

Unnamed: 0,DN_HistogramMode_5,DN_HistogramMode_10,CO_f1ecac,CO_HistogramAMI_even_2_5,CO_trev_1_num,MD_hrv_classic_pnn40,SB_BinaryStats_mean_longstretch1,SB_TransitionMatrix_3ac_sumdiagcov,CO_Embed2_Dist_tau_d_expfit_meandiff,IN_AutoMutualInfoStats_40_gaussian_fmmi,FC_LocalSimple_mean1_tauresrat,DN_OutlierInclude_p_001_mdrmd,DN_OutlierInclude_n_001_mdrmd,SP_Summaries_welch_rect_area_5_1,SB_BinaryStats_diff_longstretch0,SB_MotifThree_quantile_hh,SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,SP_Summaries_welch_rect_centroid,FC_LocalSimple_mean3_stderr
Cent1,0.1262016,0.1576786,0.4256936,0.8661123,0.02365651,0.5966145,0.7178094,0.12274245,0.4430532,0.5394737,0.2082906,0.4299245,0.7087067,0.9182354,0.4844322,0.4483842,0.5817308,0.6282051,0.02630416,0.208431
Cent2,0.6832152,0.7525918,0.2837998,0.7432119,0.02467471,0.6420908,0.6642512,0.05759401,0.3577419,0.4590643,0.30219245,0.4218498,0.7238629,0.8692001,0.4761905,0.5455393,0.4166667,0.4938272,0.24648787,0.3401096
Cent3,0.3724058,0.2298912,0.390747,0.5867222,0.08717953,0.7977046,0.7652174,0.07337093,0.2327335,0.5,0.02090625,0.4654782,0.6596937,0.5637666,0.2666667,0.6810106,0.55,0.4074074,0.02988506,0.5742908


In [33]:
as.matrix(dist(rbind(catch22_kmeans_orig_cent, train_catch22_feat_df_scaled)))[4:7, 1:3]

Unnamed: 0,Cent1,Cent2,Cent3
Obs1,0.2805229,0.9108395,0.747938
Obs2,0.4089266,1.0425763,0.6706
Obs3,0.5200594,1.0838326,0.9975239
Obs4,0.4571881,1.1433028,1.1857291


In [34]:
catch22_clust_assign <- c(1, 1, 1, 1)

# TSFeat Based Feature Computation

In [35]:
# Compute basic set of tsfeatures - 
# Set scale to FALSE as to not scale the time series before computing the features
ts_feat_list <- tsfeatures(train_samples_ts, scale=FALSE)

In [36]:
# Sanity check and inspect the summary
head(ts_feat_list)

frequency,nperiods,seasonal_period1,seasonal_period2,trend,spike,linearity,curvature,e_acf1,e_acf10,⋯,trough1,trough2,entropy,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
672,2,96,672,0.06110177,0.14095648,125.4385,163.26377,0.5938258,1.6655134,⋯,67,469,0.6207966,0.9771548,6.773005,-0.09996172,0.1278525,-0.6633647,0.5084676,0.8386451
672,2,96,672,0.0382038,0.01261537,156.6864,-55.46971,0.280905,0.4961752,⋯,83,299,0.5584588,0.9545693,7.398597,-0.4142187,0.19337,-0.6814363,0.5102978,0.8301033
672,2,96,672,0.04619973,2.3975955,492.9979,-275.78563,0.7666151,2.3408486,⋯,91,10,0.6984848,0.9754772,6.032898,0.14112589,0.1056201,-0.5399201,0.3020198,0.8206214
672,2,96,672,0.04285456,38.39305242,-103.9417,369.49427,0.7217817,1.6138534,⋯,3,500,0.6246713,0.9833059,6.925825,0.20475658,0.1525677,-0.4673074,0.2371622,0.8477111


In [37]:
# Remove features with very low variation
ts_feat_list <- ts_feat_list%>% select(-c(frequency, nperiods, seasonal_period1, seasonal_period2))

## Read in Original TSFeat Features

In [38]:
tsfeat_orig <- read_csv("Results/Clustering/KMeans/tsfeat_feat_df.csv")[,-1]

[1m[22mNew names:
[36m•[39m `` -> `...1`
[1mRows: [22m[34m76[39m [1mColumns: [22m[34m21[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (21): ...1, trend, spike, linearity, curvature, e_acf1, e_acf10, seasona...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [39]:
head(tsfeat_orig)

trend,spike,linearity,curvature,e_acf1,e_acf10,seasonal_strength1,seasonal_strength2,peak1,peak2,trough1,trough2,entropy,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.11873802,0.06236472,94.4668,293.7841,0.3946537,0.6452961,0.9835165,0.9349537,36,36,11,609,0.6782436,0.9786684,6.071303,0.09955463,0.22923628,-0.6079489,0.3943768,0.8540087
0.06271599,0.04976898,47.65461,156.29639,0.4293463,0.397322,0.9710983,0.8199788,59,349,5,500,0.6536231,0.9701665,6.374971,-0.09903519,0.07000481,-0.6268296,0.4218419,0.8425866
0.03671964,0.15966406,-131.59435,59.98838,0.6731399,1.8163552,0.9656386,0.7743938,94,182,28,282,0.6275385,0.9787455,7.026316,-0.05976278,0.09157036,-0.603806,0.381703,0.8231161
0.08678362,3.59837926,-665.51716,-40.91422,0.4426123,0.4621727,0.9815013,0.9112818,78,463,56,173,0.6499062,0.9801411,6.558471,0.04096415,0.16818353,-0.6292484,0.4389144,0.8525582
0.18428037,0.21968109,-432.29089,420.92339,0.4280693,0.5778573,0.9894138,0.915057,69,531,31,435,0.5140402,0.9878141,7.655158,0.01152159,0.24774998,-0.6334273,0.4354897,0.8565159
0.18287555,0.79600912,-630.6672,52.72775,0.5222042,0.7548893,0.9805565,0.8862463,27,612,64,88,0.5680445,0.9834137,7.483912,-0.06527288,0.13048659,-0.6088053,0.3851671,0.8532984


### Scale with Original Data

In [40]:
# Scale features to unit interval to be consistent with how we used Catch22 features
ts_feat_list_df_scaled <- ts_feat_list

In [41]:
# Scale by looping through each feature and subtracting the min and dividing by the range
for (n in 1:ncol(ts_feat_list_df_scaled)){
    min_n <- min(tsfeat_orig[,n])
    max_n <- max(tsfeat_orig[,n])
    ts_feat_list_df_scaled[,n] <- (ts_feat_list_df_scaled[,n] - min_n)/(max_n - min_n)
}

In [42]:
row.names(ts_feat_list_df_scaled) <- c("Obs1", "Obs2", "Obs3", "Obs4")

“Setting row names on a tibble is deprecated.”


In [43]:
ts_feat_list_df_scaled

Unnamed: 0_level_0,trend,spike,linearity,curvature,e_acf1,e_acf10,seasonal_strength1,seasonal_strength2,peak1,peak2,trough1,trough2,entropy,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Obs1,0.0653162,0.0015432465,0.5488178,0.3851651,0.607383,0.27477371,0.8834392,0.80181,0.2315789,0.4593373,0.69473684,0.70543807,0.5073676,0.9363689,0.6715841,0.48118392,0.2280875,0.04191902,0.74266625,0.9420798
Obs2,0.0357313,0.0001374648,0.5591864,0.3324083,0.1910468,0.06688443,0.8021373,0.6412603,0.2736842,0.2213855,0.86315789,0.44864048,0.2706641,0.8015453,0.8749556,0.08622823,0.3730314,-0.01893713,0.74819554,0.9227002
Obs3,0.04606229,0.0262612959,0.6707799,0.2792699,0.8372766,0.39483729,0.7623252,0.7048907,0.5263158,0.4472892,0.94736842,0.01208459,0.8023578,0.9263545,0.4309851,0.78418094,0.1789029,0.4576182,0.11893797,0.9011879
Obs4,0.04174023,0.4205367967,0.4727058,0.4349062,0.7776264,0.26558939,0.8909112,0.8139608,0.6105263,0.1716867,0.02105263,0.75226586,0.5220801,0.9730882,0.7212637,0.86415148,0.2827647,0.70214136,-0.07701204,0.9626485


## Load Original KMeans Cluster Assignments

In [44]:
tsfeat_kmeans_orig <- readRDS("Results/Clustering/KMeans/kmeans_tsfeat.RData")

In [45]:
table(tsfeat_kmeans_orig$cluster)


 1  2  3 
31 38  7 

In [46]:
tsfeat_kmeans_orig_cent <- tsfeat_kmeans_orig$centers
row.names(tsfeat_kmeans_orig_cent) <- c("Cent1", "Cent2", "Cent3")

In [47]:
tsfeat_kmeans_orig_cent

Unnamed: 0,trend,spike,linearity,curvature,e_acf1,e_acf10,seasonal_strength1,seasonal_strength2,peak1,peak2,trough1,trough2,entropy,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1
Cent1,0.1329332,0.04719421,0.4877631,0.3553232,0.5628416,0.230522,0.8388081,0.7356241,0.7154499,0.5897785,0.2400679,0.5099893,0.5585438,0.9076115,0.6259838,0.5578621,0.2628959,0.28160866,0.3754016,0.921031
Cent2,0.1754622,0.01292286,0.5058397,0.3087586,0.540371,0.2097876,0.8072581,0.7124736,0.2941828,0.4440393,0.7933518,0.5166561,0.5954719,0.8811117,0.6232551,0.5048795,0.248105,0.27915207,0.3804945,0.9090777
Cent3,0.4084417,0.14341019,0.6062414,0.5007258,0.4824639,0.3854115,0.2265813,0.1725436,0.5533835,0.5686317,0.4646617,0.3321105,0.7278745,0.4816146,0.5407494,0.1252272,0.3514962,0.07324611,0.6744657,0.4869311


In [48]:
as.matrix(dist(rbind(tsfeat_kmeans_orig_cent, ts_feat_list_df_scaled)))[4:7, 1:3]

Unnamed: 0,Cent1,Cent2,Cent3
Obs1,0.8503253,0.5427822,1.394589
Obs2,1.2391158,0.9057814,1.388316
Obs3,1.1017591,0.8755676,1.597775
Obs4,0.9975524,1.284049,1.943081


In [49]:
tsfeat_clust_assign <- c(2, 2, 2, 1)

# DTW Clustering

## Read in Original Clustering Results

In [50]:
dtw_orig <- readRDS("Results/Clustering/DTW/dtw_clust.RData")

In [51]:
dtw_orig

partitional clustering with 2 clusters
Using dtw_basic distance
Using pam centroids

Time required for analysis:
    user   system  elapsed 
2129.618    0.251  135.383 

Cluster sizes with average intra-cluster distance:

  size  av_dist
1   25 444242.1
2   51 198995.3

In [52]:
dtw_orig_cent <- dtw_orig@centroids

### Read in Original Data to Determine which TS are the Centroids

In [53]:
# Read in all files from the Data/Processed directory
england_fnames <- list.files("Data/Processed/Highways_England/", pattern="*.csv", full.names=TRUE)
england_df_list <- lapply(england_fnames, read_csv)

portland_fnames <- list.files("Data/Processed/Portland/", pattern="*.csv", full.names=TRUE)
portland_df_list <- lapply(portland_fnames, read_csv)

utah_fnames <- list.files("Data/Processed/Utah/", pattern="*.csv", full.names=TRUE)
utah_df_list <- lapply(utah_fnames, read_csv)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

In [54]:
# Ensure that the rows in each data frame are in proper chronological order 
england_df_list <- lapply(england_df_list, function(x) x %>% arrange(timestamp))
portland_df_list <- lapply(portland_df_list, function(x) x %>% arrange(timestamp))
utah_df_list <- lapply(utah_df_list, function(x) x %>% arrange(timestamp))    

In [55]:
# Create one list of all data frames
total_df_list_orig <- append(append(england_df_list, portland_df_list), utah_df_list)

In [56]:
# Randomly create a list of starting points from which we we sample each data frame

# Set the seed
set.seed(54321)

# Sample without replacement as to ensure the starting time for each time series is different
starting_points <- sample(1:((96*365)-(96*12*7)), 
                          length(total_df_list_orig), 
                          replace=FALSE)

# Create the ending points by adding on 12 weeks to the starting points
ending_points <- starting_points + (96*7*12) - 1

In [57]:
# For each df in our list, create a row number column called 'rn' - this will allow us to find the starting point
# for each sample using the above starting_points array
total_df_list_orig <- lapply(total_df_list_orig, function(x) x %>% mutate(rn = row_number()))

In [58]:
# Sample each df in the list according to its corresponding starting and ending point - this creates 12-week
# long samples of each data frame which will be used for modeling and testing
total_df_list_orig_samples <- lapply(1:length(total_df_list_orig), 
                                function(x) total_df_list_orig[[x]] %>% 
                                    filter(rn >= starting_points[x]) %>%
                                    filter(rn <= ending_points[x]))

In [59]:
# Add a new column to each data frame to denote whether each row should be part of the training set, validation set
# or test set - the first 8 weeks are designated as train, the next 2 as val, and the final 2 as test. It may be 
# the case that train and val get used for training, depending on the modeling task
total_df_list_orig_samples <- lapply(total_df_list_orig_samples, 
                                function(x) x %>% mutate(rn = row_number()) %>%
                                    mutate(train_val_test = ifelse(rn <= (96*7*8), 
                                                                   "train", 
                                                                   ifelse(rn <= (96*7*10), 
                                                                          "val", 
                                                                          "test")))
                                
                               )

In [60]:
# Create a list of data frames which only have the training rows of each df
train_samples_orig <- lapply(total_df_list_orig_samples, function(x) x %>% filter(train_val_test == "train"))

In [61]:
# Get the target field (total_volume) from the dfs in the above lists and create msts objects instead of dfs
train_samples_ts_orig <- lapply(train_samples_orig, 
                           function(x) msts(x$total_volume, seasonal.periods=c(24*4, 24*4*7)))

In [62]:
for (i in 1:length(train_samples_ts_orig)) {
    if (mean(train_samples_ts_orig[[i]] == dtw_orig_cent[[1]])==1) {
        centroid_1 <- i
    }
    if (mean(train_samples_ts_orig[[i]] == dtw_orig_cent[[2]])==1) {
        centroid_2 <- i
    }
    
}

In [63]:
centroid_1

In [64]:
centroid_2

## Compute Distance to Centroids

In [65]:
dist_obs_1_cent_1 <- dtw_basic(train_samples_ts[[1]], train_samples_ts_orig[[centroid_1]])
dist_obs_1_cent_2 <- dtw_basic(train_samples_ts[[1]], train_samples_ts_orig[[centroid_2]])

dist_obs_2_cent_1 <- dtw_basic(train_samples_ts[[2]], train_samples_ts_orig[[centroid_1]])
dist_obs_2_cent_2 <- dtw_basic(train_samples_ts[[2]], train_samples_ts_orig[[centroid_2]])

dist_obs_3_cent_1 <- dtw_basic(train_samples_ts[[3]], train_samples_ts_orig[[centroid_1]])
dist_obs_3_cent_2 <- dtw_basic(train_samples_ts[[3]], train_samples_ts_orig[[centroid_2]])

dist_obs_4_cent_1 <- dtw_basic(train_samples_ts[[4]], train_samples_ts_orig[[centroid_1]])
dist_obs_4_cent_2 <- dtw_basic(train_samples_ts[[4]], train_samples_ts_orig[[centroid_2]])

### Observation 1

In [261]:
dist_obs_1_cent_1

In [262]:
dist_obs_1_cent_2

In [265]:
min(dist_obs_1_cent_1, dist_obs_1_cent_2)

### Observation 2

In [263]:
dist_obs_2_cent_1

In [264]:
dist_obs_2_cent_2

In [266]:
min(dist_obs_2_cent_1, dist_obs_2_cent_2)

### Observation 3

In [66]:
dist_obs_3_cent_1

In [67]:
dist_obs_3_cent_2

In [68]:
min(dist_obs_3_cent_1, dist_obs_3_cent_2)

### Observation 4

In [69]:
dist_obs_4_cent_1

In [70]:
dist_obs_4_cent_2

In [71]:
min(dist_obs_4_cent_1, dist_obs_4_cent_2)

In [72]:
dtw_clust_assign <- c(2, 1, 1, 1)

# Save Results

In [73]:
new_clust_assign_df <- data.frame(rand=rand_clust_assign,
                                  catch22=catch22_clust_assign,
                                  tsfeat=tsfeat_clust_assign,
                                  dtw=dtw_clust_assign)

In [77]:
write.csv(new_clust_assign_df, "Results/Unseen Sensor/clust_assign.csv")