In [1]:
rm(list=ls())

In [2]:
# Uncomment and run to install packages if needed
# install.packages("tidyverse")
# install.packages("cluster")
# install.packages("tsfeatures")
# install.packages("Rcatch22")
# install.packages("tseries")
# install.packages("factoextra")
# install.packages("forecast")
# install.packages("dtwclust")

In [3]:
library(tidyverse)
library(cluster)
library(tsfeatures)
library(Rcatch22)
library(tseries)
library(factoextra)
library(forecast)
library(dtwclust)

“‘timedatectl’ indicates the non-existent timezone name ‘n/a’”
“Your system is mis-configured: ‘/etc/localtime’ is not a symlink”
“It is strongly recommended to set envionment variable TZ to ‘Etc/UCT’ (or equivalent)”
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

Welcome! Want to learn more? See two fa

In [4]:
# Create the folders to save results in
res_folder1 <- "Results/Unseen Sensor/"
if (!file.exists(res_folder1)) {
 dir.create(res_folder1, recursive=TRUE)
}

# Read In Data

In [5]:
fnames <- list.files("Data/Unseen Sensor/Processed/", pattern="*.csv", full.names=TRUE)
total_df_list <- lapply(fnames, read_csv)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

In [6]:
# Sanity check the lengths of each list
length(total_df_list)

In [7]:
total_df_list <- lapply(total_df_list, function(x) x %>% arrange(timestamp))    

In [8]:
# Randomly create a list of starting points from which we we sample each data frame

# Set the seed
set.seed(12345)

# Sample without replacement as to ensure the starting time for each time series is different
starting_points <- sample(1:((96*365)-(96*12*7)), 
                          length(total_df_list), 
                          replace=FALSE)

# Create the ending points by adding on 12 weeks to the starting points
ending_points <- starting_points + (96*7*12) - 1

In [9]:
write.csv(data.frame(start=starting_points, end=ending_points), "start_end_points_unseen.csv", row.names=F)

In [10]:
# For each df in our list, create a row number column called 'rn' - this will allow us to find the starting point
# for each sample using the above starting_points array
total_df_list <- lapply(total_df_list, function(x) x %>% mutate(rn = row_number()))

In [11]:
# Sample each df in the list according to its corresponding starting and ending point - this creates 12-week
# long samples of each data frame which will be used for modeling and testing
total_df_list_samples <- lapply(1:length(total_df_list), 
                                function(x) total_df_list[[x]] %>% 
                                    filter(rn >= starting_points[x]) %>%
                                    filter(rn <= ending_points[x]))

In [12]:
# Add a new column to each data frame to denote whether each row should be part of the training set, validation set
# or test set - the first 8 weeks are designated as train, the next 2 as val, and the final 2 as test. It may be 
# the case that train and val get used for training, depending on the modeling task
total_df_list_samples <- lapply(total_df_list_samples, 
                                function(x) x %>% mutate(rn = row_number()) %>%
                                    mutate(train_val_test = ifelse(rn <= (96*7*8), 
                                                                   "train", 
                                                                   ifelse(rn <= (96*7*10), 
                                                                          "val", 
                                                                          "test")))
                                
                               )

In [13]:
# Create a list of data frames which only have the training and validation rows of each df
train_val_samples <- lapply(total_df_list_samples, function(x) x %>% filter(train_val_test != "test"))

In [14]:
# Create a list of data frames which only have the training rows of each df
train_samples <- lapply(total_df_list_samples, function(x) x %>% filter(train_val_test == "train"))

In [15]:
# Create a list of data frames which only have the test rows of each df
test_samples <- lapply(total_df_list_samples, function(x) x %>% filter(train_val_test == "test"))

In [16]:
# Get the target field (total_volume) from the dfs in the above lists and create msts objects instead of dfs
train_val_samples_ts <- lapply(train_val_samples, 
                               function(x) msts(x$total_volume, seasonal.periods=c(24*4, 24*4*7)))

In [17]:
# Get the target field (total_volume) from the dfs in the above lists and create msts objects instead of dfs
train_samples_ts <- lapply(train_samples, 
                           function(x) msts(x$total_volume, seasonal.periods=c(24*4, 24*4*7)))

In [18]:
# Get the target field (total_volume) from the dfs in the above lists and create msts objects instead of dfs
test_samples_ts <- lapply(test_samples,
                          function(x) msts(x$total_volume, seasonal.periods=c(24*4, 24*4*7)))

# Random Cluster Assignments

In [19]:
# Total number of time series in the data set
total_clust <- 4

In [20]:
rand_clust_assign <- c()
for (ts_no in 1:length(train_samples_ts)){
    set.seed(ts_no)
    rand_clust_assign <- c(rand_clust_assign, sample(1:total_clust, 1))
}

In [21]:
rand_clust_assign

# Catch22 Based Feature Computation

In [22]:
# For each train time series in our list, compute the Catch22 features using the catch22_all function 
# and save the results in a df
train_catch22_feat <- lapply(lapply(train_samples_ts, function(x) catch22_all(x)), 
                             function(x) x %>% 
                                 pivot_wider(names_from=names, values_from=values))

“As of 0.1.14 the feature 'CO_f1ecac' returns a double instead of int


In [23]:
train_catch22_feat_df <- do.call("rbind", train_catch22_feat) %>% 
    select(-c('CO_FirstMin_ac', 'PD_PeriodicityWang_th0_01'))
train_catch22_feat_df

DN_HistogramMode_5,DN_HistogramMode_10,CO_f1ecac,CO_HistogramAMI_even_2_5,CO_trev_1_num,MD_hrv_classic_pnn40,SB_BinaryStats_mean_longstretch1,SB_TransitionMatrix_3ac_sumdiagcov,CO_Embed2_Dist_tau_d_expfit_meandiff,IN_AutoMutualInfoStats_40_gaussian_fmmi,FC_LocalSimple_mean1_tauresrat,DN_OutlierInclude_p_001_mdrmd,DN_OutlierInclude_n_001_mdrmd,SP_Summaries_welch_rect_area_5_1,SB_BinaryStats_diff_longstretch0,SB_MotifThree_quantile_hh,SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,SP_Summaries_welch_rect_centroid,FC_LocalSimple_mean3_stderr
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
-0.772714,-1.036745,15.8332,0.6770578,0.002529417,0.7529302,56,0.010604141,0.4183068,23,0.04166667,0.02139137,-0.0167410714,0.9845822,16,1.489225,0.4285714,0.3469388,0.06519418,0.2659134
-0.9985984,-1.195337,18.41606,0.7576623,0.010028003,0.7962791,59,0.006165495,0.3933663,23,0.04,0.07831101,-0.001860119,0.9662259,14,1.573515,0.4285714,0.3469388,0.06519418,0.2833783
-1.2165974,-1.413858,13.28107,0.7864801,0.007527869,0.7672558,61,0.006949684,0.5943701,18,0.52631579,0.07477679,0.0005580357,0.9804185,15,1.442368,0.4081633,0.3265306,0.06596117,0.3063198
-1.1381137,-1.313092,16.05872,0.8409939,0.006194725,0.679814,57,0.021630571,0.6649224,22,0.41666667,-0.01199777,0.0013020833,0.986233,22,1.438768,0.4285714,0.3469388,0.06519418,0.25739


## Read in Catch22 Feature Values for Original Data

In [24]:
catch22_orig <- read_csv("Results/Clustering/KMeans/catch22_feat_df.csv")[,-1]

[1m[22mNew names:
[36m•[39m `` -> `...1`
[1mRows: [22m[34m76[39m [1mColumns: [22m[34m21[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (21): ...1, DN_HistogramMode_5, DN_HistogramMode_10, CO_f1ecac, CO_Histo...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [25]:
head(catch22_orig)

DN_HistogramMode_5,DN_HistogramMode_10,CO_f1ecac,CO_HistogramAMI_even_2_5,CO_trev_1_num,MD_hrv_classic_pnn40,SB_BinaryStats_mean_longstretch1,SB_TransitionMatrix_3ac_sumdiagcov,CO_Embed2_Dist_tau_d_expfit_meandiff,IN_AutoMutualInfoStats_40_gaussian_fmmi,FC_LocalSimple_mean1_tauresrat,DN_OutlierInclude_p_001_mdrmd,DN_OutlierInclude_n_001_mdrmd,SP_Summaries_welch_rect_area_5_1,SB_BinaryStats_diff_longstretch0,SB_MotifThree_quantile_hh,SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,SP_Summaries_welch_rect_centroid,FC_LocalSimple_mean3_stderr
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
-0.9218226,-1.14847,13.82386,0.770366,0.0023764235,0.7365581,55,0.007384888,0.6402525,20,0.36363636,0.03218006,-0.024925595,0.9887043,15,1.472521,0.4081633,0.3265306,0.06596117,0.2699762
-1.0160128,-1.230913,16.25543,0.7362338,-0.0015322646,0.7856744,61,0.023279508,0.5069481,22,0.04166667,0.04389881,-0.001302083,0.9768737,15,1.51534,0.4489796,0.3469388,0.06519418,0.2904009
-0.9109971,-1.118617,16.70035,0.7820858,0.0021162446,0.7389767,60,0.007085068,0.3769874,22,0.04166667,0.03218006,-0.021763393,0.9825842,20,1.526881,0.4285714,0.3673469,0.06519418,0.2591757
-0.8799034,-1.093104,14.82659,0.7899539,0.0028058063,0.6803721,55,0.002621771,0.4395587,21,0.43478261,-0.05747768,0.007068452,0.9892416,18,1.430336,0.4285714,0.3469388,0.06596117,0.2620395
-0.9038058,-1.100202,17.63207,0.8039572,0.0041442076,0.6682791,54,0.003598341,0.5337112,23,0.52,0.0202753,-0.017578125,0.9910389,22,1.41746,0.4285714,0.3469388,0.06519418,0.2157951
-0.9946967,-1.207999,17.64684,0.814826,0.0003571444,0.7205581,64,0.004398774,0.6793088,23,0.04,0.02994792,-0.022321429,0.990121,18,1.428618,0.4285714,0.3469388,0.06519418,0.2194032


### Scale New Data with Original Scaler Values

In [26]:
train_catch22_feat_df_scaled <- train_catch22_feat_df

In [27]:
for (n in 1:ncol(train_catch22_feat_df_scaled)){
    min_n <- min(catch22_orig[,n])
    max_n <- max(catch22_orig[,n])
    train_catch22_feat_df_scaled[,n] <- (train_catch22_feat_df_scaled[,n] - min_n)/(max_n - min_n)
}

In [28]:
row.names(train_catch22_feat_df_scaled) <- c("Obs1", "Obs2", "Obs3", "Obs4")

“Setting row names on a tibble is deprecated.”


In [29]:
train_catch22_feat_df_scaled

Unnamed: 0_level_0,DN_HistogramMode_5,DN_HistogramMode_10,CO_f1ecac,CO_HistogramAMI_even_2_5,CO_trev_1_num,MD_hrv_classic_pnn40,SB_BinaryStats_mean_longstretch1,SB_TransitionMatrix_3ac_sumdiagcov,CO_Embed2_Dist_tau_d_expfit_meandiff,IN_AutoMutualInfoStats_40_gaussian_fmmi,FC_LocalSimple_mean1_tauresrat,DN_OutlierInclude_p_001_mdrmd,DN_OutlierInclude_n_001_mdrmd,SP_Summaries_welch_rect_area_5_1,SB_BinaryStats_diff_longstretch0,SB_MotifThree_quantile_hh,SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,SP_Summaries_welch_rect_centroid,FC_LocalSimple_mean3_stderr
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Obs1,0.1844253358,0.19055719,0.6403775,0.6711171,0.6641338,0.4118635,0.6976744,0.3677587,0.2542529,0.7692308,0.009174312,0.3569695,0.5564663,0.9421265,0.4761905,0.251734,0.625,0.8333333,6.386055e-16,0.1979918
Obs2,0.0901531502,0.12782326,0.9095301,0.8123209,0.8695996,0.5422496,0.7674419,0.1980535,0.2239404,0.7692308,0.00587156,0.4100208,0.629326,0.7775925,0.3809524,0.4649984,0.625,0.8333333,6.386055e-16,0.2669864
Obs3,-0.0008281171,0.04138353,0.3744273,0.8628041,0.8010944,0.4549524,0.8139535,0.2280358,0.4682397,0.3846154,0.969579913,0.4067268,0.6411658,0.9048058,0.4285714,0.1331794,0.5,0.6666667,0.01176471,0.3576165
Obs4,0.0319268339,0.08124335,0.6638783,0.9583019,0.7645655,0.1919418,0.7209302,0.7893383,0.5539886,0.6923077,0.752293578,0.3258495,0.6448087,0.9569228,0.7619048,0.1240721,0.625,0.8333333,6.386055e-16,0.1643201


## Load Original KMeans Cluster Assignments

In [30]:
catch22_kmeans_orig <- readRDS("Results/Clustering/KMeans/kmeans_catch22.RData")

In [31]:
table(catch22_kmeans_orig$cluster)


 1  2  3  4  5 
37  3  4 20 12 

In [32]:
catch22_kmeans_orig_cent <- catch22_kmeans_orig$centers
row.names(catch22_kmeans_orig_cent) <- c("Cent1", "Cent2", "Cent3", "Cent4", "Cent5")

In [33]:
catch22_kmeans_orig_cent

Unnamed: 0,DN_HistogramMode_5,DN_HistogramMode_10,CO_f1ecac,CO_HistogramAMI_even_2_5,CO_trev_1_num,MD_hrv_classic_pnn40,SB_BinaryStats_mean_longstretch1,SB_TransitionMatrix_3ac_sumdiagcov,CO_Embed2_Dist_tau_d_expfit_meandiff,IN_AutoMutualInfoStats_40_gaussian_fmmi,FC_LocalSimple_mean1_tauresrat,DN_OutlierInclude_p_001_mdrmd,DN_OutlierInclude_n_001_mdrmd,SP_Summaries_welch_rect_area_5_1,SB_BinaryStats_diff_longstretch0,SB_MotifThree_quantile_hh,SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,SP_Summaries_welch_rect_centroid,FC_LocalSimple_mean3_stderr
Cent1,0.1205032,0.1533184,0.72953216,0.7629098,0.658623,0.4591116,0.7712131,0.3627919,0.32567876,0.7422037,0.008641279,0.3796412,0.5928346,0.8530092,0.37451737,0.2909256,0.5641892,0.6801802,0.002861685,0.265547
Cent2,0.4332344,0.4905539,0.06031402,0.2785821,0.6434783,0.2719642,0.2635659,0.2972055,0.40113183,0.1538462,0.686238532,0.3958622,0.5912265,0.9190752,0.36507937,0.3559023,0.2083333,0.4444444,0.674509804,0.5081063
Cent3,0.287268,0.2605563,0.76679408,0.1267128,0.446417,0.7315333,0.7209302,0.2055811,0.08752028,0.8461538,0.005229358,0.403021,0.4931694,0.2313457,0.05952381,0.7287692,0.75,0.7083333,0.005882353,0.8871061
Cent4,0.1103231,0.1907726,0.57469973,0.8759798,0.6794695,0.280526,0.7174419,0.2498391,0.52647008,0.6692308,0.650918384,0.3378944,0.6045537,0.9692444,0.53809524,0.1620051,0.55625,0.7666667,0.007058824,0.1936715
Cent5,0.8504611,0.4590276,0.61804173,0.7512288,0.6923811,0.5798359,0.8856589,0.1236231,0.22103202,0.6410256,0.081161009,0.3496446,0.5954766,0.7642556,0.4484127,0.4138313,0.4270833,0.5,0.008823529,0.4036425


In [34]:
as.matrix(dist(rbind(catch22_kmeans_orig_cent, train_catch22_feat_df_scaled)))[6:9, 1:5]

Unnamed: 0,Cent1,Cent2,Cent3,Cent4,Cent5
Obs1,0.2979139,1.617716,1.398811,0.7771913,0.9603046
Obs2,0.4376365,1.840651,1.301638,0.9600075,1.0159148
Obs3,1.1496276,1.409613,1.907151,0.6156497,1.4387502
Obs4,1.0779367,1.750264,2.073672,0.6435282,1.5740351


In [35]:
catch22_clust_assign <- c(1, 1, 4, 4)

# TSFeat Based Feature Computation

In [36]:
# Compute basic set of tsfeatures - 
# Set scale to FALSE as to not scale the time series before computing the features
ts_feat_list <- tsfeatures(train_samples_ts, scale=FALSE)

In [37]:
# Sanity check and inspect the summary
head(ts_feat_list)

frequency,nperiods,seasonal_period1,seasonal_period2,trend,spike,linearity,curvature,e_acf1,e_acf10,⋯,trough1,trough2,entropy,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
672,2,96,672,0.06110177,0.14095648,125.4385,163.26377,0.5938258,1.6655134,⋯,67,469,0.6207966,0.9771548,6.773005,-0.09996172,0.1278525,-0.6633647,0.5084676,0.8386451
672,2,96,672,0.0382038,0.01261537,156.6864,-55.46971,0.280905,0.4961752,⋯,83,299,0.5584588,0.9545693,7.398597,-0.4142187,0.19337,-0.6814363,0.5102978,0.8301033
672,2,96,672,0.04619973,2.3975955,492.9979,-275.78563,0.7666151,2.3408486,⋯,91,10,0.6984848,0.9754772,6.032898,0.14112589,0.1056201,-0.5399201,0.3020198,0.8206214
672,2,96,672,0.04285456,38.39305242,-103.9417,369.49427,0.7217817,1.6138534,⋯,3,500,0.6246713,0.9833059,6.925825,0.20475658,0.1525677,-0.4673074,0.2371622,0.8477111


In [38]:
# Remove features with very low variation
ts_feat_list <- ts_feat_list%>% select(-c(frequency, nperiods, seasonal_period1, seasonal_period2))

## Read in Original TSFeat Features

In [39]:
tsfeat_orig <- read_csv("Results/Clustering/KMeans/tsfeat_feat_df.csv")[,-1]

[1m[22mNew names:
[36m•[39m `` -> `...1`
[1mRows: [22m[34m76[39m [1mColumns: [22m[34m21[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (21): ...1, trend, spike, linearity, curvature, e_acf1, e_acf10, seasona...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [40]:
head(tsfeat_orig)

trend,spike,linearity,curvature,e_acf1,e_acf10,seasonal_strength1,seasonal_strength2,peak1,peak2,trough1,trough2,entropy,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.04524545,1.787166,401.41853,-70.576,0.8260487,3.9629383,0.945386,0.8000433,70,550,44,357,0.6668819,0.9805245,6.359669,0.06238217,0.23763458,-0.6324652,0.4378136,0.8142246
0.03397202,0.1025198,51.03854,65.12821,0.4686588,0.6192512,0.9650589,0.7962833,39,516,79,2,0.6373192,0.969149,6.66614,-0.142832055,0.05915397,-0.6091203,0.386845,0.8416136
0.07414295,0.145313,109.4305,289.22413,0.4314831,0.3724309,0.9776093,0.8448882,89,269,31,375,0.6242515,0.9766606,7.098263,-0.101996517,0.07086743,-0.5910288,0.3578037,0.8420068
0.04656582,2.8257168,-300.58885,14.81884,0.5182204,0.6470061,0.9837552,0.9206374,22,309,95,20,0.6476451,0.9825183,6.572835,0.123384277,0.23594217,-0.6188357,0.423704,0.8603121
0.32395416,0.2786917,691.13302,-57.02537,0.4035557,0.6414066,0.988521,0.9232613,92,56,35,633,0.5452204,0.9863194,7.482607,0.01530968,0.21314835,-0.6136182,0.391117,0.8586035
0.08106612,3.2968714,146.63715,576.99568,0.7109443,2.1812341,0.9725913,0.8454157,89,98,35,245,0.5721557,0.9856025,7.505656,-0.001428731,0.17599982,-0.6095278,0.3944185,0.8349258


### Scale with Original Data

In [41]:
# Scale features to unit interval to be consistent with how we used Catch22 features
ts_feat_list_df_scaled <- ts_feat_list

In [42]:
# Scale by looping through each feature and subtracting the min and dividing by the range
for (n in 1:ncol(ts_feat_list_df_scaled)){
    min_n <- min(tsfeat_orig[,n])
    max_n <- max(tsfeat_orig[,n])
    ts_feat_list_df_scaled[,n] <- (ts_feat_list_df_scaled[,n] - min_n)/(max_n - min_n)
}

In [43]:
row.names(ts_feat_list_df_scaled) <- c("Obs1", "Obs2", "Obs3", "Obs4")

“Setting row names on a tibble is deprecated.”


In [44]:
ts_feat_list_df_scaled

Unnamed: 0_level_0,trend,spike,linearity,curvature,e_acf1,e_acf10,seasonal_strength1,seasonal_strength2,peak1,peak2,trough1,trough2,entropy,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Obs1,0.06233511,0.0034034896,0.5048055,0.7181229,0.6336328,0.39990931,0.8804181,0.8225735,0.2065217,0.4656489,0.6914894,0.72179289,0.5349825,0.9307917,0.669318,0.48619493,0.2093163,0.03140924,0.79418203,0.8876968
Obs2,0.02750633,0.0003024711,0.5148559,0.5980033,0.1910264,0.09447648,0.7825859,0.6600485,0.25,0.2244275,0.8617021,0.45904173,0.3099194,0.7602332,0.8738767,0.09518644,0.3555746,-0.03179954,0.79987112,0.8530753
Obs3,0.03966847,0.0579291185,0.6230246,0.4770147,0.8780324,0.57630782,0.7346792,0.7244618,0.5108696,0.4534351,0.9468085,0.01236476,0.8154659,0.918123,0.4273146,0.78616378,0.1596857,0.4631789,0.15242633,0.8146437
Obs4,0.03458033,0.9276627805,0.4310293,0.8313764,0.8146183,0.38641562,0.8894093,0.8348738,0.5978261,0.1740458,0.0106383,0.76970634,0.5489715,0.9772433,0.7192876,0.8653351,0.2644894,0.71715507,-0.04918716,0.9244427


## Load Original KMeans Cluster Assignments

In [45]:
tsfeat_kmeans_orig <- readRDS("Results/Clustering/KMeans/kmeans_tsfeat.RData")

In [46]:
table(tsfeat_kmeans_orig$cluster)


 1  2 
52 24 

In [47]:
tsfeat_kmeans_orig_cent <- tsfeat_kmeans_orig$centers
row.names(tsfeat_kmeans_orig_cent) <- c("Cent1", "Cent2")

In [48]:
tsfeat_kmeans_orig_cent

Unnamed: 0,trend,spike,linearity,curvature,e_acf1,e_acf10,seasonal_strength1,seasonal_strength2,peak1,peak2,trough1,trough2,entropy,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1
Cent1,0.161743,0.08396044,0.4923891,0.6090045,0.6700144,0.4268872,0.8340064,0.7667364,0.423704,0.4883147,0.5636252,0.5637855,0.5746055,0.9233,0.6568774,0.6013646,0.2704353,0.3143367,0.402547,0.8588529
Cent2,0.3660031,0.02269512,0.4921098,0.6086331,0.4558023,0.2797747,0.5743514,0.5253412,0.6752717,0.5428753,0.3156028,0.5334235,0.6921544,0.6539098,0.5554224,0.2550749,0.1811398,0.1502343,0.5486908,0.6198044


In [49]:
as.matrix(dist(rbind(tsfeat_kmeans_orig_cent, ts_feat_list_df_scaled)))[3:6, 1:2]

Unnamed: 0,Cent1,Cent2
Obs1,0.6146474,1.024686
Obs2,1.1243401,1.154617
Obs3,0.9115104,1.362195
Obs4,1.3177884,1.706205


In [50]:
tsfeat_clust_assign <- c(1, 1, 1, 1)

# DTW Clustering

## Read in Original Clustering Results

In [51]:
dtw_orig <- readRDS("Results/Clustering/DTW/dtw_clust.RData")

In [52]:
dtw_orig

partitional clustering with 2 clusters
Using dtw_basic distance
Using pam centroids

Time required for analysis:
    user   system  elapsed 
2264.315    1.126  145.180 

Cluster sizes with average intra-cluster distance:

  size  av_dist
1   25 437202.5
2   51 200953.7

In [53]:
dtw_orig_cent <- dtw_orig@centroids

### Read in Original Data to Determine which TS are the Centroids

In [54]:
# Read in all files from the Data/Processed directory
england_fnames <- list.files("Data/Processed/Highways_England/", pattern="*.csv", full.names=TRUE)
england_df_list <- lapply(england_fnames, read_csv)

portland_fnames <- list.files("Data/Processed/Portland/", pattern="*.csv", full.names=TRUE)
portland_df_list <- lapply(portland_fnames, read_csv)

utah_fnames <- list.files("Data/Processed/Utah/", pattern="*.csv", full.names=TRUE)
utah_df_list <- lapply(utah_fnames, read_csv)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1): date

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m35040[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): site_name
[32mdbl[39m  (5): day_of_week, day_of_year, interval_of_day, avg_mph, total_volume
[33mlgl[39m  (2): missing_speed, missing_volume
[34mdttm[39m (1): timestamp
[34mdate[39m (1)

In [55]:
# Ensure that the rows in each data frame are in proper chronological order 
england_df_list <- lapply(england_df_list, function(x) x %>% arrange(timestamp))
portland_df_list <- lapply(portland_df_list, function(x) x %>% arrange(timestamp))
utah_df_list <- lapply(utah_df_list, function(x) x %>% arrange(timestamp))    

In [56]:
# Create one list of all data frames
total_df_list_orig <- append(append(england_df_list, portland_df_list), utah_df_list)

In [57]:
# Read in the start and end points for each time series from csv
start_end_orig <- read_csv("start_end_points.csv")
starting_points <- start_end_orig$start
ending_points <- start_end_orig$end

[1mRows: [22m[34m76[39m [1mColumns: [22m[34m2[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (2): start, end

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [58]:
# For each df in our list, create a row number column called 'rn' - this will allow us to find the starting point
# for each sample using the above starting_points array
total_df_list_orig <- lapply(total_df_list_orig, function(x) x %>% mutate(rn = row_number()))

In [59]:
# Sample each df in the list according to its corresponding starting and ending point - this creates 12-week
# long samples of each data frame which will be used for modeling and testing
total_df_list_orig_samples <- lapply(1:length(total_df_list_orig), 
                                function(x) total_df_list_orig[[x]] %>% 
                                    filter(rn >= starting_points[x]) %>%
                                    filter(rn <= ending_points[x]))

In [60]:
# Add a new column to each data frame to denote whether each row should be part of the training set, validation set
# or test set - the first 8 weeks are designated as train, the next 2 as val, and the final 2 as test. It may be 
# the case that train and val get used for training, depending on the modeling task
total_df_list_orig_samples <- lapply(total_df_list_orig_samples, 
                                function(x) x %>% mutate(rn = row_number()) %>%
                                    mutate(train_val_test = ifelse(rn <= (96*7*8), 
                                                                   "train", 
                                                                   ifelse(rn <= (96*7*10), 
                                                                          "val", 
                                                                          "test")))
                                
                               )

In [61]:
# Create a list of data frames which only have the training rows of each df
train_samples_orig <- lapply(total_df_list_orig_samples, function(x) x %>% filter(train_val_test == "train"))

In [62]:
# Get the target field (total_volume) from the dfs in the above lists and create msts objects instead of dfs
train_samples_ts_orig <- lapply(train_samples_orig, 
                           function(x) msts(x$total_volume, seasonal.periods=c(24*4, 24*4*7)))

In [63]:
for (i in 1:length(train_samples_ts_orig)) {
    if (mean(train_samples_ts_orig[[i]] == dtw_orig_cent[[1]])==1) {
        centroid_1 <- i
    }
    if (mean(train_samples_ts_orig[[i]] == dtw_orig_cent[[2]])==1) {
        centroid_2 <- i
    }
    
}

In [64]:
centroid_1

In [65]:
centroid_2

## Compute Distance to Centroids

In [66]:
dist_obs_1_cent_1 <- dtw_basic(train_samples_ts[[1]], train_samples_ts_orig[[centroid_1]])
dist_obs_1_cent_2 <- dtw_basic(train_samples_ts[[1]], train_samples_ts_orig[[centroid_2]])

dist_obs_2_cent_1 <- dtw_basic(train_samples_ts[[2]], train_samples_ts_orig[[centroid_1]])
dist_obs_2_cent_2 <- dtw_basic(train_samples_ts[[2]], train_samples_ts_orig[[centroid_2]])

dist_obs_3_cent_1 <- dtw_basic(train_samples_ts[[3]], train_samples_ts_orig[[centroid_1]])
dist_obs_3_cent_2 <- dtw_basic(train_samples_ts[[3]], train_samples_ts_orig[[centroid_2]])

dist_obs_4_cent_1 <- dtw_basic(train_samples_ts[[4]], train_samples_ts_orig[[centroid_1]])
dist_obs_4_cent_2 <- dtw_basic(train_samples_ts[[4]], train_samples_ts_orig[[centroid_2]])

### Observation 1

In [67]:
dist_obs_1_cent_1

In [68]:
dist_obs_1_cent_2

In [69]:
min(dist_obs_1_cent_1, dist_obs_1_cent_2)

### Observation 2

In [70]:
dist_obs_2_cent_1

In [71]:
dist_obs_2_cent_2

In [72]:
min(dist_obs_2_cent_1, dist_obs_2_cent_2)

### Observation 3

In [73]:
dist_obs_3_cent_1

In [74]:
dist_obs_3_cent_2

In [75]:
min(dist_obs_3_cent_1, dist_obs_3_cent_2)

### Observation 4

In [76]:
dist_obs_4_cent_1

In [77]:
dist_obs_4_cent_2

In [78]:
min(dist_obs_4_cent_1, dist_obs_4_cent_2)

In [79]:
dtw_clust_assign <- c(2, 2, 1, 1)

# Save Results

In [80]:
new_clust_assign_df <- data.frame(rand=rand_clust_assign,
                                  catch22=catch22_clust_assign,
                                  tsfeat=tsfeat_clust_assign,
                                  dtw=dtw_clust_assign)

In [81]:
write.csv(new_clust_assign_df, "Results/Unseen Sensor/clust_assign.csv")