In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

import numpy as np

from glob import glob
from pathlib import Path
from rfcm import RFCM

INTERVAL = 5

In [2]:
def dropna(nparray):
    if isinstance(nparray[0], np.ndarray):
        return np.array([dropna(x) for x in nparray])
    else:
        return nparray[~np.isnan(nparray)]

In [3]:
for direction in ["src"]:
    for dirname in glob(Path(f'timeseries_feature/interval_{INTERVAL}_{direction}*/*').__str__()):
        print("Dir: ", dirname)
        if os.path.exists(os.path.join(dirname, "rfcm_label.npy")):
            continue
        pyts_dataset = np.load(os.path.join(dirname, "pyts_dataset.npy"))
        pyts_dataset = dropna(pyts_dataset)
        print("\tPyts dataset shape: ", pyts_dataset.shape)
        model = RFCM(n_clusters=10, max_iter=10, random_state=0, n_jobs=4, epsilon=1e-3)
        model.fit(pyts_dataset)
        y_pred = model.labels_
        np.save(os.path.join(dirname, "rfcm_label.npy"), y_pred)
        print("\tRFCM label shape: ", y_pred.shape)
        del pyts_dataset, model, y_pred

Dir:  timeseries_feature\interval_5_src_feature\bytes-bytes_packets
Dir:  timeseries_feature\interval_5_src_feature\bytes-flows
Dir:  timeseries_feature\interval_5_src_feature\bytes-flows_(bytes_packets)
Dir:  timeseries_feature\interval_5_src_feature\bytes-nDstIP
Dir:  timeseries_feature\interval_5_src_feature\bytes-nDstPort
Dir:  timeseries_feature\interval_5_src_feature\bytes-nSrcPort
Dir:  timeseries_feature\interval_5_src_feature\bytes_packets-flows_(bytes_packets)
Dir:  timeseries_feature\interval_5_src_feature\bytes_packets-nDstIP
Dir:  timeseries_feature\interval_5_src_feature\bytes_packets-nDstPort
Dir:  timeseries_feature\interval_5_src_feature\bytes_packets-nSrcPort
Dir:  timeseries_feature\interval_5_src_feature\flows-bytes_packets
Dir:  timeseries_feature\interval_5_src_feature\flows-flows_(bytes_packets)
Dir:  timeseries_feature\interval_5_src_feature\flows-nDstIP
Dir:  timeseries_feature\interval_5_src_feature\flows-nDstPort
Dir:  timeseries_feature\interval_5_src_featur

In [4]:
# class EmptyClusterError(Exception):
#     def __init__(self, message=""):
#         super().__init__()
#         self.message = message

#     def __str__(self):
#         if len(self.message) > 0:
#             suffix = " (%s)" % self.message
#         else:
#             suffix = ""
#         return "Cluster assignments lead to at least one empty cluster" + \
#                suffix

In [5]:
# class TimeSeriesCentroidBasedClusteringMixin:
#     """Mixin class for centroid-based clustering of time series."""
#     def _post_fit(self, X_fitted, centroids, inertia):
#         if np.isfinite(inertia) and (centroids is not None):
#             self.cluster_centers_ = centroids
#             self._assign(X_fitted)
#             self._X_fit = X_fitted
#             self.inertia_ = inertia
#         else:
#             self._X_fit = None

In [6]:
# def _check_initial_guess(init, n_clusters):
#     if hasattr(init, '__array__'):
#         assert init.shape[0] == n_clusters, \
#             "Initial guess index array must contain {} samples," \
#             " {} given".format(n_clusters, init.shape[0])

In [7]:
# def _k_init_metric(X, n_clusters, cdist_metric, random_state, n_local_trials=None):
#     n_samples, n_timestamps, n_features = X.shape

#     centers = np.empty((n_clusters, n_timestamps, n_features), dtype=X.dtype)

#     # Set the number of local seeding trials if none is given
#     if n_local_trials is None:
#         # This is what Arthur/Vassilvitskii tried, but did not report
#         # specific results for other than mentioning in the conclusion
#         # that it helped.
#         n_local_trials = 2 + int(np.log(n_clusters))

#     # Pick first center randomly
#     center_id = random_state.randint(n_samples)
#     centers[0] = X[center_id]

#     # Initialize list of closest distances and calculate current potential
#     closest_dist_sq = cdist_metric(centers[0, np.newaxis], X) ** 2
#     current_pot = closest_dist_sq.sum()

#     # Pick the remaining n_clusters-1 points
#     for c in range(1, n_clusters):
#         # Choose center candidates by sampling with probability proportional
#         # to the squared distance to the closest existing center
#         rand_vals = random_state.random_sample(n_local_trials) * current_pot
#         candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), rand_vals)
#         # XXX: numerical imprecision can result in a candidate_id out of range
#         np.clip(candidate_ids, None, closest_dist_sq.size - 1, out=candidate_ids)

#         # Compute distances to center candidates
#         distance_to_candidates = cdist_metric(X[candidate_ids], X) ** 2

#         # update closest distances squared and potential for each candidate
#         np.minimum(closest_dist_sq, distance_to_candidates, out=distance_to_candidates)
#         candidates_pot = distance_to_candidates.sum(axis=1)

#         # Decide which candidate is the best
#         best_candidate = np.argmin(candidates_pot)
#         current_pot = candidates_pot[best_candidate]
#         closest_dist_sq = distance_to_candidates[best_candidate]
#         best_candidate = candidate_ids[best_candidate]

#         # Permanently add best center candidate found in local tries
#         centers[c] = X[best_candidate]

#     return centers

In [8]:
# class TimeSeriesRFCM(TransformerMixin, ClusterMixin, TimeSeriesCentroidBasedClusteringMixin, BaseModelPackage, TimeSeriesBaseEstimator):
#     def __init__(self, n_clusters=3, max_iter=50, tol=1e-6, n_init=1, metric="softdtw", max_iter_barycenter=100, metric_params=None, n_jobs=None, dtw_inertia=False, verbose=0, random_state=None, init='RFCM') -> None:
#         super().__init__()
#         self.n_clusters = n_clusters
#         self.max_iter = max_iter
#         self.tol = tol
#         self.n_init = n_init
#         self.metric = metric
#         self.max_iter_barycenter = max_iter_barycenter
#         self.metric_params = metric_params
#         self.n_jobs = n_jobs
#         self.dtw_inertia = dtw_inertia
#         self.verbose = verbose
#         self.random_state = random_state
#         self.init = init
        
#     def _is_fitted(self):
#         check_is_fitted(self, ["cluster_centers_"])
#         return True
    
#     def _get_metric_params(self):
#         if self.metric_params is None:
#             metric_params = {}
#         else:
#             metric_params = self.metric_params.copy()
#         if "n_jobs" in metric_params:
#             del metric_params["n_jobs"]
#         return metric_params
    
#     def _fit_one_init(self, X, x_squared_norms, rs):
#         metric_params = self._get_metric_params()
#         n_ts, sz, d = X.shape
#         if hasattr(self.init, '__array__'):
#             self.cluster_centers_ = self.init.copy()
#         elif isinstance(self.init, str) and self.init == 'RFCM':
#             if self.metric == "softdtw":
#                 def metric_fun(x, y):
#                     return cdist_soft_dtw(x, y, **metric_params)
#             else:
#                 raise ValueError("Incorrect metric: %s (should be one of 'softdtw')" % self.metric)
#             self.cluster_centers_ = _k_init_metric(X, self.n_clusters, cdist_metric=metric_fun, random_state=rs)
#             for i in range(self.n_clusters):
#                 self.cluster_centers_[i] = X[rs.randint(n_ts)]
        
#     def _transform(self, X):
#         metric_params = self._get_metric_params()
#         if self.metric == "softdtw":
#             return cdist_soft_dtw(X, self.cluster_centers_, **metric_params)
#         else:
#             raise ValueError("Incorrect metric: %s (should be one of 'softdtw')" % self.metric)
        
#     def _update_centroids(self, X):
#         metric_params = self._get_metric_params()
#         for k in 
    
#     def fit(self, X, y=None):
#         X = check_array(X, allow_nd=True, force_all_finite='allow-nan')
        
#         if hasattr(self.init, '__array__'):
#             X = check_dims(X, X_fit_dims=self.init.shape, extend=True, check_n_features_only=(self.metric != "euclidean"))
            
#         self.labels_ = None
#         self.inertia_ = np.inf
#         self.cluster_centers_ = None
#         self._X_fit = None
#         self._squared_inertia = True

#         self.n_iter_ = 0

#         max_attempts = max(self.n_init, 10)

#         X_ = to_time_series_dataset(X)
#         rs = check_random_state(self.random_state)
        
#         x_squared_norms = None
#         _check_initial_guess(self.init, self.n_clusters)
        
#         best_correct_centroids = None
#         min_inertia = np.inf
#         n_successful = 0
#         n_attempts = 0
#         while n_successful < self.n_init and n_attempts < max_attempts:
#             try:
#                 if self.verbose and self.n_init > 1:
#                     print("Init %d" % (n_successful + 1))
#                 n_attempts += 1
#                 self._fit_one_init(X_, x_squared_norms, rs)
#                 if self.inertia_ < min_inertia:
#                     best_correct_centroids = self.cluster_centers_.copy()
#                     min_inertia = self.inertia_
#                     self.n_iter_ = self._iter
#                 n_successful += 1
#             except EmptyClusterError:
#                 if self.verbose:
#                     print("Resumed because of empty cluster")
#         self._post_fit(X_, best_correct_centroids, min_inertia)
#         return 