# Motifs clustering

Companion notebook of 2_Motifs_analysis. After exporting motifs to csv.zip files, proceed here for clustering and visualization.

## Import libraries

In [1]:
import torch
from torch.utils.data import DataLoader
from torchvision import transforms
import numpy as np
import pandas as pd
from load_data import DataProcesser
from results_model import top_confidence_perclass, least_correlated_set
from pattern_utils import extend_segments, create_cam, longest_segments, extract_pattern
from class_dataset import myDataset, ToTensor, RandomCrop
from dtaidistance import dtw, clustering
from models import ConvNetCam
from skimage.filters import threshold_li, threshold_mean
import os
from itertools import chain
from tqdm import tqdm
import subprocess

In [2]:
# data_file = '/home/marc/Dropbox/Work/TSclass_GF/data/ErkAkt_6GF_len240_repl2_trim100.zip'
# data_file = '/home/marc/Dropbox/CNN_paper_MarcAntoine/forPaper/data_analysis/synthetic_len750_univariate_classAB.zip'
data_file = '/home/marc/Dropbox/CNN_paper_MarcAntoine/forPaper/data_analysis/synthetic_len750.zip'
meas_var = None  # Set to None for auto detection

data = DataProcesser(data_file)
classes = tuple(data.classes.iloc[:, 1])
meas_var = data.detect_groups_times()['groups'] if meas_var is None else meas_var

min_len_patt = 5
max_len_patt = 400  # length to divide by nchannel

center_patt = False
normalize_dtw = True

export_perClass = True
export_allPooled = True

save_csv = True
save_rds = True

## Build distance matrix with DTW

This is done in R with the implementation of the *parallelDist* package. It is very efficient and has support for multivariate cases.

The distance matrices can be written both as a compressed csv (squared matrix, lower triangle and diagonal set to Inf) or as an rds R object which contains an R distance object. The latter is very useful to resume clustering directly in R, just load the distance object with readRDS().

In [3]:
center_patt = "T" if center_patt else "F"
normalize_dtw = "T" if normalize_dtw else "F"
save_csv = "T" if save_csv else "F"
save_rds = "T" if save_rds else "F"

if export_perClass:
    for classe in classes:
        print('Building distance matrix for class: {} \n'.format(classe))
        fout_patt = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/patt_uncorr_{}.csv.gz'.format(classe)
        fout_dist = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/uncorr_dist_norm_{}'.format(classe)
        if len(meas_var) == 1:
            subprocess.call(
                'Rscript --vanilla /home/marc/Dropbox/Work/TSclass_GF/dtw_multivar_distmat.R -i "{}" -o "{}" -l {} -n {} --norm {} --center {} --colid {} --csv {} --rds {}'.format(
                    fout_patt,
                    fout_dist,
                    max_len_patt,
                    len(meas_var),
                    normalize_dtw,
                    center_patt,
                    "NULL",
                    save_csv,
                    save_rds), shell=True)
        elif len(meas_var) >= 2:
            subprocess.call(
                'Rscript --vanilla /home/marc/Dropbox/Work/TSclass_GF/dtw_multivar_distmat.R -i "{}" -o "{}" -l {} -n {} --norm {} --center {} --colid {} --csv {} --rds {}'.format(
                    fout_patt,
                    fout_dist,
                    max_len_patt,
                    len(meas_var),
                    normalize_dtw,
                    center_patt,
                    "NULL",
                    save_csv,
                    save_rds), shell=True)
            
if export_allPooled:
    print('Building distance matrix for pooled data.')
    fout_patt = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/patt_uncorr_allPooled.csv.gz'
    fout_dist = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/uncorr_dist_norm_allPooled'
    if len(meas_var) == 1:
        subprocess.call('Rscript --vanilla /home/marc/Dropbox/Work/TSclass_GF/dtw_multivar_distmat.R -i "{}" -o "{}" -l {} -n {} --norm {} --center {} --colid {} --csv {} --rds {}'.format(
            fout_patt,
            fout_dist,
            max_len_patt,
            1,
            normalize_dtw,
            center_patt,
            "pattID",
            save_csv,
            save_rds), shell=True)
    elif len(meas_var) >= 2:
        subprocess.call('Rscript --vanilla /home/marc/Dropbox/Work/TSclass_GF/dtw_multivar_distmat.R -i "{}" -o "{}" -l {} -n {} --norm {} --center {} --colid {} --csv {} --rds {}'.format(
            fout_patt,
            fout_dist,
            max_len_patt,
            len(
                meas_var),
            normalize_dtw,
            center_patt,
            "pattID",
            save_csv,
            save_rds), shell=True)

Building distance matrix for class: A 

Building distance matrix for class: B 

Building distance matrix for class: C 

Building distance matrix for class: D 

Building distance matrix for pooled data.


## Cluster, generate report with results

This will use the distance matrix generated in the previous section to perform hierarchical clustering. Medoids from each cluster are reported along with a random sample of each cluster.

- nclust: int, number of clusters.
- nmedoid: int, number of medoids to plot per cluster.
- nseries: int, number of series to plot from each cluster (choosen randomly).
- linkage: str, one of ["ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid"]. Linkage for hierarchical clustering. Ward and average seem to be advisable defaults. More details on the help page for hierarchical clustering with R: https://stat.ethz.ch/R-manual/R-devel/library/stats/html/hclust.html

In [4]:
nclust = 4
nmedoid = 3
nseries = 16
linkage = "ward.D2"

assert linkage in ["ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid"]

if export_perClass:
    for classe in classes:
        print('Cluster patterns for class: {} \n'.format(classe))
        fout_patt = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/patt_uncorr_{}.csv.gz'.format(classe)
        fout_dist = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/uncorr_dist_norm_{}.csv.gz'.format(classe)
        fout_plot = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/uncorr_pattPlot_{}.pdf'.format(classe)
        subprocess.call('Rscript --vanilla /home/marc/Dropbox/Work/TSclass_GF/pattern_clustering.R -d {} -p {} -o {} -l {} -n {} -c {} -m {} -t {} --colid {} --linkage {}'.format(fout_dist,
                                                                                                       fout_patt,
                                                                                                       fout_plot,
                                                                                                       max_len_patt,
                                                                                                       len(meas_var),
                                                                                                       nclust,
                                                                                                       nmedoid,
                                                                                                       nseries,
                                                                                                       "NULL",
                                                                                                       linkage), shell=True)

if export_allPooled:
    print('Cluster patterns for pooled data.')
    fout_patt = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/patt_uncorr_allPooled.csv.gz'
    fout_dist = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/uncorr_dist_norm_allPooled.csv.gz'
    fout_plot = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/uncorr_pattPlot_allPooled.pdf'
    subprocess.call('Rscript --vanilla /home/marc/Dropbox/Work/TSclass_GF/pattern_clustering.R -d {} -p {} -o {} -l {} -n {} -c {} -m {} -t {} --colid {} --linkage {}'.format(fout_dist,
                                                                                               fout_patt,
                                                                                               fout_plot,
                                                                                               max_len_patt,
                                                                                               len(meas_var),
                                                                                               nclust,
                                                                                               nmedoid,
                                                                                               nseries,
                                                                                               "pattID",
                                                                                               linkage), shell=True)
print('Clustering done.')

Cluster patterns for class: A 

Cluster patterns for class: B 

Cluster patterns for class: C 

Cluster patterns for class: D 

Cluster patterns for pooled data.
Clustering done.
