# Motifs clustering

Companion notebook of 2_Motifs_analysis. After exporting motifs to csv.zip files, proceed here for clustering and visualization.

## Import libraries

In [1]:
import torch
from torch.utils.data import DataLoader
from torchvision import transforms
import numpy as np
import pandas as pd
from load_data import DataProcesser
from results_model import top_confidence_perclass, least_correlated_set
from pattern_utils import extend_segments, create_cam, longest_segments, extract_pattern
from class_dataset import myDataset, ToTensor, RandomCrop
from dtaidistance import dtw, clustering
from models import ConvNetCam
from skimage.filters import threshold_li, threshold_mean
import os
from itertools import chain
from tqdm import tqdm
import subprocess

In [2]:
data_file = '/home/marc/Dropbox/Work/TSclass_GF/data/ErkAkt_6GF_len240_repl2_trim100.zip'
data = DataProcesser(data_file)
classes = tuple(data.classes.iloc[:, 1])
meas_var = ['ERK', 'AKT']

min_len_patt = 40
max_len_patt = 400  # length to divide by nchannel

center_patt = False
normalize_dtw = True

export_perClass = True
export_allPooled = True

## Build distance matrix with DTW

This is done in R with the implementation of the *parallelDist* package. It is very efficient and has support for multivariate cases.

In [3]:
if export_perClass:
    for classe in classes:
        print('Building distance matrix for class: {} \n'.format(classe))
        fout_patt = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/patt_uncorr_{}.csv.gz'.format(classe)
        fout_dist = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/uncorr_dist_norm_{}.csv.gz'.format(classe)
        if len(meas_var) == 1:
            subprocess.call(
                'Rscript --vanilla /home/marc/Dropbox/Work/TSclass_GF/dtw_multivar_distmat.R -i "{}" -o "{}" -l {} -n {} --norm {} --center {} --colid {}'.format(
                    fout_patt,
                    fout_dist,
                    max_len_patt/len(meas_var),
                    1,
                    normalize_dtw,
                    center_patt,
                    "NULL"), shell=True)
        elif len(meas_var) >= 2:
            subprocess.call(
                'Rscript --vanilla /home/marc/Dropbox/Work/TSclass_GF/dtw_multivar_distmat.R -i "{}" -o "{}" -l {} -n {} --norm {} --center {} --colid {}'.format(
                    fout_patt,
                    fout_dist,
                    max_len_patt,
                    len(meas_var),
                    normalize_dtw,
                    center_patt,
                    "NULL"), shell=True)
            
if export_allPooled:
    print('Building distance matrix for pooled data.')
    fout_patt = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/patt_uncorr_allPooled.csv.gz'
    fout_dist = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/uncorr_dist_norm_allPooled.csv.gz'
    if len(meas_var) == 1:
        subprocess.call('Rscript --vanilla /home/marc/Dropbox/Work/TSclass_GF/dtw_multivar_distmat.R -i "{}" -o "{}" -l {} -n {} --norm {} --center {} --colid {}'.format(
            fout_patt,
            fout_dist,
            max_len_patt,
            1,
            normalize_dtw,
            center_patt,
            "pattID"), shell=True)
    elif len(meas_var) >= 2:
        subprocess.call('Rscript --vanilla /home/marc/Dropbox/Work/TSclass_GF/dtw_multivar_distmat.R -i "{}" -o "{}" -l {} -n {} --norm {} --center {}  --colid {}'.format(
            fout_patt,
            fout_dist,
            max_len_patt,
            len(
                meas_var),
            normalize_dtw,
            center_patt,
            "pattID"), shell=True)

Building distance matrix for class: BTC 

Building distance matrix for class: CTR 

Building distance matrix for class: EGF 

Building distance matrix for class: EPR 

Building distance matrix for class: HGF 

Building distance matrix for class: HRG 

Building distance matrix for class: IGF 

Building distance matrix for pooled data.


## Cluster, generate report with results

This will use the distance matrix generated in the previous section to perform hierarchical clustering. Medoids from each cluster are reported along with a random sample of each cluster.

- nclust: int, number of clusters.
- nmedoid: int, number of medoids to plot per cluster.
- nseries: int, number of series to plot from each cluster (choosen randomly).

In [4]:
nclust = 4
nmedoid = 3
nseries = 16

if export_perClass:
    for classe in classes:
        print('Cluster patterns for class: {} \n'.format(classe))
        fout_patt = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/patt_uncorr_{}.csv.gz'.format(classe)
        fout_dist = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/uncorr_dist_norm_{}.csv.gz.csv.gz'.format(classe)
        fout_plot = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/uncorr_pattPlot_{}.pdf'.format(classe)
        subprocess.call('Rscript --vanilla /home/marc/Dropbox/Work/TSclass_GF/pattern_clustering.R -d {} -p {} -o {} -l {} -n {} -c{} -m {} -t {}'.format(fout_dist,
                                                                                                       fout_patt,
                                                                                                       fout_plot,
                                                                                                       max_len_patt,
                                                                                                       len(meas_var),
                                                                                                       nclust,
                                                                                                       nmedoid,
                                                                                                       nseries), shell=True)

if export_allPooled:
    fout_patt = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/patt_uncorr_allPooled.csv.gz'
    fout_dist = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/uncorr_dist_norm_allPooled.csv.gz.csv.gz'
    fout_plot = '/home/marc/Dropbox/Work/TSclass_GF/Notebooks/output/' + '_'.join(meas_var) + '/local_patterns/uncorr_pattPlot_allPooled.pdf'.format(classe)
    subprocess.call('Rscript --vanilla /home/marc/Dropbox/Work/TSclass_GF/pattern_clustering.R -d {} -p {} -o {} -l {} -n {} -c{} -m {} -t {}'.format(fout_dist,
                                                                                               fout_patt,
                                                                                               fout_plot,
                                                                                               max_len_patt,
                                                                                               len(meas_var),
                                                                                               nclust,
                                                                                               nmedoid,
                                                                                               nseries), shell=True)
print('Clustering done.')

Cluster patterns for class: BTC 

Cluster patterns for class: CTR 

Cluster patterns for class: EGF 

Cluster patterns for class: EPR 

Cluster patterns for class: HGF 

Cluster patterns for class: HRG 

Cluster patterns for class: IGF 

Clustering done.
