## Segements and events selection

In [20]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.neighbors import KernelDensity

import transport_signal_processing as tsp
from data_manager import DataManager

import mylib as my

ModuleNotFoundError: No module named 'mylib'

In [21]:
# parameters
path = "*"
level = 1
selected_only = True

# setup database connector
sigman = DataManager('database')

# load segments informations
sinfo_l = sigman.load_info(path, 's*')

# convert info to dataframe
df = pd.DataFrame(sinfo_l)

# keep only selected segments
df = df[(df['selected'] > (level-1))]
df

Unnamed: 0,pore,temperature,voltage,analyte,buffer,channel,id,sid,segment_range,segment_duration,MODIFIED,mI_open,sI_open,N_events,N_cores,N_reduced,selected,ratio_sel
0,K238A,25,100,AA00300AA,LiCl,4,1-0,0,"[0, 4246287]",42.46287,2022-03-10_09:58:55,44.726295,3.441116,2176.0,1001.0,1001.0,1.0,0.825175
1,K238A,25,100,AA00300AA,LiCl,4,2-0,0,"[0, 5121195]",51.21195,2022-03-10_09:58:43,43.902667,3.574460,2219.0,1075.0,1075.0,1.0,0.817674
2,K238A,25,100,AA00300AA,LiCl,4,2-0,1,"[5185984, 8182573]",29.96589,2022-03-10_09:58:43,44.055105,3.615654,1333.0,647.0,647.0,1.0,0.799073
3,K238A,25,100,AA00300AA,LiCl,4,2-0,2,"[8434663, 15932687]",74.98024,2022-03-10_09:58:55,43.880660,3.592745,3801.0,1799.0,1799.0,1.0,0.831017
8,K238A,25,100,AA00300AA,LiCl,2,3-0,4,"[25490928, 37941599]",124.50671,2022-03-10_09:58:43,41.274651,3.066304,4435.0,2356.0,2356.0,1.0,0.938455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
872,K238A,15,100,AA00300AA,KCl,4,6,0,"[0, 18476959]",184.76959,2022-03-10_09:58:39,67.498344,3.551380,3543.0,1183.0,1183.0,1.0,0.770076
873,K238A,15,100,AA00300AA,KCl,4,6,1,"[18478922, 35137084]",166.58162,2022-03-10_09:58:39,67.301492,3.524443,2308.0,855.0,855.0,1.0,0.767251
874,K238A,15,100,AA00300AA,KCl,4,6,2,"[35187376, 113683647]",784.96271,2022-03-10_09:58:53,67.802013,3.509333,14926.0,5144.0,5144.0,1.0,0.769440
875,K238A,5,100,AA00300AA,KCl,4,6-0,0,"[0, 90978255]",909.78255,2022-03-10_09:58:54,53.412203,3.143489,12633.0,3148.0,3148.0,1.0,0.563532


# Consistency check

In [22]:
# parameters
key_sel = 'polymer_name'

# find keys
key_sels = np.unique(df[key_sel].values)

for key in key_sels:
    # subset
    dfs = df[df[key_sel] == key].copy()
    
    # assign unique instance id
    keys = ['pore', 'polymer_name', 'temperature', 'voltage', 'channel', 'rid']
    dfs['uid'] = dfs[keys].astype(str).apply(lambda x: '_'.join(x), axis=1)

    # load stats grouped by selected key
    stats_dict = tsp.utils.load_stats_for_key(sigman, dfs.to_dict('records'), 'uid', selected_only=selected_only)

    # dimension reduction
    stats_l = list(stats_dict.values())
    stats_l = [stats[:,:2] for stats in stats_l]
    
    #print(len(stats_l))
    #for s in stats_l:
    #    print(s.shape)
    #print(stats_l[0][:5, :])
    # stats_l - list of experiments, Nx2 array - an experiment, 1 row - 1 event
    # duration_of_the_event; average_relative_current; ### std(not used); std3(not used); std4(not used);
    
    # compute combined statistical divergence between measurements
    D = tsp.stats.divergence_matrix(stats_l)
    
    # figure
    plt.figure()
    plt.imshow(D, cmap='jet', origin='lower')
    plt.colorbar(label='divergence')
    plt.yticks(np.arange(D.shape[0]), list(stats_dict))
    plt.title(key)
    plt.show()

KeyError: 'polymer_name'

### Experiments comparison

In [23]:
# paramter
key_sel = "polymer_name"

# load stats grouped by selected key
stats_dict = tsp.utils.load_stats_for_key(sigman, df.to_dict('records'), key_sel, selected_only=selected_only)

#print(stats_dict)

# plot
for key in stats_dict:
    stats = stats_dict[key]
    
    print(stats.shape)
    
    # plot
    plt.figure(figsize=(6,5))
    plt.semilogy(stats[:,1], stats[:,0], '.', ms=1.0)
    plt.xlim(-3.0, 92.0)
    plt.ylim(0.2, 100)
    plt.xlabel('average relative current [%]')
    plt.ylabel('dwell time [ms]')
    plt.title(key)
    plt.tight_layout()
    plt.show()

KeyError: 'polymer_name'

In [24]:
# paramter
key_sel = "polymer_name"

# load stats grouped by selected key
stats_dict = tsp.utils.load_stats_for_key(sigman, df.to_dict('records'), key_sel, selected_only=selected_only)
# [stats_dict[key] - array of [relative_current; drop_duration] x N_of_events

# plot
plt.figure()
plt.hist([stats_dict[key][:,1] for key in stats_dict], range=(-3.0, 92.0), bins=100, histtype='step', label=list(stats_dict))
plt.xlim(-3.0, 92.0)
plt.legend(loc='best')
plt.xlabel('average relative current [%]')
plt.ylabel('count')
plt.show()

KeyError: 'polymer_name'

### Events analysis

In [25]:
# parameters
key_sel = 'polymer_name'

# find keys
key_sels = np.unique(df[key_sel].values)

events_dict = {}
for key in key_sels:
    # subset
    dfs = df[df[key_sel] == key].copy()

    # load core events
    events_dict[key] = tsp.utils.load_core_events(sigman, dfs.to_dict('record'), selected_only=selected_only)

KeyError: 'polymer_name'

In [26]:
# parameters
N_interp = 200

#for key in events_dict:
for key in ['pure103140asynB8']:
    print("="*62)
    print(key)
    
    print('len(events_dict[key]) = ', len(events_dict[key]))   # events_dict[key] - list of signals
    print('events_dict[key][0].shape = ', events_dict[key][0].shape)   # events_dict[key][i] - i-th signal points; 
    print('events_dict[key][0][:5, :] :\n', events_dict[key][0][:5, :])   # time; relative_current
    
    # interpolate events
    X = np.array([tsp.signals.downsample(x[:,1], N_interp) for x in events_dict[key]])
    
    # parameters
    N_clst = 40

    # clustering
    #clst = KMeans(N_clst)
    #clst.fit(X)

    # get labels
    y = clst.labels_
    print('labels:', y[:5])

    # split by clusters
    X_l = []
    ic_l = []
    for i in np.unique(y):
        ids = np.where(y == i)[0]
        X_l.append(X[ids])
        ic_l.append(ids)
    
    print('len(X_l) = ', len(X_l))
    print('X_l[0].shape = ', X_l[0].shape)   
    # X_l[0] - N_subgroup x N_interp array; 
    # N_subgroup - number of signals in this K-mean group
    # N_interp - standard number of points for a signal; is set in the beginning of this cell

    # get cluster centers
    Xc = clst.cluster_centers_
    
    print('Xc.shape = ', Xc.shape)
    # Xc - N_groups x N_interp array;
    # set of average signals for each group; red ones
    
    # plot
    plt.figure(figsize=(8,4))
    for i in np.unique(y):
        ids = np.where(y == i)[0]
        #print(ids.shape)
        plt.semilogy(stats_dict[key][ids,1], stats_dict[key][ids,0], '.', ms=1.0)

    plt.xlim(0.0, 92.0)
    plt.ylim(0.1, 100.0)
    plt.show()
    
    # parameters
    ncol = 4
    nrow = 10

    # check
    assert ncol*nrow == N_clst

    # define x axis points
    t = np.linspace(0.0, 1.0, N_interp)

    # plots
    #plt.figure(figsize=(3.2*ncol,1.6*nrow))
    plt.figure(figsize=(4*ncol,2*nrow))
    ptot = 0.0
    for i in range(nrow):
        for j in range(ncol):
            # get 1d index
            k = ncol*i+j
            # subplot
            plt.subplot(nrow, ncol, k+1)
            for x in X_l[k]:
                plt.plot(t, x, 'k-', alpha=0.025)
            plt.plot(t, Xc[k], 'r-')
            plt.xlim(0.0, 1.0)
            #plt.ylim(0.0, 100.0)
            plt.xticks([0.0, 0.25, 0.5, 0.75, 1.0], ['', '', '', '', ''])
            #plt.yticks([22.0, 50.0], [22, 50])
            #plt.yticks([0.0, 50.0, 100.0], [0, 50, 100])
            plt.title('{:.2f}%; {:d}'.format(1e2*X_l[k].shape[0]/X.shape[0], k))
            #plt.xlabel('relative time')
            #plt.ylabel('relative current')

    plt.tight_layout()
    plt.show()

pure103140asynB8


NameError: name 'events_dict' is not defined

### Classification

In [27]:
# paramter
key_sel = "polymer_name"

# load stats grouped by selected key
stats_dict = tsp.utils.load_stats_for_key(sigman, df.to_dict('records'), key_sel, selected_only=selected_only)

# plot
plt.figure()
plt.hist([stats_dict[key][:,1] for key in stats_dict], range=(-3.0, 92.0), bins=100, density=True, histtype='step', label=list(stats_dict))
plt.xlim(-3.0, 92.0)
plt.legend(loc='best')
plt.xlabel('average relative current [%]')
plt.ylabel('count')
plt.show()

# plot
plt.figure()
plt.hist([stats_dict[key][:,1] for key in stats_dict], range=(-3.0, 20.0), bins=50, density=True, histtype='step', label=list(stats_dict))
plt.xlim(-3.0, 20.0)
plt.legend(loc='best')
plt.xlabel('average relative current [%]')
plt.ylabel('density')
plt.show()

# probability
p = np.array([np.mean(stats_dict[key][:,1] < 20.0) for key in stats_dict])

# plot
plt.figure()
plt.bar(np.arange(p.shape[0]), p)
for i in range(p.shape[0]):
    plt.text(i, p[i]+5e-3, str(np.round(p[i],2)), horizontalalignment='center')
plt.xticks(np.arange(p.shape[0]), list(stats_dict))
plt.show()

KeyError: 'polymer_name'

In [28]:
for key in ['pure103140asynB8']:
    print("="*62)
    print(key)

pure103140asynB8
