In [1]:
import os
import json
import numpy as np
import torch as pt
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.signal import bessel, sosfiltfilt

import transport_signal_processing as tsp
from data_manager import DataManager

In [2]:
# parameters
output_dir = "datasets/100mV_0-90rI_bg-ps"

# make folders
os.makedirs(output_dir, exist_ok=True)

In [3]:
# parameters
path = "*100mV*"
level = 1
selected_only = True

# setup database connector
sigman = DataManager('database')

# load segments informations
sinfo_l = sigman.load_info(path, 's*')

# convert info to dataframe
df = pd.DataFrame(sinfo_l)

# condition key
exp_keys = ['pore', 'voltage', 'analyte']
df['condition'] = df.apply(lambda x: '-'.join([str(x[k]) for k in exp_keys]), axis=1)

# experience id
exp_keys = ['pore', 'analyte', 'voltage', 'temperature', 'channel', 'replica']
df['uid'] = df.apply(lambda x: '-'.join([str(x[k]) for k in exp_keys]), axis=1)

# manual filter
#df = df[df['analyte'].isin(['AYEMPSEE', 'GYQDYEPEA'])]
#df = df[df['analyte'].isin(['AYEMPSEE', 'psAYEMPSEE'])]
df = df[df['analyte'].isin(['FLasynGluC', 'psAYEMPSEE'])]

# keep only selected segments
df = df[(df['selected'] > (level-1))]
df

Unnamed: 0,pore,analyte,voltage,temperature,channel,replica,part,sid,segment_range,segment_duration,MODIFIED,mI_open,sI_open,N_events,N_cores,N_reduced,selected,ratio_sel,condition,uid
137,K238A,FLasynGluC,100,25,3,1,0,0,"[0, 27029796]",270.29796,2022-10-05_17:24:41,87.677444,2.503898,5255,725.0,725.0,1.0,0.997241,K238A-100-FLasynGluC,K238A-FLasynGluC-100-25-003-01
138,K238A,FLasynGluC,100,25,3,1,0,1,"[27075059, 29999999]",29.2494,2022-10-05_17:24:41,87.963698,2.491037,684,100.0,100.0,1.0,1.0,K238A-100-FLasynGluC,K238A-FLasynGluC-100-25-003-01
139,K238A,FLasynGluC,100,25,3,0,1,0,"[0, 30099999]",300.99999,2022-10-05_17:24:41,87.517957,2.595546,4094,647.0,647.0,1.0,0.996909,K238A-100-FLasynGluC,K238A-FLasynGluC-100-25-003-00
140,K238A,FLasynGluC,100,25,3,0,2,0,"[0, 13466591]",134.66591,2022-10-05_17:24:41,87.562158,2.52596,2132,333.0,333.0,1.0,1.0,K238A-100-FLasynGluC,K238A-FLasynGluC-100-25-003-00
142,K238A,FLasynGluC,100,25,3,1,2,0,"[0, 14036351]",140.36351,2022-10-05_17:24:41,88.720042,2.474224,2859,406.0,406.0,1.0,0.982759,K238A-100-FLasynGluC,K238A-FLasynGluC-100-25-003-01
143,K238A,FLasynGluC,100,25,4,7,1,0,"[0, 29999999]",299.99999,2022-10-05_17:24:41,98.285206,2.570736,6678,917.0,917.0,1.0,0.995638,K238A-100-FLasynGluC,K238A-FLasynGluC-100-25-004-07
144,K238A,FLasynGluC,100,25,3,2,1,0,"[0, 9923641]",99.23641,2022-10-05_17:24:41,89.443573,2.444805,1375,192.0,192.0,1.0,0.994792,K238A-100-FLasynGluC,K238A-FLasynGluC-100-25-003-02
145,K238A,FLasynGluC,100,25,3,2,1,1,"[9965124, 25228671]",152.63547,2022-10-05_17:24:41,89.762495,2.497561,3137,426.0,426.0,1.0,0.995305,K238A-100-FLasynGluC,K238A-FLasynGluC-100-25-003-02
146,K238A,FLasynGluC,100,25,1,3,1,0,"[0, 30099999]",300.99999,2022-10-05_17:24:41,94.986527,1.863453,10708,3144.0,3144.0,1.0,0.998728,K238A-100-FLasynGluC,K238A-FLasynGluC-100-25-001-03
147,K238A,FLasynGluC,100,25,4,7,2,0,"[0, 5418751]",54.18751,2022-10-05_17:24:41,98.432336,2.548964,1182,152.0,152.0,1.0,1.0,K238A-100-FLasynGluC,K238A-FLasynGluC-100-25-004-07


In [4]:
df['analyte'].unique()

array(['FLasynGluC', 'psAYEMPSEE'], dtype=object)

In [5]:
def remove_tails(y, sigma_tol=0.0):
    y_thr = np.mean(y) + sigma_tol * np.std(y)
    i0, i1 = np.where(y < y_thr)[0][[0,-1]]
    return y[i0:i1]

def events_to_features(events, min_output_size=16, sigma_tol=0.0):
    X_l = []
    for evt in events:
        if len(evt) > 1:
            #y = evt[:,1]
            y = evt.copy()
            y[:,0] = y[:,0] - y[0,0]
            #y = remove_tails(y, sigma_tol=sigma_tol)
            if len(y) > min_output_size:
                X_l.append(y)
    return X_l

In [6]:
# paramters
N_min = 100

# find keys
ckey_sels = np.unique(df["condition"].values)

# extract features and labels from events
data = []
for k, ckey in enumerate(ckey_sels):
    # add entry
    data.append({})
    
    # condition subset
    dfs = df[df["condition"] == ckey].copy()
    
    # group by experiment id
    ukey_sels = np.unique(dfs["uid"].values)
    for ukey in tqdm(ukey_sels):
        dfu = dfs[dfs["uid"] == ukey]
        
        # load core events
        events = tsp.utils.load_core_events(sigman, dfu.to_dict('records'), selected_only=selected_only)

        # extract features
        x_l = events_to_features(events)
        y_l = [np.array(k) for x in x_l]
        
        # store data
        if len(x_l) > N_min:
            # stats fix
            #mx = np.mean([np.mean(x) for x in x_l])
            #sx = np.mean([np.std(x) for x in x_l])
            #x_l = [(x - mx)/sx for x in x_l]

            data[-1][ukey] = [x_l, y_l]
        else:
            print(ukey, len(x_l))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:02<00:00,  2.63it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.85it/s]


In [7]:
# split train / test
ukeys_l = []
for i in range(len(data)):
    keys = np.array(list(data[i]))
    n = max(1, int(np.ceil(0.2*len(keys))))
    np.random.shuffle(keys)    
    ukeys_l.append({
        'train': list(keys[n:]),
        'test': list(keys[:n]),
    })
    
# save split
json.dump(ukeys_l, open(os.path.join(output_dir, "keys.json"), 'w'))

In [8]:
# pack results
X_tr_l, y_tr_l = [], []
X_te_l, y_te_l = [], []
for i in range(len(data)):
    features = data[i]
    ukeys = ukeys_l[i]
    
    for ukey in ukeys['train']:
        X_l, y_l = features[ukey]
        X_tr_l.extend(X_l)
        y_tr_l.extend(y_l)
    
    for ukey in ukeys['test']:
        X_l, y_l = features[ukey]
        X_te_l.extend(X_l)
        y_te_l.extend(y_l)

# save datasets
np.savez(
    os.path.join(output_dir, "train_dataset.npz"),
    X=np.array(X_tr_l, dtype=np.dtype('object')),
    y=np.array(y_tr_l, dtype=np.dtype('object'))
)
np.savez(
    os.path.join(output_dir, "test_dataset.npz"),
    X=np.array(X_te_l, dtype=np.dtype('object')),
    y=np.array(y_te_l, dtype=np.dtype('object'))
)