In [None]:
import numpy as np
import pyemma
from tqdm.notebook import tqdm
import mdtraj
import itertools
import h5py
from glob import glob

In [None]:
# file paths
topfile = 'setup/hsynapto.pdb'
syt_files = glob('0cal_dyn*.1/hsynapto-protein-stride10.xtc')

outfile = 'syt_0cal_internal1by1_stride100.hdf5'

In [None]:
# define pyemma featurizer
feat = pyemma.coordinates.featurizer(topfile)

# add pairs of residues, exclude first and last 3 residues
pairs = feat.pairs(np.arange(3, feat.topology.n_residues - 3), excluded_neighbors=5)
feat.add_residue_mindist(residue_pairs=pairs)

In [None]:
# create iterator
data_source = pyemma.coordinates.source(syt_files, feat)

In [None]:
### process data with featurizer and write to disk

# note that stride parameter here must be multiplied by the stride on the
# trajectories that we're loading (which is 10),
# i.e., loading with stride 10 here is a total stride of 100. compare `outfile`

it = data_source.iterator(stride=10, chunk=1000)

with h5py.File(outfile, "w") as f:
    last_trajid = -1
    for trajid, chunk in tqdm(it, total=it.n_chunks):
       
        if last_trajid < trajid:
            if last_trajid != -1:
                dset.flush()
            dset = f.create_dataset(syt_files[trajid].split('/')[-2], 
                                    shape=(it.trajectory_length(), feat.dimension()), 
                                    dtype=np.float32)
            start = 0
            last_trajid = trajid
        dset[it.pos:it.pos + it.chunksize if it.pos + it.chunksize < it.trajectory_length() else None] = chunk
        start += 1