# Dataset Example

## File Structure

In [13]:
# dataset_dir = '/home/rchaves/ssh_dirs/mn5/res/others/agus_MoDeL-CNS'
# dataset_dir = '/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/'
# dataset_dir = '/home/rchaves/repo/MDDB/workflow/test/data/input/dataset/'
dataset_dir = '/home/rchaves/ssh_dirs/mn5/ruben/model/'

# YAML use for the configuration of the dataset and the automatic inputs.yaml generation.
# project_directories outside the dataset directory are not allowed
dataset_yaml_path = dataset_dir + "dataset.yaml"
print(dataset_yaml_path)
!cat {dataset_yaml_path}

/home/rchaves/ssh_dirs/mn5/ruben/model/dataset.yaml
projects: 
    - '*'  # (matches dirs starting with a digit and all their subfolders)
ignore:
    - scripts # (ignore 'scripts' folders, it applies after resolving project directories)


In [14]:
inputs_template = dataset_dir + "inputs_template.yaml"
print(inputs_template)
!cat {inputs_template}

/home/rchaves/ssh_dirs/mn5/ruben/model/inputs_template.yaml
name: "{{ title }} ({{ DIR }})"
description: "{{ title }} (1 ms)"
authors: Agustín García
groups: IRB Barcelona, Orozco lab
citation: null
thanks: null
contact: agustin.garcia@irbbarcelona.org
type: trajectory
program: GROMACS
version: 2025.2
license: This trajectory dataset is released under a Creative Commons Attribution 4.0 International Public License
linkcense: tps://creativecommons.org/licenses/by/4.0/
method: Classical MD
accession: null
links: 
- name: Structural data source
  url: https://memprotmd.bioch.ox.ac.uk/_ref/PDB/{{ DIR }}
pdb_ids:
  - {{ DIR }}
forced_references: null
framestep: 0.01
timestep: 2
ensemble: NPT
ff: 53A6 GROMOS
wat: TIP3P
boxtype: Cubic
mds:
  - mdir: replica_1
mdref: 0
interactions: null
pbc_selection: auto
collections: mcns
chainnames: null
membranes: null
customs: null
multimeric: null
trjType: large
bucket: 8d3eha
temp: 310
ligands: null


In [15]:
job_template = dataset_dir + "job_template.sh"
print(job_template)
!cat {job_template}

/home/rchaves/ssh_dirs/mn5/ruben/model/job_template.sh
#!/bin/bash
#SBATCH --account=irb95
#SBATCH --job-name={{DIR}}_mddb
#SBATCH --output=mwf_%j.out
#SBATCH --error=mwf_%j.err
#SBATCH --qos=gp_resc
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --mail-type=END,FAIL
#SBATCH --time=24:00:00
#SBATCH --mail-user=ruben.chaves@irbbarcelona.org

set -x 

module purge
module load singularity

# 1) Setup in MN5
#singularity run -H $PWD -C ../mddb_wf.sif mwf run -top res.tpr -md replica_1 fitted.xtc -i setup -m stabonds intrajrity
#singularity run -H $PWD -C ../mddb_wf.sif mwf run -i inchikeys reframe -ow

# 2) Referencias con sshfs
#rm ./replica_1/.trajectory.xtc_offsets.lock
#mwf run -i network meta stopology -m stabonds intrajrity
#mwf run -i protmap -m stabonds intrajrity

# 3) Resto en MN5
singularity run -H $PWD -C ../mddb_wf.sif mwf run -e network protmap clusters -m stabonds intrajrity


## Python

In [16]:
%load_ext autoreload
%autoreload 2
from mddb_workflow.core.dataset import Dataset


dt = Dataset(dataset_yaml_path)
# Print the project directories to verify they are correct
dt.project_directories[:5]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


['/home/rchaves/ssh_dirs/mn5/ruben/model/6kuy',
 '/home/rchaves/ssh_dirs/mn5/ruben/model/7cmu',
 '/home/rchaves/ssh_dirs/mn5/ruben/model/6wjc',
 '/home/rchaves/ssh_dirs/mn5/ruben/model/7e2y',
 '/home/rchaves/ssh_dirs/mn5/ruben/model/7e2z']

In [17]:
# Information at dt.status
dt.display_status_with_links()

Unnamed: 0_level_0,state,message,last_modified,group,log_file,err_file
rel_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6gdg,error,Running BioBB LiPyphilic ZPositions,10:20:35 02/12/25,2,mwf_33119397.out,mwf_33119397.err
6j8h,error,Running BioBB LiPyphilic ZPositions,10:37:14 02/12/25,2,mwf_33119399.out,mwf_33119399.err
6jzh,error,[92m-> Running task protmap (Protein residues mapping)[0m,09:51:51 02/12/25,1,mwf_33119391.out,mwf_33119391.err
6k42,error,Running BioBB LiPyphilic ZPositions,10:20:54 02/12/25,2,mwf_33119398.out,mwf_33119398.err
6kux,error,[92m-> Running task linter (Membrane lipid-protein interactions analysis)[0m,10:11:12 02/12/25,0,mwf_33119404.out,mwf_33119404.err
6kuy,error,[92m-> Running task linter (Membrane lipid-protein interactions analysis)[0m,10:11:23 02/12/25,0,mwf_33119386.out,mwf_33119386.err
6ni3,error,Running BioBB LiPyphilic ZPositions,10:20:39 02/12/25,2,mwf_33119394.out,mwf_33119394.err
6nt3,error,[92m-> Running task protmap (Protein residues mapping)[0m,09:51:53 02/12/25,1,mwf_33119395.out,mwf_33119395.err
6oik,error,[92m-> Running task linter (Membrane lipid-protein interactions analysis)[0m,10:20:27 02/12/25,0,mwf_33119396.out,mwf_33119396.err
6ps5,error,[92m-> Running task protmap (Protein residues mapping)[0m,09:51:52 02/12/25,1,mwf_33119403.out,mwf_33119403.err


In [46]:
import requests

def obtener_titulo(DIR):
    url = f"https://data.rcsb.org/rest/v1/core/entry/{DIR}"
    r = requests.get(url)
    if r.status_code == 200:
        data = r.json()
        return data.get('struct', {}).get('title', '').strip()
    else:
        raise ValueError(f"No se pudo obtener título para PDB {DIR}")

dt.generate_inputs_yaml(inputs_template, obtener_titulo)

Skipping existing /home/rchaves/ssh_dirs/mn5/ruben/model/6kuy/inputs.yaml
Skipping existing /home/rchaves/ssh_dirs/mn5/ruben/model/7cmu/inputs.yaml
Skipping existing /home/rchaves/ssh_dirs/mn5/ruben/model/6wjc/inputs.yaml
Skipping existing /home/rchaves/ssh_dirs/mn5/ruben/model/7e2y/inputs.yaml
Skipping existing /home/rchaves/ssh_dirs/mn5/ruben/model/7e2z/inputs.yaml
Skipping existing /home/rchaves/ssh_dirs/mn5/ruben/model/6jzh/inputs.yaml
Skipping existing /home/rchaves/ssh_dirs/mn5/ruben/model/7dtd/inputs.yaml
Skipping existing /home/rchaves/ssh_dirs/mn5/ruben/model/7bz2/inputs.yaml
Skipping existing /home/rchaves/ssh_dirs/mn5/ruben/model/6ni3/inputs.yaml
Skipping existing /home/rchaves/ssh_dirs/mn5/ruben/model/6nt3/inputs.yaml
Skipping existing /home/rchaves/ssh_dirs/mn5/ruben/model/6ps7/inputs.yaml
Skipping existing /home/rchaves/ssh_dirs/mn5/ruben/model/6oik/inputs.yaml
Skipping existing /home/rchaves/ssh_dirs/mn5/ruben/model/6gdg/inputs.yaml
Skipping existing /home/rchaves/ssh_di

In [70]:
dt.show_groups()

Unnamed: 0_level_0,message,count
group,Unnamed: 1_level_1,Unnamed: 2_level_1
0,[92m-> Running task protmap (Protein residues...,4
1,InputError: You must provide a .jpg file name!,16


In [7]:
# To launch the workflow with SLURM
dt.launch_workflow(
    #include_groups=[0],
    slurm=True,
    job_template=job_template,
    debug=True,
    )

cd /home/rchaves/ssh_dirs/mn5/ruben/model/6kuy
sbatch --output=logs/mwf_%j.out --error=logs/mwf_%j.err mwf_slurm_job.sh 
cd /home/rchaves/ssh_dirs/mn5/ruben/model/7cmu
sbatch --output=logs/mwf_%j.out --error=logs/mwf_%j.err mwf_slurm_job.sh 
cd /home/rchaves/ssh_dirs/mn5/ruben/model/6wjc
sbatch --output=logs/mwf_%j.out --error=logs/mwf_%j.err mwf_slurm_job.sh 
cd /home/rchaves/ssh_dirs/mn5/ruben/model/7e2y
sbatch --output=logs/mwf_%j.out --error=logs/mwf_%j.err mwf_slurm_job.sh 
cd /home/rchaves/ssh_dirs/mn5/ruben/model/7e2z
sbatch --output=logs/mwf_%j.out --error=logs/mwf_%j.err mwf_slurm_job.sh 
cd /home/rchaves/ssh_dirs/mn5/ruben/model/6jzh
sbatch --output=logs/mwf_%j.out --error=logs/mwf_%j.err mwf_slurm_job.sh 
cd /home/rchaves/ssh_dirs/mn5/ruben/model/7dtd
sbatch --output=logs/mwf_%j.out --error=logs/mwf_%j.err mwf_slurm_job.sh 
cd /home/rchaves/ssh_dirs/mn5/ruben/model/7bz2
sbatch --output=logs/mwf_%j.out --error=logs/mwf_%j.err mwf_slurm_job.sh 
cd /home/rchaves/ssh_dirs/mn5/ru

## Command Line

In [None]:
!mwf dataset groups {dataset_yaml_path}

Project groups based on status messages:

Group 0:
Message: Done!
Projects:
  - 6gdg
  - 6j8h
  - 6jzh
  - 6k42
  - 6kux
  - 6kuy
  - 6ni3
  - 6nt3
  - 6oik
  - 6ps5
  - 6qfa
  - 6wjc
  - 7bz2
  - 7cmu
  - 7dhr
  - 7dtd
  - 7e2y
  - 7e2z
  - 7jvr

Group 1:
Message: No output log available
Projects:
  - 6ps7



In [37]:
!mwf dataset run -h

usage: mwf dataset run [-h] [-ns] [-nc] [-ig [INCLUDE_GROUPS ...]]
                       [-jt JOB_TEMPLATE] [--debug]
                       dataset_yaml

positional arguments:
  dataset_yaml
      Path to the dataset YAML file.

options:
  -h, --help
      show this help message and exit
  -ns, --no_symlinks
      Do not use symlinks internally
  -nc, --no_colors
      Do not use colors for logging
  -ig, --include-groups [INCLUDE_GROUPS ...]
      List of group IDs to be run.
  -eg, --exclude-groups [EXCLUDE_GROUPS ...]
      List of group IDs to be excluded.
  -n, --n_jobs N_JOBS
      Number of jobs to run.
  --slurm
      Submit the workflow to SLURM.
  -jt, --job-template JOB_TEMPLATE
      Path to the SLURM job template file. Required if --slurm is used.
  --debug
      Enable debug mode.


In [36]:
# In cmd: #mwf dataset run dataset.yaml --slurm -jt job_template.sh -eg 3 4 0
!mwf dataset run {dataset_yaml_path} --slurm -jt {job_template} -eg 1 --debug -n 2

cd /home/rchaves/ssh_dirs/mn5/ruben/model/6kuy
sbatch --output=logs/mwf_%j.out --error=logs/mwf_%j.err mwf_slurm_job.sh 
cd /home/rchaves/ssh_dirs/mn5/ruben/model/7cmu
sbatch --output=logs/mwf_%j.out --error=logs/mwf_%j.err mwf_slurm_job.sh 
