# Dataset Example

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from mddb_workflow.core.dataset import Dataset

dataset_dir = '/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/'
# YAML use for the configuration of the dataset and the automatic inputs.yaml generation.
# project_directories outside the dataset directory are not allowed
dataset_yaml_path = dataset_dir + "dataset.yaml"
print(dataset_yaml_path)
!cat {dataset_yaml_path}

/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/dataset.yaml
global:
  project_directories: 
     - '[0]*/**/'  # (matches dirs starting with a digit and all their subfolders)

In [3]:
dt = Dataset(dataset_yaml_path)
# Print the project directories to verify they are correct
dt.project_directories[:5]

['/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/0_to_reimage/6jzh/',
 '/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/0_to_reimage/7dtd/',
 '/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/0_to_reimage/6oik/',
 '/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/0_to_reimage/6j8h/',
 '/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/0_to_reimage/7cmu/']

In [5]:
inputs_template = dataset_dir + "inputs_template.yaml"
print(inputs_template)
!cat {inputs_template}

/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/inputs_template.yaml
name: "{{ title }} ({{ DIR }})"
description: "{{ title }} (1 ms)"
authors: Agustín García
groups: IRB Barcelona, Orozco lab
citation: null
thanks: null
contact: agustin.garcia@irbbarcelona.org
type: trajectory
program: GROMACS
version: 2025.2
license: This trajectory dataset is released under a Creative Commons Attribution 4.0 International Public License
linkcense: tps://creativecommons.org/licenses/by/4.0/
method: Classical MD
accession: null
links: 
- name: Structural data source
  url: https://memprotmd.bioch.ox.ac.uk/_ref/PDB/{{ DIR }}
pdb_ids:
  - {{ DIR }}
forced_references: null
framestep: 0.01
timestep: 2
ensemble: NPT
ff: 53A6 GROMOS
wat: TIP3P
boxtype: Cubic
mds:
  - mdir: replica_1
mdref: 0
interactions: null
pbc_selection: auto
collections: mcns
chainnames: null
membranes: null
customs: null
multimeric: null
trjType: large
bucket: 8d3eha
temp: 310
ligands: null



In [6]:
import requests

def obtener_titulo(pdb_id):
    url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
    r = requests.get(url)
    if r.status_code == 200:
        data = r.json()
        return data.get('struct', {}).get('title', '').strip()
    else:
        raise ValueError(f"No se pudo obtener título para PDB {pdb_id}")
    
dt.generate_inputs_yaml(inputs_template, obtener_titulo)

In [12]:
job_template=dataset_dir + "job_template.sh"
print(job_template)
!cat {job_template}

/home/rchaves/ssh_dirs/irbcluster/scratch/model-cns/job_template.sh
#!/bin/bash
#SBATCH --job-name={{ pdbIds[0] }}
#SBATCH --output={{ pdbIds[0] }}_%j.out
#SBATCH --error={{ pdbIds[0] }}_%j.err
#SBATCH --mem=64G
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --mail-type=END,FAIL
#SBATCH --time=24:00:00
#SBATCH --mail-user=ruben.chaves@irbbarcelona.org

module load anaconda3
conda activate mwf_env
mwf run -top topology.tpr -md replica_1 trajectory.xtc -i membs -ow -nc {% if group == 3 %}-m intrajrity{% endif %}



In [4]:
dt.display_status_with_links()

Unnamed: 0_level_0,state,message,group,log_file,err_file
rel_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0_to_reimage/6gdg,not_run,No output log available,3,,
0_to_reimage/6j8h,not_run,No output log available,3,,
0_to_reimage/6jzh,not_run,No output log available,3,,
0_to_reimage/6k42,not_run,No output log available,3,,
0_to_reimage/6kux,not_run,No output log available,3,,
0_to_reimage/6kuy,not_run,No output log available,3,,
0_to_reimage/6ni3,not_run,No output log available,3,,
0_to_reimage/6nt3,not_run,No output log available,3,,
0_to_reimage/6oik,not_run,No output log available,3,,
0_to_reimage/6ps5,not_run,No output log available,3,,


In [5]:
dt.show_groups()

Unnamed: 0_level_0,message,count
group,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Done!,13
1,-> Counting number of frames,6
2,"InputError: Missing input topology file ""topol...",3
3,No output log available,17
4,Running BioBB LiPyphilic ZPositions,10
5,TestFailure: Failed to find stable bonds,1
6,TestFailure: RMSD check has failed: there may ...,1


In [6]:
!mwf dataset status {dataset_yaml_path}

Group 0:
Message: Done!
Projects:
  - 2_to_membs/6gt3
  - 2_to_membs/6kr8
  - 2_to_membs/6n4q
  - 2_to_membs/6nt4
  - 2_to_membs/6ps2
  - 2_to_membs/6vxo
  - 2_to_membs/6w6o
  - 2_to_membs/6wgt
  - 2_to_membs/6zdv
  - 2_to_membs/7dfp
  - 2_to_membs/7eor
  - 2_to_membs/7eot
  - 2_to_membs/7k48

Group 1:
Message: -> Counting number of frames
Projects:
  - 1_to_run/6i53
  - 1_to_run/6me2
  - 1_to_run/6me3
  - 1_to_run/6me5
  - 1_to_run/6ps7
  - 1_to_run/6ps8

Group 2:
Message: InputError: Missing input topology file "topology.tpr"
Projects:
  - 1_to_run/2lm2
  - 1_to_run/3sn6
  - 1_to_run/7e2z

Group 3:
Message: No output log available
Projects:
  - 0_to_reimage/6gdg
  - 0_to_reimage/6j8h
  - 0_to_reimage/6jzh
  - 0_to_reimage/6k42
  - 0_to_reimage/6kux
  - 0_to_reimage/6kuy
  - 0_to_reimage/6ni3
  - 0_to_reimage/6nt3
  - 0_to_reimage/6oik
  - 0_to_reimage/6ps5
  - 0_to_reimage/6qfa
  - 0_to_reimage/6wjc
  - 0_to_reimage/7bz2
  - 0_to_reimage/7cmu
  - 0_to_reimage/7dhr
  - 0_to_reimage/7d

In [None]:
# To launch the workflow with SLURM
dt.launch_workflow(
    include_groups=[3],
    slurm=True,
    job_template=job_template)
# In cmd:
!mwf dataset run {dataset_yaml_path} --slurm -jt {sbatch_template_path} -ig 3