## Analysis of the GFA graphs produced by rust-mdbg assembler

In [3]:
# Global imports
import os
import sys
import yaml

from pathlib import Path
from typing import Optional, Union

In [23]:
def load_from_yaml(path: Path) -> dict:
    with open(path, 'r') as stream:
        data_loaded = yaml.safe_load(stream)
    return data_loaded

def save_to_yaml(config: dict, filename: str, dirname: Optional[Path] = None):
    if dirname is None:
        dirname = Path("../config/suite/").resolve()
        
    os.makedirs(dirname, exist_ok=True)
    
    with open(dirname / filename, "w") as stream:
        yaml.dump(config, stream, default_flow_style=False)
    

In [24]:
def modify_config_parameter(config: dict, param: dict[str,Union[int, float]]) -> tuple[str, dict]:
    new_config = config.copy()
    new_name =  []
    for param_name, param_value in dict(sorted(param.items())).items():
        new_config['assembler']['params'][param_name] = param_value
        if isinstance(param_value, int):
            param_value = str(param_value).zfill(2)
        if isinstance(param_value, float):
            param_value = str(param_value).split('.')[1]
            param_value = str(param_value).ljust(3, '0')
        new_name.append(f'{param_name}_{param_value}')
        
    new_name = "_".join(new_name)
    return f'config_{new_name}.yaml', new_config

In [6]:
def add_dir_to_sys_path(src_dir: os.PathLike):
    """
    Utility function for adding iccd source directory to system path.

    Parameters
    ----------
    src_dir
        Path to the src directory in interceptor repo
    """
    if os.path.isdir(src_dir) and src_dir not in sys.path:
        sys.path.insert(0, src_dir)

### Set up the paths to required software

In [20]:
# Tools for GFA analysis

# Pipeline
    
PIPELINE_PATH = Path("../").resolve()
add_dir_to_sys_path(str(PIPELINE_PATH))

# Load global config
CONFIG_PATH = Path("../config/config.yaml").resolve()
global_config = load_from_yaml(CONFIG_PATH)

# config value grid
K_RANGE = range(25, 38, 3)
L_RANGE = range(7, 14, 2)
D_RANGE = (0.001, 0.005, 0.01, 0.05, 0.1)


# Output directories


### Grid search configuration

We are performing grid search over the following values:
k <- [k_start, k_end, k_step]
l <- [l_start, l_end, l_step]
d <- [d_start, d_end, d_step]

It is necessary to perform the analysis on the real reads to determine the optimal parameters of the assembler
for further work.

In [8]:
from pipeline_dbg import main

In [25]:
for k in K_RANGE:
    for l in L_RANGE:
        for d in D_RANGE:
            config_values = {'k': k, 'l': l, 'density': d}
            config_name, config_new = modify_config_parameter(global_config, config_values)
            save_to_yaml(config_new, config_name)