In [None]:
import os
from pathlib import Path

from src.services.analysis import analyze_sample
from src.services.common import logger, timing_decorator
from src.services.constants import APPLYABLE_FILETYPES


@timing_decorator
def run_full_analysis():
    """
    Проводит полный анализ всех образцов.
    """
    db_dir = Path(os.getcwd()) / "db"          # путь к папке с БД
    input_dir = Path(os.getcwd()) / "input"    # путь к исходным данным с папками образцов
    result_dir = Path(os.getcwd()) / "result"  # путь для сохранения результатов анализа

    threads = str(os.cpu_count())  # количество потоков для параллельных вычислений (количество ядер процессора)

    samples = sorted(os.listdir(input_dir))  # список образцов в папке с входными данными

    num_samples = len(samples)  # количество образцов

    file_counter = dict()  # словарь количества файлов для анализа для каждого образца

    for sample in samples:
        files = [file for file in os.listdir(input_dir/sample) if file.split(".")[-1] in APPLYABLE_FILETYPES]
        file_counter[sample] = len(files)

    logger.info(
        f"\n{"="*30}> Start a new full analysis <{"="*30}\n"
        f"Number of threads: {threads}\n"
        f"Number of samples: {num_samples}\n"
        f"{samples}\n"
        f"Total number of files for the analysis: {sum(file_counter.values())}\n"
        f"Number of files for an each sample: {file_counter}"
    )

    result_flags = dict()  # словарь для хранения флагов успешности анализа каждого образца

    for sample in samples:
        sample_input_dir = input_dir / sample
        sample_result_dir = result_dir / sample

        os.makedirs(sample_result_dir, exist_ok=True)

        logger.info(f"Start analyzing a sample: {sample}")

        sample_analyzed = analyze_sample(
            sample,
            sample_input_dir=sample_input_dir,
            sample_result_dir=sample_result_dir,
            db_dir=db_dir,
            gff_type="prodigal",
            threads=threads
        )

        result_flags[sample] = sample_analyzed

    logger.info(f"Full analysis ended successfully:\n{result_flags}")


run_full_analysis()

In [40]:
import os
from pathlib import Path

import pandas as pd

from src.services.common import logger, timing_decorator


@timing_decorator
def read_all_substrate_prediction(*, test_include: bool = False) -> tuple[dict[str, int], pd.DataFrame]:
    result_dirpath = Path(os.getcwd()) / "result"

    logger.info(f"Reading all files with substract prediction results in directory: {result_dirpath}")

    result_samples = sorted(os.listdir(result_dirpath))

    substrate_prediction_df = pd.DataFrame()

    counter = dict()

    for sample in result_samples:
        if sample == "test" and not test_include:
            logger.warning("Skip test sample!")
            continue

        logger.info(f"Reading files for: {sample}")

        sample_dirpath = Path(result_dirpath/sample)
        counter[sample] = 0

        sample_parts = sorted(os.listdir(sample_dirpath))

        for part in sample_parts:
            part_dirpath = sample_dirpath / part
            sub_pred_filepath = part_dirpath/"substrate_prediction.tsv"

            try:
                df = pd.read_csv(sub_pred_filepath, header=0, sep="\t")
                df["sample"] = sample
                df["part"] = part
                counter[sample] += 1
                substrate_prediction_df = pd.concat([substrate_prediction_df, df])

            except FileNotFoundError:
                logger.warning(f"File not found: {sub_pred_filepath}")
                continue
    
    substrate_prediction_df.set_index(["sample", "part"], inplace=True)

    return counter, substrate_prediction_df

counter, substrate_prediction_df = read_all_substrate_prediction()

In [41]:
counter

{'AH10-2-MAGs': 51,
 'AH4-3-MAGs': 51,
 'AK10-1-MAGs': 66,
 'CK10-7-MAGs': 90,
 'CK4-8-MAGs': 168,
 'HH10-5-MAGs': 117,
 'HH4-6-MAGs': 135,
 'HK10-4-MAGs': 117,
 'LH10-10-MAGs': 78,
 'LH4-1-MAGs': 78,
 'LK10-9-MAGs': 78}

In [42]:
substrate_prediction_df

Unnamed: 0_level_0,Unnamed: 1_level_0,#cgcid,PULID,dbCAN-PUL substrate,bitscore,signature pairs,dbCAN-sub substrate,dbCAN-sub substrate score
sample,part,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
