In [None]:
import os
import pandas as pd
import seaborn as sns

import sys # noqa
sys.path.append('../..') # noqa

from utils.data_paths import make_data_list 
from dataclasses import dataclass, field
from datetime import datetime

In [None]:
@dataclass
class OutputLogs:
    """ Holds the paths to the output logs for each pipeline. """
    bio3: str = ""
    bio4: str = ""
    jams: list[str] = field(default_factory=list)
    wgsa2: str = ""
    woltka: list[str] = field(default_factory=list)

In [None]:
paths_dict = {
    "nist": "/Volumes/TBHD_share/valencia/pipelines/NIST/"
}

def search_for_files(path: str):
    """ Look for anadama.log, JAMS logs, WGSA logs, and Wolka logs."""
    data_obj = OutputLogs()

    for root, dirs, files in os.walk(path):
        # print(files)
        for f in files:
            if f == "anadama.log":
                if "bio4" in root:
                    data_obj.bio4 = os.path.join(root, f)
                elif "bio3" in root:
                    data_obj.bio3 = os.path.join(root, f)
            elif f.endswith("JAMS.log"):
                # Don't add the beta log or the negative control log.
                if "beta" in root or "Neg" in f:
                    continue
                else:
                    data_obj.jams.append(os.path.join(root, f))
            elif f == "logfile.txt":
                data_obj.wgsa2 = os.path.join(root, f)
            
            elif f == "classify_time.log" or f == "bowtie_time.log":
                data_obj.woltka.append(os.path.join(root, f))

    return data_obj

for p in data_paths:
    if 'nist' in p.path:
        log_paths = search_for_files(paths_dict['nist'])
        print(log_paths)

In [None]:
anadama_format = "%Y-%m-%d %H:%M:%S,%f"
wanted_time_fmt = "%H:%M:%S"

def parse_bio_time(log_path: str):
    """Parse the anadama.log file to get the start and end times."""
    # First line is the start time, last line is the end time.
    with open(log_path, 'r') as f:
        lines = f.readlines()
        start = datetime.strptime(lines[0].strip().split('\t')[0], anadama_format)
        end = datetime.strptime(lines[-1].strip().split('\t')[0], anadama_format)

        # Find the line that has "threads" in it.
        for line in lines:
            if "threads" in line:
                # Get the number of threads used.
                threads = int(line.split(' ')[-1])
                break

        elapsed = end - start

        return str(elapsed), threads

print(parse_bio_time(log_paths.bio3))
print(parse_bio_time(log_paths.bio4))