In [None]:
import os
import pandas as pd
import seaborn as sns

import sys  # noqa
sys.path.append('../..')  # noqa

from utils.data_paths import make_data_list
from dataclasses import dataclass, field, asdict
from datetime import datetime, timedelta
import re 

In [None]:
@dataclass
class OutputLogs:
    """ Holds the paths to the output logs for each pipeline. """
    bio3: str = ""
    bio4: str = ""
    jams: list[str] = field(default_factory=list)
    wgsa2: str = ""
    woltka: list[str] = field(default_factory=list)


@dataclass
class TimeData:
    """ Holds the average or total time for each pipeline."""
    bio3: timedelta = timedelta()
    bio4: timedelta = timedelta()
    jams: timedelta = timedelta()
    wgsa2: timedelta = timedelta()
    woltka: timedelta = timedelta()

@dataclass
class ThreadData:
    """ Holds the number of CPUS used in each run. """
    bio3: int = 0
    bio4: int = 0
    jams: int = 0
    wgsa2: int = 0
    woltka: int = 0


In [None]:
def search_for_files(path: str):
    """ Look for anadama.log, JAMS logs, WGSA logs, and Wolka logs."""
    data_obj = OutputLogs()

    for root, dirs, files in os.walk(path):
        # print(files)
        for f in files:
            if f == "anadama.log":
                if "bio4" in root:
                    data_obj.bio4 = os.path.join(root, f)
                elif "bio3" in root:
                    data_obj.bio3 = os.path.join(root, f)
            elif f.endswith("JAMS.log"):
                # Don't add the beta log or the negative control log.
                if "beta" in root or "Neg" in f:
                    continue
                else:
                    data_obj.jams.append(os.path.join(root, f))
            elif f == "logfile.txt":
                data_obj.wgsa2 = os.path.join(root, f)

            elif f == "classify_time.log" or f == "bowtie_time.log":
                data_obj.woltka.append(os.path.join(root, f))

    return data_obj


In [None]:
anadama_format = "%Y-%m-%d %H:%M:%S,%f"
wanted_time_fmt = "%H:%M:%S"


def parse_bio_time(log_path: str):
    """Parse the anadama.log file to get the start and end times."""
    # First line is the start time, last line is the end time.

    with open(log_path, 'r') as f:
        lines = f.readlines()
        start = datetime.strptime(
            lines[0].strip().split('\t')[0], anadama_format)
        end = datetime.strptime(
            lines[-1].strip().split('\t')[0], anadama_format)
        
        # Find the line that has "threads" in it.
        threads = 0
        for line in lines:
            if "threads" in line:
                # Get the number of threads used.
                threads = int(line.split(' ')[-1])
                break

        elapsed = end - start

        return elapsed, threads


In [None]:
jams_format_time = "%Y-%m-%d %H:%M:%S"


def average_time(times: list):
    """ Get the average time for each pipeline. """
    average = sum(times, timedelta()) / len(times)
    return average


def parse_jams_time(logs: list):
    """ Parse all of the JAMS logs. """

    cpu_regex = "Saving project workspace image using fastSave package with \d+ CPUs"

    times = []

    # They all ran with the same number of threads, so we can overwrite this in the for loop.
    threads = 0
    for l in logs:
        # open the file
        with open(l, 'r') as f:
            lines = f.readlines()
            # First line is start time.
            start = " ".join(lines[0].strip().split()[1:3]).strip("[]")
            start_time = datetime.strptime(start, jams_format_time)
            # Last line is end time.
            end = " ".join(lines[-1].strip().split()[1:3]).strip("[]")
            end_time = datetime.strptime(end, jams_format_time)

            for line in lines:
                if "Saving project workspace image using fastSave package with" in line:
                    threads = int(line.split()[-2])
                    break                  

            elapsed = end_time - start_time

            times.append(elapsed)

    return average_time(times), threads

In [None]:
def parse_wgsa2(file_path: str):
    """ Parse the WGSA output log for the time."""
    with open(file_path, 'r') as f:
        lines = f.readlines()
        start = " ".join(lines[0].strip().split()[0:2]).strip("[]")
        start_time = datetime.strptime(start, anadama_format)
        # Last line is end time.
        end = " ".join(lines[-1].strip().split()[0:2]).strip("[]")
        end_time = datetime.strptime(end, anadama_format)

        threads = 0
        for line in lines:
            if 'Provided cores:' in line:
                threads = int(line.split()[-1])
                break

        elapsed = end_time - start_time
        return elapsed, threads

In [None]:
def parse_woltka_time(logs: list):
    times = []
    for log in logs:
        # Second line gives column widths.
        with open(log, 'r') as f:
            dash_line = f.readlines()[1].strip().split()
            widths = [len(x)+1 for x in dash_line]

            df = pd.read_fwf(log, widths=widths, skiprows=[1], header=0)
            df = df.loc[df["JobName"] == "swarm"]

            # Split along the colon. This is of the format DD:HH:MM.
            df["Elapsed"] = df["Elapsed"].str.split(":").apply(
                lambda x: timedelta(days=int(x[0]), hours=int(x[1]), minutes=int(x[2])))

            avg_td = df["Elapsed"].mean()

            times.append(avg_td)

    # Sum of the bowtie and classify times.
    total_time = sum(times, timedelta())
    return total_time.to_pytimedelta()


In [None]:
def parse_all_times(log_paths: OutputLogs) -> TimeData:
    """ Parses all time data, returns a TimeData object."""
    times = TimeData()
    threads_data = ThreadData()

    # Parse bio3 time.
    bio3_time, threads = parse_bio_time(log_paths.bio3)
    times.bio3 = bio3_time / threads
    # threads_data.bio3 = threads
    # Parse bio4 time.
    bio4_time, threads = parse_bio_time(log_paths.bio4)
    times.bio4 = bio4_time / threads
    # threads_data.bio4 = threads
    # Parse JAMS time.
    jams_times, threads = parse_jams_time(log_paths.jams)
    times.jams = jams_times / threads
    # threads_data.jams = threads
    # Parse WGSA2 time.
    wgsa2_time, threads = parse_wgsa2(log_paths.wgsa2)
    times.wgsa2 = wgsa2_time / threads
    # threads_data.wgsa2 = threads
    # Parse Woltka time.
    woltka_times = parse_woltka_time(log_paths.woltka)
    # Woltka used 16 threads. This could be automated but since we are doing just this one pipeline, it's fine.
    times.woltka = woltka_times / 16

    # threads_data.woltka = 16

    return times, threads_data


def analyze_times(log_paths: OutputLogs):
    times, threads = parse_all_times(log_paths)

    print(asdict(times))
    print(threads)

    time_df = pd.DataFrame(asdict(times), index=[0])
    # Drop the second row.

    display(time_df)

    # Relative difference is (x2 - min) / min
    relative_times = time_df.apply(lambda x: (
        x - time_df.min(axis=1)) / time_df.min(axis=1) * 100)

    # Set index value to be: "Relative Time (Factor of Smallest Time)"
    relative_times.index = ["Relative Time Per CPU (%)"]

    return time_df, relative_times


In [None]:
paths_dict = {
    "nist": "/Volumes/TBHD_share/valencia/pipelines/NIST/"
}

def main():
    log_paths = search_for_files(paths_dict["nist"])
    # print(log_paths)

    time_df, relative_time_df = analyze_times(log_paths)

    display(time_df)

    # Format timedf to HH:MM:SS
    time_df = time_df.applymap(lambda x: str(x).split(".")[0])

    # Strip the days from the raw time_df.
    time_df = time_df.applymap(lambda x: x.split(" ")[-1])
    time_df.index = ["Time (HH:MM:SS)"]

    # Concat the two dataframes.
    time_df = pd.concat([time_df, relative_time_df], axis=0)

    display(time_df)

    time_df.to_latex("time_table.tex", index=True, escape=True)

main()


In [None]:
def test_nist_data():
    # Assert that the NIST output is TimeData(bio3=datetime.timedelta(seconds=28415, microseconds=815000), bio4=datetime.timedelta(seconds=4478, microseconds=860000), jams=datetime.timedelta(seconds=5213, microseconds=800000), wgsa2=datetime.timedelta(seconds=4963, microseconds=465000), woltka=datetime.timedelta(seconds=25692))).

    wanted = TimeData(bio3=timedelta(seconds=28415, microseconds=815000), bio4=timedelta(seconds=4478, microseconds=860000), jams=timedelta(
        seconds=5213, microseconds=800000), wgsa2=timedelta(seconds=4963, microseconds=465000), woltka=timedelta(seconds=25692))
    log_path = search_for_files(paths_dict["nist"])
    times, times_df = analyze_times(log_path)

    assert times == wanted


test_nist_data()
