In [22]:
import pandas as pd
from pathlib import Path
from IPython.display import display, HTML
from Bio import SeqIO

def fasta2df(fn):
    seqs = SeqIO.parse(fn, "fasta")
    df = pd.DataFrame((str(s.id), str(s.seq)) for s in seqs)
    df.columns = ["ID", "Sequence"]
    return df

def get_table(refname, experiment, tools):
    alldf = pd.DataFrame(columns=['tool', 'unaligned', 'aligned', 'misaligned', '%', 'idx (sec)', 'map (sec)', 'idx (GB)', 'total (GB)'])
    ref = fasta2df(Path('refs') / (refname+'.fa'))
    reads = fasta2df(Path('reads') / Path('reads-'+experiment+'.fa'))
    for tool in tools:
        d = Path("out") / experiment / tool / tool
        eval_file = d.with_suffix('.eval')
        if not eval_file.exists():
            print(f"File {eval_file} does not exist.")
            continue
        evaldf = pd.read_csv(eval_file, sep='\t', names=['Q', 'quality', 'aligned', 'wrong', 'wrong_frac', 'aligned_cum'])
        #evaldf = pd.read_csv(d/f"{tool}.eval", sep='\t', names=['Q', 'quality', 'aligned', 'wrong', 'wrong_frac', 'aligned_cum'])
        aligned = evaldf['aligned'].sum()
        unaligned = len(reads) - aligned
        wrong = evaldf['wrong'].sum()
        wrong_perc = (100.0 * wrong  / aligned).round(2)
        with open(d.with_suffix('.index.time')) as f_index_time:
            index_time, index_mem = map(float, f_index_time.readline().split())
            index_mem /= 2**20
            with open(d.with_suffix('.time')) as f_time:
                total_time, total_mem = map(float, f_time.readline().split())
                total_mem /= 2**20
                print(total_mem)
                map_time = total_time - index_time
        alldf.loc[len(alldf.index)] = [tool, unaligned, aligned, wrong, wrong_perc, index_time, map_time, index_mem, total_mem]
    alldf = alldf.set_index('tool')
    alldf.index.name = None
    #alldf['memory (GB)'] = (alldf['memory (GB)'].astype(float) / (1024 * 1024)).round(2)
    return alldf

tools = ['sweepmap', 'mapquik', 'blend', 'minimap', 'winnowmap']
#experiments = [('chm13-1B', 'chm13-1B-a0.99-d1-l10000'), ('t2tChrY', 't2tChrY-a0.99-d1-l10000')]
#experiments = [('chm13', 'chm13-a0.99-d1-l10000'), ('t2tChrY', 't2tChrY-a0.99-d1-l10000')]
experiments = [('t2tChrY', 't2tChrY-a0.99-d1-l10000')]
for refname, experiment in experiments:
    df = get_table(refname, experiment, tools).round(2)
    df.columns.name = experiment
    css = """ <style> table { font-family: "Courier New", Courier, monospace; } </style> """
    display(HTML(css))
    display(df)
    df = df.astype(str).map(lambda x: x.rstrip('0').rstrip('.') if '.' in x else x)
    print(df.to_latex(escape=True))

0.42505645751953125
1.4752349853515625
0.15755462646484375
0.2940177917480469
File out/t2tChrY-a0.99-d1-l10000/winnowmap/winnowmap.eval does not exist.


t2tChrY-a0.99-d1-l10000,unaligned,aligned,misaligned,%,idx (sec),map (sec),idx (GB),total (GB)
sweepmap,0,3131,194,6.2,0.76,1.11,0.43,0.43
mapquik,402,2729,760,27.85,0.36,0.46,1.48,1.48
blend,0,3131,318,10.16,1.07,18.67,0.16,0.16
minimap,0,3131,125,3.99,1.55,57.21,0.29,0.29


\begin{tabular}{lllllllll}
\toprule
t2tChrY-a0.99-d1-l10000 & unaligned & aligned & misaligned & \% & idx (sec) & map (sec) & idx (GB) & total (GB) \\
\midrule
sweepmap & 0 & 3131 & 194 & 6.2 & 0.76 & 1.11 & 0.43 & 0.43 \\
mapquik & 402 & 2729 & 760 & 27.85 & 0.36 & 0.46 & 1.48 & 1.48 \\
blend & 0 & 3131 & 318 & 10.16 & 1.07 & 18.67 & 0.16 & 0.16 \\
minimap & 0 & 3131 & 125 & 3.99 & 1.55 & 57.21 & 0.29 & 0.29 \\
\bottomrule
\end{tabular}

