In [8]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display, HTML
from Bio import SeqIO
import os
#from astropy import units as u
import sys
from readpaf import parse_paf
from collections import Counter, defaultdict
from tqdm import tqdm

import sv

#from itables import init_notebook_mode
#init_notebook_mode(all_interactive=True)
pd.set_option('display.max_columns', None)

def to_latex(df, data, refname):
    latex = ""
    df.index = df.index.map(lambda x: f'\\{x}')
    df.columns = df.columns.str.replace(' ', '\\\\')
    df.columns = df.columns.str.replace('%', '\%')
    df.columns = df.columns.map(lambda x: '\makecell{' + x + '}')
    #df = df.astype(str).map(lambda x: x.rstrip('0').rstrip('.') if '.' in x else x)
    latex += df.to_latex(escape=False, label=f'tab:{refname}', caption=data, float_format = lambda x: '{:0.2f}'.format(x) if pd.notna(x) else '-')
    #latex += df.to_latex(float_format = lambda x: '{:0.2f}'.format(x) if pd.notna(x) else '-')
    latex += '\n'
    return latex

In [116]:
#def perc(a, b):
#    if b == 0:
#        return np.nan
#    return 100.0 * a / b

def fasta2df(fn):
    seqs = SeqIO.parse(fn, "fasta")
    df = pd.DataFrame((str(s.id), str(s.seq)) for s in seqs)
    df.columns = ["ID", "Sequence"]
    return df

#def is_overlapping(a, sv_row):
#    return a.GT_from <= sv_row['END'] and sv_row['POS'] <= a.GT_to 
    
min_overlap = 0.1

def get_overlap(a):
     if a.GT_ref != a.target_name:
         return False
     if a.GT_strand != a.strand:
         return False
     union_from = min(a.GT_from, a.target_start)
     union_to = max(a.GT_to, a.target_end)

     intersect_from = max(a.GT_from, a.target_start)
     intersect_to = min(a.GT_to, a.target_end)
     overlap = max(0.0, (intersect_to - intersect_from) / (union_to - union_from))
     return overlap

def read_paf(pref, reads, experiment, tool):
    paf_file = pref.with_suffix('.paf')
    no_GT = False
    if not paf_file.exists():
        raise Exception(f"File does not exist or is empty: {paf_file}")
    with open(paf_file) as handle:
        df = parse_paf(handle, dataframe=True)
        df['experiment'] = experiment
        df['tool'] = tool
        try:
            df[ ['read_name', 'GT_ref', 'GT_from', 'GT_to', 'GT_strand'] ] = df['query_name'].str.split('!', expand=True)
            df['GT_from'] = df['GT_from'].astype(int)
            df['GT_to'] = df['GT_to'].astype(int)
            df['overlap'] = df.apply(get_overlap, axis=1)
            df['is_correct'] = df['overlap'] >= min_overlap
            #df['is_correct_labels'] = df.apply(lambda x: is_correct_labels(x, orig_l, mutated_l), axis=1)
            #df['is_correct'] = df.apply(lambda x: is_correct_labels(x, orig_l, mutated_l), axis=1)
            #df['start_diff'] = df.target_start - df.GT_from  # TODO: different coordinate systems!
            #df['end_diff'] = df.target_end - df.GT_to  # TODO: different coordinate systems!
            #df['read_sv'] = 'none' # df.apply(lambda x: read_falls_on_what_sv(x, vcf_df), axis=1)
        except Exception as e:
            #display(e)
            df['read_name'] = df['query_name']
            df['GT_ref'] = np.NaN
            df['GT_from'] = np.NaN
            df['GT_to'] = np.NaN
            df['GT_strand'] = np.NaN
            df['overlap'] = np.NaN
            df['is_correct'] = True
            #df['is_correct_labels'] = True
            #df['start_diff'] = 0
            #df['end_diff'] = 0
            #df['read_sv'] = 'none'
            no_GT = True
    df = df.sort_values(['read_name', 'residue_matches'], ascending=[True, False], ignore_index=True)
    #display(df)

    paf = defaultdict(int)
    paf['Mapped Q60'] = 0
    paf['Q<60 or missed'] = 0
    paf['Wrong Q60'] = 0
    mapped_reads = 0

    def process_group(group_first_index, group_last_index):
        nonlocal mapped_reads
        group = df.loc[group_first_index:group_last_index]
        mapped_reads += 1
        if (group.mapping_quality == 60).all():
            paf['Mapped Q60'] += 1
            if not group.is_correct.all():
                paf['Wrong Q60'] += 1
        else:
            paf['Q<60 or missed'] += 1

    group_first_i, group_read_name = 0, df.loc[0, 'read_name']
    for i, a in df.iterrows():
        if a.read_name != group_read_name:
            process_group(group_first_i, i-1)
            group_first_i, group_read_name = i, a.read_name
    process_group(group_first_i, len(df)-1)
    missed = reads - mapped_reads
    paf['Q<60 or missed'] += missed

    if no_GT:
        paf['Wrong Q60'] = 'n/a'

    return pd.Series(paf, dtype='object')
    
index_time_col = 'Index [sec]'
map_time_col = 'Map [sec]'
memory_col = 'Memory [GB]'

def read_times(pref):
    times = {}
    with open(str(pref) + '.index.time') as f_index_time:
        index_time, index_mem = map(float, f_index_time.readline().split())
        times[index_time_col] = index_time #* u.second
        #times['index_mem'] = index_mem / 2**20
        with open(str(pref) + '.time') as f_time:
            total_time, total_mem = map(float, f_time.readline().split())
            #times['time total'] = total_time #* u.second
            times[map_time_col] = total_time - times[index_time_col]
            times[memory_col] = (total_mem / 2**20) #* u.GB
    return pd.Series(times, dtype='object').map('{:.1f}'.format)

def get_comparison_table(main_dir: Path, refname, experiment: Path, tools):
    empty_cell = -1
    alldf = pd.DataFrame()
    alldf.name = experiment
    ref = fasta2df(Path('refs') / (refname + '.fa'))
    reads = fasta2df(Path('reads') / (str(experiment) + '.fa'))

    rows = []

    for tool in tqdm(tools, desc=f'Tools for {experiment}', leave=False):
        d = Path(main_dir) / experiment / tool / tool
        row = pd.Series({
            'tool': tool,
        })
        try:
            paf = read_paf(d, len(reads), experiment, tool)
            row = pd.concat([row, paf])
            try:
                row = pd.concat([row, read_times(d)])   # .time, .index.time
            except Exception as e:
                print(f"An error occurred while reading times {d}: {e}")
                row[index_time_col] = empty_cell
                row[map_time_col] = empty_cell
                row[memory_col] = empty_cell
            rows.append(row)
        except Exception as e:
            print(f"An error occurred while reading PAF {d}: {e}")

    alldf = pd.DataFrame(rows)
    alldf = alldf.set_index('tool')
    alldf.index.name = None
    return alldf

Build a table to compare the mappers by accuracy, runtime (indexing and mapping) and memory.

In [121]:
main_dir = Path('out20241022')
#tools = ['sweepmap', 'sweepmap-slow', 'mapquik', 'blend', 'minimap', 'winnowmap', 'rmqmap'] 
tools = ['minimap', 'mapquik', 'blend', 'winnowmap', 'jaccmap']
experiments = [
    ('t2tChrY', 't2tChrY-readst2tChrY-a0.99-d10-l10000'),
    ('chm13',   'chm13-readschm13-a0.99-d0.1-l10000'),
    ('t2tChrY', 't2tChrY-readst2tChrY-a0.99-d1-l24000'),
    ('chm13',   'HG002_24kb'),
]

pd.set_option('display.width', 100)
css = """ <style> table { font-family: "Courier New", Courier, monospace; } </style> """
display(HTML(css))
dfs = []
keys = []
for refname, data in experiments:
    df = get_comparison_table(main_dir=main_dir, refname=refname, experiment=data, tools=tools).round(2)
    dfs.append(df)
    keys.append(data)
    df_styled = df.style.set_caption(data)
    display(df_styled)
DF = pd.concat(dfs, keys=keys)
display(DF)
DF.to_latex('evals-table.tex', escape=True, multirow=False)

                                                                                              

Unnamed: 0,Mapped Q60,Q<60 or missed,Wrong Q60,Index [sec],Map [sec],Memory [GB]
minimap,20572,41893,0,1.6,1153.4,0.7
mapquik,25907,36558,0,0.5,4.7,1.5
blend,29461,33004,132,1.1,371.8,0.6
winnowmap,34803,27662,38,2.5,54016.1,16.4
jaccmap,43528,18937,12,0.7,117.1,0.4


                                                                                           

Unnamed: 0,Mapped Q60,Q<60 or missed,Wrong Q60,Index [sec],Map [sec],Memory [GB]
minimap,28233,2951,0,79.2,60.9,10.2
mapquik,8,31176,8,30.3,3.1,4.2
blend,29288,1896,19,51.4,13.0,5.4
winnowmap,29501,1683,4,120.1,1372.5,3.3
jaccmap,30278,906,1,41.5,15.5,12.3


Tools for t2tChrY-readst2tChrY-a0.99-d1-l24000:  60%|██████    | 3/5 [00:02<00:01,  1.04it/s]

An error occurred while reading PAF out20241022/t2tChrY-readst2tChrY-a0.99-d1-l24000/winnowmap/winnowmap: 0


                                                                                             

Unnamed: 0,Mapped Q60,Q<60 or missed,Wrong Q60,Index [sec],Map [sec],Memory [GB]
minimap,935,1801,0,1.7,159.4,0.3
mapquik,1472,1264,0,0.5,0.5,1.5
blend,1521,1215,6,1.1,52.9,0.2
jaccmap,2198,538,1,0.6,5.8,0.4


Tools for HG002_24kb:  60%|██████    | 3/5 [00:15<00:10,  5.22s/it]

An error occurred while reading PAF out20241022/HG002_24kb/winnowmap/winnowmap: File does not exist or is empty: out20241022/HG002_24kb/winnowmap/winnowmap.paf


                                                                   

Unnamed: 0,Mapped Q60,Q<60 or missed,Wrong Q60,Index [sec],Map [sec],Memory [GB]
minimap,20365,1802,,78.7,141.4,10.2
mapquik,510,21657,,30.4,6.6,4.2
blend,20752,1415,,51.2,32.2,5.4
jaccmap,21522,645,,43.2,33.2,12.3


Unnamed: 0,Unnamed: 1,Mapped Q60,Q<60 or missed,Wrong Q60,Index [sec],Map [sec],Memory [GB]
t2tChrY-readst2tChrY-a0.99-d10-l10000,minimap,20572,41893,0.0,1.6,1153.4,0.7
t2tChrY-readst2tChrY-a0.99-d10-l10000,mapquik,25907,36558,0.0,0.5,4.7,1.5
t2tChrY-readst2tChrY-a0.99-d10-l10000,blend,29461,33004,132.0,1.1,371.8,0.6
t2tChrY-readst2tChrY-a0.99-d10-l10000,winnowmap,34803,27662,38.0,2.5,54016.1,16.4
t2tChrY-readst2tChrY-a0.99-d10-l10000,jaccmap,43528,18937,12.0,0.7,117.1,0.4
chm13-readschm13-a0.99-d0.1-l10000,minimap,28233,2951,0.0,79.2,60.9,10.2
chm13-readschm13-a0.99-d0.1-l10000,mapquik,8,31176,8.0,30.3,3.1,4.2
chm13-readschm13-a0.99-d0.1-l10000,blend,29288,1896,19.0,51.4,13.0,5.4
chm13-readschm13-a0.99-d0.1-l10000,winnowmap,29501,1683,4.0,120.1,1372.5,3.3
chm13-readschm13-a0.99-d0.1-l10000,jaccmap,30278,906,1.0,41.5,15.5,12.3


In [128]:
def highlight_max(s):
    is_max = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_max]

df_high = df_styled.apply(highlight_max).set_caption(data)
display(df_high.to_latex(escape=False))

display(df_high)

TypeError: Styler.to_latex() got an unexpected keyword argument 'escape'

In [109]:
DF = pd.concat(dfs, keys=keys)
display(DF)
DF.to_latex('evals-table.tex', escape=True)

Unnamed: 0,Unnamed: 1,Mapped Q60,Q<60 or missed,Wrong Q60,Index [sec],Map [sec],Memory [GB]
t2tChrY-readst2tChrY-a0.99-d10-l10000,jaccmap,43528,0,12.0,0.7,117.1,0.4
t2tChrY-readst2tChrY-a0.99-d10-l10000,minimap,20572,0,0.0,1.6,1153.4,0.7
t2tChrY-readst2tChrY-a0.99-d10-l10000,mapquik,25907,7727,0.0,0.5,4.7,1.5
t2tChrY-readst2tChrY-a0.99-d10-l10000,blend,29461,0,132.0,1.1,371.8,0.6
t2tChrY-readst2tChrY-a0.99-d10-l10000,winnowmap,34803,0,38.0,2.5,54016.1,16.4
chm13-readschm13-a0.99-d0.1-l10000,jaccmap,30278,0,1.0,41.5,15.5,12.3
chm13-readschm13-a0.99-d0.1-l10000,minimap,28233,0,0.0,79.2,60.9,10.2
chm13-readschm13-a0.99-d0.1-l10000,mapquik,8,17378,8.0,30.3,3.1,4.2
chm13-readschm13-a0.99-d0.1-l10000,blend,29288,2,19.0,51.4,13.0,5.4
chm13-readschm13-a0.99-d0.1-l10000,winnowmap,29501,1,4.0,120.1,1372.5,3.3


In [124]:
DF.to_latex('evals-table.tex', escape=True, multirow=False)