In [8]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display, HTML
from Bio import SeqIO
import os
#from astropy import units as u
import sys
from readpaf import parse_paf
from collections import Counter, defaultdict
from tqdm import tqdm

import sv

#from itables import init_notebook_mode
#init_notebook_mode(all_interactive=True)
pd.set_option('display.max_columns', None)

def to_latex(df, data, refname):
    latex = ""
    df.index = df.index.map(lambda x: f'\\{x}')
    df.columns = df.columns.str.replace(' ', '\\\\')
    df.columns = df.columns.str.replace('%', '\%')
    df.columns = df.columns.map(lambda x: '\makecell{' + x + '}')
    #df = df.astype(str).map(lambda x: x.rstrip('0').rstrip('.') if '.' in x else x)
    latex += df.to_latex(escape=False, label=f'tab:{refname}', caption=data, float_format = lambda x: '{:0.2f}'.format(x) if pd.notna(x) else '-')
    #latex += df.to_latex(float_format = lambda x: '{:0.2f}'.format(x) if pd.notna(x) else '-')
    latex += '\n'
    return latex

In [60]:
#def perc(a, b):
#    if b == 0:
#        return np.nan
#    return 100.0 * a / b

def fasta2df(fn):
    seqs = SeqIO.parse(fn, "fasta")
    df = pd.DataFrame((str(s.id), str(s.seq)) for s in seqs)
    df.columns = ["ID", "Sequence"]
    return df

#def is_overlapping(a, sv_row):
#    return a.GT_from <= sv_row['END'] and sv_row['POS'] <= a.GT_to 
    
min_overlap = 0.1

def get_overlap(a):
     if a.GT_ref != a.target_name:
         return False
     if a.GT_strand != a.strand:
         return False
     union_from = min(a.GT_from, a.target_start)
     union_to = max(a.GT_to, a.target_end)

     intersect_from = max(a.GT_from, a.target_start)
     intersect_to = min(a.GT_to, a.target_end)
     overlap = max(0.0, (intersect_to - intersect_from) / (union_to - union_from))
     return overlap

def read_paf(pref, reads, experiment, tool):
    paf_file = pref.with_suffix('.paf')
    no_GT = False
    if not paf_file.exists():
        raise Exception(f"File does not exist or is empty: {paf_file}")
    with open(paf_file) as handle:
        df = parse_paf(handle, dataframe=True)
        df['experiment'] = experiment
        df['tool'] = tool
        try:
            df[ ['read_name', 'GT_ref', 'GT_from', 'GT_to', 'GT_strand'] ] = df['query_name'].str.split('!', expand=True)
            df['GT_from'] = df['GT_from'].astype(int)
            df['GT_to'] = df['GT_to'].astype(int)
            df['overlap'] = df.apply(get_overlap, axis=1)
            df['is_correct'] = df['overlap'] >= min_overlap
            #df['is_correct_labels'] = df.apply(lambda x: is_correct_labels(x, orig_l, mutated_l), axis=1)
            #df['is_correct'] = df.apply(lambda x: is_correct_labels(x, orig_l, mutated_l), axis=1)
            #df['start_diff'] = df.target_start - df.GT_from  # TODO: different coordinate systems!
            #df['end_diff'] = df.target_end - df.GT_to  # TODO: different coordinate systems!
            #df['read_sv'] = 'none' # df.apply(lambda x: read_falls_on_what_sv(x, vcf_df), axis=1)
        except Exception as e:
            #display(e)
            df['read_name'] = df['query_name']
            df['GT_ref'] = np.NaN
            df['GT_from'] = np.NaN
            df['GT_to'] = np.NaN
            df['GT_strand'] = np.NaN
            df['overlap'] = np.NaN
            df['is_correct'] = True
            #df['is_correct_labels'] = True
            #df['start_diff'] = 0
            #df['end_diff'] = 0
            #df['read_sv'] = 'none'
            no_GT = True
    df = df.sort_values(['read_name', 'residue_matches'], ascending=[True, False], ignore_index=True)
    #display(df)

    paf = defaultdict(int)
    paf['mapped'] = 0
    paf['reads not mapped'] = 0
    paf['mapped q=60'] = 0
    paf['wrong q=60'] = 0
    paf['mapped q<60'] = 0
    paf['wrong q<60'] = 0

    def process_group(group_first_index, group_last_index):
        group = df.loc[group_first_index:group_last_index]
        paf['mapped'] += 1
        if (group.mapping_quality == 60).all():
            paf['mapped q=60'] += 1
            if not group.is_correct.all():
                paf['wrong q=60'] += 1
        else:
            paf['mapped q<60'] += 1
            if not group.is_correct.all():
                paf['wrong q<60'] += 1

    group_first_i, group_read_name = 0, df.loc[0, 'read_name']
    for i, a in df.iterrows():
        if a.read_name != group_read_name:
            process_group(group_first_i, i-1)
            group_first_i, group_read_name = i, a.read_name
    process_group(group_first_i, len(df)-1)

    if no_GT:
        paf['wrong q=60'] = '?'
        paf['wrong q<60'] = '?'
    
    paf['reads not mapped'] = '{}/{:>5}'.format(reads - paf['mapped'], reads)
    paf['wrong q=60'] = '{}/{:>5}'.format(paf['wrong q=60'], paf['mapped q=60'])
    paf['wrong q<60'] = '{}/{:>5}'.format(paf['wrong q<60'], paf['mapped q<60'])

    paf.pop('mapped')
    paf.pop('mapped q=60')
    paf.pop('mapped q<60')

    return pd.Series(paf, dtype='object')
    
index_time_col = 'index [s]'
map_time_col = 'map [s]'
memory_col = 'memory [GB]'

def read_times(pref):
    times = {}
    with open(str(pref) + '.index.time') as f_index_time:
        index_time, index_mem = map(float, f_index_time.readline().split())
        times[index_time_col] = index_time #* u.second
        #times['index_mem'] = index_mem / 2**20
        with open(str(pref) + '.time') as f_time:
            total_time, total_mem = map(float, f_time.readline().split())
            #times['time total'] = total_time #* u.second
            times[map_time_col] = total_time - times[index_time_col]
            times[memory_col] = (total_mem / 2**20) #* u.GB
    return pd.Series(times, dtype='object').map('{:.1f}'.format)

def get_comparison_table(main_dir: Path, refname, experiment: Path, tools):
    empty_cell = -1
    alldf = pd.DataFrame()
    alldf.name = experiment
    ref = fasta2df(Path('refs') / (refname + '.fa'))
    reads = fasta2df(Path('reads') / (str(experiment) + '.fa'))

    rows = []

    for tool in tqdm(tools, desc=f'Tools for {experiment}', leave=False):
        d = Path(main_dir) / experiment / tool / tool
        row = pd.Series({
            'tool': tool,
        })
        row = pd.concat([row, read_paf(d, len(reads), experiment, tool)])     # .paf
        try:
            row = pd.concat([row, read_times(d)])   # .time, .index.time
        except Exception as e:
            print(f"An error occurred while reading times {d}: {e}")
            row[index_time_col] = empty_cell
            row[map_time_col] = empty_cell
            row[memory_col] = empty_cell
        rows.append(row)
    alldf = pd.DataFrame(rows)
    alldf = alldf.set_index('tool')
    alldf.index.name = None
    return alldf

Build a table to compare the mappers by accuracy, runtime (indexing and mapping) and memory.

In [63]:
main_dir = Path('out20241022')
#tools = ['sweepmap', 'sweepmap-slow', 'mapquik', 'blend', 'minimap', 'winnowmap', 'rmqmap'] 
tools = ['minimap', 'mapquik', 'blend', 'jaccmap', 'winnowmap']
experiments = [
    ('t2tChrY', 't2tChrY-readst2tChrY-a0.99-d10-l10000'),
    ('chm13',   'chm13-readschm13-a0.99-d0.1-l10000'),
    ('t2tChrY', 't2tChrY-readst2tChrY-a0.99-d1-l24000'),
    ('chm13',   'HG002_24kb'),
]

pd.set_option('display.width', 100)
css = """ <style> table { font-family: "Courier New", Courier, monospace; } </style> """
display(HTML(css))
dfs = []
keys = []
for refname, data in experiments:
    df = get_comparison_table(main_dir=main_dir, refname=refname, experiment=data, tools=tools).round(2)
    dfs.append(df)
    keys.append(data)
    df_styled = df.style.set_caption(data)
    display(df_styled)
    #print(vcf_df.to_latex(escape=True))
#DF = pd.concat(dfs, keys=keys)
#display(DF)

                                                                                      

KeyboardInterrupt: 

In [24]:
#def union_length(df):
#    df_sorted = df.sort_values(by='target_start').reset_index(drop=True)
#    merged_intervals = []
#    current_start, current_end = df_sorted.iloc[0]['target_start'], df_sorted.iloc[0]['target_end']
#
#    for index, row in df_sorted.iterrows():
#        if index == 0:
#            continue
#        if row['target_start'] <= current_end:
#            current_end = max(current_end, row['target_end'])
#        else:
#            merged_intervals.append((current_start, current_end))
#            current_start, current_end = row['target_start'], row['target_end']
#
#    merged_intervals.append((current_start, current_end))
#    union_length = sum(end - start for start, end in merged_intervals)
#    return union_length

In [12]:

#def is_correct_labels(a, GT_l, target_l, debug=False):
#    if a.GT_ref != a.target_name:
#        return False
#    if a.GT_strand != a.strand:  # won't work for inversions
#        return False
##    union_from = min(a.GT_from, a.target_start)
##    union_to = max(a.GT_to, a.target_end)
#
#    GT_labels = Counter(target_l[a.GT_ref][a.GT_from:a.GT_to])
#    target_labels = Counter(GT_l[a.target_name][a.target_start:a.target_end])
#    #GT_labels = Counter(GT_l[a.GT_ref][a.GT_from:a.GT_to])
#    #target_labels = Counter(target_l[a.target_name][a.target_start:a.target_end])
#    intersection = GT_labels & target_labels
#    union = GT_labels | target_labels
#    overlaps = sum(intersection.values()) >= min_overlap * sum(union.values())
#
#    if debug:
#        display(a)
#        display('           GT from {} to {}'.format(a.GT_from, a.GT_to))
#        display('       target from {} to {}'.format(a.target_start, a.target_end))
#        display('    GT_labels from {} to {}'.format(min(GT_labels), max(GT_labels)))
#        display('target_labels from {} to {}'.format(min(target_labels), max(target_labels)))
#        display('{} >?= {} = {} * union {} => {}'.format(sum(intersection.values()), min_overlap * sum(union.values()), min_overlap, sum(union.values()), overlaps))
#    return overlaps

#def is_correct_labels_df(df: pd.DataFrame, GT_l, target_l, debug=False):
#    if (df.GT_ref != df.target_name).any():
#        return False
#    if (df.GT_strand != df.strand).any():  # won't work for inversions
#        return False
#
#    GT_ref, GT_from, GT_to = df.GT_ref.iloc[0], df.GT_from.iloc[0], df.GT_to.iloc[0]
#    assert (df.GT_ref == GT_ref).all() and (df.GT_from == GT_from).all() and (df.GT_to == GT_to).all()
#
#    GT_labels = Counter(target_l[GT_ref][GT_from:GT_to])
#    L = [ Counter(GT_l[a.target_name][a.target_start:a.target_end]) for a in df.itertuples() ]
#    target_labels = sum(L, Counter())
#    #GT_labels = Counter(GT_l[a.GT_ref][a.GT_from:a.GT_to])
#    #target_labels = Counter(target_l[a.target_name][a.target_start:a.target_end])
#    intersection = GT_labels & target_labels
#    union = GT_labels | target_labels
#    overlaps = sum(intersection.values()) >= min_overlap * sum(union.values())
#    return overlaps

#def read_falls_on_what_sv(a, vcf_df):
#    query_start = a['GT_from']
#    query_end = a['GT_to']
#    start_idx = vcf_df['POS'].searchsorted(query_start, side='right')
#    end_idx = vcf_df['END'].searchsorted(query_end, side='left')
#    overlap_df = vcf_df.iloc[start_idx:end_idx]
#    overlap_ch_df = overlap_df[overlap_df['CHROM'] == a['GT_ref']]
#    overlap_ch_sv = overlap_ch_df[(overlap_ch_df['POS'] <= query_end) & (overlap_ch_df['END'] >= query_start)]
#    #if len(overlap_ch_sv) > 0:
#    #    display(a, overlap_ch_sv)
#    if len(overlap_ch_sv) == 0:
#        return 'none'
#    elif len(overlap_ch_sv) == 1:
#        return overlap_ch_sv.iloc[0]['SVTYPE']
#    else:
#        return 'multi'

    # SVs
    #orig_fa_dict = sv.read_fasta_file(Path('refs') / (refname+'.fa'))
    #orig_l = sv.gen_unique_labels(orig_fa_dict)
    #try:
    #    vcf_df = sv.read_vcf(Path('refs') / (readsim_refname+'.vcf'))
    #    assert((vcf_df.POS <= vcf_df.END).all())
    #    vcf_df = vcf_df.sort_values(by='POS').reset_index(drop=True)
    #    mutated_fa, mutated_l = sv.mutate(orig_fa_dict, orig_l, vcf_df)
    #except FileNotFoundError:
    #    vcf_df = sv.read_vcf(Path('refs') / 'empty.vcf')
    #    mutated_fa, mutated_l = orig_fa_dict, orig_l


    #print(vcf_df)
    #for sv_type in list(vcf_df['SVTYPE'].unique()) + ['none', 'multi']:
    #    paf[bad(sv_type)] = '{} / {} ({:.2f}%)'.format(paf[bad(sv_type)], paf[sv_type], perc(paf[bad(sv_type)], paf[sv_type]))
    #    paf.pop(sv_type)

    # debug
    #for row in df[df['read_sv'] != 'none'].itertuples():
    #    is_correct_labels(row, orig_l, mutated_l, debug=True)

        #first_in_group = df.loc[group_first_index]
        #group_is_correct = group['is_correct'].any()
        #group_is_unique = len(group) == 1
        #suffix = 'q=60' if (group.mapping_quality == 60).all() else 'q<60'
#        unique_or_multi = unique if group_is_unique else multi
#        paf[unique_or_multi] += 1
#        if not group_is_correct:
#            paf[bad(suffix)] += 1
#            paf[unique_or_multi + ' ' +bad(suffix)] += 1
#            #if not group_is_unique:
#            #    paf[multi+' '+bad+' group'] += 1
#            #    paf[bad_mappings_perc] += 1
#            #paf[bad(first_in_group['read_sv'])] += 1
#        #paf[first_in_group['read_sv']] += 1
#    paf['unmapped'] = reads - paf[unique] - paf[multi]

#    paf['aligned_with 0/1/2+_segments'] = '{} / {} / {} ({:.0f}%/{:.0f}%/{:.0f}%)'.format(
#        paf['unmapped'], paf[unique], paf[multi],
#        perc(paf['unmapped'], len(df)), perc(paf[unique], len(df)), perc(paf[multi], len(df)))
    #for key in [unique, multi, 'unmapped', bad]:
    #    paf[key] = '{} ({:.2f}%)'.format(paf[key], perc(paf[key], reads))

    #if paf[multi] > 0:
    #    paf[bad_mappings_perc] = '{:.2f}'.format(paf[bad_mappings_perc] / paf[multi])
    #else:
    #    paf[bad_mappings_perc] = myNA

    #paf['mean start diff'] = '{:.1f}'.format(df[df.is_correct].start_diff.mean())
    #paf['mean end diff'] = '{:.1f}'.format(df[df.is_correct].end_diff.mean())
 
    #if no_GT:
    #    for suff in ['', ' '+unique, ' '+multi]:
    #        for suffsuff in ['', ' Q60']:
    #            paf[bad + suff + suffsuff] = myNA 
    #        #paf[multi+' '+bad+' group'] = myNA
    #        paf['addit '+bad+' aligns'] = myNA
    #    paf['mean start diff'] = myNA
    #    paf['mean end diff'] = myNA

    #paf['alignments depth'] = '{:.2f}'.format(sum(df['query_length']) / df.iloc[0]['target_length'])
    #union_len = union_length(df)
    #paf['covered reference'] = '{} ({:.2f}%)'.format(union_len, perc(union_len, df.iloc[0]['target_length']))

    #paf.pop('unmapped')
    #paf.pop(unique)
    #paf.pop(multi)


    # initialize from left to right
    #myNA = 'N/A'
    ##bad = 'bad'
    #unique = 'uniq'
    #multi = 'mult'
    #bad_mappings_perc = bad+' mappings p.multiread'
    #paf['unmapped'] = 0
    #paf[unique] = 0
    #paf[multi] = 0
    #for suffix in ['', ' Q60']:
    #    paf[bad + suffix] = 0
    #    #if suffix == '':
    #        #paf[multi+' '+bad+' group'] = 0
    #        #paf[bad_mappings_perc] = 0
    #    for unique_or_multi in [unique, multi]:
    #        paf[bad + ' ' + unique_or_multi + suffix] = 0

    #def bad(sv_type):
    #    return 'bad ' + sv_type
