In [63]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display, HTML
from Bio import SeqIO
import os
#from astropy import units as u
import sys
from readpaf import parse_paf
from collections import Counter
from tqdm import tqdm

import sv

#from itables import init_notebook_mode
#init_notebook_mode(all_interactive=True)

def to_latex(df, data, refname):
    latex = ""
    df.index = df.index.map(lambda x: f'\\{x}')
    df.columns = df.columns.str.replace(' ', '\\\\')
    df.columns = df.columns.str.replace('%', '\%')
    df.columns = df.columns.map(lambda x: '\makecell{' + x + '}')
    #df = df.astype(str).map(lambda x: x.rstrip('0').rstrip('.') if '.' in x else x)
    latex += df.to_latex(escape=False, label=f'tab:{refname}', caption=data, float_format = lambda x: '{:0.2f}'.format(x) if pd.notna(x) else '-')
    #latex += df.to_latex(float_format = lambda x: '{:0.2f}'.format(x) if pd.notna(x) else '-')
    latex += '\n'
    return latex

In [64]:
def union_length(df):
    df_sorted = df.sort_values(by='target_start').reset_index(drop=True)
    merged_intervals = []
    current_start, current_end = df_sorted.iloc[0]['target_start'], df_sorted.iloc[0]['target_end']

    for index, row in df_sorted.iterrows():
        if index == 0:
            continue
        if row['target_start'] <= current_end:
            current_end = max(current_end, row['target_end'])
        else:
            merged_intervals.append((current_start, current_end))
            current_start, current_end = row['target_start'], row['target_end']

    merged_intervals.append((current_start, current_end))
    union_length = sum(end - start for start, end in merged_intervals)
    return union_length

In [77]:
def perc(a, b):
    if b == 0:
        return np.nan
    return 100.0 * a / b

def fasta2df(fn):
    seqs = SeqIO.parse(fn, "fasta")
    df = pd.DataFrame((str(s.id), str(s.seq)) for s in seqs)
    df.columns = ["ID", "Sequence"]
    return df

def is_overlapping(a, sv_row):
    return a.GT_from <= sv_row['END'] and sv_row['POS'] <= a.GT_to 
    
min_overlap = 0.8

# def is_correct(a):
#     if a.GT_ref != a.target_name:
#         return False
#     if a.GT_strand != a.strand:
#         return False
#     union_from = min(a.GT_from, a.target_start)
#     union_to = max(a.GT_to, a.target_end)

#     intersect_from = max(a.GT_from, a.target_start)
#     intersect_to = min(a.GT_to, a.target_end)
#     overlaps = intersect_to - intersect_from >= min_overlap * (union_to - union_from)
#     return overlaps

def is_correct_labels(a, GT_l, target_l):
    if a.GT_ref != a.target_name:
        return False
    if a.GT_strand != a.strand:  # won't work for inversions
        return False
    union_from = min(a.GT_from, a.target_start)
    union_to = max(a.GT_to, a.target_end)

    GT_labels = Counter(GT_l[a.GT_ref][a.GT_from:a.GT_to])
    target_labels = Counter(target_l[a.target_name][a.target_start:a.target_end])
    intersection = GT_labels & target_labels
    overlaps = sum(intersection.values()) >= min_overlap * (union_to - union_from)
    return overlaps

def read_falls_on_what_sv(a, vcf_df):
    #print('read_falls_on_what_sv', a)
    assert((vcf_df.POS <= vcf_df.END).all())
    sv_cnt = 0
    sv_type = 'none'
    for index, sv_row in vcf_df.iterrows():
        if sv_row['CHROM'] != a.GT_ref:
            continue
        if is_overlapping(a, sv_row):
            sv_cnt += 1
            sv_type = sv_row['SVTYPE']
    if sv_cnt > 1:
        return 'multi'
    return sv_type 
    
def read_paf(pref, reads, orig_l: dict, mutated_l: dict, vcf_df: pd.DataFrame):
    display(vcf_df)
    paf_file = pref.with_suffix('.paf')
    no_GT = False
    if not paf_file.exists():
        raise Exception(f"File does not exist or is empty: {paf_file}")
    with open(paf_file) as handle:
        df = parse_paf(handle, dataframe=True)
        try:
            df[ ['read_name', 'GT_ref', 'GT_from', 'GT_to', 'GT_strand'] ] = df['query_name'].str.split('!', expand=True)
            df['GT_from'] = df['GT_from'].astype(int)
            df['GT_to'] = df['GT_to'].astype(int)
            #df['is_correct_labels'] = df.apply(lambda x: is_correct_labels(x, orig_l, mutated_l), axis=1)
            #df['is_correct'] = df.apply(is_correct, axis=1)
            df['is_correct'] = df.apply(lambda x: is_correct_labels(x, orig_l, mutated_l), axis=1)
            df['start_diff'] = df.target_start - df.GT_from
            df['end_diff'] = df.target_end - df.GT_to
            df['read_sv'] = df.apply(lambda x: read_falls_on_what_sv(x, vcf_df), axis=1)
        except ValueError as e:
            display(e)
            df['read_name'] = df['query_name']
            #df['is_correct_labels'] = True
            df['is_correct'] = True
            df['start_diff'] = 0
            df['end_diff'] = 0
            df['read_sv'] = 'none'
            no_GT = True
        display(df)
    df = df.sort_values(['read_name', 'residue_matches'], ascending=[True, False], ignore_index=True)

    # initialize from left to right
    myNA = 'N/A'
    bad = 'bad'
    unique = 'uniq'
    multi = 'mult'
    bad_mappings_perc = bad+' mappings p.multiread'
    paf = {}
    paf[unique] = 0
    paf[multi] = 0
    paf['unmapped'] = 0
    for suffix in ['', ' Q60']:
        paf[bad + suffix] = 0
        if suffix == '':
            paf[multi+' '+bad+' group'] = 0
            paf[bad_mappings_perc] = 0
        paf[bad + ' ' + unique + suffix] = 0
        paf[bad + ' ' + multi + suffix] = 0
    prev_read_name = None

    at_least_one_correct = False
    for index, a in df.iterrows():
        assert(df.loc[index, 'read_name'] == a.read_name)
        next_read_name = df.loc[index + 1, 'read_name'] if index + 1 < len(df) else None
        is_unique = a.read_name != prev_read_name and a.read_name != next_read_name
        if is_unique:
            paf[unique] += 1
        if a.read_name != prev_read_name: 
            at_least_one_correct = False
        if a.is_correct:
            at_least_one_correct = True
        elif not is_unique:
            paf[bad_mappings_perc] += 1
        if a.read_name != next_read_name:
            if not at_least_one_correct and not is_unique:
                paf[multi+' '+bad+' group'] += 1
        if a.read_name == prev_read_name:
            continue
        if not is_unique:
            paf[multi] += 1
        if not a.is_correct:
            #print('wrong:', a)
            suffix = ' Q60' if a.mapping_quality == 60 else ''
            paf[bad + suffix] += 1
            if next_read_name != a.read_name:
                paf[bad + ' ' + unique + suffix] += 1
            else:
                paf[bad + ' ' + multi + suffix] += 1
        prev_read_name = a.read_name

    if paf[multi] > 0:
        paf[bad_mappings_perc] = '{:.2f}'.format(paf[bad_mappings_perc] / paf[multi])
    else:
        paf[bad_mappings_perc] = myNA
    paf['unmapped'] = reads - paf[unique] - paf[multi]

    paf['mean start diff'] = '{:.1f}'.format(df[df.is_correct].start_diff.mean())
    paf['mean end diff'] = '{:.1f}'.format(df[df.is_correct].end_diff.mean())

    for key in [unique, multi, 'unmapped', bad]:
        paf[key] = '{} ({:.2f}%)'.format(paf[key], perc(paf[key], reads))
    
    if no_GT:
        for suff in ['', ' '+unique, ' '+multi]:
            for suffsuff in ['', ' Q60']:
                paf[bad + suff + suffsuff] = myNA 
            paf[multi+' '+bad+' group'] = myNA
            paf['addit '+bad+' aligns'] = myNA
        paf['mean start diff'] = myNA
        paf['mean end diff'] = myNA

    paf['alignments depth'] = '{:.2f}'.format(sum(df['query_length']) / df.iloc[0]['target_length'])
    union_len = union_length(df)
    paf['covered reference'] = '{} ({:.2f}%)'.format(union_len, perc(union_len, df.iloc[0]['target_length']))

    # for each sv type in vcf_df or 'multi', count how many alignment are not correct and calculate their percentage of all alignments on this SV
    for sv_type in list(vcf_df['SVTYPE'].unique()) + ['multi']:
        if sv_type == 'none':
            continue
        sv_df = df[df['read_sv'] == sv_type]
        bad_sv = sv_df[~sv_df['is_correct']]
        paf[f'bad {sv_type}'] = '{} / {} ({:.2f}%)'.format(len(bad_sv), len(sv_df), perc(len(bad_sv), len(sv_df)))

    return pd.Series(paf, dtype='object')
    
#def read_eval(pref, reads):
#    eval_file = pref.with_suffix('.eval')
#    if not eval_file.exists() or os.stat(eval_file).st_size == 0:
#        raise Exception(f"File does not exist or is empty: {eval_file}")
#    evaldf = pd.read_csv(eval_file, sep='\t', names=['Q', 'quality', 'aligned', 'misaligned', 'misaligned/aligned', 'aligned cum'])
#    evals = {}
#    evals['eval aligned'] = evaldf.iloc[-1]['aligned cum']
#    evals['eval misaligned'] = evaldf.iloc[-1]['misaligned']
#    #display(f'aligned: {evals["eval aligned"]}, misaligned: {evals["eval misaligned"]}')
#    q60row = evaldf[evaldf['quality'] == 60]
#    assert(len(q60row) == 1)
#    #evals['eval unaligned'] = reads - evals['eval aligned']
#    evals['eval correct'] = evals['eval aligned'] - evals['eval misaligned']
#    evals['eval Q60 aligned'] = q60row['aligned'].values[0]
#    evals['eval Q60 misaligned'] = q60row['misaligned'].values[0]
#    #evals['eval noncorrect%'] = (100.0 * (evals['eval misaligned'] + evals['eval unaligned']) / evals['eval aligned']) #.round(2)
#    return pd.Series(evals, dtype='object')

def read_times(pref):
    times = {}
    with open(str(pref) + '.index.time') as f_index_time:
        index_time, index_mem = map(float, f_index_time.readline().split())
        times['index time'] = index_time #* u.second
        #times['index_mem'] = index_mem / 2**20
        with open(str(pref) + '.time') as f_time:
            total_time, total_mem = map(float, f_time.readline().split())
            #times['time total'] = total_time #* u.second
            times['map time'] = total_time - times['index time']
            times['memory'] = (total_mem / 2**20) #* u.GB
    return pd.Series(times, dtype='object').map('{:.1f}'.format)

def get_comparison_table(refname, readsim_refname, experiment, tools):
    empty_cell = -1
    alldf = pd.DataFrame()
    alldf.name = experiment
    ref = fasta2df(Path('refs') / (refname+'.fa'))
    reads = fasta2df(Path('reads') / Path(experiment+'.fa'))

    # SVs
    orig_fa_dict = sv.read_fasta_file(Path('refs') / (refname+'.fa'))
    orig_l = sv.gen_unique_labels(orig_fa_dict)
    try:
        vcf_df = sv.read_vcf(Path('refs') / (readsim_refname+'.vcf'))
        mutated_fa, mutated_l = sv.mutate(orig_fa_dict, orig_l, vcf_df)
    except FileNotFoundError:
        vcf_df = None
        mutated_fa, mutated_l = orig_fa_dict, orig_l

    rows = []  # list of Series to be converted to DataFrame

    for tool in tqdm(tools, desc=f'Tools for {experiment}'):
        d = Path("out") / experiment / tool / tool
        row = pd.Series({
            'tool': tool,
            'reads': len(reads),
        })
        row = pd.concat([row, read_paf(d, len(reads), orig_l, mutated_l, vcf_df)])     # .paf
        try:
            row = pd.concat([row, read_times(d)])   # .time, .index.time
        except Exception as e:
            print(f"An error occurred while reading times {d}: {e}")
            row['index time'] = empty_cell
            row['map time'] = empty_cell
            row['memory'] = empty_cell
        rows.append(row)
    alldf = pd.DataFrame(rows)
    alldf = alldf.set_index('tool')
    alldf.index.name = None
    return alldf

Build a table to compare the mappers by accuracy, runtime (indexing and mapping) and memory.

In [78]:
#tools = ['sweepmap', 'sweepmap-slow', 'mapquik', 'blend', 'minimap', 'winnowmap'] 
tools = ['sweepmap', 'minimap'] 
experiments = [
#    ('t2tChrY', 't2tChrY', 't2tChrY-readst2tChrY-a0.99-d10-l10000'),
#    ('chm13',   'chm13',   'chm13-readschm13-a0.99-d0.1-l10000'),
#    ('t2tChrY', 't2tChrY', 't2tChrY-readst2tChrY-a0.99-d1-l24000'),
#    ('chm13',   'chm13',   'HG002_24kb'),
#
	('t2tChrY', 't2tChrY-SVs', 't2tChrY-readst2tChrY-SVs-a0.99-d1-l10000'),
]

pd.set_option('display.width', 100)
css = """ <style> table { font-family: "Courier New", Courier, monospace; } </style> """
display(HTML(css))
dfs = []
keys = []
for refname, readsim_refname, data in experiments:
    df = get_comparison_table(refname=refname, readsim_refname=readsim_refname, experiment=data, tools=tools).round(2)
    dfs.append(df)
    keys.append(data)
    df_styled = df.style.set_caption(data)
    display(df_styled)
    #print(to_latex(df))
DF = pd.concat(dfs, keys=keys)
#display(DF)

Tools for t2tChrY-readst2tChrY-SVs-a0.99-d1-l10000:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,PRECISE,SVTYPE,SVMETHOD,CHR2,END,SVLEN,dup_num
0,NC_060948.1,43055173,[DUP0SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,43055840,667,6
1,NC_060948.1,14766290,[DUP1SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,14767204,914,6
2,NC_060948.1,32178577,[DUP2SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,32178899,322,3
3,NC_060948.1,30743832,[DUP3SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,30744029,197,2
4,NC_060948.1,6438389,[DUP4SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,6439015,626,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,NC_060948.1,9184141,[INV85SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,9184745,604,
86,NC_060948.1,39310397,[INV86SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,39311089,692,
87,NC_060948.1,48277312,[INV87SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,48277929,617,
88,NC_060948.1,24584239,[INV88SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,24584912,673,


Unnamed: 0,query_name,query_length,query_start,query_end,strand,target_name,target_length,target_start,target_end,residue_matches,...,t,read_name,GT_ref,GT_from,GT_to,GT_strand,is_correct,start_diff,end_diff,read_sv
0,S1_1!NC_060948.1!20412840!20424436!+,11634,28,11475,+,NC_060948.1,62460029,20396785,20408383,11634,...,0.000128,S1_1,NC_060948.1,20412840,20424436,+,False,-16055,-16053,none
1,S1_2!NC_060948.1!3214411!3224511!-,10128,60,10120,-,NC_060948.1,62460029,3214381,3224481,10128,...,0.000126,S1_2,NC_060948.1,3214411,3224511,-,True,-30,-30,none
2,S1_3!NC_060948.1!21532531!21540896!+,8385,29,8381,+,NC_060948.1,62460029,21508452,21516817,8385,...,0.000120,S1_3,NC_060948.1,21532531,21540896,+,False,-24079,-24079,none
3,S1_4!NC_060948.1!10719394!10729404!-,10039,37,10036,-,NC_060948.1,62460029,10714736,10724780,10039,...,0.001605,S1_4,NC_060948.1,10719394,10729404,-,False,-4658,-4624,none
4,S1_5!NC_060948.1!47682562!47690571!+,8038,487,5434,+,NC_060948.1,62460029,47619983,47629455,8038,...,0.000170,S1_5,NC_060948.1,47682562,47690571,+,False,-62579,-61116,none
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6264,S1_6265!NC_060948.1!49059940!49069141!+,9222,292,9203,+,NC_060948.1,62460029,48992872,49002073,9222,...,0.000198,S1_6265,NC_060948.1,49059940,49069141,+,False,-67068,-67068,none
6265,S1_6266!NC_060948.1!44038494!44046481!-,8005,258,7992,-,NC_060948.1,62460029,38789157,38797393,8005,...,0.000433,S1_6266,NC_060948.1,44038494,44046481,-,False,-5249337,-5249088,none
6266,S1_6267!NC_060948.1!15401752!15412274!+,10560,27,10542,+,NC_060948.1,62460029,15391604,15402126,10560,...,0.000130,S1_6267,NC_060948.1,15401752,15412274,+,False,-10148,-10148,none
6267,S1_6268!NC_060948.1!3960720!3980344!-,19663,1968,14492,-,NC_060948.1,62460029,3963940,3976116,19663,...,0.000171,S1_6268,NC_060948.1,3960720,3980344,-,False,3220,-4228,none


Tools for t2tChrY-readst2tChrY-SVs-a0.99-d1-l10000:  50%|█████     | 1/2 [00:53<00:53, 53.02s/it]

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,PRECISE,SVTYPE,SVMETHOD,CHR2,END,SVLEN,dup_num
0,NC_060948.1,43055173,[DUP0SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,43055840,667,6
1,NC_060948.1,14766290,[DUP1SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,14767204,914,6
2,NC_060948.1,32178577,[DUP2SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,32178899,322,3
3,NC_060948.1,30743832,[DUP3SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,30744029,197,2
4,NC_060948.1,6438389,[DUP4SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,6439015,626,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,NC_060948.1,9184141,[INV85SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,9184745,604,
86,NC_060948.1,39310397,[INV86SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,39311089,692,
87,NC_060948.1,48277312,[INV87SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,48277929,617,
88,NC_060948.1,24584239,[INV88SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,24584912,673,


Unnamed: 0,query_name,query_length,query_start,query_end,strand,target_name,target_length,target_start,target_end,residue_matches,...,rl,read_name,GT_ref,GT_from,GT_to,GT_strand,is_correct,start_diff,end_diff,read_sv
0,S1_1!NC_060948.1!20412840!20424436!+,11634,10,11618,+,NC_060948.1,62460029,20396795,20408367,9974,...,0,S1_1,NC_060948.1,20412840,20424436,+,False,-16045,-16069,none
1,S1_2!NC_060948.1!3214411!3224511!-,10128,35,10114,-,NC_060948.1,62460029,3214425,3224476,8922,...,81,S1_2,NC_060948.1,3214411,3224511,-,True,14,-35,none
2,S1_3!NC_060948.1!21532531!21540896!+,8385,9,8371,+,NC_060948.1,62460029,21508461,21516803,7502,...,67,S1_3,NC_060948.1,21532531,21540896,+,False,-24070,-24093,none
3,S1_4!NC_060948.1!10719394!10729404!-,10039,15,10038,-,NC_060948.1,62460029,10714783,10724777,8157,...,757,S1_4,NC_060948.1,10719394,10729404,-,False,-4611,-4627,none
4,S1_5!NC_060948.1!47682562!47690571!+,8038,238,7606,+,NC_060948.1,62460029,47621670,47629011,813,...,6653,S1_5,NC_060948.1,47682562,47690571,+,False,-60892,-61560,none
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6391,S1_6265!NC_060948.1!49059940!49069141!+,9222,279,9204,+,NC_060948.1,62460029,48993151,49002055,1192,...,7230,S1_6265,NC_060948.1,49059940,49069141,+,False,-66789,-67086,none
6392,S1_6266!NC_060948.1!44038494!44046481!-,8005,62,7988,-,NC_060948.1,62460029,43983752,43991661,1631,...,5966,S1_6266,NC_060948.1,44038494,44046481,-,False,-54742,-54820,none
6393,S1_6267!NC_060948.1!15401752!15412274!+,10560,17,10552,+,NC_060948.1,62460029,15391621,15402118,9092,...,140,S1_6267,NC_060948.1,15401752,15412274,+,False,-10131,-10156,none
6394,S1_6268!NC_060948.1!3960720!3980344!-,19663,62,19645,-,NC_060948.1,62460029,3960738,3980284,17076,...,44,S1_6268,NC_060948.1,3960720,3980344,-,True,18,-60,none


Tools for t2tChrY-readst2tChrY-SVs-a0.99-d1-l10000: 100%|██████████| 2/2 [01:50<00:00, 55.11s/it]


Unnamed: 0,reads,uniq,mult,unmapped,bad,mult bad group,bad mappings p.multiread,bad uniq,bad mult,bad Q60,bad uniq Q60,bad mult Q60,mean start diff,mean end diff,alignments depth,covered reference,bad DUPmulti,bad INSmulti,bad DELmulti,bad INVmulti,index time,map time,memory
sweepmap,6269,6269 (100.00%),0 (0.00%),0 (0.00%),2832 (45.17%),0,,2832,0,2940,2940,0,30.1,14.3,1.0,37997420 (60.83%),0 / 0 (nan%),0 / 0 (nan%),0 / 0 (nan%),0 / 0 (nan%),0.9,1.5,0.6
minimap,6269,6179 (98.56%),90 (1.44%),0 (0.00%),4171 (66.53%),88,2.39,4091,80,1415,1407,8,-17.3,-42.9,1.02,38922337 (62.32%),0 / 0 (nan%),0 / 0 (nan%),0 / 0 (nan%),0 / 0 (nan%),1.8,132.1,0.3


Visualize how SweepMap parameters influence its accuracy and mapping runtime.

In [67]:
Ks = [14, 16, 18, 20, 22, 24, 26]
Rs = ['0.01', '0.05', '0.1', '0.15', '0.2']

def read_matrix(Ys, Xs, ylabel, xlabel, experiment):
    TOOL = 'sweepmap'
    DATA = ('t2tChrY', 't2tChrY-a0.99-d1-l10000') # ('chm13', 'chm13-a0.99-d1-l10000')

    VARS = {'thinning': ('S', 'M'), 'sketching': ('K', 'R')}
    vary, varx = VARS[experiment]

    map_time_df = pd.DataFrame(index=Ys, columns=Xs)
    map_time_df.index.name = ylabel
    map_time_df.columns.name = xlabel
    issue_perc_df = map_time_df.copy() # pd.DataFrame(index=MAX_SEEDS, columns=MAX_MATCHES)

    for y in Ys:
        for x in Xs:
            refname, data = DATA
            reads = fasta2df(Path('reads') / Path('reads-'+data+'.fa'))
            pref = Path("out") / data / experiment / f'{TOOL}-{vary}{y}-{varx}{x}'
            eval_file = str(pref) + '.eval'
            unaligned,  aligned,  wrong,     issue_perc = get_eval(eval_file, len(reads))
            index_time, map_time, index_mem, total_mem  = get_times(pref)
            map_time_df.loc[y, x] = map_time
            issue_perc_df.loc[y, x] = issue_perc / 100.0

    return map_time_df, issue_perc_df

def plot_SM_heatmap(df, name, title, cm):
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.heatmap(df.astype(float), ax=ax, annot=True, fmt=".2f", cmap=cm) #, cbar=False)
    ax.set_title(title)
    ax.set_ylabel(df.index.name)
    ax.set_xlabel(df.columns.name)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
    ax.invert_yaxis()
    plt.tight_layout()
    plt.show()
    fig.savefig(f'imgs/{name}.png', dpi=300)
    plt.close(fig)

KR_map_time_df, KR_issue_perc_df = read_matrix(Ks, Rs, 'kmer size (K)', 'FracMinHash ratio (R)', 'sketching')
plot_SM_heatmap(KR_map_time_df, name='heatmapKR_maptime', title='Map time', cm='mako_r')
plot_SM_heatmap(KR_issue_perc_df, name='heatmapKR_percissues', title='Percent of issues (wrong + unaligned)', cm='rocket_r')

MAX_SEEDS = [10, 30, 100, 300, 1000, 3000, 10000]
MAX_MATCHES = [100, 300, 1000, 3000, 10000, 30000, 100000, 300000]
SM_map_time_df, SM_issue_perc_df = read_matrix(MAX_SEEDS, MAX_MATCHES, 'max seeds (S)', 'max matches (M)', 'thinning')
plot_SM_heatmap(SM_map_time_df, name='heatmapSM_maptime', title='Map time', cm='mako_r')
plot_SM_heatmap(SM_issue_perc_df, name='heatmapSM_percissues', title='Percent of issues (wrong + unaligned)', cm='rocket_r')

FileNotFoundError: [Errno 2] No such file or directory: 'reads/reads-t2tChrY-a0.99-d1-l10000.fa'

In [None]:
vcf_df = read_vcf('refs/t2tChrY-SVs.vcf')
fa_orig = read_fasta_file('refs/t2tChrY.fa')
fa_SURVIVOR = read_fasta_file('refs/t2tChrY-SVs.fasta')
l_orig = gen_unique_labels(fa_orig)
fa_SV, labels_SV = mutate(fa_orig, l_orig, vcf_df)
#are_equal(fa_orig, fa_SURVIVOR)
are_equal(fa_SV, fa_SURVIVOR)

/home/pesho/.local/lib/python3.10/site-packages/vcfpy/header.py:413: FieldInfoNotFound: INFO dup_num not found using String/'.' instead


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,PRECISE,SVTYPE,SVMETHOD,CHR2,END,SVLEN,dup_num
4,NC_060948.1,3430510,[INS4SURVIVOR],A,GGAGCTTAGGAGGTAAAAGGAACTTGTGAAGGTGAACTATCCCTAC...,,PASS,True,INS,SURVIVOR_sim,NC_060948.1,3430745,235,
2,NC_060948.1,14398344,[DUP2SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,14398656,312,5.0
5,NC_060948.1,15092668,[DEL5SURVIVOR],TTAGACTTCCTAAATATATAAAGCAAATATTAATGGACATAAAGGG...,T,,PASS,True,DEL,SURVIVOR_sim,NC_060948.1,15092864,196,
8,NC_060948.1,16086589,[INV8SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,16087210,621,
1,NC_060948.1,27847166,[DUP1SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,27847906,740,6.0
0,NC_060948.1,35790329,[DUP0SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,35790540,211,2.0
7,NC_060948.1,42134928,[INV7SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,42135682,754,
3,NC_060948.1,47095939,[INS3SURVIVOR],G,AGAACCCCCCTTTAATATGAGCGAAATGCCTCTACCCTGGACCACG...,,PASS,True,INS,SURVIVOR_sim,NC_060948.1,47096432,493,
6,NC_060948.1,56548921,[INV6SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,56549555,634,


CHROM                                             NC_060948.1
POS                                                   3430510
ID                                             [INS4SURVIVOR]
REF                                                         A
ALT         GGAGCTTAGGAGGTAAAAGGAACTTGTGAAGGTGAACTATCCCTAC...
QUAL                                                     None
FILTER                                                   PASS
PRECISE                                                  True
SVTYPE                                                    INS
SVMETHOD                                         SURVIVOR_sim
CHR2                                              NC_060948.1
END                                                   3430745
SVLEN                                                     235
dup_num                                                   NaN
Name: 4, dtype: object

2. add 0:3430510 of len 3430510
4. ins of len 235


CHROM          NC_060948.1
POS               14398344
ID          [DUP2SURVIVOR]
REF                      N
ALT                    DUP
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 DUP
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               14398656
SVLEN                  312
dup_num                  5
Name: 2, dtype: object

2. add 3430510:14398344 of len 10967834
3. dup at 14398344 312*5 of len 1560


CHROM                                             NC_060948.1
POS                                                  15092668
ID                                             [DEL5SURVIVOR]
REF         TTAGACTTCCTAAATATATAAAGCAAATATTAATGGACATAAAGGG...
ALT                                                         T
QUAL                                                     None
FILTER                                                   PASS
PRECISE                                                  True
SVTYPE                                                    DEL
SVMETHOD                                         SURVIVOR_sim
CHR2                                              NC_060948.1
END                                                  15092864
SVLEN                                                     196
dup_num                                                   NaN
Name: 5, dtype: object

2. add 14398344:15092668 of len 694324
5. delete 15092668:15092864 of len 196


CHROM          NC_060948.1
POS               16086589
ID          [INV8SURVIVOR]
REF                      N
ALT                    INV
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 INV
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               16087210
SVLEN                  621
dup_num                NaN
Name: 8, dtype: object

2. add 15092864:16086589 of len 993725
6. inv 16086589:16087210 of len 621


CHROM          NC_060948.1
POS               27847166
ID          [DUP1SURVIVOR]
REF                      N
ALT                    DUP
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 DUP
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               27847906
SVLEN                  740
dup_num                  6
Name: 1, dtype: object

2. add 16087210:27847166 of len 11759956
3. dup at 27847166 740*6 of len 4440


CHROM          NC_060948.1
POS               35790329
ID          [DUP0SURVIVOR]
REF                      N
ALT                    DUP
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 DUP
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               35790540
SVLEN                  211
dup_num                  2
Name: 0, dtype: object

2. add 27847166:35790329 of len 7943163
3. dup at 35790329 211*2 of len 422


CHROM          NC_060948.1
POS               42134928
ID          [INV7SURVIVOR]
REF                      N
ALT                    INV
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 INV
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               42135682
SVLEN                  754
dup_num                NaN
Name: 7, dtype: object

2. add 35790329:42134928 of len 6344599
6. inv 42134928:42135682 of len 754


CHROM                                             NC_060948.1
POS                                                  47095939
ID                                             [INS3SURVIVOR]
REF                                                         G
ALT         AGAACCCCCCTTTAATATGAGCGAAATGCCTCTACCCTGGACCACG...
QUAL                                                     None
FILTER                                                   PASS
PRECISE                                                  True
SVTYPE                                                    INS
SVMETHOD                                         SURVIVOR_sim
CHR2                                              NC_060948.1
END                                                  47096432
SVLEN                                                     493
dup_num                                                   NaN
Name: 3, dtype: object

2. add 42135682:47095939 of len 4960257
4. ins of len 493


CHROM          NC_060948.1
POS               56548921
ID          [INV6SURVIVOR]
REF                      N
ALT                    INV
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 INV
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               56549555
SVLEN                  634
dup_num                NaN
Name: 6, dtype: object

2. add 47095939:56548921 of len 9452982
6. inv 56548921:56549555 of len 634


CHROM    !!fake_chrom!!
dtype: object

1. add 56549555: of len 5910474
62466983 62466983


True