In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display, HTML
from Bio import SeqIO
import os
#from astropy import units as u
import sys
from readpaf import parse_paf
from collections import Counter
from tqdm import tqdm

import sv

#from itables import init_notebook_mode
#init_notebook_mode(all_interactive=True)

def to_latex(df, data, refname):
    latex = ""
    df.index = df.index.map(lambda x: f'\\{x}')
    df.columns = df.columns.str.replace(' ', '\\\\')
    df.columns = df.columns.str.replace('%', '\%')
    df.columns = df.columns.map(lambda x: '\makecell{' + x + '}')
    #df = df.astype(str).map(lambda x: x.rstrip('0').rstrip('.') if '.' in x else x)
    latex += df.to_latex(escape=False, label=f'tab:{refname}', caption=data, float_format = lambda x: '{:0.2f}'.format(x) if pd.notna(x) else '-')
    #latex += df.to_latex(float_format = lambda x: '{:0.2f}'.format(x) if pd.notna(x) else '-')
    latex += '\n'
    return latex



In [3]:
def union_length(df):
    df_sorted = df.sort_values(by='target_start').reset_index(drop=True)
    merged_intervals = []
    current_start, current_end = df_sorted.iloc[0]['target_start'], df_sorted.iloc[0]['target_end']

    for index, row in df_sorted.iterrows():
        if index == 0:
            continue
        if row['target_start'] <= current_end:
            current_end = max(current_end, row['target_end'])
        else:
            merged_intervals.append((current_start, current_end))
            current_start, current_end = row['target_start'], row['target_end']

    merged_intervals.append((current_start, current_end))
    union_length = sum(end - start for start, end in merged_intervals)
    return union_length

In [15]:
def perc(a, b):
    if b == 0:
        return np.nan
    return 100.0 * a / b

def fasta2df(fn):
    seqs = SeqIO.parse(fn, "fasta")
    df = pd.DataFrame((str(s.id), str(s.seq)) for s in seqs)
    df.columns = ["ID", "Sequence"]
    return df

def is_overlapping(a, sv_row):
    return a.GT_from <= sv_row['END'] and sv_row['POS'] <= a.GT_to 
    
min_overlap = 0.9

# def is_correct(a):
#     if a.GT_ref != a.target_name:
#         return False
#     if a.GT_strand != a.strand:
#         return False
#     union_from = min(a.GT_from, a.target_start)
#     union_to = max(a.GT_to, a.target_end)

#     intersect_from = max(a.GT_from, a.target_start)
#     intersect_to = min(a.GT_to, a.target_end)
#     overlaps = intersect_to - intersect_from >= min_overlap * (union_to - union_from)
#     return overlaps

def is_correct_labels(a, GT_l, target_l, debug=False):
    if a.GT_ref != a.target_name:
        return False
    if a.GT_strand != a.strand:  # won't work for inversions
        return False
#    union_from = min(a.GT_from, a.target_start)
#    union_to = max(a.GT_to, a.target_end)

    GT_labels = Counter(target_l[a.GT_ref][a.GT_from:a.GT_to])
    target_labels = Counter(GT_l[a.target_name][a.target_start:a.target_end])
    #GT_labels = Counter(GT_l[a.GT_ref][a.GT_from:a.GT_to])
    #target_labels = Counter(target_l[a.target_name][a.target_start:a.target_end])
    intersection = GT_labels & target_labels
    union = GT_labels | target_labels
    overlaps = sum(intersection.values()) >= min_overlap * sum(union.values())

    if debug:
        display(a)
        display('           GT from {} to {}'.format(a.GT_from, a.GT_to))
        display('       target from {} to {}'.format(a.target_start, a.target_end))
        display('    GT_labels from {} to {}'.format(min(GT_labels), max(GT_labels)))
        display('target_labels from {} to {}'.format(min(target_labels), max(target_labels)))
        display('{} >?= {} = {} * union {} => {}'.format(sum(intersection.values()), min_overlap * sum(union.values()), min_overlap, sum(union.values()), overlaps))
    return overlaps

def is_correct_labels_df(df: pd.DataFrame, GT_l, target_l, debug=False):
    if (df.GT_ref != df.target_name).any():
        return False
    if (df.GT_strand != df.strand).any():  # won't work for inversions
        return False

    GT_labels = sum([ Counter(target_l[a.GT_ref][a.GT_from:a.GT_to]) for a in df.itertuples() ])
    target_labels = sum([ Counter(GT_l[a.target_name][a.target_start:a.target_end] for a in df.itertuples()) ])
    #GT_labels = Counter(GT_l[a.GT_ref][a.GT_from:a.GT_to])
    #target_labels = Counter(target_l[a.target_name][a.target_start:a.target_end])
    intersection = GT_labels & target_labels
    union = GT_labels | target_labels
    overlaps = sum(intersection.values()) >= min_overlap * sum(union.values())

def read_falls_on_what_sv(a, vcf_df):
    query_start = a['GT_from']
    query_end = a['GT_to']
    start_idx = vcf_df['POS'].searchsorted(query_start, side='right')
    end_idx = vcf_df['END'].searchsorted(query_end, side='left')
    overlap_df = vcf_df.iloc[start_idx:end_idx]
    overlap_ch_df = overlap_df[overlap_df['CHROM'] == a['GT_ref']]
    overlap_ch_sv = overlap_ch_df[(overlap_ch_df['POS'] <= query_end) & (overlap_ch_df['END'] >= query_start)]
    #if len(overlap_ch_sv) > 0:
    #    display(a, overlap_ch_sv)
    if len(overlap_ch_sv) == 0:
        return 'none'
    elif len(overlap_ch_sv) == 1:
        return overlap_ch_sv.iloc[0]['SVTYPE']
    else:
        return 'multi'

def read_paf(pref, reads, experiment, tool, orig_l: dict, mutated_l: dict, vcf_df: pd.DataFrame):
    #display(vcf_df)
    paf_file = pref.with_suffix('.paf')
    no_GT = False
    if not paf_file.exists():
        raise Exception(f"File does not exist or is empty: {paf_file}")
    with open(paf_file) as handle:
        df = parse_paf(handle, dataframe=True)
        df['experiment'] = experiment
        df['tool'] = tool
        try:
            df[ ['read_name', 'GT_ref', 'GT_from', 'GT_to', 'GT_strand'] ] = df['query_name'].str.split('!', expand=True)
            df['GT_from'] = df['GT_from'].astype(int)
            df['GT_to'] = df['GT_to'].astype(int)
            #df['is_correct_labels'] = df.apply(lambda x: is_correct_labels(x, orig_l, mutated_l), axis=1)
            #df['is_correct'] = df.apply(is_correct, axis=1)
            df['is_correct'] = df.apply(lambda x: is_correct_labels(x, orig_l, mutated_l), axis=1)
            df['start_diff'] = df.target_start - df.GT_from  # TODO: different coordinate systems!
            df['end_diff'] = df.target_end - df.GT_to  # TODO: different coordinate systems!
            df['read_sv'] = df.apply(lambda x: read_falls_on_what_sv(x, vcf_df), axis=1)
        except ValueError as e:
            display(e)
            df['read_name'] = df['query_name']
            #df['is_correct_labels'] = True
            df['is_correct'] = True
            df['start_diff'] = 0
            df['end_diff'] = 0
            df['read_sv'] = 'none'
            no_GT = True
        #display(df)
    df = df.sort_values(['read_name', 'residue_matches'], ascending=[True, False], ignore_index=True)

    # initialize from left to right
    myNA = 'N/A'
    bad = 'bad'
    unique = 'uniq'
    multi = 'mult'
    bad_mappings_perc = bad+' mappings p.multiread'
    paf = {}
    paf['unmapped'] = 0
    paf[unique] = 0
    paf[multi] = 0
    for suffix in ['', ' Q60']:
        paf[bad + suffix] = 0
        if suffix == '':
            paf[multi+' '+bad+' group'] = 0
            paf[bad_mappings_perc] = 0
        paf[bad + ' ' + unique + suffix] = 0
        paf[bad + ' ' + multi + suffix] = 0

    def process_group(group_first_index, group_last_index):
        group = df.loc[group_first_index:group_last_index]
        group_is_correct = group['is_correct'].any()
        group_is_unique = len(group) == 1
        suffix = ' Q60' if (group.mapping_quality == 60).all() else ''

        unique_or_multi = unique if group_is_unique else multi
        paf[unique_or_multi] += 1
        if not group_is_correct:
            paf[bad + suffix] += 1
            paf[bad + ' ' + unique_or_multi + suffix] += 1
            if not group_is_unique:
                paf[multi+' '+bad+' group'] += 1
                paf[bad_mappings_perc] += 1

    group_first_i, group_read_name = 0, df.loc[0, 'read_name']
    for i, a in df.iterrows():
        if a.read_name != group_read_name:
            process_group(group_first_i, i-1)
            group_first_i, group_read_name = i, a.read_name
    process_group(group_first_i, len(df)-1)

    paf['unmapped'] = reads - paf[unique] - paf[multi]

    paf['aligned_with 0/1/2+_segments'] = '{} / {} / {} ({:.1f}% / {:.1f}% / {:.1f}%)'.format(
        paf['unmapped'], paf[unique], paf[multi],
        perc(paf['unmapped'], reads), perc(paf[unique], reads), perc(paf[multi], reads))
    #for key in [unique, multi, 'unmapped', bad]:
    #    paf[key] = '{} ({:.2f}%)'.format(paf[key], perc(paf[key], reads))

    if paf[multi] > 0:
        paf[bad_mappings_perc] = '{:.2f}'.format(paf[bad_mappings_perc] / paf[multi])
    else:
        paf[bad_mappings_perc] = myNA

    paf['mean start diff'] = '{:.1f}'.format(df[df.is_correct].start_diff.mean())
    paf['mean end diff'] = '{:.1f}'.format(df[df.is_correct].end_diff.mean())
 
    if no_GT:
        for suff in ['', ' '+unique, ' '+multi]:
            for suffsuff in ['', ' Q60']:
                paf[bad + suff + suffsuff] = myNA 
            paf[multi+' '+bad+' group'] = myNA
            paf['addit '+bad+' aligns'] = myNA
        paf['mean start diff'] = myNA
        paf['mean end diff'] = myNA

    paf['alignments depth'] = '{:.2f}'.format(sum(df['query_length']) / df.iloc[0]['target_length'])
    union_len = union_length(df)
    paf['covered reference'] = '{} ({:.2f}%)'.format(union_len, perc(union_len, df.iloc[0]['target_length']))

    # for each sv type in vcf_df or 'multi', count how many alignment are not correct and calculate their percentage of all alignments on this SV
    for sv_type in list(vcf_df['SVTYPE'].unique()) + ['multi']:
        if sv_type == 'none':
            continue
        sv_df = df[df['read_sv'] == sv_type]
        bad_sv = sv_df[~sv_df['is_correct']]
        paf[f'bad {sv_type}'] = '{} / {} ({:.2f}%)'.format(len(bad_sv), len(sv_df), perc(len(bad_sv), len(sv_df)))

    # debug
    #for row in df[df['read_sv'] != 'none'].itertuples():
    #    is_correct_labels(row, orig_l, mutated_l, debug=True)

    paf.pop('unmapped')
    paf.pop(unique)
    paf.pop(multi)

    return pd.Series(paf, dtype='object')
    
def read_times(pref):
    times = {}
    with open(str(pref) + '.index.time') as f_index_time:
        index_time, index_mem = map(float, f_index_time.readline().split())
        times['index time'] = index_time #* u.second
        #times['index_mem'] = index_mem / 2**20
        with open(str(pref) + '.time') as f_time:
            total_time, total_mem = map(float, f_time.readline().split())
            #times['time total'] = total_time #* u.second
            times['map time'] = total_time - times['index time']
            times['memory'] = (total_mem / 2**20) #* u.GB
    return pd.Series(times, dtype='object').map('{:.1f}'.format)

def get_comparison_table(refname, readsim_refname, experiment, tools):
    empty_cell = -1
    alldf = pd.DataFrame()
    alldf.name = experiment
    ref = fasta2df(Path('refs') / (refname+'.fa'))
    reads = fasta2df(Path('reads') / Path(experiment+'.fa'))

    # SVs
    orig_fa_dict = sv.read_fasta_file(Path('refs') / (refname+'.fa'))
    orig_l = sv.gen_unique_labels(orig_fa_dict)
    try:
        vcf_df = sv.read_vcf(Path('refs') / (readsim_refname+'.vcf'))
        assert((vcf_df.POS <= vcf_df.END).all())
        vcf_df = vcf_df.sort_values(by='POS').reset_index(drop=True)
        mutated_fa, mutated_l = sv.mutate(orig_fa_dict, orig_l, vcf_df)
    except FileNotFoundError:
        vcf_df = None
        mutated_fa, mutated_l = orig_fa_dict, orig_l

    rows = []

    for tool in tqdm(tools, desc=f'Tools for {experiment}'):
        d = Path("out") / experiment / tool / tool
        row = pd.Series({
            'tool': tool,
            'reads': len(reads),
        })
        row = pd.concat([row, read_paf(d, len(reads), experiment, tool, orig_l, mutated_l, vcf_df)])     # .paf
        try:
            row = pd.concat([row, read_times(d)])   # .time, .index.time
        except Exception as e:
            print(f"An error occurred while reading times {d}: {e}")
            row['index time'] = empty_cell
            row['map time'] = empty_cell
            row['memory'] = empty_cell
        rows.append(row)
    alldf = pd.DataFrame(rows)
    alldf = alldf.set_index('tool')
    alldf.index.name = None
    return alldf

Build a table to compare the mappers by accuracy, runtime (indexing and mapping) and memory.

In [16]:
tools = ['sweepmap', 'sweepmap-slow', 'mapquik', 'blend', 'minimap', 'winnowmap'] 
#tools = ['winnowmap', 'minimap'] 
experiments = [
#    ('t2tChrY', 't2tChrY', 't2tChrY-readst2tChrY-a0.99-d10-l10000'),
#    ('chm13',   'chm13',   'chm13-readschm13-a0.99-d0.1-l10000'),
#    ('t2tChrY', 't2tChrY', 't2tChrY-readst2tChrY-a0.99-d1-l24000'),
#    ('chm13',   'chm13',   'HG002_24kb'),
	('t2tChrY', 't2tChrY-SVs', 't2tChrY-readst2tChrY-SVs-a0.99-d0.1-l10000'),
]

pd.set_option('display.width', 100)
css = """ <style> table { font-family: "Courier New", Courier, monospace; } </style> """
display(HTML(css))
dfs = []
keys = []
for refname, readsim_refname, data in experiments:
    vcf_df = get_comparison_table(refname=refname, readsim_refname=readsim_refname, experiment=data, tools=tools).round(2)
    dfs.append(vcf_df)
    keys.append(data)
    df_styled = vcf_df.style.set_caption(data)
    display(df_styled)
    #print(to_latex(df))
#DF = pd.concat(dfs, keys=keys)
#display(DF)

Tools for t2tChrY-readst2tChrY-SVs-a0.99-d0.1-l10000: 100%|██████████| 6/6 [00:22<00:00,  3.77s/it]


Unnamed: 0,reads,bad,mult bad group,bad mappings p.multiread,bad uniq,bad mult,bad Q60,bad uniq Q60,bad mult Q60,aligned_with 0/1/2+_segments,mean start diff,mean end diff,alignments depth,covered reference,bad INV,bad DEL,bad DUP,bad INS,bad multi,index time,map time,memory
sweepmap,638,118,0,,118,0,144,144,0,0 / 638 / 0 (0.0% / 100.0% / 0.0%),-1082619.9,-1082562.3,0.1,5762584 (9.23%),24 / 58 (41.38%),14 / 25 (56.00%),24 / 59 (40.68%),14 / 37 (37.84%),14 / 43 (32.56%),0.8,0.2,0.6
sweepmap-slow,638,118,0,,118,0,24,24,0,0 / 638 / 0 (0.0% / 100.0% / 0.0%),-1045488.8,-1045465.6,0.1,5981227 (9.58%),10 / 58 (17.24%),9 / 25 (36.00%),14 / 59 (23.73%),9 / 37 (24.32%),7 / 43 (16.28%),1.0,60.7,0.7
mapquik,638,206,0,,206,0,31,31,0,76 / 562 / 0 (11.9% / 88.1% / 0.0%),-844564.3,-844567.0,0.09,5210632 (8.34%),17 / 52 (32.69%),10 / 21 (47.62%),20 / 54 (37.04%),15 / 29 (51.72%),20 / 41 (48.78%),0.6,-0.0,1.5
blend,638,195,74,0.78,124,71,64,61,3,0 / 543 / 95 (0.0% / 85.1% / 14.9%),-999652.6,-1000080.2,0.12,5610945 (8.98%),30 / 68 (44.12%),21 / 34 (61.76%),21 / 61 (34.43%),19 / 43 (44.19%),19 / 47 (40.43%),1.1,3.9,0.2
minimap,638,108,59,0.73,53,55,20,16,4,0 / 557 / 81 (0.0% / 87.3% / 12.7%),-1044700.3,-1044910.7,0.14,5763502 (9.23%),27 / 75 (36.00%),29 / 48 (60.42%),18 / 69 (26.09%),11 / 40 (27.50%),29 / 64 (45.31%),1.6,12.1,0.3
winnowmap,638,85,54,0.82,52,33,46,25,21,0 / 572 / 66 (0.0% / 89.7% / 10.3%),-1064044.2,-1064158.5,0.12,5875957 (9.41%),16 / 64 (25.00%),16 / 33 (48.48%),15 / 64 (23.44%),15 / 43 (34.88%),15 / 49 (30.61%),2.7,689.2,0.5


Visualize how SweepMap parameters influence its accuracy and mapping runtime.

In [12]:
Ks = [14, 16, 18, 20, 22, 24, 26]
Rs = ['0.01', '0.05', '0.1', '0.15', '0.2']

def read_matrix(Ys, Xs, ylabel, xlabel, experiment):
    TOOL = 'sweepmap'
    DATA = ('t2tChrY', 't2tChrY-a0.99-d1-l10000') # ('chm13', 'chm13-a0.99-d1-l10000')

    VARS = {'thinning': ('S', 'M'), 'sketching': ('K', 'R')}
    vary, varx = VARS[experiment]

    map_time_df = pd.DataFrame(index=Ys, columns=Xs)
    map_time_df.index.name = ylabel
    map_time_df.columns.name = xlabel
    issue_perc_df = map_time_df.copy() # pd.DataFrame(index=MAX_SEEDS, columns=MAX_MATCHES)

    for y in Ys:
        for x in Xs:
            refname, data = DATA
            reads = fasta2df(Path('reads') / Path('reads-'+data+'.fa'))
            pref = Path("out") / data / experiment / f'{TOOL}-{vary}{y}-{varx}{x}'
            eval_file = str(pref) + '.eval'
            unaligned,  aligned,  wrong,     issue_perc = get_eval(eval_file, len(reads))
            index_time, map_time, index_mem, total_mem  = get_times(pref)
            map_time_df.loc[y, x] = map_time
            issue_perc_df.loc[y, x] = issue_perc / 100.0

    return map_time_df, issue_perc_df

def plot_SM_heatmap(df, name, title, cm):
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.heatmap(df.astype(float), ax=ax, annot=True, fmt=".2f", cmap=cm) #, cbar=False)
    ax.set_title(title)
    ax.set_ylabel(df.index.name)
    ax.set_xlabel(df.columns.name)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
    ax.invert_yaxis()
    plt.tight_layout()
    plt.show()
    fig.savefig(f'imgs/{name}.png', dpi=300)
    plt.close(fig)

KR_map_time_df, KR_issue_perc_df = read_matrix(Ks, Rs, 'kmer size (K)', 'FracMinHash ratio (R)', 'sketching')
plot_SM_heatmap(KR_map_time_df, name='heatmapKR_maptime', title='Map time', cm='mako_r')
plot_SM_heatmap(KR_issue_perc_df, name='heatmapKR_percissues', title='Percent of issues (wrong + unaligned)', cm='rocket_r')

MAX_SEEDS = [10, 30, 100, 300, 1000, 3000, 10000]
MAX_MATCHES = [100, 300, 1000, 3000, 10000, 30000, 100000, 300000]
SM_map_time_df, SM_issue_perc_df = read_matrix(MAX_SEEDS, MAX_MATCHES, 'max seeds (S)', 'max matches (M)', 'thinning')
plot_SM_heatmap(SM_map_time_df, name='heatmapSM_maptime', title='Map time', cm='mako_r')
plot_SM_heatmap(SM_issue_perc_df, name='heatmapSM_percissues', title='Percent of issues (wrong + unaligned)', cm='rocket_r')

FileNotFoundError: [Errno 2] No such file or directory: 'reads/reads-t2tChrY-a0.99-d1-l10000.fa'

In [None]:
vcf_df = read_vcf('refs/t2tChrY-SVs.vcf')
fa_orig = read_fasta_file('refs/t2tChrY.fa')
fa_SURVIVOR = read_fasta_file('refs/t2tChrY-SVs.fasta')
l_orig = gen_unique_labels(fa_orig)
fa_SV, labels_SV = mutate(fa_orig, l_orig, vcf_df)
#are_equal(fa_orig, fa_SURVIVOR)
are_equal(fa_SV, fa_SURVIVOR)

/home/pesho/.local/lib/python3.10/site-packages/vcfpy/header.py:413: FieldInfoNotFound: INFO dup_num not found using String/'.' instead


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,PRECISE,SVTYPE,SVMETHOD,CHR2,END,SVLEN,dup_num
4,NC_060948.1,3430510,[INS4SURVIVOR],A,GGAGCTTAGGAGGTAAAAGGAACTTGTGAAGGTGAACTATCCCTAC...,,PASS,True,INS,SURVIVOR_sim,NC_060948.1,3430745,235,
2,NC_060948.1,14398344,[DUP2SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,14398656,312,5.0
5,NC_060948.1,15092668,[DEL5SURVIVOR],TTAGACTTCCTAAATATATAAAGCAAATATTAATGGACATAAAGGG...,T,,PASS,True,DEL,SURVIVOR_sim,NC_060948.1,15092864,196,
8,NC_060948.1,16086589,[INV8SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,16087210,621,
1,NC_060948.1,27847166,[DUP1SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,27847906,740,6.0
0,NC_060948.1,35790329,[DUP0SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,35790540,211,2.0
7,NC_060948.1,42134928,[INV7SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,42135682,754,
3,NC_060948.1,47095939,[INS3SURVIVOR],G,AGAACCCCCCTTTAATATGAGCGAAATGCCTCTACCCTGGACCACG...,,PASS,True,INS,SURVIVOR_sim,NC_060948.1,47096432,493,
6,NC_060948.1,56548921,[INV6SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,56549555,634,


CHROM                                             NC_060948.1
POS                                                   3430510
ID                                             [INS4SURVIVOR]
REF                                                         A
ALT         GGAGCTTAGGAGGTAAAAGGAACTTGTGAAGGTGAACTATCCCTAC...
QUAL                                                     None
FILTER                                                   PASS
PRECISE                                                  True
SVTYPE                                                    INS
SVMETHOD                                         SURVIVOR_sim
CHR2                                              NC_060948.1
END                                                   3430745
SVLEN                                                     235
dup_num                                                   NaN
Name: 4, dtype: object

2. add 0:3430510 of len 3430510
4. ins of len 235


CHROM          NC_060948.1
POS               14398344
ID          [DUP2SURVIVOR]
REF                      N
ALT                    DUP
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 DUP
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               14398656
SVLEN                  312
dup_num                  5
Name: 2, dtype: object

2. add 3430510:14398344 of len 10967834
3. dup at 14398344 312*5 of len 1560


CHROM                                             NC_060948.1
POS                                                  15092668
ID                                             [DEL5SURVIVOR]
REF         TTAGACTTCCTAAATATATAAAGCAAATATTAATGGACATAAAGGG...
ALT                                                         T
QUAL                                                     None
FILTER                                                   PASS
PRECISE                                                  True
SVTYPE                                                    DEL
SVMETHOD                                         SURVIVOR_sim
CHR2                                              NC_060948.1
END                                                  15092864
SVLEN                                                     196
dup_num                                                   NaN
Name: 5, dtype: object

2. add 14398344:15092668 of len 694324
5. delete 15092668:15092864 of len 196


CHROM          NC_060948.1
POS               16086589
ID          [INV8SURVIVOR]
REF                      N
ALT                    INV
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 INV
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               16087210
SVLEN                  621
dup_num                NaN
Name: 8, dtype: object

2. add 15092864:16086589 of len 993725
6. inv 16086589:16087210 of len 621


CHROM          NC_060948.1
POS               27847166
ID          [DUP1SURVIVOR]
REF                      N
ALT                    DUP
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 DUP
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               27847906
SVLEN                  740
dup_num                  6
Name: 1, dtype: object

2. add 16087210:27847166 of len 11759956
3. dup at 27847166 740*6 of len 4440


CHROM          NC_060948.1
POS               35790329
ID          [DUP0SURVIVOR]
REF                      N
ALT                    DUP
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 DUP
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               35790540
SVLEN                  211
dup_num                  2
Name: 0, dtype: object

2. add 27847166:35790329 of len 7943163
3. dup at 35790329 211*2 of len 422


CHROM          NC_060948.1
POS               42134928
ID          [INV7SURVIVOR]
REF                      N
ALT                    INV
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 INV
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               42135682
SVLEN                  754
dup_num                NaN
Name: 7, dtype: object

2. add 35790329:42134928 of len 6344599
6. inv 42134928:42135682 of len 754


CHROM                                             NC_060948.1
POS                                                  47095939
ID                                             [INS3SURVIVOR]
REF                                                         G
ALT         AGAACCCCCCTTTAATATGAGCGAAATGCCTCTACCCTGGACCACG...
QUAL                                                     None
FILTER                                                   PASS
PRECISE                                                  True
SVTYPE                                                    INS
SVMETHOD                                         SURVIVOR_sim
CHR2                                              NC_060948.1
END                                                  47096432
SVLEN                                                     493
dup_num                                                   NaN
Name: 3, dtype: object

2. add 42135682:47095939 of len 4960257
4. ins of len 493


CHROM          NC_060948.1
POS               56548921
ID          [INV6SURVIVOR]
REF                      N
ALT                    INV
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 INV
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               56549555
SVLEN                  634
dup_num                NaN
Name: 6, dtype: object

2. add 47095939:56548921 of len 9452982
6. inv 56548921:56549555 of len 634


CHROM    !!fake_chrom!!
dtype: object

1. add 56549555: of len 5910474
62466983 62466983


True