In [7]:
import evals_header
pd.set_option('display.max_rows', 200)

In [1]:
reads_fn = 'simulations/reads/t2thumanChrY_sr0.0001090909090909091_dr0.0009818181818181818_i0.0009090909090909091_sd7361077429744071834_lmn100_lmx1000000_lavg9000_ls7000_dp10_rm20.fasta'
#eskemap_fn = 'out/eske100.out'
minimap_fn = 'out/minimap.out'
sweep_fn = 'sweep.out'

simulated_df = parse_fasta_metadata(reads_fn)

sweep_df = pd.read_csv(sweep_fn, sep='\t', header=None)
sweep_df.columns = ['query_name', 'k', 'P_sz','p_sz', 'matches', 'ref_start', 'ref_end', 'minx', 'J']
sweep_df['J'] = sweep_df['J'].round(3)
sweep_df['alignment_bps'] = sweep_df['ref_end'] - sweep_df['ref_start']
sweep_df = pd.merge(simulated_df, sweep_df, on='query_name', how='left')
#sweep_df['ref_start'] -= 0.07*sweep_df['alignment_len']
#sweep_df['ref_end'] += 0.07*sweep_df['alignment_len']
#sweep_fn = pd.merge(simulated_df, sweep_df, on='query_name', how='left')

#display(sweep_df)

#eskemap_df = pd.read_csv(eskemap_fn, sep='\t', header=None)
#eskemap_df.columns = ['query_name', 'ref_start', 'ref_end', 'jaccard_scoreX1000']

minimap_df = pd.read_csv(minimap_fn, sep='\t', header=None)
minimap_df.columns = ['query_name', 'query_len', 'query_start', 'query_end', 'strand', 'ref_name', 'ref_len', 'ref_start', 'ref_end', 'match_bases', 'total_bases', 'map_quality', 'cigar']

#eskemap_df = pd.read_csv(eskemap_fn, sep='\t', header=None)

# the intersection divided by the union of the reference intervals 
def get_jaccard_nucl_overlap(row1, row2):
    assert(row1['query_name'] == row2['query_name'])
    if (row1['ref_start'] > row2['ref_end']) or (row2['ref_start'] > row1['ref_end']):
        return 0
    intersection = (min(row1['ref_end'], row2['ref_end']) - max(row1['ref_start'], row2['ref_start']))
    union = (max(row1['ref_end'], row2['ref_end']) - min(row1['ref_start'], row2['ref_start']))
    if union == 0:
        return 0
    return intersection / union

# calculate the maximal jaccard of a row with all other rows with the same query_name in the dataframe
def get_max_jaccard(row, groundtruth_df):
    common = groundtruth_df[groundtruth_df['query_name'] == row['query_name']]
    if common.shape[0] == 0:
        return -1
    return common.apply(lambda row2: get_jaccard_nucl_overlap(row, row2), axis=1).max()

# add a column to the tested_df with the maximal intersection with a groundtruth row with the same query_name
def add_overlap_column(tested_df, groundtruth_df, intersection_column_name):
    tested_df[intersection_column_name] = tested_df.apply(lambda row: get_max_jaccard(row, groundtruth_df), axis=1)

# get the accuracy
def get_accuracy(tested_df, groundtruth_df):
    add_overlap_column(tested_df, groundtruth_df, 'overlap')
    accuracy = tested_df[tested_df['overlap'] > 0.1].shape[0] / tested_df.shape[0]
    return accuracy

In [None]:
add_overlap_column(sweep_df, minimap_df, 'overlap')
correct = sweep_df[sweep_df['overlap'] > 0.1].shape[0]
all = sweep_df.shape[0]
accuracy = correct / all
print('  sweep accuracy: {:.2%} ({} / {})'.format(accuracy, correct, all))
sweep_df.to_csv('all.csv', sep='\t', index=False)

sweep_misaligned_df = sweep_df[sweep_df['overlap'] < 0.1]
sweep_misaligned_df.to_csv('misalignments.csv', sep='\t', index=False)

  sweep accuracy: 98.06% (6806 / 6941)


In [None]:
plot_all_columns(sweep_misaligned_df)
display(sweep_misaligned_df)

Unnamed: 0,query_name,from_ref_sim,to_ref_sim,read_len,k,P_sz,p_sz,matches,ref_start,ref_end,minx,J,alignment_bps,overlap
100,s_100,43510727,43511172,445,15,443,7,69,10945413,10945414,2,0.286,1,0.0
247,s_247,32110413,32110778,365,15,366,0,0,0,0,9,0.0,0,0.0
283,s_283,42399887,42401255,1368,15,1372,7,34,10911765,10911765,1,0.143,0,0.0
293,s_293,37240201,37241199,998,15,1001,11,163,11754400,11754400,1,0.091,0,0.0
336,s_336,39813781,39815191,1410,15,1411,10,35,28027303,28027303,1,0.1,0,0.0
358,s_358,38344415,38347024,2609,15,2608,37,1013,35795870,35798030,23,0.622,2160,0.0
444,s_444,47116197,47116314,117,15,118,0,0,0,0,9,0.0,0,0.0
586,s_586,47925570,47925735,165,15,166,4,128,30992072,30992087,4,1.0,15,0.0
602,s_602,29653773,29655728,1955,15,1954,12,191,30914461,30914471,3,0.25,10,0.0
653,s_653,57886856,57887204,348,15,349,0,0,0,0,9,0.0,0,0.0
