In [1]:
from evals_header import *
pd.set_option('display.max_rows', 200)
T = 0.9

def read_sweep(sweep_fn):
    sweep_df = pd.read_csv(sweep_fn, sep='\t', header=None)
    sweep_df.columns = ['query_name', 'k', 'P_sz','p_sz', 'matches', 'ref_start', 'ref_end', 'minx', 'J', 'runtime']
    sweep_df['J'] = sweep_df['J'].round(3)
    sweep_df['alignment_bps'] = sweep_df['ref_end'] - sweep_df['ref_start']
    return sweep_df

def add_TP_column(tested_df, groundtruth_df, TP_column_name):
    tested_df[TP_column_name] = tested_df.apply(lambda row: get_max_jaccard(row, groundtruth_df) >= T, axis=1)

def get_sensitivity(tested_df, truth_df):
    add_overlap_column(tested_df, truth_df, 'TP')
    add_overlap_column(truth_df, tested_df, 'TP')

    sensitivity = tested_df['TP'].sum() / tested_df.shape[0]  # TP / P
    specificity = truth_df['TP'].sum() / truth_df.shape[0]  # 1 - FP / N 

    display(tested_df.head())
    display(truth_df.head())

    #correct = df[df['overlap'] > 0.1].shape[0]
    #all = df.shape[0]
    #accuracy = correct / all
    #return correct, all, accuracy

In [2]:
#def plot_time_hist(df):
#    plt.figure(figsize=(10, 6))
#    #plt.hist(df["runtime"], cumulative=True, bins=20, log=True)
#    plt.hist(df["runtime"], bins=20, cumulative=True, color='green', alpha=0.7, weights=df["runtime"])
#    plt.xlabel("Runtime")
#    plt.ylabel("Cumulative Runtime")
#    plt.title("Histogram of Runtime with Cumulative Runtime on Y-axis (Logarithmic X-axis)")
#    plt.xscale("log")
#    plt.show()

def plot_time_hist(df):
    df_sorted = df.sort_values(by='runtime')

    # Calculate the cumulative sum
    cumulative_sum = df_sorted['runtime'].cumsum()
    display(cumulative_sum.head())
    display(cumulative_sum.tail())

    # Convert to NumPy arrays
    runtime_array = df_sorted['runtime'].to_numpy()
    cumulative_sum_array = cumulative_sum.to_numpy()
    #print('total time: ', cumulative_sum_array[-1])

    # Create the line plot
    #plt.plot(runtime_array, cumulative_sum_array)

    # Add red dots for each data point
    plt.scatter(runtime_array, cumulative_sum_array)

    # Labeling the axes
    plt.xlabel('Runtime (seconds)')
    plt.ylabel('Cumulative Sum of Runtimes')
    plt.title('Cumulative Sum of Runtimes')

    # Adjusting the scale if necessary
    plt.xscale('log')  # Use logarithmic scale if the range of runtimes is large

    # Show the plot
    plt.show()

In [3]:
# Input
#reads_fn = 'simulations/reads/t2thumanChrY_sr0.0001090909090909091_dr0.0009818181818181818_i0.0009090909090909091_sd7361077429744071834_lmn100_lmx1000000_lavg9000_ls7000_dp10_rm20.fasta'
reads_fn = 'newevals/reads-ChrY-positive.fa'
#eskemap_fn = 'out/eske100.out'
minimap_fn = 'out/minimap.out'
#sweep_fn = 'sweep-pairs-noblacklist.out'
#sweep_fn = 'out/sweep-b-a-fine.out'
#sweep_fn = 'sweep-normalized-intervals.out'

# Minimap
minimap_df = pd.read_csv(minimap_fn, sep='\t', header=None)
minimap_df.columns = ['query_name', 'query_len', 'query_start', 'query_end', 'strand', 'ref_name', 'ref_len', 'ref_start', 'ref_end', 'match_bases', 'total_bases', 'map_quality', 'cigar']

# Simulated reads
simulated_df = parse_fasta_metadata(reads_fn)

sweep_dfs = {}
params_dfs = pd.DataFrame()
# all files in which start with "sweep_"
for sweep_fn in glob.glob('out/sweep*.out'):
    print(sweep_fn)
    sweep_dfs[sweep_fn] = read_sweep(sweep_fn)
    sweep_dfs[sweep_fn] = pd.merge(simulated_df, sweep_dfs[sweep_fn], on='query_name', how='left')
    add_overlap_column(sweep_dfs[sweep_fn], minimap_df, 'overlap')
    correct, all, accuracy = get_accuracy(sweep_dfs[sweep_fn])
    # sensitivity = get_sensitivity(sweep_dfs[sweep_fn], minimap_df)
    # change the extension to '.params'
    continue
    params_fn = sweep_fn.replace('.out', '.params')
    params_df = pd.read_csv(params_fn, sep='\t')
    params_df['correct'], params_df['all'],params_df['accuracy'] = correct, all, accuracy
    #print('{} {:.2%} ({} / {})'.format(sweep_fn, accuracy, correct, all))
    params_dfs = pd.concat([params_dfs, params_df], axis=0)
# Sweep algo
display(params_dfs)

# Eskemap
#eskemap_df = pd.read_csv(eskemap_fn, sep='\t', header=None)
#eskemap_df.columns = ['query_name', 'ref_start', 'ref_end', 'jaccard_scoreX1000']

In [4]:
# Output
for fn, df in sweep_dfs.items():
    # sort df decreasingly by 'runtime' column
    df = df.sort_values(by='runtime', ascending=False)
    display(df.head(10))
    display(df.tail(10))

In [6]:
# loop for each sweep df
for fn, df in sweep_dfs.items():
    print('  {:<25} unique reads={:<5} mean overlap={:.2%}'.format(fn, len(df['query_name'].unique()), df['overlap'].mean()))
    print('  total runtime: ', df['runtime'].sum())
    plot_time_hist(df)
#print('  Sweep avg overlap: {:.8}'.format(sweep_df['overlap'].mean()))
#sweep_df.to_csv('out/all.csv', sep='\t', index=False)
#plot_all_columns(sweep_df)

In [62]:
sweep_df = sweep_dfs['out/sweep-Y-x.out']
sweep_misaligned_df = sweep_df[sweep_df['overlap'] < 0.1]
sweep_misaligned_df = sweep_misaligned_df[sweep_misaligned_df['overlap'] >= 0]  # remove reads not aligned by minimap
sweep_misaligned_df['start_diff'] = sweep_misaligned_df['ref_start'] - sweep_misaligned_df['from_ref_sim']

display(sweep_misaligned_df[sweep_misaligned_df['start_diff'] > 0].head(10))

print('Number of misaligned reads:', sweep_misaligned_df.shape[0])
sweep_misaligned_df.to_csv('out/misalignments.csv', sep='\t', index=False)
#plot_all_columns(sweep_misaligned_df)
display(sweep_misaligned_df.head(10))    

KeyError: 'out/sweep-Y-x.out'