In [1]:
import pandas as pd

from evaluation_helper import get_time_and_rss, get_max_gpu_usage

In [2]:
log_dir = '../../logs/minimap'
res_dir = '../../data/eval'
minimap_res_dir = f'{res_dir}/minimap'

In [3]:
measures = pd.DataFrame()
for ds in ['real', 'sim']:
    for mx in [4, 6, 8]:
        logfile_time = f'{log_dir}/step6_minimap_{ds}_{mx}.txt'
        logfile_nvidia = f'{log_dir}/step6_minimap_{ds}_{mx}_gpu.txt'
        user_time, system_time, elapsed_time, max_rss = get_time_and_rss(logfile_time)
        max_gpu_usage = get_max_gpu_usage(logfile_nvidia, 'guppy_basecaller')
        measures = pd.concat([measures, pd.DataFrame([{'ID': f'max{mx}_{ds}',
                                                       'Approach': 'Guppy + Minimap',
                                                       'Dataset': ds,
                                                       'Maximum Sequence Length': mx * 1000,
                                                       'User Time': user_time,
                                                       'System Time': system_time,
                                                       'Elapsed Time': elapsed_time,
                                                       'Max RSS (GB)': max_rss,
                                                       'Max GPU Memory Usage (GiB)': max_gpu_usage}])], ignore_index=True)
measures

In [4]:
measures['Elapsed Time'] = '00:' + measures['Elapsed Time'].apply(lambda t: str(t).split('.')[0])
measures.to_csv(f'{res_dir}/MINIMAP_times_and_measures.csv', index=False)

In [5]:
def assign_class_to_ref(ref_name, pos_refs, neg_refs):
    if ref_name in pos_refs:
        return 'pos'
    elif ref_name in neg_refs:
        return 'neg'
    else:
        raise ValueError(f'Reference name "{ref_name}" not known!')

In [6]:
metrics = pd.DataFrame(columns=['ID', 'Maximum Sequence Length', 'Dataset', 'TP', 'TN', 'FP', 'FN', 'UP', 'UC'])
neg_refs = open(f'{minimap_res_dir}/neg_references.txt').read().splitlines()
pos_refs = open(f'{minimap_res_dir}/pos_references.txt').read().splitlines()

for ds in ['real', 'sim']:
    for mx in [4, 6, 8]:
        merged = pd.DataFrame()
        unclassified_plasmids, unclassified_chromosomes = 0, 0
        ids = pd.read_csv(f'{res_dir}/max{mx}_gt_test_{ds}_labels.csv')

        for cls in ['pos', 'neg']:
            alignment = pd.read_csv(f'{minimap_res_dir}/{ds}_max{mx}/{cls}_reads_and_refs.csv')
            alignment['Read Class'] = cls  # ground truth label
            alignment['Reference Class'] = alignment['Reference'].apply(lambda ref: assign_class_to_ref(ref, pos_refs, neg_refs))  # predicted label
            merged = pd.concat([merged, alignment], ignore_index=True)

            all_reads = len(ids[ids['GT Label'] == f'{"plasmid" if cls == "pos" else "chr"}'])
            matched_reads = len(alignment)
            if cls == 'pos':
                unclassified_plasmids = all_reads - matched_reads
            else:
                unclassified_chromosomes = all_reads - matched_reads

        metrics = pd.concat([metrics, pd.DataFrame([{
            'ID': f'max{mx}_{ds}',
            'Maximum Sequence Length': int(mx) * 1000,
            'Dataset': ds,
            'TP': len(merged[(merged['Reference Class'] == 'pos') & (merged['Read Class'] == 'pos')]),
            'TN': len(merged[(merged['Reference Class'] == 'neg') & (merged['Read Class'] == 'neg')]),
            'FP': len(merged[(merged['Reference Class'] == 'pos') & (merged['Read Class'] == 'neg')]),
            'FN': len(merged[(merged['Reference Class'] == 'neg') & (merged['Read Class'] == 'pos')]),
            'UP': unclassified_plasmids,
            'UC': unclassified_chromosomes,
        }])], ignore_index=True)

metrics['TPR (Sensitivity)'] = metrics['TP'] / (metrics['TP'] + metrics['FN'] + metrics['UP'])
metrics['TNR (Specificity)'] = metrics['TN'] / (metrics['TN'] + metrics['FP'] + metrics['UC'])
metrics['FPR'] = 1 - metrics['TNR (Specificity)']
metrics['FNR'] = 1 - metrics['TPR (Sensitivity)']
metrics['Balanced Accuracy'] = (metrics['TPR (Sensitivity)'] + metrics['TNR (Specificity)']) / 2
metrics['Accuracy'] = (metrics['TP'] + metrics['TN']) / (metrics['TP'] + metrics['TN'] + metrics['FP'] + metrics['FN'] + metrics['UP'] + metrics['UC'])
metrics

In [7]:
metrics.to_csv(f'{res_dir}/MINIMAP_metrics.csv', index=False)