In [150]:
import pandas as pd

from evaluation_helper import get_time_and_rss, get_max_gpu_usage

In [151]:
df = pd.DataFrame()
for ds in ['real', 'sim']:
    for mx in [4, 6, 8]:
        # add guppy results
        guppy_logfile_time = f'../../logs/guppy/step6c_basecall_{ds}_{mx}.txt'
        guppy_logfile_nvidia = f'../../logs/guppy/step6c_basecall_{ds}_{mx}_gpu.txt'
        g_user_time, g_system_time, g_elapsed_time, g_max_rss = get_time_and_rss(guppy_logfile_time)
        max_gpu_usage = get_max_gpu_usage(guppy_logfile_nvidia, 'guppy_basecaller')
        df = pd.concat([df, pd.DataFrame([{'Approach': 'Guppy',
                                           'Dataset': ds,
                                           'Maximum Sequence Length': mx * 1000,
                                           'User Time': g_user_time,
                                           'System Time': g_system_time,
                                           'Elapsed Time': g_elapsed_time,
                                           'Max RSS (GB)': g_max_rss,
                                           'Max GPU Memory Usage (GiB)': max_gpu_usage}])],
                       ignore_index=True)

        # add minimap2 results
        minimap_logfile_time = f'../../logs/minimap/step6e_map_{ds}_{mx}.txt'
        m_user_time, m_system_time, m_elapsed_time, m_max_rss = get_time_and_rss(minimap_logfile_time)
        df = pd.concat([df, pd.DataFrame([{'Approach': 'Minimap',
                                           'Dataset': ds,
                                           'Maximum Sequence Length': mx * 1000,
                                           'User Time': m_user_time,
                                           'System Time': m_system_time,
                                           'Elapsed Time': m_elapsed_time,
                                           'Max RSS (GB)': m_max_rss,
                                           'Max GPU Memory Usage (GiB)': 0.0}])],
                       ignore_index=True)
df

Unnamed: 0,Approach,Dataset,Maximum Sequence Length,User Time,System Time,Elapsed Time,Max RSS (GB),Max GPU Memory Usage (GiB)
0,Guppy,real,4000,00:21:22,00:02:49,18:41.54,1.438256,0.758789
1,Minimap,real,4000,00:06:40,00:00:11,3:56.30,2.609836,0.0
2,Guppy,real,6000,00:19:23,00:02:40,17:02.46,1.38412,0.901367
3,Minimap,real,6000,00:07:59,00:00:18,4:36.10,2.80754,0.0
4,Guppy,real,8000,00:18:41,00:02:32,16:02.37,1.358204,1.043945
5,Minimap,real,8000,00:08:47,00:00:17,5:05.99,2.928372,0.0
6,Guppy,sim,4000,00:17:53,00:02:43,15:54.86,1.206684,0.901367
7,Minimap,sim,4000,00:16:09,00:00:49,8:46.95,25.17162,0.0
8,Guppy,sim,6000,00:19:49,00:02:47,16:25.91,1.208464,1.043945
9,Minimap,sim,6000,00:22:15,00:01:19,11:18.31,25.69482,0.0


In [152]:
df['User Time'] = pd.to_timedelta(df['User Time'])
df['System Time'] = pd.to_timedelta(df['System Time'])
df['Elapsed Time'] = '00:' + df['Elapsed Time']
df['Elapsed Time'] = pd.to_timedelta(df['Elapsed Time'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype          
---  ------                      --------------  -----          
 0   Approach                    12 non-null     object         
 1   Dataset                     12 non-null     object         
 2   Maximum Sequence Length     12 non-null     int64          
 3   User Time                   12 non-null     timedelta64[ns]
 4   System Time                 12 non-null     timedelta64[ns]
 5   Elapsed Time                12 non-null     timedelta64[ns]
 6   Max RSS (GB)                12 non-null     float64        
 7   Max GPU Memory Usage (GiB)  12 non-null     float64        
dtypes: float64(2), int64(1), object(2), timedelta64[ns](3)
memory usage: 896.0+ bytes


In [153]:
summed_measures = df.groupby(['Dataset', 'Maximum Sequence Length'])['User Time', 'System Time', 'Elapsed Time', 'Max RSS (GB)', 'Max GPU Memory Usage (GiB)'].apply(lambda x : x.sum())
summed_measures = summed_measures.reset_index()
summed_measures['Approach'] = 'Guppy + Minimap'
df = pd.concat([df, summed_measures], ignore_index=True)
df

  summed_measures = df.groupby(['Dataset', 'Maximum Sequence Length'])['User Time', 'System Time', 'Elapsed Time', 'Max RSS (GB)', 'Max GPU Memory Usage (GiB)'].apply(lambda x : x.sum())


Unnamed: 0,Approach,Dataset,Maximum Sequence Length,User Time,System Time,Elapsed Time,Max RSS (GB),Max GPU Memory Usage (GiB)
0,Guppy,real,4000,0 days 00:21:22,0 days 00:02:49,0 days 00:18:41.540000,1.438256,0.758789
1,Minimap,real,4000,0 days 00:06:40,0 days 00:00:11,0 days 00:03:56.300000,2.609836,0.0
2,Guppy,real,6000,0 days 00:19:23,0 days 00:02:40,0 days 00:17:02.460000,1.38412,0.901367
3,Minimap,real,6000,0 days 00:07:59,0 days 00:00:18,0 days 00:04:36.100000,2.80754,0.0
4,Guppy,real,8000,0 days 00:18:41,0 days 00:02:32,0 days 00:16:02.370000,1.358204,1.043945
5,Minimap,real,8000,0 days 00:08:47,0 days 00:00:17,0 days 00:05:05.990000,2.928372,0.0
6,Guppy,sim,4000,0 days 00:17:53,0 days 00:02:43,0 days 00:15:54.860000,1.206684,0.901367
7,Minimap,sim,4000,0 days 00:16:09,0 days 00:00:49,0 days 00:08:46.950000,25.17162,0.0
8,Guppy,sim,6000,0 days 00:19:49,0 days 00:02:47,0 days 00:16:25.910000,1.208464,1.043945
9,Minimap,sim,6000,0 days 00:22:15,0 days 00:01:19,0 days 00:11:18.310000,25.69482,0.0


In [154]:
# TODO: eval BAM (interpret unclassified as errors, search for unmapped read IDs & duplicated mappings between BAMs) -> balancedAccuracy, TNR, TPR

In [155]:
# TODO: plot metrics