In [4]:
import json 
import numpy as np

def get_number_regions(experiment): 
  number_regions = ! less $experiment/regions.bed.gz | wc -l
  number_regions, = number_regions
  return int(number_regions)

def get_confusion_counts(experiment): 
  with open('{}/summary.txt'.format(experiment)) as json_file:
    return json.load(json_file)

def get_eventCount_callCount(experiment): 
  try: 
    counts = get_confusion_counts(experiment)
    event_count = counts['TP-base'] + counts['FN']
    call_count = counts['TP-base'] + counts['FP']
    return event_count, call_count
  except FileNotFoundError: 
    return None, None

def get_TPR_FDR(experiment):
  try: 
    counts = get_confusion_counts(experiment)
    event_count, call_count = get_eventCount_callCount(experiment)
    TPR = counts['TP-base']/float(event_count)
    FDR = counts['FP']/float(call_count) 
    return TPR, FDR
  except (FileNotFoundError, ZeroDivisionError): 
    return None, None

def get_TP_eventCount(experiment):
  try: 
    counts = get_confusion_counts(experiment)
    event_count, _ = get_eventCount_callCount(experiment)
    return counts['TP-base'], event_count
  except FileNotFoundError: 
    return None, None

def get_FP_callCount(experiment):
  try: 
    counts = get_confusion_counts(experiment)
    _, call_count = get_eventCount_callCount(experiment)
    return counts['FP'], call_count
  except FileNotFoundError: 
    return None, None

def get_TPRs_FDRs(experiments): 
  TPRs, FDRs = zip(*[get_TPR_FDR(experiment) for experiment in experiments])  
  return TPRs, FDRs

def get_numbers_regions(experiments): 
  return [get_number_regions(experiment) for experiment in experiments]

def get_eventCounts_callCounts(experiments): 
  event_counts, call_counts = zip(*[get_eventCount_callCount(experiment) for experiment in experiments])
  return event_counts, call_counts

def get_overall_TPR_FDR(experiments): 
  TPs, event_counts = zip(*[get_TP_eventCount(experiment) for experiment in experiments])  
  TPR = np.sum(TPs)/np.sum(event_counts)

  FPs, call_counts = zip(*[get_FP_callCount(experiment) for experiment in experiments])  
  FDR = np.sum(FPs)/np.sum(call_counts)

  return TPR, FDR
  
def get_experiments_and_size_ranges(consortium, population, sample, svtype, caller):
  experiments = ! ls -d $consortium/data/$population,$sample/truvari-$svtype-*,*-pacbio-$caller
  region_size_ranges_str = [experiment.split('-')[2] for experiment in experiments]
  region_size_ranges = [region_size_range_str.split(',') for region_size_range_str in region_size_ranges_str]
  region_size_ranges = [(int(start), int(end)) for start, end in region_size_ranges]
  return experiments, region_size_ranges_str, region_size_ranges

def zip_and_sort(experiments_and_size_ranges):
  l = list(zip(*experiments_and_size_ranges))    
  l.sort(key=lambda x: x[2])
  return tuple(zip(*l))[:2]

zip_and_sort(get_experiments_and_size_ranges('chaisson_2019', 'PUR', 'HG00733', 'DEL', 'manta'))

(('chaisson_2019/data/PUR,HG00733/truvari-DEL-600,625-pacbio-manta',
  'chaisson_2019/data/PUR,HG00733/truvari-DEL-625,650-pacbio-manta',
  'chaisson_2019/data/PUR,HG00733/truvari-DEL-650,700-pacbio-manta',
  'chaisson_2019/data/PUR,HG00733/truvari-DEL-700,800-pacbio-manta',
  'chaisson_2019/data/PUR,HG00733/truvari-DEL-800,1000-pacbio-manta',
  'chaisson_2019/data/PUR,HG00733/truvari-DEL-1000,2000-pacbio-manta',
  'chaisson_2019/data/PUR,HG00733/truvari-DEL-2000,100000-pacbio-manta'),
 ('600,625',
  '625,650',
  '650,700',
  '700,800',
  '800,1000',
  '1000,2000',
  '2000,100000'))

In [5]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def get_font(size=20, family='Arial'): 
  return dict(family=family, color='black', size=size)

def get_marker(color=None, linewidth=1, linecolor='black'): 
  return dict(
    line=dict(width=linewidth, color=linecolor),
    color=color
  )

def update_axes(fig, axis, text=None, showline=True, ticks='outside', linewidth=1.5, range_=None, row=1, col=1): 
  update = {
    'x': fig.update_xaxes,
    'y': fig.update_yaxes
  }
  
  update[axis]( 
    title=dict(text=text),
    showgrid=False,
    showline=showline, 
    linewidth=linewidth, 
    linecolor='black',
    ticks=ticks, 
    tickwidth=linewidth, 
    tickcolor='black', 
    ticklen=10,
    range=range_,
    row=row,
    col=col
  )

def get_slop(consortium, population, sample): 
  with open(f'{consortium}/data/{population},{sample}/config.json') as json_file:
    return int(json.load(json_file)['makeRegions']['slop'])

def region_to_VNTR(consortium, population, sample, region_size_ranges_str): 
  region_size_ranges = [region_size_range_str.split(',') for region_size_range_str in region_size_ranges_str]
  return [f'{int(start)-2*get_slop(consortium, population, sample)}-{int(end)-2*get_slop(consortium, population, sample)}' 
          for start, end in region_size_ranges]

def get_VNTR_size_ranges__TPRs__FDRs(consortium, population, sample, svtype, caller): 
  experiments, region_size_ranges_str = zip_and_sort(get_experiments_and_size_ranges(consortium, population, sample, svtype, caller))
  TPRs, FDRs = get_TPRs_FDRs(experiments)
  TPR, FDR = get_overall_TPR_FDR(experiments)
  print(f'{population}:{sample}:{svtype}:{caller}:')
  print(f'overall TPR = {TPR}')
  print(f'overall FDR = {FDR}')
  VNTR_size_ranges = region_to_VNTR(consortium, population, sample, region_size_ranges_str)
  return VNTR_size_ranges, TPRs, FDRs

def get_VNTR_size_ranges__numbers_regions__event_counts__call_counts(consortium, population, sample, svtype, caller): 
  experiments, region_size_ranges_str = zip_and_sort(get_experiments_and_size_ranges(consortium, population, sample, svtype, caller))
  numbers_regions = get_numbers_regions(experiments)
  event_counts, call_counts = get_eventCounts_callCounts(experiments)
  VNTR_size_ranges = region_to_VNTR(consortium, population, sample, region_size_ranges_str)
  return VNTR_size_ranges, numbers_regions, event_counts, call_counts

def create_bar_chart_AB(consortium, population, sample, svtype, caller, chart_type, color): 
  VNTR_size_ranges, TPRs, FDRs = get_VNTR_size_ranges__TPRs__FDRs(consortium, population, sample, svtype, caller)
  y = { 
    'TPR': TPRs, 
    'FDR': FDRs,
  }
  name = { 
    'manta': 'manta',
    'trfermikit.unitigSupport.thinned': 'trfermikit'
  }
  def get_showlegend(): 
    if chart_type == 'TPR': return True
    elif chart_type == 'FDR': return False 
    else: raise ValueError
    
  return go.Bar(
    x=VNTR_size_ranges, 
    y=y[chart_type], 
    name=name[caller],
    marker=get_marker(color=color),
    showlegend=get_showlegend()
  )

In [6]:
def plot_performance_AB(consortium, population, sample, svtype, chart_type, fig, row, col): 
  fig.add_trace(
    create_bar_chart_AB(consortium, population, sample, svtype, 'trfermikit.unitigSupport.thinned', chart_type, color='red'),
    row=row, 
    col=col
  ) 
  fig.add_trace(
    create_bar_chart_AB(consortium, population, sample, svtype, 'manta', chart_type, color='green'),
    row=row,
    col=col
  ) 
  update_axes(fig, 'x', 'VNTR length (bps)', showline=False, ticks=None, row=row, col=col) 
  y_text = { 
    'TPR': 'Recall, TP/(TP + FN)',
    'FDR': 'False Discovery Rate, FP/(TP + FP)'    
  }  
  update_axes(fig, 'y' , y_text[chart_type], range_=[0, 1], row=row, col=col) 

def plot_performance_C(consortium, population, sample, svtype, fig, row, col): 
  VNTR_size_ranges, numbers_regions, event_counts, _ = \
    get_VNTR_size_ranges__numbers_regions__event_counts__call_counts(consortium, population, sample, svtype, caller='manta')
  fig.add_trace(
    go.Bar(
      x=VNTR_size_ranges, 
      y=numbers_regions, 
      name='VNTRs',
      marker=get_marker(),
      showlegend=True
    ),
    row=row, 
    col=col
  ) 
  fig.add_trace(
    go.Bar(
      x=VNTR_size_ranges, 
      y=event_counts, 
      name=svtype,
      marker=get_marker(),
      showlegend=True
    ), 
    row=row,
    col=col
  ) 
  update_axes(fig, 'x', 'VNTR length (bps)', showline=False, ticks=None, row=row, col=col)
  update_axes(fig, 'y' , 'count', range_=[0, None], row=row, col=col)
  fig.update_yaxes(type='log', row=row, col=col)

def plot_performance(consortium, population, sample, svtype): 
  fig = make_subplots(
    rows=1, 
    cols=3,
    column_widths=[1, 1, 1],
    row_heights=[1],
    specs=[[{"type": "bar"}, {"type": "bar"}, {"type": "bar"}]],
    horizontal_spacing=0.1
  ) 
  
  plot_performance_AB(consortium, population, sample, svtype, chart_type='TPR', fig=fig, row=1, col=1)
  plot_performance_AB(consortium, population, sample, svtype, chart_type='FDR', fig=fig, row=1, col=2)
  plot_performance_C(consortium, population, sample, svtype, fig=fig, row=1, col=3)

  fig.update_layout(dict(
      xaxis=dict(type='category'),
      plot_bgcolor='white',
      font=get_font(),
      width=1500,
      height=600,
      margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=0
      )
  ))

  fig.update_xaxes(tickangle=45)

  fig.show()

plot_performance('HGSVC2', 'CEU', 'NA12878', 'DEL')

CEU:NA12878:DEL:trfermikit.unitigSupport.thinned:
overall TPR = 0.3719399300555441
overall FDR = 0.399734395750332
CEU:NA12878:DEL:manta:
overall TPR = 0.2606459576218885
overall FDR = 0.40207645115620577
CEU:NA12878:DEL:trfermikit.unitigSupport.thinned:
overall TPR = 0.3719399300555441
overall FDR = 0.399734395750332
CEU:NA12878:DEL:manta:
overall TPR = 0.2606459576218885
overall FDR = 0.40207645115620577
