# About
This was used to produce Table S2.

# Prereqs

In [24]:
import collections
import configparser
import csv
import gzip
import os
import pathlib
import time
import sys
import xml.etree.ElementTree
import zlib

In [2]:
from html_table import Table, Rows, Row, Cell

In [3]:
MAIN_DIR = pathlib.Path('/nfs/brubeck.bx.psu.edu/scratch5/nick/overlaps/auto/ecoli')
CSV_PATH = MAIN_DIR/'sra.runinfo3.csv.gz'
XML_PATH = MAIN_DIR/'sra.docsum2.xml.gz'

## Reading metadata

In [4]:
def read_runinfo(csv_path):
  start = time.perf_counter()
  int_fields = {'spots', 'spots_with_mates', 'avgLength', 'size_MB', 'InsertSize', 'InsertDev', 'ProjectID', 'TaxID'}
  runs = {}
  header = {}
  empty_lines = 0
  header_lines = 0
  try:
    with gzip.open(csv_path, 'rt') as csv_file:
      for row in csv.reader(csv_file):
        if not header:
          header_lines += 1
          for i, value in enumerate(row):
            header[i] = value
          RunInfo = collections.namedtuple('RunInfo', header.values())
          continue
        if len(row) == 0:
          empty_lines += 1
          continue
        assert len(row) == len(header), (len(row), row)
        header_matches = 0
        value_dict = {}
        for i, raw_value in enumerate(row):
          if raw_value == header[i]:
            header_matches += 1
          else:
            if header[i] in int_fields:
              try:
                value = int(raw_value)
              except ValueError:
                if header[i] == 'InsertDev':
                  value = float(raw_value)
                else:
                  print(f'{header[i]}: {raw_value!r}', file=sys.stderr)
            else:
              value = raw_value
          value_dict[header[i]] = value
        if header_matches > 5:
          header_lines += 1
          continue
        runs[value_dict['Run']] = RunInfo(**value_dict)
  except EOFError:
    print('Incomplete gzip.', file=sys.stderr)
  elapsed = time.perf_counter() - start
  print(f'{len(runs)} runs in {elapsed:0.1f} seconds (headers: {header_lines}, empty lines: {empty_lines})')
  return runs

In [5]:
def read_xml(xml_path):
  start = time.perf_counter()
  summaries = {}
  with gzip.open(xml_path) as xml_file:
    # Note: This takes about 1-2GB of memory.
    tree = xml.etree.ElementTree.parse(xml_file)
  docsum = tree.getroot()
  for exp in docsum:
    for run in exp.find('./Runs'):
      acc = run.attrib['acc']
      summaries[acc] = exp
  elapsed = time.perf_counter() - start
  print(f'{len(summaries)} experiments in {round(elapsed)} seconds')
  return summaries

In [6]:
def get_xml_experiment(accession, summaries):
  try:
    return summaries[accession]
  except KeyError:
    sys.stderr.write(f'Warning: Could not find XML summary for {accession}\n')
    return None

## Read in the metadata

In [7]:
RUNS = read_runinfo(CSV_PATH)

186022 runs in 11.5 seconds (headers: 1, empty lines: 0)


In [8]:
SUMMARIES = read_xml(XML_PATH)

186022 experiments in 36 seconds


## Study objects

In [9]:
ABBREVIATIONS = {
  'European':'Euro.', 'Nucleotide':'Nuc.', 'Institute':'Inst.', 'Technology':'Tech.', 'The Pennsylvania':'Penn',
  'University':'Univ.', 'National':'Nat.', 'Department':'Dept.', 'Technological':'Tech.', 'Laboratory':'Lab.',
  'Biotechnology':'Biotech.',
}
class Study(collections.namedtuple('Study', ('title', 'study', 'center', 'lab', 'contact'))):
  __slots__ = ()
  @property
  def submitter(self):
    return (self.center, self.lab, self.contact)
  @classmethod
  def from_accession(cls, accession, summaries=SUMMARIES):
    try:
      experiment = summaries[accession]
    except KeyError:
      sys.stderr.write(f'Warning: Could not find XML summary for {accession}\n')
      raise
    return cls.from_experiment(experiment)
  @classmethod
  def from_experiment(cls, experiment):
    data = {'title':None, 'study':None, 'center':None, 'lab':None, 'contact':None}
    if not experiment:
      return cls(**data)
    data['title'] = experiment.find('./ExpXml/Summary/Title').text
    study_elem = experiment.find('./ExpXml/Study')
    data['study'] = study_elem.attrib.get('name')
    subm_elem = experiment.find('./ExpXml/Submitter')
    for field in 'center', 'lab', 'contact':
      data[field] = subm_elem.attrib.get(field+'_name')
    return cls(**data)
  def format_fields(self, max_len=None, null='?'):
    """Return a copy with the fields formatted for human reading."""
    strs = {}
    for field in self._fields:
      value = raw_value = getattr(self, field)
      if raw_value is None:
        value = null
      strs[field] = value
    # This is a common prefix that's lengthy and not too informative.
    strs['title'] = rm_prefix(strs['title'], 'Illumina MiSeq paired end sequencing; ')
    for field in 'center', 'lab', 'contact':
      value = raw_value = strs[field]
      # If it's all uppercase, make it titlecased to be easier to read.
      if len(raw_value) > 16 and raw_value == raw_value.upper():
        value = raw_value.title()
      for long, short in ABBREVIATIONS.items():
        value = value.replace(long, short)
      strs[field] = value
    if max_len is not None:
      for field, value in strs.items():
        strs[field] = truncate(value, max_len)
    return type(self)(**strs)
  def get_color(self, fields=None):
    study_str = ''
    if fields is None:
      fields = self._fields
    for field in fields:
      value = getattr(self, field)
      study_str += f'{field!r}: {value!r}\n'
    return get_color_from_str(study_str)

In [10]:
def get_color_from_str(string):
  string_bytes = bytes(string, 'utf8')
  crc = zlib.crc32(string_bytes)
  crc_str = f'{crc:06x}'
  return '#'+crc_str[-6:]

## Reading `progress.ini`

In [11]:
PROGRESS_TYPES = {
  'step':int, 'when':int, 'timestamp':int,
  'start_step':int, 'start_time':int, 'end_step':int, 'end_time':int, 'commit_time':int,
}
def read_progress(progress_path):
  raw_progress = read_config(progress_path, PROGRESS_TYPES)
  return convert_progress(raw_progress)

In [12]:
def read_config(config_path, types):
  data = {}
  config = configparser.ConfigParser(interpolation=None)
  try:
    config.read(config_path)
    for section in config.sections():
      for key, raw_value in config.items(section):
        if types and key in types:
          value = types[key](raw_value)
        else:
          value = raw_value
        try:
          data[section][key] = value
        except KeyError:
          data[section] = {key:value}
  except configparser.Error:
    logging.critical(f'Error: Invalid config file format in {config_path!r}.')
    raise
  return data

In [13]:
def convert_progress(progress):
  """Convert old progress structure to the new one, if necessary."""
  if any([section.startswith('run') for section in progress.keys()]):
    # It's the new format.
    return progress
  mapping = {
    ('start', 'step'): 'start_step',
    ('start', 'when'): 'start_time',
    ('end', 'step'): 'end_step',
    ('end', 'when'): 'end_time',
    ('version', 'timestamp'): 'commit_time',
    ('version', 'commit'): 'commit',
  }
  run0 = {}
  for section_name, section in progress.items():
    for key, value in section.items():
      new_key = mapping.get((section_name, key), f'{section_name}_{key}')
      run0[new_key] = value
  if run0:
    return {'run0':run0}
  else:
    return {}

In [14]:
def get_last_step(progress):
  section = get_last_section(progress)
  if section is not None:
    return section.get('end_step')

In [15]:
def get_last_section(progress):
  last_run = None
  last_section = None
  for name, section in progress.items():
    if name.startswith('run'):
      run = int(name[3:])
      if last_run is None or run > last_run:
        last_run = run
        last_section = section
  return last_section

## Misc

In [16]:
def rm_prefix(string, prefix):
  if string.startswith(prefix):
    return string[len(prefix):]
  else:
    return string

## Reading data

In [17]:
def get_samples(runs_dir=MAIN_DIR/'runs'):
  samples = []
  for sample_dir in runs_dir.iterdir():
    if not sample_dir.is_dir():
      continue
    samples.append(sample_dir.name)
  return sorted(samples)

In [22]:
def get_passing_samples(model=None, min_overlap=0, bin_num=5, runs=RUNS):
  for sample in get_samples(MAIN_DIR/'runs'):
    run = runs[sample]
    this_model = rm_prefix(run.Model, 'Illumina ')
    if model is not None and this_model != model:
      continue
    rate = get_rate_from_sample(sample, bin_num, min_overlap=min_overlap)
    if rate is not None:
      yield run

In [26]:
def read_summary(summary_path):
  summary = {}
  ranges = ('min', 'avg', 'med', 'max')
  range_vars = ('rlen', 'overlap')
  names = ('errors', 'overlap_bp', 'pairs', 'reads', 'pair_bases', 'error_rate', 'paired_read_frac', 'overlap_rate')
  types = collections.defaultdict(lambda: int)
  for name in list(ranges)+list(names):
    if name in ('avg', 'error_rate', 'paired_read_frac', 'overlap_rate'):
      types[name] = float
  with summary_path.open() as summary_file:
    for lnum, line in enumerate(summary_file):
      fields = line.split()
      if lnum <= 1:
        assert len(fields) == len(ranges), line
        stats = {}
        for fnum, stat_name in enumerate(ranges):
          if fields[fnum] == '.':
            stats[stat_name] = None
          else:
            try:
              stats[stat_name] = types[stat_name](fields[fnum])
            except ValueError:
              stats[stat_name] = float(fields[fnum])
        range_var = range_vars[lnum]
        summary[range_var] = stats
      elif lnum == 2:
        for fnum, name in enumerate(names):
          if fields[fnum] == '.':
            summary[name] = None
          else:
            summary[name] = types[name](fields[fnum])
  return summary

In [28]:
def read_analysis(analysis_path):
  analysis = {}
  types = {'overlaps':int, 'errors':int, 'rates':float}
  def convert(stat, val_str):
    if val_str == '.':
      return None
    else:
      return types[stat](val_str)
  last_sample = None
  with analysis_path.open() as analysis_file:
    for line in analysis_file:
      fields = line.split()
      sample = fields[0]
      stat_name = fields[1]
      total = convert(stat_name, fields[2])
      bins = [convert(stat_name, count) for count in fields[3:]]
      analysis[stat_name] = {'total':total, 'bins':bins}
      if last_sample is not None:
        assert sample == last_sample, (sample, last_sample)
      last_sample = sample
  return analysis

In [19]:
def get_rate_from_sample(
  sample, bin_num, min_overlap=100000, min_avg_overlap=0, min_errors=0, main_dir=MAIN_DIR, runs=RUNS
):
  sample_dir = MAIN_DIR/'runs'/sample
  end = get_last_step(read_progress(sample_dir/'progress.ini'))
  if end is None or end < 4:
    return
  run = runs[sample]
  if run.size_MB <= 0:
    return
  summary_path = sample_dir/'errors.summary.tsv'
  if not (summary_path.is_file() and os.path.getsize(summary_path) > 0):
    return
  summary = read_summary(summary_path)
  analysis_path = sample_dir/'analysis.tsv'
  if not (analysis_path.is_file() and os.path.getsize(analysis_path) > 0):
    return
  analysis = read_analysis(analysis_path)
  assert len(analysis['rates']['bins']) == 10, analysis
  overlap = summary['overlap']['avg']
  if overlap < min_avg_overlap:
    return
  if analysis['overlaps']['bins'][bin_num] < min_overlap:
    return
  if analysis['errors']['bins'][bin_num] < min_errors:
    return
  return analysis['rates']['bins'][bin_num]

## Display groups

In [20]:
group_fields = ('center', 'lab', 'contact')
def show_model_groups(model, min_overlap=0, bin_num=5):
  rows = []
  group_samples = {}
  group_counts = collections.Counter()
  for run in get_passing_samples(model, min_overlap=min_overlap, bin_num=bin_num):
    sample = run.Run
    experiment = get_xml_experiment(sample, SUMMARIES)
    if experiment is None:
      group = sum(group_counts.values())
    else:
      study = Study.from_experiment(experiment)
      group_data = {}
      for field in Study._fields:
        if field in group_fields:
          group_data[field] = getattr(study, field)
        else:
          group_data[field] = None
      group = Study(**group_data)
    group_counts[group] += 1
    group_samples[group] = sample
  for group, count in group_counts.items():
    sample = group_samples[group]
    if isinstance(group, int):
      color = 'inherit'
    else:
      color = group.get_color()
      formatted_group = group.format_fields()
    row = [{'value':sample, 'css':{'background-color':color}}, count]
    for field in group_fields:
      if isinstance(group, int):
        row.append(None)
      else:
        row.append(getattr(formatted_group, field))
    rows.append(row)
  rows.sort(reverse=True, key=lambda row: row[1])
  row = ['Total', sum(group_counts.values())]
  rows.insert(0, row)
  header = ['sample', 'count'] + list(group_fields)
  header = [h.title() for h in header]
  table = Table(rows, header=header)
  table.add_border('rows', 1, 'body')
  return table.render()

In [30]:
show_model_groups('MiSeq', min_overlap=2500000, bin_num=5)

Sample,Count,Center,Lab,Contact
Total,212,,,
ERR2868175,2,,Euro. Nuc. Archive,Euro. Nuc. Archive
ERR2686034,2,"Inst. of Tech., Univ. of Tartu",Euro. Nuc. Archive,Euro. Nuc. Archive
DRR021342,1,RIKEN_QBC,Lab. for multiscale biosystem dynamics,Lab. for multiscale biosystem dynamics
DRR036001,1,WASEDA,Haruko Takeyama lab,Haruko Takeyama lab
DRR058068,1,NIG,"Microbial Genetics Lab., Genetics Strains Re","Microbial Genetics Lab., Genetics Strains Re"
DRR065947,1,NIID,"Lab. of Bacterial Genomics, Pathogen Genomic","Lab. of Bacterial Genomics, Pathogen Genomic"
DRR066639,1,TSUKUBA,"Environmental Molecular Microbiology, Faculty of l","Environmental Molecular Microbiology, Faculty of l"
DRR075635,1,NCGM,"Dept. of Infectious Diseases, Nat. Center","Dept. of Infectious Diseases, Nat. Center"
DRR075951,1,RIKEN_QBC,Lab. for Multiscale Biosystem Dynamics,Lab. for Multiscale Biosystem Dynamics


In [31]:
show_model_groups('MiniSeq', min_overlap=2500000, bin_num=5)

Sample,Count,Center,Lab,Contact
Total,40,,,
ERR2777515,27,RIVM - Nat. Inst. for Public Health and th,Euro. Nuc. Archive,Euro. Nuc. Archive
SRR10097238,6,USDA,ARS,Aixia Xu
SRR9886731,3,Pulsenet,,pulsenet service
SRR7403873,1,Robert Koch Institut,Enteropathogenic Bacteria and Legionella,Christina Lang
SRR8573904,1,universite paris Diderot site bichat,Microbiology,Andre birgy
SRR8695865,1,Belarusian State Univ.,The Faculty of Biology,Alexander Lagonenko
SRR9691129,1,Skolkovo Inst. of Science and Tech.,Life Sciences,Aleksandra Vasileva


In [32]:
show_model_groups('NextSeq 500', min_overlap=2500000, bin_num=5)

Sample,Count,Center,Lab,Contact
Total,160,,,
DRR051046,4,CALGARY,"Johann D.D. Pitout, Microbiology, Calgary Laborato","Johann D.D. Pitout, Microbiology, Calgary Laborato"
DRR065639,4,KYOTO_GM,"Clinical Lab. Medicine, Kyoto Univ. Gra","Clinical Lab. Medicine, Kyoto Univ. Gra"
DRR129837,4,NIID,"Lab. of Bacterial Genomics, Pathogen Genomic","Lab. of Bacterial Genomics, Pathogen Genomic"
ERR3063452,4,Instituto de Salud Carlos III,Euro. Nuc. Archive,Euro. Nuc. Archive
ERR1837604,4,Saolta,Euro. Nuc. Archive,Euro. Nuc. Archive
SRR10065352,4,Walailak Univ.,Akkhrararatchakumari Veterinary College,Thotsapol Thomrongsuwannakij
SRR10094626,4,Washington Univ. in St. Louis School of Medic,Computational and Systems Biology,Alaric W D'Souza
SRR2970288,4,FDA,DMB,SOLOMON GEBRU
SRR3989534,4,CFSAN,,


In [33]:
show_model_groups('NextSeq 550', min_overlap=2500000, bin_num=5)

Sample,Count,Center,Lab,Contact
Total,171,,,
ERR1841152,80,Warwick Univ.,Euro. Nuc. Archive,Euro. Nuc. Archive
SRR7866058,35,Kenyon College,Biology,Jeremy Philippe Moore
SRR5121873,33,Kenyon College,Biology,Preston Basting
SRR4164003,12,Univ. of Georgia,Environmental Health Science,Adelumola Oladeinde
SRR8835636,7,Yonsei Univ.,Chemistry,Jeewon Lee
SRR6801478,2,Arbor Biotechnologies,Research,David Cheng
SRR8186051,1,Univ. of Sao Paulo,Microbiology,Miriam R Fernandes
SRR8441387,1,Cleveland Clinic,Translational Hematology and Oncology Research,Jacob G Scott


In [34]:
show_model_groups('HiSeq 2500', min_overlap=2500000, bin_num=5)

Sample,Count,Center,Lab,Contact
Total,141,,,
SRR1217287,4,Penn State Univ.,Mwangi Lab,Juan Antonio Raygoza Garay
DRR061476,2,Uni_North_Carolina,"Medicine / Division of Infectious Diseases, Unive","Medicine / Division of Infectious Diseases, Unive"
ERR1351685,2,Weizmann Institue Of Science,Euro. Nuc. Archive,Euro. Nuc. Archive
ERR1370918,2,VETAGROSUP,Euro. Nuc. Archive,Euro. Nuc. Archive
ERR1949075,2,Embl Euro. Bioinformatics Inst.,Euro. Nuc. Archive,Euro. Nuc. Archive
ERR1957976,2,Univ. of York,Euro. Nuc. Archive,Euro. Nuc. Archive
ERR2039643,2,Univ. Of Manchester,Euro. Nuc. Archive,Euro. Nuc. Archive
ERR2135235,2,The Roslin Inst.,Euro. Nuc. Archive,Euro. Nuc. Archive
ERR2226598,2,Univ. Of North Carolina At Chapel Hill,Euro. Nuc. Archive,Euro. Nuc. Archive


In [29]:
show_model_groups('NovaSeq 6000', min_overlap=2500000, bin_num=5)

Sample,Count,Center,Lab,Contact
Total,239,,,
SRR11810195,40,NEIKER - Basque Inst. for Agricultural Researc,Animal Health Dept.,Medelin Ocejo
ERR4221187,34,IMBIM(Dept. of Medical Biochemistry and Micro,Euro. Nuc. Archive,Euro. Nuc. Archive
SRR11075478,30,Univ. of Illinois at Urbana-Champaign,Civil and Environmental Engineering,Yue Xing
SRR12020688,30,Univ. of Tech. Sydney,ithree institute,Veronica Jarocki
SRR7664908,23,GEO,,"Gene Expression Omnibus (GEO), NCBI, NLM, NIH, htt"
SRR8610355,18,Univ. of Bern,Inst. for Infectious Diseases,Mathieu Clement
SRR9648314,13,JGI,,JGI SRA
SRR11018568,12,Colorado State Univ.,Dept. of Biology,Daniel B Sloan
SRR11286234,10,Boston Univ.,Biomedical Engineering,Carly Ching


In [35]:
show_model_groups('HiSeq X Ten', min_overlap=2500000, bin_num=5)

Sample,Count,Center,Lab,Contact
Total,163,,,
SRR7879965,40,BI,,bi service
SRR7716577,39,"West China Hospital, Sichuan Univ.",Center of Infectious Diseases,Zhiyong Zong
SRR6069470,29,Peking Univ. People's Hospital,Dept. of Clinical Lab.,Ruobing Wang
SRR7280090,12,Beijing university of agriculture,Food science and engineering,xiaoxia Li
SRR7537408,12,GEO,,"Gene Expression Omnibus (GEO), NCBI, NLM, NIH, htt"
SRR9855382,10,South China Agricultural Univ.,College of Veterinary Medicine,Xiufeng Zhang
SRR7641178,7,College of Animal Science South China Agricultural,"No.483 Wushan, Tianhe District, Guangzhou City",yiwen yang
SRR7474260,6,Jiangnang Univ.,Nat. Engineering Lab. for Cereal Ferment,Kangjia Zhu
SRR8335003,5,College of Biotech. and Pharmaceutical Engine,State Key Lab. of Materials-Oriented Chemica,Yong Chen
