adding in extra verbose support for processing genome alignemtns, abi…

…lity to specify which species to use when extracting columns from genome alignments and producing conservation profiles, and extra support for adjusting treatment of missing sequences.
pjuren · Jul 9, 2015 · ae62dab · ae62dab
1 parent 45d2bfd
commit ae62dab
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 21 deletions.
diff --git a/src/pyokit/datastruct/genomeAlignment.py b/src/pyokit/datastruct/genomeAlignment.py
@@ -25,6 +25,7 @@
 
 # standard imports
 import unittest
+import sys
 
 # pyokit imports
 from pyokit.datastruct.multipleAlignment import MultipleSequenceAlignment
@@ -87,21 +88,31 @@ def __init__(self, msg):
 #                              HELPER FUNCTIONS                               #
 ###############################################################################
 
-def _build_trees_by_chrom(blocks):
+def _build_trees_by_chrom(blocks, verbose=False):
   """
   Construct set of interval trees from an iterable of genome alignment blocks.
 
   :return: a dictionary indexed by chromosome name where each entry is an
            interval tree for that chromosome.
   """
+  if verbose:
+    sys.stderr.write("separating blocks by chromosome... ")
   by_chrom = {}
   for b in blocks:
     if b.chrom not in by_chrom:
       by_chrom[b.chrom] = []
     by_chrom[b.chrom].append(b)
+  if verbose:
+    sys.stderr.write("done\n")
+
+  if verbose:
+    sys.stderr.write("building interval trees by chromosome... ")
   res = {}
   for c in by_chrom:
     res[c] = IntervalTree(by_chrom[c], openEnded=True)
+  if verbose:
+    sys.stderr.write("done\n")
+
   return res
 
 
@@ -162,13 +173,17 @@ def end(self):
     """:return: the end of the block on the reference genome sequence."""
     return self[self.reference_sequence_name].end
 
-  def get_column_absolute(self, position):
+  def get_column_absolute(self, position,
+                          miss_seqs=MissingSequenceHandler.TREAT_AS_ALL_GAPS,
+                          species=None):
     """
     return a column from the block as dictionary indexed by seq. name.
 
-    :param position: the index to extract from the block; must be absolute
-                     coordinates (i.e. between self.start and self.end, not
-                     inclusive of the end).
+    :param position:  the index to extract from the block; must be absolute
+                      coordinates (i.e. between self.start and self.end, not
+                      inclusive of the end).
+    :param miss_seqs: how to treat sequence with no actual sequence data for
+                      the column.
     :return: dictionary where keys are sequence names and values are
              nucleotides (raw strings).
     """
@@ -182,7 +197,16 @@ def get_column_absolute(self, position):
     assert(len(rel_coord) == 1)
     rel_start, rel_end = rel_coord[0]
     assert(rel_end == rel_start + 1)
-    return self.get_column(rel_start, MissingSequenceHandler.TREAT_AS_ALL_GAPS)
+    raw_col = self.get_column(rel_start, miss_seqs)
+
+    if species is None:
+      return raw_col
+    res = {}
+    for k in raw_col:
+      name_parts = k.split(".")
+      if name_parts[0] in species:
+        res[k] = raw_col[k]
+    return res
 
 
 ###############################################################################
@@ -285,11 +309,11 @@ class GenomeAlignment(object):
   :param blocks: genome alignment blocks to build the alignment from.
   """
 
-  def __init__(self, blocks):
+  def __init__(self, blocks, verbose=False):
     """Constructor; see class docstring for description of parameters."""
     # a genome alignment stores blocks internally using one an interval tree
     # for each chromosome
-    self.block_trees = _build_trees_by_chrom(blocks)
+    self.block_trees = _build_trees_by_chrom(blocks, verbose)
     self.num_blocks = len(blocks)
 
   def get_blocks(self, chrom, start, end):
@@ -303,7 +327,9 @@ def get_blocks(self, chrom, start, end):
       return []
     return self.block_trees[chrom].intersectingInterval(start, end)
 
-  def get_column(self, chrom, position):
+  def get_column(self, chrom, position,
+                 missing_seqs=MissingSequenceHandler.TREAT_AS_ALL_GAPS,
+                 species=None):
     """Get the alignment column at the specified chromosome and position."""
     blocks = self.get_blocks(chrom, position, position + 1)
     if len(blocks) == 0:
@@ -317,7 +343,7 @@ def get_column(self, chrom, position):
                                 " at position " + str(position) + "not " +
                                 "possible; ambiguous alignment of that locus.")
 
-    return blocks[0].get_column_absolute(position)
+    return blocks[0].get_column_absolute(position, missing_seqs, species)
 
 
 ###############################################################################
@@ -346,7 +372,7 @@ def end(self):
 class JustInTimeGenomeAlignment(GenomeAlignment):
 
   """
-  A genome alignment stored on-dsik over many alignment and index files.
+  A genome alignment stored on-disk over many alignment and index files.
 
   Only one file will be loaded at a time, and loading will be done
   just-in-time to satisfy lookups

diff --git a/src/pyokit/datastruct/multipleAlignment.py b/src/pyokit/datastruct/multipleAlignment.py
@@ -155,7 +155,7 @@ def get_column(self, position, missing_seqs=MissingSequenceHandler.SKIP):
                          has index 1, and the final column has
                          index == size(self).
     :param missing_seqs: how to treat sequence with no actual sequence data for
-                          the column.
+                         the column.
     :return: dictionary where keys are sequence names and values are
              nucleotides (raw strings).
     """

diff --git a/src/pyokit/io/genomeAlignment.py b/src/pyokit/io/genomeAlignment.py
@@ -236,7 +236,7 @@ def build_genome_alignment_from_file(ga_path, ref_spec, idx_path=None,
   else:
     for b in genome_alignment_iterator(ga_path, ref_spec, verbose=verbose):
       blocks.append(b)
-  return GenomeAlignment(blocks)
+  return GenomeAlignment(blocks, verbose)
 
 
 ###############################################################################

diff --git a/src/pyokit/scripts/conservationProfile.py b/src/pyokit/scripts/conservationProfile.py
@@ -53,6 +53,7 @@
 from pyokit.datastruct.genomeAlignment import JustInTimeGenomeAlignmentBlock
 from pyokit.datastruct import sequence
 from pyokit.datastruct.genomeAlignment import GenomeAlignment
+from pyokit.datastruct.multipleAlignment import MissingSequenceHandler
 # pyokit imports -- statistics
 from pyokit.statistics.online import RollingMean
 
@@ -173,13 +174,17 @@ def pid(col, ignore_gaps=False):
   return max(hist.values()) / float(total)
 
 
-def conservtion_profile_pid(region, genome_alignment):
+def conservtion_profile_pid(region, genome_alignment,
+                            mi_seqs=MissingSequenceHandler.TREAT_AS_ALL_GAPS,
+                            species=None):
   """
   build a conservation profile for the given region using the genome alignment.
 
   The scores in the profile will be the percent of bases identical to the
   reference sequence.
 
+  :param miss_seqs: how to treat sequence with no actual sequence data for
+                    the column.
   :return: a list of the same length as the region where each entry is the
            PID at the corresponding locus.
   """
@@ -189,7 +194,7 @@ def conservtion_profile_pid(region, genome_alignment):
   step = 1 if region.isPositiveStrand() else -1
   for i in range(s, e, step):
     try:
-      col = genome_alignment.get_column(region.chrom, i)
+      col = genome_alignment.get_column(region.chrom, i, mi_seqs, species)
       res.append(pid(col))
     except NoSuchAlignmentColumnError:
       res.append(None)
@@ -211,7 +216,9 @@ def merge_profile(mean_profile, new_profile):
 #                             MAIN SCRIPT LOGIC                               #
 ###############################################################################
 
-def processBED(fh, genome_alig, window_size, window_centre, verbose=False):
+def processBED(fh, genome_alig, window_size, window_centre,
+               mi_seqs=MissingSequenceHandler.TREAT_AS_ALL_GAPS, species=None,
+               verbose=False):
   """
   Process BED file, produce profile of conservation using whole genome alig.
 
@@ -222,6 +229,8 @@ def processBED(fh, genome_alig, window_size, window_centre, verbose=False):
   :param window_center: which part of each interval to place at the center
                         of the profile. Acceptable values are in the module
                         constant WINDOW_CENTRE_OPTIONS.
+  :param miss_seqs:     how to treat sequence with no actual sequence data for
+                        the column.
   :param verbose:       if True, output progress messages to stderr.
 
   :return:
@@ -234,7 +243,7 @@ def processBED(fh, genome_alig, window_size, window_centre, verbose=False):
                        sortedby=ITERATOR_SORTED_START):
     # figure out which interval to look at...
     transform_locus(e, window_centre, window_size)
-    new_profile = conservtion_profile_pid(e, genome_alig)
+    new_profile = conservtion_profile_pid(e, genome_alig, mi_seqs, species)
     merge_profile(mean_profile, new_profile)
   return [m.mean for m in mean_profile]
 
@@ -290,6 +299,15 @@ def getUI(prog_name, args):
                                   "might be slow if they're large, and " +
                                   "might require a lot of memory)",
                       default=False, required=False))
+  ui.addOption(Option(short="m", long="missing", argName="strategy",
+                      description="how to treat missing sequences in " +
+                                  "blocks. Options are " +
+                                  ", ".join([str(x.name) for x in
+                                             MissingSequenceHandler]),
+                                  required=False, type=str))
+  ui.addOption(Option(short="s", long="species", argName="species",
+                      description="consider only these species. Default is " +
+                                  "all.", required=False, type=str))
   ui.addOption(Option(short="v", long="verbose",
                       description="output additional messages to stderr " +
                                   "about run (default: " +
@@ -359,6 +377,11 @@ def main(args, prog_name):
   index_extensions = (ui.getValue("index-extensions").strip().split(",") if
                       ui.optionIsSet("index-extensions") else None)
   fail_no_index = ui.optionIsSet("fail-no-index")
+  mi_seqs = (MissingSequenceHandler[ui.getValue("missing")]
+             if ui.optionIsSet("missing")
+             else MissingSequenceHandler.TREAT_AS_ALL_GAPS)
+  species = ([x.strip() for x in ui.getValue("species").split(",")] if
+             ui.optionIsSet("species") else None)
 
   # build the genome alignment
   alig = (load_just_in_time_genome_alignment(ga_path, spec, extensions,
@@ -367,12 +390,11 @@ def main(args, prog_name):
           if os.path.isdir(ga_path)
           else build_genome_alignment_from_file(ga_path, spec, index_fn,
                                                 verbose))
-  if verbose:
-    sys.stderr.write("Done\n")
 
   # get the profile and write it to the output stream
-  profile = processBED(open(region_fn), alig, window_size, CENTRE, verbose)
-  out_fh.write("\n\n" + ", ".join(str(x) for x in profile))
+  profile = processBED(open(region_fn), alig, window_size, CENTRE,
+                       mi_seqs, species, verbose)
+  out_fh.write("\n\n" + ", ".join(str(x) for x in profile) + "\n")
 
 
 ###############################################################################