Skip to content

Commit

Permalink
adding in extra verbose support for processing genome alignemtns, abi…
Browse files Browse the repository at this point in the history
…lity to specify which species to use when extracting columns from genome alignments and producing conservation profiles, and extra support for adjusting treatment of missing sequences.
  • Loading branch information
pjuren committed Jul 9, 2015
1 parent 45d2bfd commit ae62dab
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 21 deletions.
48 changes: 37 additions & 11 deletions src/pyokit/datastruct/genomeAlignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

# standard imports
import unittest
import sys

# pyokit imports
from pyokit.datastruct.multipleAlignment import MultipleSequenceAlignment
Expand Down Expand Up @@ -87,21 +88,31 @@ def __init__(self, msg):
# HELPER FUNCTIONS #
###############################################################################

def _build_trees_by_chrom(blocks):
def _build_trees_by_chrom(blocks, verbose=False):
"""
Construct set of interval trees from an iterable of genome alignment blocks.
:return: a dictionary indexed by chromosome name where each entry is an
interval tree for that chromosome.
"""
if verbose:
sys.stderr.write("separating blocks by chromosome... ")
by_chrom = {}
for b in blocks:
if b.chrom not in by_chrom:
by_chrom[b.chrom] = []
by_chrom[b.chrom].append(b)
if verbose:
sys.stderr.write("done\n")

if verbose:
sys.stderr.write("building interval trees by chromosome... ")
res = {}
for c in by_chrom:
res[c] = IntervalTree(by_chrom[c], openEnded=True)
if verbose:
sys.stderr.write("done\n")

return res


Expand Down Expand Up @@ -162,13 +173,17 @@ def end(self):
""":return: the end of the block on the reference genome sequence."""
return self[self.reference_sequence_name].end

def get_column_absolute(self, position):
def get_column_absolute(self, position,
miss_seqs=MissingSequenceHandler.TREAT_AS_ALL_GAPS,
species=None):
"""
return a column from the block as dictionary indexed by seq. name.
:param position: the index to extract from the block; must be absolute
coordinates (i.e. between self.start and self.end, not
inclusive of the end).
:param position: the index to extract from the block; must be absolute
coordinates (i.e. between self.start and self.end, not
inclusive of the end).
:param miss_seqs: how to treat sequence with no actual sequence data for
the column.
:return: dictionary where keys are sequence names and values are
nucleotides (raw strings).
"""
Expand All @@ -182,7 +197,16 @@ def get_column_absolute(self, position):
assert(len(rel_coord) == 1)
rel_start, rel_end = rel_coord[0]
assert(rel_end == rel_start + 1)
return self.get_column(rel_start, MissingSequenceHandler.TREAT_AS_ALL_GAPS)
raw_col = self.get_column(rel_start, miss_seqs)

if species is None:
return raw_col
res = {}
for k in raw_col:
name_parts = k.split(".")
if name_parts[0] in species:
res[k] = raw_col[k]
return res


###############################################################################
Expand Down Expand Up @@ -285,11 +309,11 @@ class GenomeAlignment(object):
:param blocks: genome alignment blocks to build the alignment from.
"""

def __init__(self, blocks):
def __init__(self, blocks, verbose=False):
"""Constructor; see class docstring for description of parameters."""
# a genome alignment stores blocks internally using one an interval tree
# for each chromosome
self.block_trees = _build_trees_by_chrom(blocks)
self.block_trees = _build_trees_by_chrom(blocks, verbose)
self.num_blocks = len(blocks)

def get_blocks(self, chrom, start, end):
Expand All @@ -303,7 +327,9 @@ def get_blocks(self, chrom, start, end):
return []
return self.block_trees[chrom].intersectingInterval(start, end)

def get_column(self, chrom, position):
def get_column(self, chrom, position,
missing_seqs=MissingSequenceHandler.TREAT_AS_ALL_GAPS,
species=None):
"""Get the alignment column at the specified chromosome and position."""
blocks = self.get_blocks(chrom, position, position + 1)
if len(blocks) == 0:
Expand All @@ -317,7 +343,7 @@ def get_column(self, chrom, position):
" at position " + str(position) + "not " +
"possible; ambiguous alignment of that locus.")

return blocks[0].get_column_absolute(position)
return blocks[0].get_column_absolute(position, missing_seqs, species)


###############################################################################
Expand Down Expand Up @@ -346,7 +372,7 @@ def end(self):
class JustInTimeGenomeAlignment(GenomeAlignment):

"""
A genome alignment stored on-dsik over many alignment and index files.
A genome alignment stored on-disk over many alignment and index files.
Only one file will be loaded at a time, and loading will be done
just-in-time to satisfy lookups
Expand Down
2 changes: 1 addition & 1 deletion src/pyokit/datastruct/multipleAlignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def get_column(self, position, missing_seqs=MissingSequenceHandler.SKIP):
has index 1, and the final column has
index == size(self).
:param missing_seqs: how to treat sequence with no actual sequence data for
the column.
the column.
:return: dictionary where keys are sequence names and values are
nucleotides (raw strings).
"""
Expand Down
2 changes: 1 addition & 1 deletion src/pyokit/io/genomeAlignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def build_genome_alignment_from_file(ga_path, ref_spec, idx_path=None,
else:
for b in genome_alignment_iterator(ga_path, ref_spec, verbose=verbose):
blocks.append(b)
return GenomeAlignment(blocks)
return GenomeAlignment(blocks, verbose)


###############################################################################
Expand Down
38 changes: 30 additions & 8 deletions src/pyokit/scripts/conservationProfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
from pyokit.datastruct.genomeAlignment import JustInTimeGenomeAlignmentBlock
from pyokit.datastruct import sequence
from pyokit.datastruct.genomeAlignment import GenomeAlignment
from pyokit.datastruct.multipleAlignment import MissingSequenceHandler
# pyokit imports -- statistics
from pyokit.statistics.online import RollingMean

Expand Down Expand Up @@ -173,13 +174,17 @@ def pid(col, ignore_gaps=False):
return max(hist.values()) / float(total)


def conservtion_profile_pid(region, genome_alignment):
def conservtion_profile_pid(region, genome_alignment,
mi_seqs=MissingSequenceHandler.TREAT_AS_ALL_GAPS,
species=None):
"""
build a conservation profile for the given region using the genome alignment.
The scores in the profile will be the percent of bases identical to the
reference sequence.
:param miss_seqs: how to treat sequence with no actual sequence data for
the column.
:return: a list of the same length as the region where each entry is the
PID at the corresponding locus.
"""
Expand All @@ -189,7 +194,7 @@ def conservtion_profile_pid(region, genome_alignment):
step = 1 if region.isPositiveStrand() else -1
for i in range(s, e, step):
try:
col = genome_alignment.get_column(region.chrom, i)
col = genome_alignment.get_column(region.chrom, i, mi_seqs, species)
res.append(pid(col))
except NoSuchAlignmentColumnError:
res.append(None)
Expand All @@ -211,7 +216,9 @@ def merge_profile(mean_profile, new_profile):
# MAIN SCRIPT LOGIC #
###############################################################################

def processBED(fh, genome_alig, window_size, window_centre, verbose=False):
def processBED(fh, genome_alig, window_size, window_centre,
mi_seqs=MissingSequenceHandler.TREAT_AS_ALL_GAPS, species=None,
verbose=False):
"""
Process BED file, produce profile of conservation using whole genome alig.
Expand All @@ -222,6 +229,8 @@ def processBED(fh, genome_alig, window_size, window_centre, verbose=False):
:param window_center: which part of each interval to place at the center
of the profile. Acceptable values are in the module
constant WINDOW_CENTRE_OPTIONS.
:param miss_seqs: how to treat sequence with no actual sequence data for
the column.
:param verbose: if True, output progress messages to stderr.
:return:
Expand All @@ -234,7 +243,7 @@ def processBED(fh, genome_alig, window_size, window_centre, verbose=False):
sortedby=ITERATOR_SORTED_START):
# figure out which interval to look at...
transform_locus(e, window_centre, window_size)
new_profile = conservtion_profile_pid(e, genome_alig)
new_profile = conservtion_profile_pid(e, genome_alig, mi_seqs, species)
merge_profile(mean_profile, new_profile)
return [m.mean for m in mean_profile]

Expand Down Expand Up @@ -290,6 +299,15 @@ def getUI(prog_name, args):
"might be slow if they're large, and " +
"might require a lot of memory)",
default=False, required=False))
ui.addOption(Option(short="m", long="missing", argName="strategy",
description="how to treat missing sequences in " +
"blocks. Options are " +
", ".join([str(x.name) for x in
MissingSequenceHandler]),
required=False, type=str))
ui.addOption(Option(short="s", long="species", argName="species",
description="consider only these species. Default is " +
"all.", required=False, type=str))
ui.addOption(Option(short="v", long="verbose",
description="output additional messages to stderr " +
"about run (default: " +
Expand Down Expand Up @@ -359,6 +377,11 @@ def main(args, prog_name):
index_extensions = (ui.getValue("index-extensions").strip().split(",") if
ui.optionIsSet("index-extensions") else None)
fail_no_index = ui.optionIsSet("fail-no-index")
mi_seqs = (MissingSequenceHandler[ui.getValue("missing")]
if ui.optionIsSet("missing")
else MissingSequenceHandler.TREAT_AS_ALL_GAPS)
species = ([x.strip() for x in ui.getValue("species").split(",")] if
ui.optionIsSet("species") else None)

# build the genome alignment
alig = (load_just_in_time_genome_alignment(ga_path, spec, extensions,
Expand All @@ -367,12 +390,11 @@ def main(args, prog_name):
if os.path.isdir(ga_path)
else build_genome_alignment_from_file(ga_path, spec, index_fn,
verbose))
if verbose:
sys.stderr.write("Done\n")

# get the profile and write it to the output stream
profile = processBED(open(region_fn), alig, window_size, CENTRE, verbose)
out_fh.write("\n\n" + ", ".join(str(x) for x in profile))
profile = processBED(open(region_fn), alig, window_size, CENTRE,
mi_seqs, species, verbose)
out_fh.write("\n\n" + ", ".join(str(x) for x in profile) + "\n")


###############################################################################
Expand Down

0 comments on commit ae62dab

Please sign in to comment.