Skip to content

Commit

Permalink
Find key col signatures
Browse files Browse the repository at this point in the history
  • Loading branch information
mooreryan committed Jul 25, 2018
1 parent 3ebd31c commit a59224e
Show file tree
Hide file tree
Showing 7 changed files with 311 additions and 26 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,6 @@ todo.txt
.rspec_status
*.lock

.idea
.idea

exe/key_cols2
206 changes: 206 additions & 0 deletions exe/key_cols
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
#!/usr/bin/env ruby

Signal.trap("PIPE", "EXIT")

require "tree_clusters"
require "trollop"
require "parse_fasta"
require "fileutils"

TreeClusters.extend TreeClusters

GREETING = "The '#{__FILE__}' program"
UNDERLINE = "=" * GREETING.length

opts = Trollop.options do
version TreeClusters::VERSION

# banner <<-EOS


# Checking IDs
# ------------

# IDs for the sequences must match between the three input files.

# The tree file is allowed to have quoted taxa names, but the
# mapping file and alignment file are not.

# If your alignment file has spaces in the name, the ID part of the
# header (i.e., the part up until the space) must match with the
# sequence IDs in the tree and the mapping file.

# Example: This would be okay.

# tree file:
# ('genome_A', 'genome_B');

# aln file:
# >genome_A apple pie
# AAAAA
# >genome_B brown sugar
# AATTA

# Options:
# EOS

banner <<-EOS
#{GREETING}
#{UNDERLINE}
Hi. My name is #{__FILE__}. If you give me a Newick tree file and
an alignment file (fasta format), I will tell you key columns for
all clades/clusters that have them.
Overview
--------
A clade has key columns if you can use the residue/nucleotide at
those columns to tell sequences in the clade from sequences outside
of the clade.
Here's an example....
After you run me (#{__FILE__} is my name), you'll get an output file
with the extension, '*.tree_clusters.key_cols.txt'. It may look
something like this:
cluster_A 4 1-A 2-A 3-A 5-G
cluster_B 4 1-C 2-C 3-C 5-A
This file has the clade name, the number of key columns for that
clade, and then the rest of the columns tell you the position
(1-based) and the nucleotide or residue in that column in all
sequences of that clade.
In this case we have only two clades. The key columns for both are
1, 2, 3, and 5. So you can use columns 1, 2, 3, and 5 to classify a
sequence as belonging to one of these clades. If it has A, A, A,
and G in those positions, it'll be in cluster_A, and if it has C, C,
C, and A in those positions, it'll be in cluster_B. If it has any
other combination in those 4 columns of the alignment, it won't be
in either clade.
This is just a silly example and most of the time you'll get
different key columns for different clades. Note that every clade
may not have key columns listed depending on your data and the
options you select.
Notes & Gotchas
--------------
- I ignore columns with gap chars (currently just '-') regardless of
column entropy.
Option info
-----------
--entropy-cutoff: A cutoff of 0 means that you allow no variation at
any column.
--clade-size-cutoff: Use this option to ignore tiny clades.
Options:
EOS

opt(:tree,
"Newick tree file",
type: :string)
opt(:aln,
"Alignment file",
type: :string)

opt(:entropy_cutoff,
"Cutoff to consider a column low entropy",
default: 0.0)
opt(:clade_size_cutoff,
"Consider only clades with at least this many leaves",
default: 1)

opt(:outdir,
"Output directory",
default: ".")
opt(:base,
"Basename for output",
default: "snazzy_clades")
end

abort_if opts[:tree].nil?,
"--tree is a required arg. Try running: #{__FILE__} --help"
abort_if opts[:aln].nil?,
"--aln is a required arg. Try running: #{__FILE__} --help"

abort_unless_file_exists opts[:tree]
abort_unless_file_exists opts[:aln]

# TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:aln]

abort_unless opts[:entropy_cutoff] >= 0,
"--entropy-cutoff must be >= 0"
abort_unless opts[:clade_size_cutoff] >= 1,
"--clade-size-cutoff must be >= 1"

FileUtils.mkdir_p opts[:outdir]

tree = NewickTree.fromFile opts[:tree]
leaf2attrs = TreeClusters.read_alignment opts[:aln]

members_fname =
File.join opts[:outdir],
"#{opts[:base]}.tree_clusters.clade_members.txt"
key_cols_fname =
File.join opts[:outdir],
"#{opts[:base]}.tree_clusters.key_cols.txt"
annotated_tree_fname =
File.join opts[:outdir],
"#{opts[:base]}.tree_clusters.annotated_tree.txt"

clade_members_f =
File.open(members_fname, "w")
key_cols_f =
File.open(key_cols_fname, "w")
annotated_tree_f =
File.open(annotated_tree_fname, "w")

key_col_sets = {}

begin
TreeClusters.all_clades(tree).sort_by {|cl| cl.all_leaves.count}.reverse.each_with_index do |clade, idx|
clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"

clade_members_f.puts [clade_id,
clade.all_leaves.count,
clade.all_leaves].join "\t"

key_cols_all_leaves =
TreeClusters.low_ent_cols_with_bases clade.all_leaves,
leaf2attrs,
opts[:entropy_cutoff]

unless key_col_sets.has_key? key_cols_all_leaves
key_col_sets[key_cols_all_leaves] = Set.new [clade_id]
end
key_col_sets[key_cols_all_leaves] << clade_id

# This will change the node in the original NewickTree
clade.node.name = "'#{clade_id}'"

end

# We only want key column sets that are unique to a single clade.
key_col_sets.select { |_, clades| clades.count == 1 }.each do |kc_set, clades|
clade_id = clades.first
key_cols_f.puts [
clade_id,
kc_set.count,
kc_set.map { |pos, bases| "#{pos}-#{bases.join}" }
].join "\t"
end

annotated_tree_f.puts tree.to_s.sub(/;+$/, ";")
ensure
clade_members_f.close
key_cols_f.close
annotated_tree_f.close
end
19 changes: 19 additions & 0 deletions lib/tree_clusters.rb
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,25 @@ def low_ent_cols leaves, leaf2attrs, entropy_cutoff
Set.new low_ent_cols
end

# Like low_ent_cols method but also returns the bases at the positions.
def low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff
low_ent_cols = []
alns = leaf2attrs.attrs leaves, :aln
aln_cols = alns.transpose

aln_cols.each_with_index do |aln_col, aln_col_idx|
has_gaps = aln_col.any? { |aa| aa == "-" }
low_entropy =
Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff

if !has_gaps && low_entropy
low_ent_cols << [(aln_col_idx + 1), aln_col.map(&:upcase).uniq.sort]
end
end

Set.new low_ent_cols
end

# @note If there are quoted names in the tree file, they are
# unquoted first.
def check_ids tree, mapping, aln
Expand Down
50 changes: 26 additions & 24 deletions lib/tree_clusters/clade.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
module TreeClusters
# Represents a clade in a NewickTree
class Clade
attr_accessor :name,
attr_accessor :node,
:name,
:all_leaves,
:left_leaves,
:right_leaves,
Expand All @@ -18,10 +19,11 @@ class Clade
#
# @param node [NewickNode] a NewickNode from a NewickTree
# @param tree [NewickTree] a NewickTree
def initialize node, tree, metadata=nil
def initialize node, tree, metadata = nil
tree_taxa = tree.unquoted_taxa

@name = unquote node.name
@node = node
@name = unquote node.name
@all_leaves = descendant_leaves node

if (children = node.children).count == 2
Expand All @@ -37,7 +39,7 @@ def initialize node, tree, metadata=nil
# "Node #{node.name} has more than one sibling."

@each_sibling_leaf_set = siblings.
map { |node| descendant_leaves node }
map {|node| descendant_leaves node}

@all_sibling_leaves = @each_sibling_leaf_set.flatten.uniq

Expand All @@ -47,14 +49,14 @@ def initialize node, tree, metadata=nil
@parent_leaves = descendant_leaves parent

@other_leaves =
Object::Set.new(tree_taxa) - Object::Set.new(all_leaves)
Object::Set.new(tree_taxa) - Object::Set.new(all_leaves)

@non_parent_leaves =
Object::Set.new(tree_taxa) - Object::Set.new(parent_leaves)
Object::Set.new(tree_taxa) - Object::Set.new(parent_leaves)

if metadata
@metadata = metadata
@all_tags ||= get_all_tags
@metadata = metadata
@all_tags ||= get_all_tags
@single_tag_info ||= get_single_tag_info
else
@single_tag_info = nil
Expand All @@ -67,16 +69,16 @@ def initialize node, tree, metadata=nil
# well.
def == clade
(
self.name == clade.name &&
self.all_leaves == clade.all_leaves &&
self.left_leaves == clade.left_leaves &&
self.right_leaves == clade.right_leaves &&
self.all_sibling_leaves == clade.all_sibling_leaves &&
self.each_sibling_leaf_set == clade.each_sibling_leaf_set &&
self.parent_leaves == clade.parent_leaves &&
self.other_leaves == clade.other_leaves &&
self.single_tag_info == clade.single_tag_info &&
self.all_tags == clade.all_tags
self.name == clade.name &&
self.all_leaves == clade.all_leaves &&
self.left_leaves == clade.left_leaves &&
self.right_leaves == clade.right_leaves &&
self.all_sibling_leaves == clade.all_sibling_leaves &&
self.each_sibling_leaf_set == clade.each_sibling_leaf_set &&
self.parent_leaves == clade.parent_leaves &&
self.other_leaves == clade.other_leaves &&
self.single_tag_info == clade.single_tag_info &&
self.all_tags == clade.all_tags
)
end

Expand All @@ -99,7 +101,7 @@ def get_all_tags
tag_info = self.all_leaves.map do |leaf|
assert name2tag.has_key?(leaf),
"leaf #{leaf} is not present in name2tag ht for " +
"md_cat #{md_cat}"
"md_cat #{md_cat}"

name2tag[leaf]
end
Expand All @@ -113,11 +115,11 @@ def descendant_leaves node
[unquote(node.name)]
else
node.
descendants.
flatten.
uniq.
select { |node| node.leaf? }.
map { |node| unquote(node.name) }
descendants.
flatten.
uniq.
select {|node| node.leaf?}.
map {|node| unquote(node.name)}
end
end

Expand Down
2 changes: 1 addition & 1 deletion lib/tree_clusters/version.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module TreeClusters
VERSION = "0.7.0"
VERSION = "0.8.0"
end

0 comments on commit a59224e

Please sign in to comment.