Find key col signatures

mooreryan · Jul 25, 2018 · a59224e · a59224e
1 parent 3ebd31c
commit a59224e
Show file tree

Hide file tree

Showing 7 changed files with 311 additions and 26 deletions.
diff --git a/.gitignore b/.gitignore
@@ -29,4 +29,6 @@ todo.txt
 .rspec_status
 *.lock
 
-.idea
+.idea
+
+exe/key_cols2
diff --git a/exe/key_cols b/exe/key_cols
@@ -0,0 +1,206 @@
+#!/usr/bin/env ruby
+
+Signal.trap("PIPE", "EXIT")
+
+require "tree_clusters"
+require "trollop"
+require "parse_fasta"
+require "fileutils"
+
+TreeClusters.extend TreeClusters
+
+GREETING = "The '#{__FILE__}' program"
+UNDERLINE = "=" * GREETING.length
+
+opts = Trollop.options do
+  version TreeClusters::VERSION
+
+  # banner <<-EOS
+
+
+  # Checking IDs
+  # ------------
+
+  # IDs for the sequences must match between the three input files.
+
+  # The tree file is allowed to have quoted taxa names, but the
+  # mapping file and alignment file are not.
+
+  # If your alignment file has spaces in the name, the ID part of the
+  # header (i.e., the part up until the space) must match with the
+  # sequence IDs in the tree and the mapping file.
+
+  # Example: This would be okay.
+
+  # tree file:
+  #   ('genome_A', 'genome_B');
+
+  # aln file:
+  #   >genome_A apple pie
+  #   AAAAA
+  #   >genome_B brown sugar
+  #   AATTA
+
+  # Options:
+  # EOS
+
+  banner <<-EOS
+
+#{GREETING}
+#{UNDERLINE}
+
+  Hi.  My name is #{__FILE__}.  If you give me a Newick tree file and
+  an alignment file (fasta format), I will tell you key columns for
+  all clades/clusters that have them.
+
+Overview
+--------
+
+  A clade has key columns if you can use the residue/nucleotide at
+  those columns to tell sequences in the clade from sequences outside
+  of the clade.
+
+  Here's an example....
+
+  After you run me (#{__FILE__} is my name), you'll get an output file
+  with the extension, '*.tree_clusters.key_cols.txt'.  It may look
+  something like this:
+
+cluster_A	4	1-A	2-A	3-A	5-G
+cluster_B	4	1-C	2-C	3-C	5-A
+
+  This file has the clade name, the number of key columns for that
+  clade, and then the rest of the columns tell you the position
+  (1-based) and the nucleotide or residue in that column in all
+  sequences of that clade.
+
+  In this case we have only two clades.  The key columns for both are
+  1, 2, 3, and 5.  So you can use columns 1, 2, 3, and 5 to classify a
+  sequence as belonging to one of these clades.  If it has A, A, A,
+  and G in those positions, it'll be in cluster_A, and if it has C, C,
+  C, and A in those positions, it'll be in cluster_B.  If it has any
+  other combination in those 4 columns of the alignment, it won't be
+  in either clade.
+
+  This is just a silly example and most of the time you'll get
+  different key columns for different clades.  Note that every clade
+  may not have key columns listed depending on your data and the
+  options you select.
+
+Notes & Gotchas
+--------------
+
+  - I ignore columns with gap chars (currently just '-') regardless of
+    column entropy.
+
+Option info
+-----------
+
+  --entropy-cutoff: A cutoff of 0 means that you allow no variation at
+    any column.
+
+  --clade-size-cutoff: Use this option to ignore tiny clades.
+
+  Options:
+  EOS
+
+  opt(:tree,
+      "Newick tree file",
+      type: :string)
+  opt(:aln,
+      "Alignment file",
+      type: :string)
+
+  opt(:entropy_cutoff,
+      "Cutoff to consider a column low entropy",
+      default: 0.0)
+  opt(:clade_size_cutoff,
+      "Consider only clades with at least this many leaves",
+      default: 1)
+
+  opt(:outdir,
+      "Output directory",
+      default: ".")
+  opt(:base,
+      "Basename for output",
+      default: "snazzy_clades")
+end
+
+abort_if opts[:tree].nil?,
+         "--tree is a required arg.  Try running: #{__FILE__} --help"
+abort_if opts[:aln].nil?,
+         "--aln is a required arg.  Try running: #{__FILE__} --help"
+
+abort_unless_file_exists opts[:tree]
+abort_unless_file_exists opts[:aln]
+
+# TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:aln]
+
+abort_unless opts[:entropy_cutoff] >= 0,
+             "--entropy-cutoff must be >= 0"
+abort_unless opts[:clade_size_cutoff] >= 1,
+             "--clade-size-cutoff must be >= 1"
+
+FileUtils.mkdir_p opts[:outdir]
+
+tree       = NewickTree.fromFile opts[:tree]
+leaf2attrs = TreeClusters.read_alignment opts[:aln]
+
+members_fname                     =
+  File.join opts[:outdir],
+            "#{opts[:base]}.tree_clusters.clade_members.txt"
+key_cols_fname                    =
+  File.join opts[:outdir],
+            "#{opts[:base]}.tree_clusters.key_cols.txt"
+annotated_tree_fname =
+  File.join opts[:outdir],
+            "#{opts[:base]}.tree_clusters.annotated_tree.txt"
+
+clade_members_f               =
+  File.open(members_fname, "w")
+key_cols_f                    =
+  File.open(key_cols_fname, "w")
+annotated_tree_f                    =
+  File.open(annotated_tree_fname, "w")
+
+key_col_sets = {}
+
+begin
+  TreeClusters.all_clades(tree).sort_by {|cl| cl.all_leaves.count}.reverse.each_with_index do |clade, idx|
+    clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
+
+    clade_members_f.puts [clade_id,
+                          clade.all_leaves.count,
+                          clade.all_leaves].join "\t"
+
+    key_cols_all_leaves         =
+      TreeClusters.low_ent_cols_with_bases clade.all_leaves,
+                                           leaf2attrs,
+                                           opts[:entropy_cutoff]
+
+    unless key_col_sets.has_key? key_cols_all_leaves
+      key_col_sets[key_cols_all_leaves] = Set.new [clade_id]
+    end
+    key_col_sets[key_cols_all_leaves] << clade_id
+
+    # This will change the node in the original NewickTree
+    clade.node.name = "'#{clade_id}'"
+
+  end
+
+  # We only want key column sets that are unique to a single clade.
+  key_col_sets.select { |_, clades| clades.count == 1 }.each do |kc_set, clades|
+    clade_id = clades.first
+    key_cols_f.puts [
+      clade_id,
+      kc_set.count,
+      kc_set.map { |pos, bases| "#{pos}-#{bases.join}" }
+    ].join "\t"
+  end
+
+  annotated_tree_f.puts tree.to_s.sub(/;+$/, ";")
+ensure
+  clade_members_f.close
+  key_cols_f.close
+  annotated_tree_f.close
+end
diff --git a/lib/tree_clusters.rb b/lib/tree_clusters.rb
@@ -89,6 +89,25 @@ def low_ent_cols leaves, leaf2attrs, entropy_cutoff
     Set.new low_ent_cols
   end
 
+  # Like low_ent_cols method but also returns the bases at the positions.
+  def low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff
+    low_ent_cols = []
+    alns = leaf2attrs.attrs leaves, :aln
+    aln_cols = alns.transpose
+
+    aln_cols.each_with_index do |aln_col, aln_col_idx|
+      has_gaps = aln_col.any? { |aa| aa == "-" }
+      low_entropy =
+          Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
+
+      if !has_gaps && low_entropy
+        low_ent_cols << [(aln_col_idx + 1), aln_col.map(&:upcase).uniq.sort]
+      end
+    end
+
+    Set.new low_ent_cols
+  end
+
   # @note If there are quoted names in the tree file, they are
   #   unquoted first.
   def check_ids tree, mapping, aln

diff --git a/lib/tree_clusters/clade.rb b/lib/tree_clusters/clade.rb
@@ -1,7 +1,8 @@
 module TreeClusters
   # Represents a clade in a NewickTree
   class Clade
-    attr_accessor :name,
+    attr_accessor :node,
+                  :name,
                   :all_leaves,
                   :left_leaves,
                   :right_leaves,
@@ -18,10 +19,11 @@ class Clade
     #
     # @param node [NewickNode] a NewickNode from a NewickTree
     # @param tree [NewickTree] a NewickTree
-    def initialize node, tree, metadata=nil
+    def initialize node, tree, metadata = nil
       tree_taxa = tree.unquoted_taxa
 
-      @name = unquote node.name
+      @node = node
+      @name       = unquote node.name
       @all_leaves = descendant_leaves node
 
       if (children = node.children).count == 2
@@ -37,7 +39,7 @@ def initialize node, tree, metadata=nil
       #        "Node #{node.name} has more than one sibling."
 
       @each_sibling_leaf_set = siblings.
-                               map { |node| descendant_leaves node }
+          map {|node| descendant_leaves node}
 
       @all_sibling_leaves = @each_sibling_leaf_set.flatten.uniq
 
@@ -47,14 +49,14 @@ def initialize node, tree, metadata=nil
       @parent_leaves = descendant_leaves parent
 
       @other_leaves =
-        Object::Set.new(tree_taxa) - Object::Set.new(all_leaves)
+          Object::Set.new(tree_taxa) - Object::Set.new(all_leaves)
 
       @non_parent_leaves =
-        Object::Set.new(tree_taxa) - Object::Set.new(parent_leaves)
+          Object::Set.new(tree_taxa) - Object::Set.new(parent_leaves)
 
       if metadata
-        @metadata = metadata
-        @all_tags ||= get_all_tags
+        @metadata        = metadata
+        @all_tags        ||= get_all_tags
         @single_tag_info ||= get_single_tag_info
       else
         @single_tag_info = nil
@@ -67,16 +69,16 @@ def initialize node, tree, metadata=nil
     # well.
     def == clade
       (
-        self.name == clade.name &&
-        self.all_leaves == clade.all_leaves &&
-        self.left_leaves == clade.left_leaves &&
-        self.right_leaves == clade.right_leaves &&
-        self.all_sibling_leaves == clade.all_sibling_leaves &&
-        self.each_sibling_leaf_set == clade.each_sibling_leaf_set &&
-        self.parent_leaves == clade.parent_leaves &&
-        self.other_leaves == clade.other_leaves &&
-        self.single_tag_info == clade.single_tag_info &&
-        self.all_tags == clade.all_tags
+      self.name == clade.name &&
+          self.all_leaves == clade.all_leaves &&
+          self.left_leaves == clade.left_leaves &&
+          self.right_leaves == clade.right_leaves &&
+          self.all_sibling_leaves == clade.all_sibling_leaves &&
+          self.each_sibling_leaf_set == clade.each_sibling_leaf_set &&
+          self.parent_leaves == clade.parent_leaves &&
+          self.other_leaves == clade.other_leaves &&
+          self.single_tag_info == clade.single_tag_info &&
+          self.all_tags == clade.all_tags
       )
     end
 
@@ -99,7 +101,7 @@ def get_all_tags
         tag_info = self.all_leaves.map do |leaf|
           assert name2tag.has_key?(leaf),
                  "leaf #{leaf} is not present in name2tag ht for " +
-                 "md_cat #{md_cat}"
+                     "md_cat #{md_cat}"
 
           name2tag[leaf]
         end
@@ -113,11 +115,11 @@ def descendant_leaves node
         [unquote(node.name)]
       else
         node.
-          descendants.
-          flatten.
-          uniq.
-          select { |node| node.leaf? }.
-          map { |node| unquote(node.name) }
+            descendants.
+            flatten.
+            uniq.
+            select {|node| node.leaf?}.
+            map {|node| unquote(node.name)}
       end
     end
 

diff --git a/lib/tree_clusters/version.rb b/lib/tree_clusters/version.rb
@@ -1,3 +1,3 @@
 module TreeClusters
-  VERSION = "0.7.0"
+  VERSION = "0.8.0"
 end