From a59224eca72c9f8ec031fefd7242bf5be43cbd86 Mon Sep 17 00:00:00 2001 From: Ryan Moore Date: Tue, 24 Jul 2018 21:57:03 -0400 Subject: [PATCH] Find key col signatures --- .gitignore | 4 +- exe/key_cols | 206 +++++++++++++++++++++++++++++++++++ lib/tree_clusters.rb | 19 ++++ lib/tree_clusters/clade.rb | 50 +++++---- lib/tree_clusters/version.rb | 2 +- spec/tree_clusters_spec.rb | 42 +++++++ test_files/small2.aln | 14 +++ 7 files changed, 311 insertions(+), 26 deletions(-) create mode 100755 exe/key_cols create mode 100644 test_files/small2.aln diff --git a/.gitignore b/.gitignore index 2575a79..292cf0b 100644 --- a/.gitignore +++ b/.gitignore @@ -29,4 +29,6 @@ todo.txt .rspec_status *.lock -.idea \ No newline at end of file +.idea + +exe/key_cols2 diff --git a/exe/key_cols b/exe/key_cols new file mode 100755 index 0000000..ff18268 --- /dev/null +++ b/exe/key_cols @@ -0,0 +1,206 @@ +#!/usr/bin/env ruby + +Signal.trap("PIPE", "EXIT") + +require "tree_clusters" +require "trollop" +require "parse_fasta" +require "fileutils" + +TreeClusters.extend TreeClusters + +GREETING = "The '#{__FILE__}' program" +UNDERLINE = "=" * GREETING.length + +opts = Trollop.options do + version TreeClusters::VERSION + + # banner <<-EOS + + + # Checking IDs + # ------------ + + # IDs for the sequences must match between the three input files. + + # The tree file is allowed to have quoted taxa names, but the + # mapping file and alignment file are not. + + # If your alignment file has spaces in the name, the ID part of the + # header (i.e., the part up until the space) must match with the + # sequence IDs in the tree and the mapping file. + + # Example: This would be okay. + + # tree file: + # ('genome_A', 'genome_B'); + + # aln file: + # >genome_A apple pie + # AAAAA + # >genome_B brown sugar + # AATTA + + # Options: + # EOS + + banner <<-EOS + +#{GREETING} +#{UNDERLINE} + + Hi. My name is #{__FILE__}. If you give me a Newick tree file and + an alignment file (fasta format), I will tell you key columns for + all clades/clusters that have them. + +Overview +-------- + + A clade has key columns if you can use the residue/nucleotide at + those columns to tell sequences in the clade from sequences outside + of the clade. + + Here's an example.... + + After you run me (#{__FILE__} is my name), you'll get an output file + with the extension, '*.tree_clusters.key_cols.txt'. It may look + something like this: + +cluster_A 4 1-A 2-A 3-A 5-G +cluster_B 4 1-C 2-C 3-C 5-A + + This file has the clade name, the number of key columns for that + clade, and then the rest of the columns tell you the position + (1-based) and the nucleotide or residue in that column in all + sequences of that clade. + + In this case we have only two clades. The key columns for both are + 1, 2, 3, and 5. So you can use columns 1, 2, 3, and 5 to classify a + sequence as belonging to one of these clades. If it has A, A, A, + and G in those positions, it'll be in cluster_A, and if it has C, C, + C, and A in those positions, it'll be in cluster_B. If it has any + other combination in those 4 columns of the alignment, it won't be + in either clade. + + This is just a silly example and most of the time you'll get + different key columns for different clades. Note that every clade + may not have key columns listed depending on your data and the + options you select. + +Notes & Gotchas +-------------- + + - I ignore columns with gap chars (currently just '-') regardless of + column entropy. + +Option info +----------- + + --entropy-cutoff: A cutoff of 0 means that you allow no variation at + any column. + + --clade-size-cutoff: Use this option to ignore tiny clades. + + Options: + EOS + + opt(:tree, + "Newick tree file", + type: :string) + opt(:aln, + "Alignment file", + type: :string) + + opt(:entropy_cutoff, + "Cutoff to consider a column low entropy", + default: 0.0) + opt(:clade_size_cutoff, + "Consider only clades with at least this many leaves", + default: 1) + + opt(:outdir, + "Output directory", + default: ".") + opt(:base, + "Basename for output", + default: "snazzy_clades") +end + +abort_if opts[:tree].nil?, + "--tree is a required arg. Try running: #{__FILE__} --help" +abort_if opts[:aln].nil?, + "--aln is a required arg. Try running: #{__FILE__} --help" + +abort_unless_file_exists opts[:tree] +abort_unless_file_exists opts[:aln] + +# TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:aln] + +abort_unless opts[:entropy_cutoff] >= 0, + "--entropy-cutoff must be >= 0" +abort_unless opts[:clade_size_cutoff] >= 1, + "--clade-size-cutoff must be >= 1" + +FileUtils.mkdir_p opts[:outdir] + +tree = NewickTree.fromFile opts[:tree] +leaf2attrs = TreeClusters.read_alignment opts[:aln] + +members_fname = + File.join opts[:outdir], + "#{opts[:base]}.tree_clusters.clade_members.txt" +key_cols_fname = + File.join opts[:outdir], + "#{opts[:base]}.tree_clusters.key_cols.txt" +annotated_tree_fname = + File.join opts[:outdir], + "#{opts[:base]}.tree_clusters.annotated_tree.txt" + +clade_members_f = + File.open(members_fname, "w") +key_cols_f = + File.open(key_cols_fname, "w") +annotated_tree_f = + File.open(annotated_tree_fname, "w") + +key_col_sets = {} + +begin + TreeClusters.all_clades(tree).sort_by {|cl| cl.all_leaves.count}.reverse.each_with_index do |clade, idx| + clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}" + + clade_members_f.puts [clade_id, + clade.all_leaves.count, + clade.all_leaves].join "\t" + + key_cols_all_leaves = + TreeClusters.low_ent_cols_with_bases clade.all_leaves, + leaf2attrs, + opts[:entropy_cutoff] + + unless key_col_sets.has_key? key_cols_all_leaves + key_col_sets[key_cols_all_leaves] = Set.new [clade_id] + end + key_col_sets[key_cols_all_leaves] << clade_id + + # This will change the node in the original NewickTree + clade.node.name = "'#{clade_id}'" + + end + + # We only want key column sets that are unique to a single clade. + key_col_sets.select { |_, clades| clades.count == 1 }.each do |kc_set, clades| + clade_id = clades.first + key_cols_f.puts [ + clade_id, + kc_set.count, + kc_set.map { |pos, bases| "#{pos}-#{bases.join}" } + ].join "\t" + end + + annotated_tree_f.puts tree.to_s.sub(/;+$/, ";") +ensure + clade_members_f.close + key_cols_f.close + annotated_tree_f.close +end diff --git a/lib/tree_clusters.rb b/lib/tree_clusters.rb index dfc1c28..378169f 100644 --- a/lib/tree_clusters.rb +++ b/lib/tree_clusters.rb @@ -89,6 +89,25 @@ def low_ent_cols leaves, leaf2attrs, entropy_cutoff Set.new low_ent_cols end + # Like low_ent_cols method but also returns the bases at the positions. + def low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff + low_ent_cols = [] + alns = leaf2attrs.attrs leaves, :aln + aln_cols = alns.transpose + + aln_cols.each_with_index do |aln_col, aln_col_idx| + has_gaps = aln_col.any? { |aa| aa == "-" } + low_entropy = + Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff + + if !has_gaps && low_entropy + low_ent_cols << [(aln_col_idx + 1), aln_col.map(&:upcase).uniq.sort] + end + end + + Set.new low_ent_cols + end + # @note If there are quoted names in the tree file, they are # unquoted first. def check_ids tree, mapping, aln diff --git a/lib/tree_clusters/clade.rb b/lib/tree_clusters/clade.rb index 7964a2e..bddfc3a 100644 --- a/lib/tree_clusters/clade.rb +++ b/lib/tree_clusters/clade.rb @@ -1,7 +1,8 @@ module TreeClusters # Represents a clade in a NewickTree class Clade - attr_accessor :name, + attr_accessor :node, + :name, :all_leaves, :left_leaves, :right_leaves, @@ -18,10 +19,11 @@ class Clade # # @param node [NewickNode] a NewickNode from a NewickTree # @param tree [NewickTree] a NewickTree - def initialize node, tree, metadata=nil + def initialize node, tree, metadata = nil tree_taxa = tree.unquoted_taxa - @name = unquote node.name + @node = node + @name = unquote node.name @all_leaves = descendant_leaves node if (children = node.children).count == 2 @@ -37,7 +39,7 @@ def initialize node, tree, metadata=nil # "Node #{node.name} has more than one sibling." @each_sibling_leaf_set = siblings. - map { |node| descendant_leaves node } + map {|node| descendant_leaves node} @all_sibling_leaves = @each_sibling_leaf_set.flatten.uniq @@ -47,14 +49,14 @@ def initialize node, tree, metadata=nil @parent_leaves = descendant_leaves parent @other_leaves = - Object::Set.new(tree_taxa) - Object::Set.new(all_leaves) + Object::Set.new(tree_taxa) - Object::Set.new(all_leaves) @non_parent_leaves = - Object::Set.new(tree_taxa) - Object::Set.new(parent_leaves) + Object::Set.new(tree_taxa) - Object::Set.new(parent_leaves) if metadata - @metadata = metadata - @all_tags ||= get_all_tags + @metadata = metadata + @all_tags ||= get_all_tags @single_tag_info ||= get_single_tag_info else @single_tag_info = nil @@ -67,16 +69,16 @@ def initialize node, tree, metadata=nil # well. def == clade ( - self.name == clade.name && - self.all_leaves == clade.all_leaves && - self.left_leaves == clade.left_leaves && - self.right_leaves == clade.right_leaves && - self.all_sibling_leaves == clade.all_sibling_leaves && - self.each_sibling_leaf_set == clade.each_sibling_leaf_set && - self.parent_leaves == clade.parent_leaves && - self.other_leaves == clade.other_leaves && - self.single_tag_info == clade.single_tag_info && - self.all_tags == clade.all_tags + self.name == clade.name && + self.all_leaves == clade.all_leaves && + self.left_leaves == clade.left_leaves && + self.right_leaves == clade.right_leaves && + self.all_sibling_leaves == clade.all_sibling_leaves && + self.each_sibling_leaf_set == clade.each_sibling_leaf_set && + self.parent_leaves == clade.parent_leaves && + self.other_leaves == clade.other_leaves && + self.single_tag_info == clade.single_tag_info && + self.all_tags == clade.all_tags ) end @@ -99,7 +101,7 @@ def get_all_tags tag_info = self.all_leaves.map do |leaf| assert name2tag.has_key?(leaf), "leaf #{leaf} is not present in name2tag ht for " + - "md_cat #{md_cat}" + "md_cat #{md_cat}" name2tag[leaf] end @@ -113,11 +115,11 @@ def descendant_leaves node [unquote(node.name)] else node. - descendants. - flatten. - uniq. - select { |node| node.leaf? }. - map { |node| unquote(node.name) } + descendants. + flatten. + uniq. + select {|node| node.leaf?}. + map {|node| unquote(node.name)} end end diff --git a/lib/tree_clusters/version.rb b/lib/tree_clusters/version.rb index dfd5fe2..6f6c2d8 100644 --- a/lib/tree_clusters/version.rb +++ b/lib/tree_clusters/version.rb @@ -1,3 +1,3 @@ module TreeClusters - VERSION = "0.7.0" + VERSION = "0.8.0" end diff --git a/spec/tree_clusters_spec.rb b/spec/tree_clusters_spec.rb index f9238ca..5ec3875 100644 --- a/spec/tree_clusters_spec.rb +++ b/spec/tree_clusters_spec.rb @@ -92,6 +92,21 @@ def read_all_clusters fname ] end + let(:expected_nodes) do + [ + tree.findNode("cluster19", exact: true), + tree.findNode("cluster22", exact: true), + tree.findNode("cluster11", exact: true), + tree.findNode("cluster14", exact: true), + tree.findNode("cluster7", exact: true), + tree.findNode("cluster1", exact: true), + tree.findNode("cluster4", exact: true), + tree.findNode("cluster6", exact: true), + tree.findNode("cluster10", exact: true), + tree.findNode("cluster16", exact: true), + ] + end + let(:expected_names) do ["cluster19", "cluster22", @@ -319,6 +334,28 @@ def read_all_clusters fname end end + describe "#low_ent_cols_with_bases" do + it "returns low entropy columns given leaves and attrs" do + entropy_cutoff = 0 + leaves = %w[b-1 b-2] + + expect(klass.low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff). + to eq Set.new([[1, ["C"]], [2, ["C"]], [3, ["C"]]]) + end + + it "ignores columns if there is a gap regardless of entropy" do + leaf2attrs = TreeClusters::Attrs[ + "a" => { aln: %w[A - T] }, + "b" => { aln: %w[a a c] } + ] + leaves = %w[a b] + entropy_cutoff = 1 + + expect(klass.low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff). + to eq Set.new([[1, ["A"]], [3, ["C", "T"]]]) + end + end + describe "#all_clades" do context "with block given" do it "yields all the clades" do @@ -612,6 +649,11 @@ def read_all_clusters fname end end + it "generates clade with the proper node" do + expect(expected_clades.map(&:node)). + to eq expected_nodes + end + it "generates clade with proper name" do expect(expected_clades.map(&:name)). to eq expected_names diff --git a/test_files/small2.aln b/test_files/small2.aln new file mode 100644 index 0000000..77fb1dc --- /dev/null +++ b/test_files/small2.aln @@ -0,0 +1,14 @@ +>a-1 apple +AAAAg +>a-2 pie +AAATg +>b-1 is +CCCCa +>b-2 really +cccTa +>bb-1 good +CCTGa +>bbb-1 and +CCGgg +>bbb-2 tasty +CGGGg