Skip to content

Commit

Permalink
Add general clade prog
Browse files Browse the repository at this point in the history
  • Loading branch information
mooreryan committed Nov 14, 2017
1 parent 57ed43b commit 18c3063
Show file tree
Hide file tree
Showing 4 changed files with 327 additions and 1 deletion.
229 changes: 229 additions & 0 deletions exe/clade_attrs
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
#!/usr/bin/env ruby

Signal.trap("PIPE", "EXIT")

require "tree_clusters"
require "trollop"
require "parse_fasta"
require "shannon"
require "fileutils"

TreeClusters.extend TreeClusters

def puts_info outf, clade_id, attr_cat, attr_set
outf.puts [clade_id, attr_cat, attr_set.to_a].join "\t"
end

opts = Trollop.options do
version TreeClusters::VERSION

banner <<-EOS
Checking IDs
------------
IDs for the sequences must match between the three input files.
The tree file is allowed to have quoted taxa names, but the mapping
file and alignment file are not.
If your alignment file has spaces in the name, the ID part of the
header (i.e., the part up until the space) must match with the
sequence IDs in the tree and the mapping file.
Example: This would be okay.
tree file:
('genome_A', 'genome_B');
aln file:
>genome_A apple pie
AAAAA
>genome_B brown sugar
AATTA
mapping file:
name coolness
genome_A cool
genome_B notcool
Subtracting parent nodes
------------------------
If a clade's parent would be the root of the tree, no columns will
be subtracted when removing the parent columns as it would be the
entire alignment.
Options:
EOS

opt(:tree,
"Newick tree file",
type: :string)
opt(:mapping,
"Mapping file",
type: :string)
opt(:attrs,
"Attributes file",
type: :string)

opt(:clade_size_cutoff,
"Consider only clades with at least this many leaves",
default: 1)

opt(:outdir,
"Output directory",
default: ".")
opt(:base,
"Basename for output",
default: "clade_attrs")
end

abort_if opts[:tree].nil?,
"--tree is a required arg"
abort_if opts[:mapping].nil?,
"--mapping is a required arg"
abort_if opts[:attrs].nil?,
"--attrs is a required arg"

abort_unless_file_exists opts[:tree]
abort_unless_file_exists opts[:mapping]
abort_unless_file_exists opts[:attrs]

# TODO check IDs when attrs is not a fasta file
# TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:attrs]

abort_unless opts[:clade_size_cutoff] >= 1,
"--clade-size-cutoff must be >= 1"

FileUtils.mkdir_p opts[:outdir]

tree = NewickTree.fromFile opts[:tree]
metadata = TreeClusters.read_mapping_file opts[:mapping]
snazzy_info = TreeClusters.snazzy_info tree, metadata
attr_names, leaf2attrs = TreeClusters.read_attrs_file opts[:attrs]

ext_base = "clade_attrs"

clades_fname =
File.join opts[:outdir],
"#{opts[:base]}.#{ext_base}.txt"
members_fname =
File.join opts[:outdir],
"#{opts[:base]}.#{ext_base}_clade_members.txt"
attrs_fname =
File.join opts[:outdir],
"#{opts[:base]}.#{ext_base}_attrs_union.txt"
attrs_intersection_fname =
File.join opts[:outdir],
"#{opts[:base]}.#{ext_base}_attrs_intersection.txt"
attrs_minus_parent_attrs_fname =
File.join opts[:outdir],
"#{opts[:base]}.#{ext_base}_attrs_minus_parent_attrs.txt"
attrs_minus_sibling_attrs_fname =
File.join opts[:outdir],
"#{opts[:base]}.#{ext_base}_attrs_minus_sibling_attrs.txt"
attrs_minus_other_attrs_fname =
File.join opts[:outdir],
"#{opts[:base]}.#{ext_base}_attrs_minus_other_attrs.txt"


info_f =
File.open(clades_fname, "w")
clade_members_f =
File.open(members_fname, "w")
attrs_f =
File.open(attrs_fname, "w")
attrs_intersection_f =
File.open(attrs_intersection_fname, "w")
attrs_minus_parent_attrs_f =
File.open(attrs_minus_parent_attrs_fname, "w")
attrs_minus_sibling_attrs_f =
File.open(attrs_minus_sibling_attrs_fname, "w")
attrs_minus_other_attrs_f =
File.open(attrs_minus_other_attrs_fname, "w")


begin
# info is { metadata_category => metadata_tag , ... }
snazzy_info.each_with_index do |(clade, info), idx|
assert clade.all_leaves.all? { |leaf| leaf2attrs.has_key? leaf },
"Not all leaves are present in the leaf2attrs hash table"

clade_id = "clade_#{idx+1}___#{clade.name}"

is_snazzy = info.nil? ? false : true
snazzy = is_snazzy ? "snazzy" : "not_snazzy"

if is_snazzy
info_f.puts [clade_id,
info.count,
info.map { |pair| pair.join("|")}].join "\t"
else
info_f.puts [clade_id,
0,
"not_snazzy"].join "\t"
end

clade_members_f.puts [clade_id,
clade.all_leaves.count,
clade.all_leaves].join "\t"

attr_names.each do |attr_category|
attrs_all_leaves =
leaf2attrs.attrs clade.all_leaves, attr_category

attrs_all_sibling_leaves =
leaf2attrs.attrs clade.all_sibling_leaves,
attr_category
attrs_parent_leaves =
leaf2attrs.attrs clade.parent_leaves,
attr_category
attrs_other_leaves =
leaf2attrs.attrs clade.other_leaves,
attr_category

attrs_all_minus_parent =
attrs_all_leaves.union - attrs_parent_leaves.union
attrs_all_minus_sibling =
attrs_all_leaves.union - attrs_all_sibling_leaves.union
attrs_all_minus_other =
attrs_all_leaves.union - attrs_other_leaves.union


puts_info attrs_f,
clade_id,
attr_category,
attrs_all_leaves.union

puts_info attrs_intersection_f,
clade_id,
attr_category,
attrs_all_leaves.intersection

puts_info attrs_minus_parent_attrs_f,
clade_id,
attr_category,
attrs_all_minus_parent

puts_info attrs_minus_sibling_attrs_f,
clade_id,
attr_category,
attrs_all_minus_sibling

puts_info attrs_minus_other_attrs_f,
clade_id,
attr_category,
attrs_all_minus_other
end
end
ensure
info_f.close
clade_members_f.close
attrs_f.close
attrs_minus_parent_attrs_f.close
attrs_minus_sibling_attrs_f.close
attrs_minus_other_attrs_f.close
end
67 changes: 67 additions & 0 deletions lib/tree_clusters.rb
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,73 @@ def snazzy_clades tree, metadata
snazzy_clades
end

def snazzy_info tree, metadata
snazzy_info = {}

clades = self.
all_clades(tree, metadata).
sort_by { |clade| clade.all_leaves.count }.
reverse

# Non snazzy clades have a value of nil, so set all to nil and the
# snazzy ones will be overwritten.
clades.each do |clade|
snazzy_info[clade] = nil
end

metadata.each do |md_cat, leaf2mdtag|
already_checked = Set.new
single_tag_clades = {}

clades.each do |clade|
assert clade.all_leaves.count > 1,
"A clade cannot also be a leaf"

unless clade.all_leaves.all? do |leaf|
already_checked.include? leaf
end
md_tags = clade.all_leaves.map do |leaf|
assert leaf2mdtag.has_key?(leaf),
"leaf #{leaf} is missing from leaf2mdtag ht"

leaf2mdtag[leaf]
end

# this clade is mono-phyletic w.r.t. this metadata category.
if md_tags.uniq.count == 1
clade.all_leaves.each do |leaf|
already_checked << leaf
end

assert !single_tag_clades.has_key?(clade),
"clade #{clade.name} is repeated in single_tag_clades for #{md_cat}"

single_tag_clades[clade] = md_tags.first
end
end
end

single_tag_clades.each do |clade, md_tag|
non_clade_leaves = tree.unquoted_taxa - clade.all_leaves

non_clade_leaves_with_this_md_tag = non_clade_leaves.map do |leaf|
[leaf, leaf2mdtag[leaf]]
end.select { |ary| ary.last == md_tag }

is_snazzy_clade = non_clade_leaves_with_this_md_tag.count.zero?
if is_snazzy_clade
if !snazzy_info[clade].nil?
snazzy_info[clade][md_cat] = md_tag
else
snazzy_info[clade] = { md_cat => md_tag }
end
end
end
end

snazzy_info
end

def read_mapping_file fname
md_cat_names = nil
metadata = TreeClusters::Attrs.new
Expand Down
2 changes: 1 addition & 1 deletion lib/tree_clusters/version.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module TreeClusters
VERSION = "0.7.0-alpha"
VERSION = "0.7.0"
end
30 changes: 30 additions & 0 deletions spec/tree_clusters_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,24 @@ def read_all_clusters fname
"oddness" => "quite odd" },
}
end
# Like snazzy clades, except that it has an entry for every clade
let(:small_tree_snazzy_info) do
{ "cluster_B3" =>
{ "oddness" => "rather odd" },

"cluster_B2" =>
{ "snazzyness" => "notsnazzy" },

"cluster_B1" => nil,

"cluster_B" =>
{ "coolness" => "notcool" },

"cluster_A" =>
{ "coolness" => "cool",
"oddness" => "quite odd" },
}
end
let(:small_tree_all_tags) do
[ # B3
{ "coolness" => Set.new(["notcool"]),
Expand Down Expand Up @@ -559,6 +577,18 @@ def read_all_clusters fname
end
end

describe "#snazzy_info" do
it "returns snazzy info for all clades" do
info = klass.snazzy_info small_tree, metadata

info_hash = info.
map { |clade, snazzy_info| [clade.name, snazzy_info] }.
to_h

expect(info_hash).to eq small_tree_snazzy_info
end
end

# describe "#deepest_single_tag_clades" do
# it "returns the single tag clades, but only those not " +
# "contained within another clade single tag clade w.r.t. " +
Expand Down

0 comments on commit 18c3063

Please sign in to comment.