Permalink
Browse files

Pruning of cherries from Newick string

  • Loading branch information...
1 parent c8b22fc commit 39c724d350b02d9ee50a3ef24ba84c4898dd1f7e @fangly fangly committed May 28, 2012
Showing with 173 additions and 0 deletions.
  1. +79 −0 lib/Bio/Phylo/Parsers/Newick.pm
  2. +94 −0 t/42-filter-newick.t
@@ -1,4 +1,5 @@
package Bio::Phylo::Parsers::Newick;
+use warnings;
use strict;
use base 'Bio::Phylo::Parsers::Abstract';
no warnings 'recursion';
@@ -17,6 +18,69 @@ don't call it directly.
sub _return_is_scalar { 1 }
+
+sub _prefilter {
+ my ($string, $ids) = @_;
+ my %id_hash = map { $_ => undef } @$ids;
+
+ # Setup some regular expressions:
+ # 1/ ID is anything but : or , or ( or )
+ my $id_re = qr/[^)(:,;]+?/;
+ # 2/ Distance is a number (Regexp::Common $RE{num}{real} would be more stringent)
+ my $dist_re = qr/[0-9.]+/;
+ # 3/ A pair of ID and distance (both optional)
+ my $pair_re = qr/ \s* ($id_re)? \s* (?: \:\s* ($dist_re) )? \s* /x;
+ # 4/ Cherry
+ my $cherry_re = qr/ ( \( $pair_re , $pair_re \) $pair_re ) /msx;
+ #my $binary_leaves_re = qr/ ( (,?) \s* \( $pair_re , $pair_re \) $pair_re ) /msx;
+ #my $left_leaves_re = qr/ ( (,?) \s* \( $pair_re , $pair_re (,.+?) \) $pair_re ) /msx;
+ #my $right_leaves_re = qr/ ( (,?) \s* \( (.+?,) $pair_re , $pair_re \) $pair_re ) /msx;
+
+ # Substitute in Newick string
+ my $prev_string = $string;
+ while (1) {
+ $string =~ s/ $cherry_re / _prune_cherry($1, $2, $3, $4, $5, $6, $7, \%id_hash) /gex;
+ #$string =~ s/ $left_leaves_re /
+ # _prune_more_branch($1, $2, undef, $3, $4, $5, $6, $7, $8, $9, \%id_hash) /gex;
+ #$string =~ s/ $right_leaves_re /
+ # _prune_more_branch($1, $2, $3, $4, $5, $6, $7, undef, $8, $9, \%id_hash) /gex;
+ #$string =~ s/ $binary_leaves_re /
+ # _prune_binary_branch($1, $2, $3, $4, $5, $6, $7, $8, \%id_hash) /gex;
+ #$string =~ s/^ \( ( [^,]* ) \) ; $/$1;/msx;
+ #$string =~ s/^ \( \s* ( \( .* \) $pair_re ) \s* \) ; $/$1;/msx;
+
+ last if ( $string eq $prev_string );
+ $prev_string = $string;
+ }
+
+ return $string;
+}
+
+sub _prune_cherry {
+ my ($match, $id1, $dist1, $id2, $dist2, $idp, $distp, $id_hash) = @_;
+ my $repl;
+ $id1 ||= '';
+ $id2 ||= '';
+ my $id1_exists = exists $id_hash->{$id1};
+ my $id2_exists = exists $id_hash->{$id2};
+ if ( $id1_exists && $id2_exists ) {
+ # Keep both leaves
+ $repl = $match;
+ } elsif ( not($id1_exists) && not($id2_exists) ) {
+ # Delete both leaves
+ $repl = '';
+ } else {
+ # Keep only one leaf
+ my ($id, $dist) = $id1_exists ? ($id1, $dist1) : ($id2, $dist2);
+ if ( defined($dist) || defined($distp) ) {
+ $dist = ':'.(($dist||0) + ($distp||0));
+ }
+ $dist = '' if not defined $dist;
+ $repl = $id.$dist;
+ }
+ return $repl;
+}
+
sub _parse {
my $self = shift;
my $fh = $self->_handle;
@@ -27,9 +91,24 @@ sub _parse {
$string .= $_;
}
+ my $prefilter_ids = $self->_args->{'-prefilter'};
+
# remove comments, split on tree descriptions
for my $newick ( $self->_split($string) ) {
+ ####
+ print "newick:\n$newick\n\n";
+ print "Prefiltering...\n\n";
+ ####
+
+ # prefilter tree
+ $newick = _prefilter($string, $prefilter_ids) if $prefilter_ids;
+
+ ####
+ print "newick:\n$newick\n\n";
+ print "Parsing...\n\n";
+ ####
+
# parse trees
my $tree = $self->_parse_string($newick);
View
@@ -0,0 +1,94 @@
+use strict;
+
+use warnings;
+use Test::More tests => 44;
+use Bio::Phylo;
+use Bio::Phylo::IO qw(parse);
+use Bio::Phylo::Parsers::Newick;
+
+
+
+
+# Newick tree taken from http://en.wikipedia.org/wiki/Newick_format
+my @strings = (
+ '(,(,));', # no nodes are named
+ '(A,(C,D));', # leaf nodes are named
+ '((C,D),A);',
+ '(A,(C,D)E)F;', # all nodes are named
+ '(:0.1,(:0.3,:0.4):0.5);', # all but root node have a distance to parent
+ '(:0.1,(:0.3,:0.4):0.5):0.0;', # all have a distance to parent
+ '(A:0.1,(C:0.3,D:0.4):0.5);', # distances and leaf names (popular)
+ '((C:0.3,D:0.4):0.5,A:0.1);',
+ "(A :0.1, (C:0.3,\nD:0.4): 0.5);", # extra one with spaces and newline
+ '(A:0.1,(C:0.3,D:0.4)E:0.5)F;', # distances and all names
+);
+
+my @id_sets = (
+ ['C'],
+ ['A', 'C'],
+ ['C', 'D'],
+ ['A', 'C', 'D'],
+);
+
+# Test parsing with filtering hooked up to it
+my $string = '(H:1,(G:1,(F:1,(E:1,(D:1,(C:1,(A:1,B):1):1):1):1):1):1):0;';
+ok( my $phylo = Bio::Phylo->new, 'Init' );
+ok( !Bio::Phylo->VERBOSE( -level => 0 ), 'Set terse' );
+ok( my $tree = Bio::Phylo::IO->parse(
+ -string => $string,
+ -format => 'newick',
+ -prefilter => $id_sets[0]
+)->first, 'Parse' );
+is $tree->calc_number_of_terminals, 1;
+
+# Test many tree combinations and IDs
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[0], $id_sets[0] ), ';', 'Prefilter';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[0], $id_sets[1] ), ';';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[0], $id_sets[2] ), ';';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[0], $id_sets[3] ), ';';
+
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[1], $id_sets[0] ), 'C;';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[1], $id_sets[1] ), '(A,C);';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[1], $id_sets[2] ), '(A,(C,D));';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[1], $id_sets[3] ), '(A,(C,D));';
+
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[2], $id_sets[0] ), 'C;';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[2], $id_sets[1] ), '(C,A);';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[2], $id_sets[2] ), '((C,D),A);';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[2], $id_sets[3] ), '((C,D),A);';
+
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[3], $id_sets[0] ), 'C;';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[3], $id_sets[1] ), '(A,C)F;';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[3], $id_sets[2] ), '(A,(C,D)E)F;';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[3], $id_sets[3] ), '(A,(C,D)E)F;';
+
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[4], $id_sets[0] ), ';';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[4], $id_sets[1] ), ';';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[4], $id_sets[2] ), ';';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[4], $id_sets[3] ), ';';
+
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[5], $id_sets[0] ), ';';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[5], $id_sets[1] ), ';';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[5], $id_sets[2] ), ';';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[5], $id_sets[3] ), ';';
+
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[6], $id_sets[0] ), 'C:0.8;';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[6], $id_sets[1] ), '(A:0.1,C:0.8);';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[6], $id_sets[2] ), '(A:0.1,(C:0.3,D:0.4):0.5);';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[6], $id_sets[3] ), '(A:0.1,(C:0.3,D:0.4):0.5);';
+
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[7], $id_sets[0] ), 'C:0.8;';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[7], $id_sets[1] ), '(C:0.8,A:0.1);';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[7], $id_sets[2] ), '((C:0.3,D:0.4):0.5,A:0.1);';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[7], $id_sets[3] ), '((C:0.3,D:0.4):0.5,A:0.1);';
+
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[8], $id_sets[0] ), 'C:0.8;';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[8], $id_sets[1] ), '(A :0.1, C:0.8);';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[8], $id_sets[2] ), "(A :0.1, (C:0.3,\nD:0.4): 0.5);";
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[8], $id_sets[3] ), "(A :0.1, (C:0.3,\nD:0.4): 0.5);";
+
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[9], $id_sets[0] ), 'C:0.8;';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[9], $id_sets[1] ), '(A:0.1,C:0.8)F;';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[9], $id_sets[2] ), '(A:0.1,(C:0.3,D:0.4)E:0.5)F;';
+is Bio::Phylo::Parsers::Newick::_prefilter( $strings[9], $id_sets[3] ), '(A:0.1,(C:0.3,D:0.4)E:0.5)F;';
+

0 comments on commit 39c724d

Please sign in to comment.