Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Add functions and tests for the functions:

* $mech->scrape_text_by_id()
* $mech->scrape_text_by_attr()
* $mech->scraped_id_is()
  • Loading branch information...
commit 41040e80fab265ca040723419c2abfac38b6ed1d 1 parent 47a7d5a
@petdance authored
Showing with 203 additions and 1 deletion.
  1. +9 −0 Changes
  2. +1 −0  MANIFEST
  3. +142 −1 Mechanize.pm
  4. +51 −0 t/scrape-text-by-id.t
View
9 Changes
@@ -11,6 +11,15 @@ NEXT
Added a $mech->autolint() method so you can turn autolinting on and off
in mid-program. Thanks, Mike O'Regan.
+New functions $mech->scrape_text_by_id() and $mech->scrape_text_by_attr()
+let you extract text from your pages. Convenience function
+$mech->scraped_id_is() makes it easy to scrape and compare in one
+function.
+
+ <h1 id="pagetitle">My Awesome Page!</h1>
+
+ # Verify that HTML is there with:
+ $mech->scraped_id_is( 'pagetitle', 'My Awesome Page!' );
[FIXES]
$mech->has_tag() now handles nested tags more sensibly. This fixes
View
1  MANIFEST
@@ -36,6 +36,7 @@ t/page_links_ok.t
t/pod-coverage.t
t/pod.t
t/put_ok.t
+t/scrape-text-by-id.t
t/stuff_inputs.html
t/stuff_inputs.t
t/submit_form_ok.t
View
143 Mechanize.pm
@@ -1494,13 +1494,154 @@ sub _grep_hashes {
}
+=head2 $mech->scrape_text_by_attr( $attr, $attr [, $html ] )
+
+=head2 $mech->scrape_text_by_attr( $attr, $attr_regex [, $html ] )
+
+Returns an array of strings, each string the text surrounded by an
+element with attribute I<$attr> of value I<$value>. You can also pass in
+a regular expression. If nothing is found the return is an empty list.
+In scalar context the return is the first string found.
+
+If passed, I<$html> is scraped instead of the current page's content.
+
+=cut
+
+sub scrape_text_by_attr {
+ my $self = shift;
+ my $attr = shift;
+ my $value = shift;
+
+ require HTML::TokeParser;
+
+ my $html;
+ if ( @_ ) {
+ $html = shift;
+ assert_nonblank( $html, '$html passed in is a populated scalar' );
+ }
+ elsif ( $self->ct() eq 'text/html' ) {
+ $html = $self->content();
+ }
+
+ my @results;
+
+ if ( defined $html ) {
+ my $parser = HTML::TokeParser->new(\$html);
+
+ while ( my $token = $parser->get_tag() ) {
+ if ( ref $token->[1] eq 'HASH' ) {
+ if ( exists $token->[1]->{$attr} ) {
+ my $matched = (ref $value eq 'Regexp') ? $token->[1]->{$attr} =~ $value : $token->[1]->{$attr} eq $value;
+ if ( $matched ) {
+ my $tag = $token->[ 0 ];
+ push @results, $parser->get_trimmed_text( "/$tag" );
+ if ( !wantarray ) {
+ last;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return $results[0] if !wantarray;
+ return @results;
+}
+
+
+=head2 scrape_text_by_id( $id [, $html ] )
+
+Finds all elements with the given id attribute and pulls out the text that that element encloses.
+
+In list context, returns a list of all strings found. In scalar context, returns the first one found.
+
+If C<$html> is not provided then the current content is used.
+
+=cut
+
+sub scrape_text_by_id {
+ my $self = shift;
+ my $id = shift;
+ my $html;
+
+ require HTML::TokeParser;
+
+ if ( @_ ) {
+ $html = shift;
+ assert_nonref( $html, '$html passed in is a populated scalar' );
+ }
+ else {
+ if ( $self->ct() eq 'text/html' && defined $self->{content} ) {
+ $html = $self->{ content };
+ }
+ }
+
+ my @results;
+
+ if ( defined $html ) {
+ my $found = index( $html, "id=\"$id\"" );
+ if ( $found >= 0 ) {
+ # quick and dirty hack to try and cut down on the amount of DOM parsing
+ if ( $found >=150 ) {
+ $html = substr( $html, $found-150 );
+ }
+
+ my $parser = HTML::TokeParser->new( \$html );
+
+ while ( my $token = $parser->get_tag() ) {
+ if ( ref $token->[1] eq 'HASH' ) {
+ my $actual_id = $token->[1]->{id};
+ $actual_id = '' unless defined $actual_id;
+ if ( $actual_id eq $id ) {
+ my $tag = $token->[ 0 ];
+ push @results, $parser->get_trimmed_text( "/$tag" );
+ if ( !wantarray ) {
+ last;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return $results[0] if !wantarray;
+ return @results;
+}
+
+
+
+=head2 $mech->scraped_id_is( $id, $expected [, $msg] )
+
+Scrapes the current page for given ID and tests that it matches the expected value.
+
+=cut
+
+sub scraped_id_is {
+ my $self = shift;
+ my $id = shift;
+ my $expected = shift;
+ my $msg = shift;
+
+ if ( not defined $msg ) {
+ my $what = defined( $expected ) ? $expected : '(undef)';
+
+ $msg = qq{scraped id "$id" is "$what"};
+ }
+
+ local $Test::Builder::Level = $Test::Builder::Level + 1;
+
+ my $got = $self->scrape_text_by_id($id);
+ is( $got, $expected, $msg );
+
+ return;
+}
=head1 TODO
Add HTML::Tidy capabilities.
-Add a broken image check.
+Other ideas for features are at https://github.com/petdance/test-www-mechanize
=head1 AUTHOR
View
51 t/scrape-text-by-id.t
@@ -0,0 +1,51 @@
+#!/usr/bin/perl -T
+
+use strict;
+use warnings;
+
+use Test::More tests => 1;
+
+use Test::Builder;
+
+use URI::file ();
+
+use Test::WWW::Mechanize ();
+
+subtest scrape_text_by_id => sub {
+ plan tests => 12;
+
+ my $mech = Test::WWW::Mechanize->new( autolint => 0 );
+ isa_ok( $mech, 'Test::WWW::Mechanize' );
+
+ my $uri = URI::file->new_abs( 't/goodlinks.html' )->as_string;
+ $mech->get_ok( $uri, 'Get a dummy page just to have one' );
+
+ # nothing to find
+ $mech->update_html( '<html><head><title></title></head><body></body></html>' );
+ is_deeply( [$mech->scrape_text_by_id('asdf')], [], 'not found: empty list returned in list context' );
+ is( $mech->scrape_text_by_id('asdf'), undef, 'not found: undef returned in scalar context' );
+
+ # find one
+ $mech->update_html( '<html><head><title></title></head><body><p id="asdf">contents</p></body></html>' );
+ is_deeply( [$mech->scrape_text_by_id('asdf')], ['contents'], 'find one: list context' );
+ is( $mech->scrape_text_by_id('asdf'), 'contents', 'find one: scalar context' );
+
+ # find multiple
+ $mech->update_html( '<html><head><title></title></head><body><p id="asdf">contents</p><p id="asdf">further</p></body></html>' );
+ is_deeply( [$mech->scrape_text_by_id('asdf')], ['contents', 'further'], 'find multiple: empty list returned in list context' );
+ is( $mech->scrape_text_by_id('asdf'), 'contents', 'find multiple: first string returned in scalar context' );
+
+ # present but empty
+ $mech->update_html( '<html><head><title></title></head><body><p id="asdf"></p></body></html>' );
+ is_deeply( [$mech->scrape_text_by_id('asdf')], [''], 'present but empty: list context' );
+ is( $mech->scrape_text_by_id('asdf'), '', 'present but empty: scalar context' );
+
+ # present but emptier
+ $mech->update_html( '<html><head><title></title></head><body><p id="asdf" /></body></html>' );
+ is_deeply( [$mech->scrape_text_by_id('asdf')], [''], 'present but emptier: list context' );
+ is( $mech->scrape_text_by_id('asdf'), '', 'present but emptier: scalar context' );
+};
+
+done_testing();
+
+exit 0;
Please sign in to comment.
Something went wrong with that request. Please try again.