Skip to content

Commit

Permalink
Fix content decoding and add a unit test for this issue
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew Whatson committed Feb 4, 2010
1 parent fad055a commit ea24deb
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 9 deletions.
10 changes: 1 addition & 9 deletions lib/Web/Scraper.pm
Expand Up @@ -63,16 +63,8 @@ sub scrape {
my $res = $ua->get($stuff);
return $self->scrape($res, $stuff->as_string);
} elsif (blessed($stuff) && $stuff->isa('HTTP::Response')) {
require Encode;
if ($stuff->is_success) {
my @encoding = (
$stuff->content_charset,
# could be multiple because HTTP response and META might be different
($stuff->header('Content-Type') =~ /charset=([\w\-]+)/g),
"latin-1",
);
my $encoding = first { defined $_ && Encode::find_encoding($_) } @encoding;
$html = Encode::decode($encoding, $stuff->content);
$html = $stuff->decoded_content;
} else {
croak "GET " . $stuff->request->uri . " failed: ", $stuff->status_line;
}
Expand Down
32 changes: 32 additions & 0 deletions t/19_decode_content.t
@@ -0,0 +1,32 @@
use strict;
use warnings;
use URI;
use LWP::UserAgent;
use Web::Scraper;
use Test::More;

plan skip_all => "LIVE_TEST not enabled"
unless $ENV{LIVE_TEST} || $ENV{TEST_ALL};

plan tests => 2;

my $ua = LWP::UserAgent->new;
$ua->default_header('Accept-Encoding' => 'gzip');
{
my $res = $ua->get("http://www.yahoo.co.jp/");
my $result = scraper {
process 'title', title => 'TEXT';
}->scrape($res);
is $result->{title}, 'Yahoo! JAPAN';
}

{
my $res = $ua->get("http://b.hatena.ne.jp/");
my $result = scraper {
process 'img.csschanger', image => '@src';
}->scrape($res);
is $result->{image}, 'http://b.hatena.ne.jp/images/logo1.gif', 'Absolute URI';
}



0 comments on commit ea24deb

Please sign in to comment.