Skip to content
This repository has been archived by the owner on Dec 14, 2023. It is now read-only.

Commit

Permalink
Add support for solving redirect loops (paywalls, etc.) by checking w…
Browse files Browse the repository at this point in the history
  • Loading branch information
pypt committed Sep 18, 2014
1 parent 08bf5c6 commit 2e1600f
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 17 deletions.
63 changes: 47 additions & 16 deletions lib/MediaWords/Util/URL.pm
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use URI;
use URI::QueryParam;
use Regexp::Common qw /URI/;
use MediaWords::Util::Web;
use URI::Escape;
use List::MoreUtils qw/uniq/;

# Returns true if URL is in the "http" ("https") scheme
Expand Down Expand Up @@ -336,25 +337,55 @@ sub url_and_data_after_redirects($;$$)

unless ( $response->is_success )
{
warn "Request to " . $uri->as_string . " was unsuccessful: " . $response->status_line;
$uri = URI->new( $orig_url )->canonical;
my @redirects = $response->redirects();
if ( scalar @redirects + 1 >= $max_http_redirect )
{
my @urls_redirected_to;

my $error_message = "";
$error_message .= "Number of HTTP redirects ($max_http_redirect) exhausted; redirects:\n";
foreach my $redirect ( @redirects )
{
push( @urls_redirected_to, $redirect->request()->uri()->canonical->as_string );
$error_message .= "* From: " . $redirect->request()->uri()->canonical->as_string . "; ";
$error_message .= "to: " . $redirect->header( 'Location' ) . "\n";
}

say STDERR $error_message;

# Return the original URL (unless we find a URL being a substring of another URL, see below)
$uri = URI->new( $orig_url )->canonical;

# If one of the URLs that we've been redirected to contains another URLencoded URL, assume
# that we're hitting a paywall and the URLencoded URL is the right one
@urls_redirected_to = uniq @urls_redirected_to;
foreach my $url_redirected_to ( @urls_redirected_to )
{
my $encoded_url_redirected_to = uri_escape( $url_redirected_to );

if ( my ( $matched_url ) = grep /$encoded_url_redirected_to/, @urls_redirected_to )
{

say STDERR
"Encoded URL $encoded_url_redirected_to is a substring of another URL $matched_url, so I'll assume that $url_redirected_to is the correct one.";
$uri = URI->new( $url_redirected_to )->canonical;
last;

}
}

}
else
{
say STDERR "Request to " . $uri->as_string . " was unsuccessful: " . $response->status_line;

# Return the original URL and give up
$uri = URI->new( $orig_url )->canonical;
}

last;
}

my @redirects = $response->redirects();

# if ( scalar @redirects )
# {
# say STDERR "Redirects:";
# foreach my $redirect ( @redirects )
# {
# say STDERR "* From:";
# say STDERR " " . $redirect->request()->uri()->canonical;
# say STDERR " to:";
# say STDERR " " . $redirect->header( 'Location' );
# }
# }

my $new_uri = $response->request()->uri()->canonical;
unless ( $uri->eq( $new_uri ) )
{
Expand Down
50 changes: 49 additions & 1 deletion lib/MediaWords/Util/t/URL.t
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@ use warnings;

use utf8;
use Test::NoWarnings;
use Test::More tests => 42;
use Test::More tests => 44;

use Readonly;
use HTTP::HashServer;
use HTTP::Status qw(:constants);
use URI::Escape;
use Data::Dumper;

Readonly my $TEST_HTTP_SERVER_PORT => 9998;
Expand Down Expand Up @@ -387,6 +388,51 @@ sub test_url_and_data_after_redirects_html()
is( $data_after_redirects, $pages->{ '/fifth' }, 'Data after HTML redirects' );
}

sub test_url_and_data_after_redirects_loop()
{
Readonly my $TEST_HTTP_SERVER_URL => 'http://localhost:' . $TEST_HTTP_SERVER_PORT;
my $starting_url = $TEST_HTTP_SERVER_URL . '/first';

# "http://127.0.0.1:9998/third?url=http%3A%2F%2F127.0.0.1%2Fsecond"
my $third = '/third?url=' . uri_escape( $TEST_HTTP_SERVER_URL . '/second' );

# HTTP redirects
my $pages = {

# e.g. http://rss.nytimes.com/c/34625/f/640350/s/3a08a24a/sc/1/l/0L0Snytimes0N0C20A140C0A50C0A40Cus0Cpolitics0Cobama0Ewhite0Ehouse0Ecorrespondents0Edinner0Bhtml0Dpartner0Frss0Gemc0Frss/story01.htm
'/first' => { redirect => '/second', http_status_code => HTTP_SEE_OTHER },

# e.g. http://www.nytimes.com/2014/05/04/us/politics/obama-white-house-correspondents-dinner.html?partner=rss&emc=rss
'/second' => { redirect => $third, http_status_code => HTTP_SEE_OTHER },

# e.g. http://www.nytimes.com/glogin?URI=http%3A%2F%2Fwww.nytimes.com%2F2014%2F05%2F04%2Fus%2Fpolitics%2Fobama-white-house-correspondents-dinner.html%3Fpartner%3Drss%26emc%3Drss
'/third' => { redirect => '/second', http_status_code => HTTP_SEE_OTHER }
};

my $hs = HTTP::HashServer->new( $TEST_HTTP_SERVER_PORT, $pages );
$hs->start();

my ( $url_after_redirects, $data_after_redirects ) =
MediaWords::Util::URL::url_and_data_after_redirects( $starting_url );

$hs->stop();

is( $url_after_redirects, $TEST_HTTP_SERVER_URL . '/second', 'URL after HTTP redirect loop' );
}

sub test_url_and_data_after_redirects_nytimes()
{
my $starting_url =
'http://rss.nytimes.com/c/34625/f/640350/s/3a08a24a/sc/1/l/0L0Snytimes0N0C20A140C0A50C0A40Cus0Cpolitics0Cobama0Ewhite0Ehouse0Ecorrespondents0Edinner0Bhtml0Dpartner0Frss0Gemc0Frss/story01.htm';
my $expected_url =
'http://www.nytimes.com/2014/05/04/us/politics/obama-white-house-correspondents-dinner.html?partner=rss&emc=rss';

my ( $url_after_redirects, $data_after_redirects ) =
MediaWords::Util::URL::url_and_data_after_redirects( $starting_url );

is( $url_after_redirects, $expected_url, 'URL after NYTimes redirects' );
}

sub main()
{
my $builder = Test::More->builder;
Expand All @@ -401,6 +447,8 @@ sub main()
test_link_canonical_url_from_html();
test_url_and_data_after_redirects_http();
test_url_and_data_after_redirects_html();
test_url_and_data_after_redirects_loop();
test_url_and_data_after_redirects_nytimes();
}

main();

0 comments on commit 2e1600f

Please sign in to comment.