diff --git a/bin/generate_sitemap.pl b/bin/generate_sitemap.pl index 0905f1df89..2decbb6af4 100755 --- a/bin/generate_sitemap.pl +++ b/bin/generate_sitemap.pl @@ -5,37 +5,41 @@ use strict; use warnings; -use FindBin qw ($Bin); -use lib "$Bin/../lib"; - +use File::Basename; +use File::Spec; +use Cwd; +use Config::ZOMG; +my $root_dir; + +BEGIN { + my $bin_dir = File::Basename::dirname(__FILE__); + $root_dir + = Cwd::abs_path( File::Spec->catdir( $bin_dir, File::Spec->updir ) ); +} +use lib "$root_dir/lib"; use MetaCPAN::Sitemap; -my $out_dir = "$Bin/../root/static/sitemaps/"; -mkdir $out_dir; - -my @parts = ( - - # For authors, we're looking for the pauseid, and want to build a URL - # with 'author' in the path. - - { - object_type => 'author', - field_name => 'pauseid', - xml_file => "$out_dir/authors.xml.gz", - cpan_directory => 'author', - }, - - # For releases, we're looking for a download URL; since we're not - # building a URL, the cpan_directory is missing, but we also want to - # filter on only the 'latest' entries. - - { - object_type => 'release', - field_name => 'distribution', - xml_file => "$out_dir/releases.xml.gz", - cpan_directory => 'release', - filter => { status => 'latest' }, - } +my $config = Config::ZOMG->open( + name => 'MetaCPAN::Web', + path => $root_dir, ); -MetaCPAN::Sitemap->new($_)->process for @parts; +my $out_dir = "$root_dir/root/static/sitemaps/"; +mkdir $out_dir; + +my $web_host = $config->{web_host}; +$web_host =~ s{/\z}{}; +my $sitemaps = $config->{sitemap}; + +for my $file ( sort keys %$sitemaps ) { + my %sm_config = %{ $sitemaps->{$file} }; + my $full_file = $out_dir . $file; + $sm_config{url_prefix} ||= do { + my $metacpan_url = $sm_config{metacpan_url}; + s{/\z}{}, s{\A/}{} for $metacpan_url; + "$web_host/$metacpan_url/"; + }; + $sm_config{api_secure} = $config->{api_secure}; + my $sitemap = MetaCPAN::Sitemap->new(%sm_config); + $sitemap->write($full_file); +} diff --git a/cpanfile b/cpanfile index 341157c6a6..687ba237fe 100644 --- a/cpanfile +++ b/cpanfile @@ -59,7 +59,6 @@ requires 'Moo', '2.000002'; requires 'Moose', '2.1605'; requires 'MooseX::Fastly::Role', '0.03'; requires 'MooseX::Role::Parameterized', '1.02'; -requires 'MooseX::StrictConstructor'; requires 'MooseX::Types::Common::Numeric'; requires 'MooseX::Types::Common::String'; requires 'MooseX::Types::Moose'; @@ -68,7 +67,6 @@ requires 'Net::Async::HTTP'; requires 'Net::Fastly', '1.05'; requires 'Params::ValidationCompiler'; requires 'Path::Tiny', '0.076'; -requires 'PerlIO::gzip'; requires 'Plack', '1.0039'; requires 'Plack::Middleware::ReverseProxy'; requires 'Plack::Middleware::Runtime'; @@ -95,7 +93,6 @@ requires 'Try::Tiny', '0.24'; requires 'URI', '1.71'; requires 'URI::Escape'; requires 'XML::Feed'; -requires 'XML::Simple'; requires 'YAML', '1.15'; # fix dep chain issue test_requires 'App::Prove'; diff --git a/lib/MetaCPAN/Middleware/Static.pm b/lib/MetaCPAN/Middleware/Static.pm index 4ca88ed0ae..cfc48cc507 100644 --- a/lib/MetaCPAN/Middleware/Static.pm +++ b/lib/MetaCPAN/Middleware/Static.pm @@ -82,6 +82,11 @@ sub wrap { }; } + mount '/sitemap-authors.xml.gz' => Plack::App::File->new( + file => 'root/static/sitemaps/sitemap-authors.xml.gz' )->to_app; + mount '/sitemap-releases.xml.gz' => Plack::App::File->new( + file => 'root/static/sitemaps/sitemap-releases.xml.gz' )->to_app; + mount '/favicon.ico' => Plack::App::File->new( file => 'root/static/icons/favicon.ico' ) ->to_app; diff --git a/lib/MetaCPAN/Sitemap.pm b/lib/MetaCPAN/Sitemap.pm index 7fe589f798..fd9c4d6e91 100644 --- a/lib/MetaCPAN/Sitemap.pm +++ b/lib/MetaCPAN/Sitemap.pm @@ -1,41 +1,42 @@ package MetaCPAN::Sitemap; - -=head1 DESCRIPTION - -Generate an XML file containing URLs use by the robots.txt Sitemap. We use this -module to generate one each for authors, modules and releases. - -=cut - use strict; use warnings; -use MetaCPAN::Moose; - -use autodie; - -use Carp; -use Search::Elasticsearch; -use File::Spec; -use MetaCPAN::Web::Types qw( HashRef Int Str ); -use MooseX::StrictConstructor; -use PerlIO::gzip; -use XML::Simple qw(:strict); - -has [ 'cpan_directory', 'object_type', 'field_name', 'xml_file', ] => ( - is => 'ro', - isa => Str, - required => 1, -); - -has 'filter' => ( - is => 'ro', - isa => HashRef, +use IO::Socket::SSL qw(SSL_VERIFY_PEER); +use IO::Async::Loop; +use IO::Async::SSL; +use Net::Async::HTTP; +use Cpanel::JSON::XS; +use IO::Compress::Gzip; +use HTML::Entities qw(encode_entities_numeric); + +use Moo; + +has api_secure => ( is => 'ro', required => 1 ); +has url_prefix => ( is => 'ro', required => 1 ); +has object_type => ( is => 'ro', required => 1 ); +has field_name => ( is => 'ro', required => 1 ); +has filter => ( is => 'ro' ); +has size => ( is => 'ro', default => 1000 ); +has loop => ( is => 'lazy', default => sub { IO::Async::Loop->new } ); +has ua => ( + is => 'lazy', + default => sub { + my $self = shift; + my $http = Net::Async::HTTP->new( + user_agent => + 'MetaCPAN-Web/1.0 (https://github.com/metacpan/metacpan-web)', + max_connections_per_host => 5, + SSL_verify_mode => SSL_VERIFY_PEER, + timeout => 10, + ); + $self->loop->add($http); + $http; + } ); -has 'size' => ( - is => 'ro', - isa => Int, -); +sub DEMOLISH { + $_[0]->ua->remove_from_parent; +} # Mandatory arguments to this function are # [] search object_type (author and release) @@ -48,82 +49,72 @@ has 'size' => ( # [] filter - contains filter for a field that also needs to be included in # the list of form fields. -sub process { - my $self = shift; - - # Check that a) the directory where the output file wants to be does - # actually exist and b) the directory itself is writeable. - - # Get started. Create the ES object and the scrolled search object. - # XXX Remove this hardcoded URL - my $es = Search::Elasticsearch->new( - cxn_pool => 'Static::NoPing', - nodes => ['https://fastapi.metacpan.org'], - send_get_body_as => 'POST', - ); - - my $field_name = $self->field_name; - - # Start off with standard search parameters .. - - my %search_parameters = ( - index => 'v1', - size => 5000, - type => $self->object_type, - fields => [$field_name], - ); - - # ..and augment them if necesary. - - if ( $self->filter ) { +my $json = Cpanel::JSON::XS->new->utf8->canonical; - # Copy the filter over wholesale into the search parameters, and add - # the filter fields to the field list. - - $search_parameters{'body'}{'query'}{'match'} = $self->filter; - push @{ $search_parameters{'fields'} }, keys %{ $self->filter }; - } - - my $scrolled_search = $es->scroll_helper(%search_parameters); - - # Open the output file, get ready to pump out the XML. - - open my $fh, '>:gzip', $self->xml_file; - - my @urls; - my $metacpan_url = q{}; - if ( $self->cpan_directory ) { - $metacpan_url - = 'https://metacpan.org/' . $self->cpan_directory . q{/}; +sub _request { + my ( $self, $content, $cb ) = @_; + my $url = $self->api_secure . '/'; + my $content_type = 'text/plain'; + if ( ref $content ) { + $url .= $self->object_type . '/'; + $content_type = 'application/json'; + $content = $json->encode($content); } + $url .= '_search/scroll?scroll=1m&size=' . $self->size; + $self->ua->POST( $url, $content, content_type => $content_type, )->then( + sub { + my $response = shift; + my $content = $json->decode( $response->content ); + return Future->done + if !@{ $content->{hits}{hits} }; + $cb->( $content->{hits}{hits} ); + return $self->_request( $content->{_scroll_id}, $cb ); + } + ); +} - while ( $scrolled_search->refill_buffer ) { - push @urls, - map { $metacpan_url . $_->{'fields'}->{$field_name} } - $scrolled_search->drain_buffer; - } +sub write { + my ( $self, $file ) = @_; - $_ = $_ . q{ } for @urls; + my $fh = IO::Compress::Gzip->new( $file . '.new' ); + $fh->print(<<'END_XML_HEADER'); + + +END_XML_HEADER - $self->{size} = @urls; - XMLout( + $self->_request( { - 'xmlns' => 'http://www.sitemaps.org/schemas/sitemap/0.9', - 'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance', - 'xsi:schemaLocation' => - 'http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd', - 'url' => [ sort @urls ], + fields => [ $self->field_name ], + query => { match_all => {} }, + ( $self->filter ? ( filter => $self->filter ) : () ), + sort => [ $self->field_name ], }, - 'KeyAttr' => [], - 'RootName' => 'urlset', - 'XMLDecl' => q//, - 'OutputFile' => $fh, - ); - - close $fh; + sub { + my $hits = shift; + for my $hit (@$hits) { + my $link_field = $hit->{fields}{ $self->field_name }; + $link_field = $link_field->[0] if ref $link_field; + my $url = $self->url_prefix . $link_field; + $fh->print( " " + . encode_entities_numeric($url) + . "\n" ); + } + } + )->get; + $fh->print("\n"); + $fh->close; + rename "$file.new", "$file"; return; } -__PACKAGE__->meta->make_immutable; - 1; +__END__ + +=head1 DESCRIPTION + +Generate an XML file containing URLs use by the robots.txt Sitemap. We use this +module to generate one each for authors, modules and releases. + +=cut diff --git a/metacpan_web.conf b/metacpan_web.conf index b5425a5ce5..a3dd0edd54 100644 --- a/metacpan_web.conf +++ b/metacpan_web.conf @@ -36,3 +36,19 @@ mark_unauthorized_releases = 0 public_key 6LeH2MsSAAAAANwz3AA73Gw5OjCVjT6I51Ev-ior + + object_type = author + field_name = pauseid + metacpan_url = author + + + + object_type = release + field_name = distribution + metacpan_url = release + + + status = latest + + + diff --git a/root/robots.txt b/root/robots.txt index 28938f5fa3..61b426cc94 100644 --- a/root/robots.txt +++ b/root/robots.txt @@ -9,5 +9,5 @@ Disallow: /raw/ # Do not allow changing the default per page as is not useful Disallow: /*?*size=* -Sitemap: https://metacpan.org/static/sitemaps/authors.xml.gz -Sitemap: https://metacpan.org/static/sitemaps/releases.xml.gz +Sitemap: https://metacpan.org/sitemap-authors.xml.gz +Sitemap: https://metacpan.org/sitemap-releases.xml.gz diff --git a/t/metacpan/sitemap.t b/t/metacpan/sitemap.t deleted file mode 100644 index 00f6b04ee5..0000000000 --- a/t/metacpan/sitemap.t +++ /dev/null @@ -1,82 +0,0 @@ -use strict; -use warnings; - -use File::Temp qw/ tempdir /; -use MetaCPAN::Sitemap; -use Test::More; -use Try::Tiny; -use XML::Simple; - -# Test each of the three things that the production script is going to do, -# but limit the searches to a single chunk of 250 results to speed things -# along. - -my @tests = ( - { - inputs => { - object_type => 'author', - field_name => 'pauseid', - xml_file => '', - cpan_directory => 'author', - }, - pattern => qr{https:.+/author/[a-z0-9A-Z-]+}, - }, - { - inputs => { - object_type => 'release', - field_name => 'distribution', - xml_file => '', - cpan_directory => 'release', - filter => { status => 'latest' }, - }, - pattern => qr{https?:.+/release/[a-z0-9A-Z-]+}, - } -); - -my $temp_dir = tempdir( CLEANUP => 1 ); - -foreach my $test (@tests) { - - # Generate the XML file into a file in a temporary directory, then - # check that the file exists, is valid XML, and has the right number - # of URLs. - - my $args = $test->{inputs}; - $args->{size} = 250; - $args->{xml_file} = File::Spec->catfile( $temp_dir, - "$test->{inputs}{object_type}.xml.gz" ); - my $sitemap = MetaCPAN::Sitemap->new($args); - $sitemap->process(); - - ok( -e $args->{xml_file}, - "XML output file for $args->{object_type} exists" ); - - open( my $xml_fh, '<:gzip', $args->{xml_file} ); - - my $xml = XMLin($xml_fh); - ok( defined $xml, "XML for $args->{object_type} checks out" ); - - ok( @{ $xml->{url} }, 'We have some URLs to look at' ); - is( - $sitemap->{size}, - scalar @{ $xml->{url} }, - "Number of URLs is correct" - ); - - # Check that each of the urls has the right pattern. - - note 'Checking urls'; - my $url_tests; - foreach my $url ( @{ $xml->{url} } ) { - - # Test that the url matches - # but only print a TAP line for the first test or if there's a failure. - # ~30,000 tests is a lot of output to sift through. - if ( !$url_tests++ || $url !~ $test->{pattern} ) { - like( $url, $test->{pattern}, 'URL matches' ); - } - } - ok( $url_tests, "Tested $url_tests urls" ); -} - -done_testing();