diff --git a/bin/generate_sitemap.pl b/bin/generate_sitemap.pl
index 0905f1df89..2decbb6af4 100755
--- a/bin/generate_sitemap.pl
+++ b/bin/generate_sitemap.pl
@@ -5,37 +5,41 @@
use strict;
use warnings;
-use FindBin qw ($Bin);
-use lib "$Bin/../lib";
-
+use File::Basename;
+use File::Spec;
+use Cwd;
+use Config::ZOMG;
+my $root_dir;
+
+BEGIN {
+ my $bin_dir = File::Basename::dirname(__FILE__);
+ $root_dir
+ = Cwd::abs_path( File::Spec->catdir( $bin_dir, File::Spec->updir ) );
+}
+use lib "$root_dir/lib";
use MetaCPAN::Sitemap;
-my $out_dir = "$Bin/../root/static/sitemaps/";
-mkdir $out_dir;
-
-my @parts = (
-
- # For authors, we're looking for the pauseid, and want to build a URL
- # with 'author' in the path.
-
- {
- object_type => 'author',
- field_name => 'pauseid',
- xml_file => "$out_dir/authors.xml.gz",
- cpan_directory => 'author',
- },
-
- # For releases, we're looking for a download URL; since we're not
- # building a URL, the cpan_directory is missing, but we also want to
- # filter on only the 'latest' entries.
-
- {
- object_type => 'release',
- field_name => 'distribution',
- xml_file => "$out_dir/releases.xml.gz",
- cpan_directory => 'release',
- filter => { status => 'latest' },
- }
+my $config = Config::ZOMG->open(
+ name => 'MetaCPAN::Web',
+ path => $root_dir,
);
-MetaCPAN::Sitemap->new($_)->process for @parts;
+my $out_dir = "$root_dir/root/static/sitemaps/";
+mkdir $out_dir;
+
+my $web_host = $config->{web_host};
+$web_host =~ s{/\z}{};
+my $sitemaps = $config->{sitemap};
+
+for my $file ( sort keys %$sitemaps ) {
+ my %sm_config = %{ $sitemaps->{$file} };
+ my $full_file = $out_dir . $file;
+ $sm_config{url_prefix} ||= do {
+ my $metacpan_url = $sm_config{metacpan_url};
+ s{/\z}{}, s{\A/}{} for $metacpan_url;
+ "$web_host/$metacpan_url/";
+ };
+ $sm_config{api_secure} = $config->{api_secure};
+ my $sitemap = MetaCPAN::Sitemap->new(%sm_config);
+ $sitemap->write($full_file);
+}
diff --git a/cpanfile b/cpanfile
index 341157c6a6..687ba237fe 100644
--- a/cpanfile
+++ b/cpanfile
@@ -59,7 +59,6 @@ requires 'Moo', '2.000002';
requires 'Moose', '2.1605';
requires 'MooseX::Fastly::Role', '0.03';
requires 'MooseX::Role::Parameterized', '1.02';
-requires 'MooseX::StrictConstructor';
requires 'MooseX::Types::Common::Numeric';
requires 'MooseX::Types::Common::String';
requires 'MooseX::Types::Moose';
@@ -68,7 +67,6 @@ requires 'Net::Async::HTTP';
requires 'Net::Fastly', '1.05';
requires 'Params::ValidationCompiler';
requires 'Path::Tiny', '0.076';
-requires 'PerlIO::gzip';
requires 'Plack', '1.0039';
requires 'Plack::Middleware::ReverseProxy';
requires 'Plack::Middleware::Runtime';
@@ -95,7 +93,6 @@ requires 'Try::Tiny', '0.24';
requires 'URI', '1.71';
requires 'URI::Escape';
requires 'XML::Feed';
-requires 'XML::Simple';
requires 'YAML', '1.15'; # fix dep chain issue
test_requires 'App::Prove';
diff --git a/lib/MetaCPAN/Middleware/Static.pm b/lib/MetaCPAN/Middleware/Static.pm
index 4ca88ed0ae..cfc48cc507 100644
--- a/lib/MetaCPAN/Middleware/Static.pm
+++ b/lib/MetaCPAN/Middleware/Static.pm
@@ -82,6 +82,11 @@ sub wrap {
};
}
+ mount '/sitemap-authors.xml.gz' => Plack::App::File->new(
+ file => 'root/static/sitemaps/sitemap-authors.xml.gz' )->to_app;
+ mount '/sitemap-releases.xml.gz' => Plack::App::File->new(
+ file => 'root/static/sitemaps/sitemap-releases.xml.gz' )->to_app;
+
mount '/favicon.ico' =>
Plack::App::File->new( file => 'root/static/icons/favicon.ico' )
->to_app;
diff --git a/lib/MetaCPAN/Sitemap.pm b/lib/MetaCPAN/Sitemap.pm
index 7fe589f798..fd9c4d6e91 100644
--- a/lib/MetaCPAN/Sitemap.pm
+++ b/lib/MetaCPAN/Sitemap.pm
@@ -1,41 +1,42 @@
package MetaCPAN::Sitemap;
-
-=head1 DESCRIPTION
-
-Generate an XML file containing URLs use by the robots.txt Sitemap. We use this
-module to generate one each for authors, modules and releases.
-
-=cut
-
use strict;
use warnings;
-use MetaCPAN::Moose;
-
-use autodie;
-
-use Carp;
-use Search::Elasticsearch;
-use File::Spec;
-use MetaCPAN::Web::Types qw( HashRef Int Str );
-use MooseX::StrictConstructor;
-use PerlIO::gzip;
-use XML::Simple qw(:strict);
-
-has [ 'cpan_directory', 'object_type', 'field_name', 'xml_file', ] => (
- is => 'ro',
- isa => Str,
- required => 1,
-);
-
-has 'filter' => (
- is => 'ro',
- isa => HashRef,
+use IO::Socket::SSL qw(SSL_VERIFY_PEER);
+use IO::Async::Loop;
+use IO::Async::SSL;
+use Net::Async::HTTP;
+use Cpanel::JSON::XS;
+use IO::Compress::Gzip;
+use HTML::Entities qw(encode_entities_numeric);
+
+use Moo;
+
+has api_secure => ( is => 'ro', required => 1 );
+has url_prefix => ( is => 'ro', required => 1 );
+has object_type => ( is => 'ro', required => 1 );
+has field_name => ( is => 'ro', required => 1 );
+has filter => ( is => 'ro' );
+has size => ( is => 'ro', default => 1000 );
+has loop => ( is => 'lazy', default => sub { IO::Async::Loop->new } );
+has ua => (
+ is => 'lazy',
+ default => sub {
+ my $self = shift;
+ my $http = Net::Async::HTTP->new(
+ user_agent =>
+ 'MetaCPAN-Web/1.0 (https://github.com/metacpan/metacpan-web)',
+ max_connections_per_host => 5,
+ SSL_verify_mode => SSL_VERIFY_PEER,
+ timeout => 10,
+ );
+ $self->loop->add($http);
+ $http;
+ }
);
-has 'size' => (
- is => 'ro',
- isa => Int,
-);
+sub DEMOLISH {
+ $_[0]->ua->remove_from_parent;
+}
# Mandatory arguments to this function are
# [] search object_type (author and release)
@@ -48,82 +49,72 @@ has 'size' => (
# [] filter - contains filter for a field that also needs to be included in
# the list of form fields.
-sub process {
- my $self = shift;
-
- # Check that a) the directory where the output file wants to be does
- # actually exist and b) the directory itself is writeable.
-
- # Get started. Create the ES object and the scrolled search object.
- # XXX Remove this hardcoded URL
- my $es = Search::Elasticsearch->new(
- cxn_pool => 'Static::NoPing',
- nodes => ['https://fastapi.metacpan.org'],
- send_get_body_as => 'POST',
- );
-
- my $field_name = $self->field_name;
-
- # Start off with standard search parameters ..
-
- my %search_parameters = (
- index => 'v1',
- size => 5000,
- type => $self->object_type,
- fields => [$field_name],
- );
-
- # ..and augment them if necesary.
-
- if ( $self->filter ) {
+my $json = Cpanel::JSON::XS->new->utf8->canonical;
- # Copy the filter over wholesale into the search parameters, and add
- # the filter fields to the field list.
-
- $search_parameters{'body'}{'query'}{'match'} = $self->filter;
- push @{ $search_parameters{'fields'} }, keys %{ $self->filter };
- }
-
- my $scrolled_search = $es->scroll_helper(%search_parameters);
-
- # Open the output file, get ready to pump out the XML.
-
- open my $fh, '>:gzip', $self->xml_file;
-
- my @urls;
- my $metacpan_url = q{};
- if ( $self->cpan_directory ) {
- $metacpan_url
- = 'https://metacpan.org/' . $self->cpan_directory . q{/};
+sub _request {
+ my ( $self, $content, $cb ) = @_;
+ my $url = $self->api_secure . '/';
+ my $content_type = 'text/plain';
+ if ( ref $content ) {
+ $url .= $self->object_type . '/';
+ $content_type = 'application/json';
+ $content = $json->encode($content);
}
+ $url .= '_search/scroll?scroll=1m&size=' . $self->size;
+ $self->ua->POST( $url, $content, content_type => $content_type, )->then(
+ sub {
+ my $response = shift;
+ my $content = $json->decode( $response->content );
+ return Future->done
+ if !@{ $content->{hits}{hits} };
+ $cb->( $content->{hits}{hits} );
+ return $self->_request( $content->{_scroll_id}, $cb );
+ }
+ );
+}
- while ( $scrolled_search->refill_buffer ) {
- push @urls,
- map { $metacpan_url . $_->{'fields'}->{$field_name} }
- $scrolled_search->drain_buffer;
- }
+sub write {
+ my ( $self, $file ) = @_;
- $_ = $_ . q{ } for @urls;
+ my $fh = IO::Compress::Gzip->new( $file . '.new' );
+ $fh->print(<<'END_XML_HEADER');
+
+
+END_XML_HEADER
- $self->{size} = @urls;
- XMLout(
+ $self->_request(
{
- 'xmlns' => 'http://www.sitemaps.org/schemas/sitemap/0.9',
- 'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
- 'xsi:schemaLocation' =>
- 'http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd',
- 'url' => [ sort @urls ],
+ fields => [ $self->field_name ],
+ query => { match_all => {} },
+ ( $self->filter ? ( filter => $self->filter ) : () ),
+ sort => [ $self->field_name ],
},
- 'KeyAttr' => [],
- 'RootName' => 'urlset',
- 'XMLDecl' => q//,
- 'OutputFile' => $fh,
- );
-
- close $fh;
+ sub {
+ my $hits = shift;
+ for my $hit (@$hits) {
+ my $link_field = $hit->{fields}{ $self->field_name };
+ $link_field = $link_field->[0] if ref $link_field;
+ my $url = $self->url_prefix . $link_field;
+ $fh->print( " "
+ . encode_entities_numeric($url)
+ . "\n" );
+ }
+ }
+ )->get;
+ $fh->print("\n");
+ $fh->close;
+ rename "$file.new", "$file";
return;
}
-__PACKAGE__->meta->make_immutable;
-
1;
+__END__
+
+=head1 DESCRIPTION
+
+Generate an XML file containing URLs use by the robots.txt Sitemap. We use this
+module to generate one each for authors, modules and releases.
+
+=cut
diff --git a/metacpan_web.conf b/metacpan_web.conf
index b5425a5ce5..a3dd0edd54 100644
--- a/metacpan_web.conf
+++ b/metacpan_web.conf
@@ -36,3 +36,19 @@ mark_unauthorized_releases = 0
public_key 6LeH2MsSAAAAANwz3AA73Gw5OjCVjT6I51Ev-ior
+
+ object_type = author
+ field_name = pauseid
+ metacpan_url = author
+
+
+
+ object_type = release
+ field_name = distribution
+ metacpan_url = release
+
+
+ status = latest
+
+
+
diff --git a/root/robots.txt b/root/robots.txt
index 28938f5fa3..61b426cc94 100644
--- a/root/robots.txt
+++ b/root/robots.txt
@@ -9,5 +9,5 @@ Disallow: /raw/
# Do not allow changing the default per page as is not useful
Disallow: /*?*size=*
-Sitemap: https://metacpan.org/static/sitemaps/authors.xml.gz
-Sitemap: https://metacpan.org/static/sitemaps/releases.xml.gz
+Sitemap: https://metacpan.org/sitemap-authors.xml.gz
+Sitemap: https://metacpan.org/sitemap-releases.xml.gz
diff --git a/t/metacpan/sitemap.t b/t/metacpan/sitemap.t
deleted file mode 100644
index 00f6b04ee5..0000000000
--- a/t/metacpan/sitemap.t
+++ /dev/null
@@ -1,82 +0,0 @@
-use strict;
-use warnings;
-
-use File::Temp qw/ tempdir /;
-use MetaCPAN::Sitemap;
-use Test::More;
-use Try::Tiny;
-use XML::Simple;
-
-# Test each of the three things that the production script is going to do,
-# but limit the searches to a single chunk of 250 results to speed things
-# along.
-
-my @tests = (
- {
- inputs => {
- object_type => 'author',
- field_name => 'pauseid',
- xml_file => '',
- cpan_directory => 'author',
- },
- pattern => qr{https:.+/author/[a-z0-9A-Z-]+},
- },
- {
- inputs => {
- object_type => 'release',
- field_name => 'distribution',
- xml_file => '',
- cpan_directory => 'release',
- filter => { status => 'latest' },
- },
- pattern => qr{https?:.+/release/[a-z0-9A-Z-]+},
- }
-);
-
-my $temp_dir = tempdir( CLEANUP => 1 );
-
-foreach my $test (@tests) {
-
- # Generate the XML file into a file in a temporary directory, then
- # check that the file exists, is valid XML, and has the right number
- # of URLs.
-
- my $args = $test->{inputs};
- $args->{size} = 250;
- $args->{xml_file} = File::Spec->catfile( $temp_dir,
- "$test->{inputs}{object_type}.xml.gz" );
- my $sitemap = MetaCPAN::Sitemap->new($args);
- $sitemap->process();
-
- ok( -e $args->{xml_file},
- "XML output file for $args->{object_type} exists" );
-
- open( my $xml_fh, '<:gzip', $args->{xml_file} );
-
- my $xml = XMLin($xml_fh);
- ok( defined $xml, "XML for $args->{object_type} checks out" );
-
- ok( @{ $xml->{url} }, 'We have some URLs to look at' );
- is(
- $sitemap->{size},
- scalar @{ $xml->{url} },
- "Number of URLs is correct"
- );
-
- # Check that each of the urls has the right pattern.
-
- note 'Checking urls';
- my $url_tests;
- foreach my $url ( @{ $xml->{url} } ) {
-
- # Test that the url matches
- # but only print a TAP line for the first test or if there's a failure.
- # ~30,000 tests is a lot of output to sift through.
- if ( !$url_tests++ || $url !~ $test->{pattern} ) {
- like( $url, $test->{pattern}, 'URL matches' );
- }
- }
- ok( $url_tests, "Tested $url_tests urls" );
-}
-
-done_testing();