rewrite of sitemap generation

metacpan · Jun 12, 2017 · cfb0778 · cfb0778
1 parent 02ae623
commit cfb0778
Show file tree

Hide file tree

Showing 7 changed files with 149 additions and 218 deletions.
diff --git a/bin/generate_sitemap.pl b/bin/generate_sitemap.pl
@@ -5,37 +5,41 @@
 use strict;
 use warnings;
 
-use FindBin qw ($Bin);
-use lib "$Bin/../lib";
-
+use File::Basename;
+use File::Spec;
+use Cwd;
+use Config::ZOMG;
+my $root_dir;
+
+BEGIN {
+    my $bin_dir = File::Basename::dirname(__FILE__);
+    $root_dir
+        = Cwd::abs_path( File::Spec->catdir( $bin_dir, File::Spec->updir ) );
+}
+use lib "$root_dir/lib";
 use MetaCPAN::Sitemap;
 
-my $out_dir = "$Bin/../root/static/sitemaps/";
-mkdir $out_dir;
-
-my @parts = (
-
-    # For authors, we're looking for the pauseid, and want to build a URL
-    # with 'author' in the path.
-
-    {
-        object_type    => 'author',
-        field_name     => 'pauseid',
-        xml_file       => "$out_dir/authors.xml.gz",
-        cpan_directory => 'author',
-    },
-
-    # For releases, we're looking for a download URL; since we're not
-    # building a URL, the cpan_directory is missing, but we also want to
-    # filter on only the 'latest' entries.
-
-    {
-        object_type    => 'release',
-        field_name     => 'distribution',
-        xml_file       => "$out_dir/releases.xml.gz",
-        cpan_directory => 'release',
-        filter         => { status => 'latest' },
-    }
+my $config = Config::ZOMG->open(
+    name => 'MetaCPAN::Web',
+    path => $root_dir,
 );
 
-MetaCPAN::Sitemap->new($_)->process for @parts;
+my $out_dir = "$root_dir/root/static/sitemaps/";
+mkdir $out_dir;
+
+my $web_host = $config->{web_host};
+$web_host =~ s{/\z}{};
+my $sitemaps = $config->{sitemap};
+
+for my $file ( sort keys %$sitemaps ) {
+    my %sm_config = %{ $sitemaps->{$file} };
+    my $full_file = $out_dir . $file;
+    $sm_config{url_prefix} ||= do {
+        my $metacpan_url = $sm_config{metacpan_url};
+        s{/\z}{}, s{\A/}{} for $metacpan_url;
+        "$web_host/$metacpan_url/";
+    };
+    $sm_config{api_secure} = $config->{api_secure};
+    my $sitemap = MetaCPAN::Sitemap->new(%sm_config);
+    $sitemap->write($full_file);
+}
diff --git a/cpanfile b/cpanfile
@@ -59,7 +59,6 @@ requires 'Moo', '2.000002';
 requires 'Moose', '2.1605';
 requires 'MooseX::Fastly::Role', '0.03';
 requires 'MooseX::Role::Parameterized', '1.02';
-requires 'MooseX::StrictConstructor';
 requires 'MooseX::Types::Common::Numeric';
 requires 'MooseX::Types::Common::String';
 requires 'MooseX::Types::Moose';
@@ -68,7 +67,6 @@ requires 'Net::Async::HTTP';
 requires 'Net::Fastly', '1.05';
 requires 'Params::ValidationCompiler';
 requires 'Path::Tiny', '0.076';
-requires 'PerlIO::gzip';
 requires 'Plack', '1.0039';
 requires 'Plack::Middleware::ReverseProxy';
 requires 'Plack::Middleware::Runtime';
@@ -95,7 +93,6 @@ requires 'Try::Tiny', '0.24';
 requires 'URI', '1.71';
 requires 'URI::Escape';
 requires 'XML::Feed';
-requires 'XML::Simple';
 requires 'YAML', '1.15'; # fix dep chain issue
 
 test_requires 'App::Prove';

diff --git a/lib/MetaCPAN/Middleware/Static.pm b/lib/MetaCPAN/Middleware/Static.pm
@@ -82,6 +82,11 @@ sub wrap {
             };
         }
 
+        mount '/sitemap-authors.xml.gz' => Plack::App::File->new(
+            file => 'root/static/sitemaps/sitemap-authors.xml.gz' )->to_app;
+        mount '/sitemap-releases.xml.gz' => Plack::App::File->new(
+            file => 'root/static/sitemaps/sitemap-releases.xml.gz' )->to_app;
+
         mount '/favicon.ico' =>
             Plack::App::File->new( file => 'root/static/icons/favicon.ico' )
             ->to_app;

diff --git a/lib/MetaCPAN/Sitemap.pm b/lib/MetaCPAN/Sitemap.pm
@@ -1,41 +1,42 @@
 package MetaCPAN::Sitemap;
-
-=head1 DESCRIPTION
-
-Generate an XML file containing URLs use by the robots.txt Sitemap. We use this
-module to generate one each for authors, modules and releases.
-
-=cut
-
 use strict;
 use warnings;
-use MetaCPAN::Moose;
-
-use autodie;
-
-use Carp;
-use Search::Elasticsearch;
-use File::Spec;
-use MetaCPAN::Web::Types qw( HashRef Int Str );
-use MooseX::StrictConstructor;
-use PerlIO::gzip;
-use XML::Simple qw(:strict);
-
-has [ 'cpan_directory', 'object_type', 'field_name', 'xml_file', ] => (
-    is       => 'ro',
-    isa      => Str,
-    required => 1,
-);
-
-has 'filter' => (
-    is  => 'ro',
-    isa => HashRef,
+use IO::Socket::SSL qw(SSL_VERIFY_PEER);
+use IO::Async::Loop;
+use IO::Async::SSL;
+use Net::Async::HTTP;
+use Cpanel::JSON::XS;
+use IO::Compress::Gzip;
+use HTML::Entities qw(encode_entities_numeric);
+
+use Moo;
+
+has api_secure  => ( is => 'ro', required => 1 );
+has url_prefix  => ( is => 'ro', required => 1 );
+has object_type => ( is => 'ro', required => 1 );
+has field_name  => ( is => 'ro', required => 1 );
+has filter      => ( is => 'ro' );
+has size        => ( is => 'ro', default  => 1000 );
+has loop => ( is => 'lazy', default => sub { IO::Async::Loop->new } );
+has ua => (
+    is      => 'lazy',
+    default => sub {
+        my $self = shift;
+        my $http = Net::Async::HTTP->new(
+            user_agent =>
+                'MetaCPAN-Web/1.0 (https://github.com/metacpan/metacpan-web)',
+            max_connections_per_host => 5,
+            SSL_verify_mode          => SSL_VERIFY_PEER,
+            timeout                  => 10,
+        );
+        $self->loop->add($http);
+        $http;
+    }
 );
 
-has 'size' => (
-    is  => 'ro',
-    isa => Int,
-);
+sub DEMOLISH {
+    $_[0]->ua->remove_from_parent;
+}
 
 # Mandatory arguments to this function are
 # [] search object_type (author and release)
@@ -48,82 +49,72 @@ has 'size' => (
 # [] filter - contains filter for a field that also needs to be included in
 # the list of form fields.
 
-sub process {
-    my $self = shift;
-
-    # Check that a) the directory where the output file wants to be does
-    # actually exist and b) the directory itself is writeable.
-
-    # Get started. Create the ES object and the scrolled search object.
-    # XXX Remove this hardcoded URL
-    my $es = Search::Elasticsearch->new(
-        cxn_pool         => 'Static::NoPing',
-        nodes            => ['https://fastapi.metacpan.org'],
-        send_get_body_as => 'POST',
-    );
-
-    my $field_name = $self->field_name;
-
-    # Start off with standard search parameters ..
-
-    my %search_parameters = (
-        index  => 'v1',
-        size   => 5000,
-        type   => $self->object_type,
-        fields => [$field_name],
-    );
-
-    # ..and augment them if necesary.
-
-    if ( $self->filter ) {
+my $json = Cpanel::JSON::XS->new->utf8->canonical;
 
-        # Copy the filter over wholesale into the search parameters, and add
-        # the filter fields to the field list.
-
-        $search_parameters{'body'}{'query'}{'match'} = $self->filter;
-        push @{ $search_parameters{'fields'} }, keys %{ $self->filter };
-    }
-
-    my $scrolled_search = $es->scroll_helper(%search_parameters);
-
-    # Open the output file, get ready to pump out the XML.
-
-    open my $fh, '>:gzip', $self->xml_file;
-
-    my @urls;
-    my $metacpan_url = q{};
-    if ( $self->cpan_directory ) {
-        $metacpan_url
-            = 'https://metacpan.org/' . $self->cpan_directory . q{/};
+sub _request {
+    my ( $self, $content, $cb ) = @_;
+    my $url          = $self->api_secure . '/';
+    my $content_type = 'text/plain';
+    if ( ref $content ) {
+        $url .= $self->object_type . '/';
+        $content_type = 'application/json';
+        $content      = $json->encode($content);
     }
+    $url .= '_search/scroll?scroll=1m&size=' . $self->size;
+    $self->ua->POST( $url, $content, content_type => $content_type, )->then(
+        sub {
+            my $response = shift;
+            my $content  = $json->decode( $response->content );
+            return Future->done
+                if !@{ $content->{hits}{hits} };
+            $cb->( $content->{hits}{hits} );
+            return $self->_request( $content->{_scroll_id}, $cb );
+        }
+    );
+}
 
-    while ( $scrolled_search->refill_buffer ) {
-        push @urls,
-            map { $metacpan_url . $_->{'fields'}->{$field_name} }
-            $scrolled_search->drain_buffer;
-    }
+sub write {
+    my ( $self, $file ) = @_;
 
-    $_ = $_ . q{ } for @urls;
+    my $fh = IO::Compress::Gzip->new( $file . '.new' );
+    $fh->print(<<'END_XML_HEADER');
+<?xml version='1.0' encoding='UTF-8'?>
+<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+        xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
+        xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
+END_XML_HEADER
 
-    $self->{size} = @urls;
-    XMLout(
+    $self->_request(
         {
-            'xmlns'     => 'http://www.sitemaps.org/schemas/sitemap/0.9',
-            'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
-            'xsi:schemaLocation' =>
-                'http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd',
-            'url' => [ sort @urls ],
+            fields => [ $self->field_name ],
+            query  => { match_all => {} },
+            ( $self->filter ? ( filter => $self->filter ) : () ),
+            sort => [ $self->field_name ],
         },
-        'KeyAttr'    => [],
-        'RootName'   => 'urlset',
-        'XMLDecl'    => q/<?xml version='1.0' encoding='UTF-8'?>/,
-        'OutputFile' => $fh,
-    );
-
-    close $fh;
+        sub {
+            my $hits = shift;
+            for my $hit (@$hits) {
+                my $link_field = $hit->{fields}{ $self->field_name };
+                $link_field = $link_field->[0] if ref $link_field;
+                my $url = $self->url_prefix . $link_field;
+                $fh->print( "    <url><loc>"
+                        . encode_entities_numeric($url)
+                        . "</loc></url>\n" );
+            }
+        }
+    )->get;
+    $fh->print("</urlset>\n");
+    $fh->close;
+    rename "$file.new", "$file";
     return;
 }
 
-__PACKAGE__->meta->make_immutable;
-
 1;
+__END__
+
+=head1 DESCRIPTION
+
+Generate an XML file containing URLs use by the robots.txt Sitemap. We use this
+module to generate one each for authors, modules and releases.
+
+=cut
diff --git a/metacpan_web.conf b/metacpan_web.conf
@@ -36,3 +36,19 @@ mark_unauthorized_releases = 0
     public_key 6LeH2MsSAAAAANwz3AA73Gw5OjCVjT6I51Ev-ior
 </controller>
 
+<sitemap sitemap-authors.xml.gz>
+    object_type = author
+    field_name  = pauseid
+    metacpan_url = author
+</sitemap>
+
+<sitemap sitemap-releases.xml.gz>
+    object_type = release
+    field_name  = distribution
+    metacpan_url = release
+    <filter>
+        <term>
+            status = latest
+        </term>
+    </filter>
+</sitemap>
diff --git a/root/robots.txt b/root/robots.txt
@@ -9,5 +9,5 @@ Disallow: /raw/
 # Do not allow changing the default per page as is not useful
 Disallow: /*?*size=*
 
-Sitemap: https://metacpan.org/static/sitemaps/authors.xml.gz
-Sitemap: https://metacpan.org/static/sitemaps/releases.xml.gz
+Sitemap: https://metacpan.org/sitemap-authors.xml.gz
+Sitemap: https://metacpan.org/sitemap-releases.xml.gz