Skip to content

Commit

Permalink
rewrite of sitemap generation
Browse files Browse the repository at this point in the history
  • Loading branch information
haarg committed Jun 12, 2017
1 parent 02ae623 commit cfb0778
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 218 deletions.
64 changes: 34 additions & 30 deletions bin/generate_sitemap.pl
Expand Up @@ -5,37 +5,41 @@
use strict;
use warnings;

use FindBin qw ($Bin);
use lib "$Bin/../lib";

use File::Basename;
use File::Spec;
use Cwd;
use Config::ZOMG;
my $root_dir;

BEGIN {
my $bin_dir = File::Basename::dirname(__FILE__);
$root_dir
= Cwd::abs_path( File::Spec->catdir( $bin_dir, File::Spec->updir ) );
}
use lib "$root_dir/lib";
use MetaCPAN::Sitemap;

my $out_dir = "$Bin/../root/static/sitemaps/";
mkdir $out_dir;

my @parts = (

# For authors, we're looking for the pauseid, and want to build a URL
# with 'author' in the path.

{
object_type => 'author',
field_name => 'pauseid',
xml_file => "$out_dir/authors.xml.gz",
cpan_directory => 'author',
},

# For releases, we're looking for a download URL; since we're not
# building a URL, the cpan_directory is missing, but we also want to
# filter on only the 'latest' entries.

{
object_type => 'release',
field_name => 'distribution',
xml_file => "$out_dir/releases.xml.gz",
cpan_directory => 'release',
filter => { status => 'latest' },
}
my $config = Config::ZOMG->open(
name => 'MetaCPAN::Web',
path => $root_dir,
);

MetaCPAN::Sitemap->new($_)->process for @parts;
my $out_dir = "$root_dir/root/static/sitemaps/";
mkdir $out_dir;

my $web_host = $config->{web_host};
$web_host =~ s{/\z}{};
my $sitemaps = $config->{sitemap};

for my $file ( sort keys %$sitemaps ) {
my %sm_config = %{ $sitemaps->{$file} };
my $full_file = $out_dir . $file;
$sm_config{url_prefix} ||= do {
my $metacpan_url = $sm_config{metacpan_url};
s{/\z}{}, s{\A/}{} for $metacpan_url;
"$web_host/$metacpan_url/";
};
$sm_config{api_secure} = $config->{api_secure};
my $sitemap = MetaCPAN::Sitemap->new(%sm_config);
$sitemap->write($full_file);
}
3 changes: 0 additions & 3 deletions cpanfile
Expand Up @@ -59,7 +59,6 @@ requires 'Moo', '2.000002';
requires 'Moose', '2.1605';
requires 'MooseX::Fastly::Role', '0.03';
requires 'MooseX::Role::Parameterized', '1.02';
requires 'MooseX::StrictConstructor';
requires 'MooseX::Types::Common::Numeric';
requires 'MooseX::Types::Common::String';
requires 'MooseX::Types::Moose';
Expand All @@ -68,7 +67,6 @@ requires 'Net::Async::HTTP';
requires 'Net::Fastly', '1.05';
requires 'Params::ValidationCompiler';
requires 'Path::Tiny', '0.076';
requires 'PerlIO::gzip';
requires 'Plack', '1.0039';
requires 'Plack::Middleware::ReverseProxy';
requires 'Plack::Middleware::Runtime';
Expand All @@ -95,7 +93,6 @@ requires 'Try::Tiny', '0.24';
requires 'URI', '1.71';
requires 'URI::Escape';
requires 'XML::Feed';
requires 'XML::Simple';
requires 'YAML', '1.15'; # fix dep chain issue

test_requires 'App::Prove';
Expand Down
5 changes: 5 additions & 0 deletions lib/MetaCPAN/Middleware/Static.pm
Expand Up @@ -82,6 +82,11 @@ sub wrap {
};
}

mount '/sitemap-authors.xml.gz' => Plack::App::File->new(
file => 'root/static/sitemaps/sitemap-authors.xml.gz' )->to_app;
mount '/sitemap-releases.xml.gz' => Plack::App::File->new(
file => 'root/static/sitemaps/sitemap-releases.xml.gz' )->to_app;

mount '/favicon.ico' =>
Plack::App::File->new( file => 'root/static/icons/favicon.ico' )
->to_app;
Expand Down
193 changes: 92 additions & 101 deletions lib/MetaCPAN/Sitemap.pm
@@ -1,41 +1,42 @@
package MetaCPAN::Sitemap;

=head1 DESCRIPTION
Generate an XML file containing URLs use by the robots.txt Sitemap. We use this
module to generate one each for authors, modules and releases.
=cut

use strict;
use warnings;
use MetaCPAN::Moose;

use autodie;

use Carp;
use Search::Elasticsearch;
use File::Spec;
use MetaCPAN::Web::Types qw( HashRef Int Str );
use MooseX::StrictConstructor;
use PerlIO::gzip;
use XML::Simple qw(:strict);

has [ 'cpan_directory', 'object_type', 'field_name', 'xml_file', ] => (
is => 'ro',
isa => Str,
required => 1,
);

has 'filter' => (
is => 'ro',
isa => HashRef,
use IO::Socket::SSL qw(SSL_VERIFY_PEER);
use IO::Async::Loop;
use IO::Async::SSL;
use Net::Async::HTTP;
use Cpanel::JSON::XS;
use IO::Compress::Gzip;
use HTML::Entities qw(encode_entities_numeric);

use Moo;

has api_secure => ( is => 'ro', required => 1 );
has url_prefix => ( is => 'ro', required => 1 );
has object_type => ( is => 'ro', required => 1 );
has field_name => ( is => 'ro', required => 1 );
has filter => ( is => 'ro' );
has size => ( is => 'ro', default => 1000 );
has loop => ( is => 'lazy', default => sub { IO::Async::Loop->new } );
has ua => (
is => 'lazy',
default => sub {
my $self = shift;
my $http = Net::Async::HTTP->new(
user_agent =>
'MetaCPAN-Web/1.0 (https://github.com/metacpan/metacpan-web)',
max_connections_per_host => 5,
SSL_verify_mode => SSL_VERIFY_PEER,
timeout => 10,
);
$self->loop->add($http);
$http;
}
);

has 'size' => (
is => 'ro',
isa => Int,
);
sub DEMOLISH {
$_[0]->ua->remove_from_parent;
}

# Mandatory arguments to this function are
# [] search object_type (author and release)
Expand All @@ -48,82 +49,72 @@ has 'size' => (
# [] filter - contains filter for a field that also needs to be included in
# the list of form fields.

sub process {
my $self = shift;

# Check that a) the directory where the output file wants to be does
# actually exist and b) the directory itself is writeable.

# Get started. Create the ES object and the scrolled search object.
# XXX Remove this hardcoded URL
my $es = Search::Elasticsearch->new(
cxn_pool => 'Static::NoPing',
nodes => ['https://fastapi.metacpan.org'],
send_get_body_as => 'POST',
);

my $field_name = $self->field_name;

# Start off with standard search parameters ..

my %search_parameters = (
index => 'v1',
size => 5000,
type => $self->object_type,
fields => [$field_name],
);

# ..and augment them if necesary.

if ( $self->filter ) {
my $json = Cpanel::JSON::XS->new->utf8->canonical;

# Copy the filter over wholesale into the search parameters, and add
# the filter fields to the field list.

$search_parameters{'body'}{'query'}{'match'} = $self->filter;
push @{ $search_parameters{'fields'} }, keys %{ $self->filter };
}

my $scrolled_search = $es->scroll_helper(%search_parameters);

# Open the output file, get ready to pump out the XML.

open my $fh, '>:gzip', $self->xml_file;

my @urls;
my $metacpan_url = q{};
if ( $self->cpan_directory ) {
$metacpan_url
= 'https://metacpan.org/' . $self->cpan_directory . q{/};
sub _request {
my ( $self, $content, $cb ) = @_;
my $url = $self->api_secure . '/';
my $content_type = 'text/plain';
if ( ref $content ) {
$url .= $self->object_type . '/';
$content_type = 'application/json';
$content = $json->encode($content);
}
$url .= '_search/scroll?scroll=1m&size=' . $self->size;
$self->ua->POST( $url, $content, content_type => $content_type, )->then(
sub {
my $response = shift;
my $content = $json->decode( $response->content );
return Future->done
if !@{ $content->{hits}{hits} };
$cb->( $content->{hits}{hits} );
return $self->_request( $content->{_scroll_id}, $cb );
}
);
}

while ( $scrolled_search->refill_buffer ) {
push @urls,
map { $metacpan_url . $_->{'fields'}->{$field_name} }
$scrolled_search->drain_buffer;
}
sub write {
my ( $self, $file ) = @_;

$_ = $_ . q{ } for @urls;
my $fh = IO::Compress::Gzip->new( $file . '.new' );
$fh->print(<<'END_XML_HEADER');
<?xml version='1.0' encoding='UTF-8'?>
<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
END_XML_HEADER

$self->{size} = @urls;
XMLout(
$self->_request(
{
'xmlns' => 'http://www.sitemaps.org/schemas/sitemap/0.9',
'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
'xsi:schemaLocation' =>
'http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd',
'url' => [ sort @urls ],
fields => [ $self->field_name ],
query => { match_all => {} },
( $self->filter ? ( filter => $self->filter ) : () ),
sort => [ $self->field_name ],
},
'KeyAttr' => [],
'RootName' => 'urlset',
'XMLDecl' => q/<?xml version='1.0' encoding='UTF-8'?>/,
'OutputFile' => $fh,
);

close $fh;
sub {
my $hits = shift;
for my $hit (@$hits) {
my $link_field = $hit->{fields}{ $self->field_name };
$link_field = $link_field->[0] if ref $link_field;
my $url = $self->url_prefix . $link_field;
$fh->print( " <url><loc>"
. encode_entities_numeric($url)
. "</loc></url>\n" );
}
}
)->get;
$fh->print("</urlset>\n");
$fh->close;
rename "$file.new", "$file";
return;
}

__PACKAGE__->meta->make_immutable;

1;
__END__
=head1 DESCRIPTION
Generate an XML file containing URLs use by the robots.txt Sitemap. We use this
module to generate one each for authors, modules and releases.
=cut
16 changes: 16 additions & 0 deletions metacpan_web.conf
Expand Up @@ -36,3 +36,19 @@ mark_unauthorized_releases = 0
public_key 6LeH2MsSAAAAANwz3AA73Gw5OjCVjT6I51Ev-ior
</controller>

<sitemap sitemap-authors.xml.gz>
object_type = author
field_name = pauseid
metacpan_url = author
</sitemap>

<sitemap sitemap-releases.xml.gz>
object_type = release
field_name = distribution
metacpan_url = release
<filter>
<term>
status = latest
</term>
</filter>
</sitemap>
4 changes: 2 additions & 2 deletions root/robots.txt
Expand Up @@ -9,5 +9,5 @@ Disallow: /raw/
# Do not allow changing the default per page as is not useful
Disallow: /*?*size=*

Sitemap: https://metacpan.org/static/sitemaps/authors.xml.gz
Sitemap: https://metacpan.org/static/sitemaps/releases.xml.gz
Sitemap: https://metacpan.org/sitemap-authors.xml.gz
Sitemap: https://metacpan.org/sitemap-releases.xml.gz

0 comments on commit cfb0778

Please sign in to comment.