Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

executable file 102 lines (91 sloc) 2.777 kb
#!/usr/bin/env perl
# to see info messages, run with VERBOSE=1. there are also DEBUG=1 or TRACE=1 if
# you want to see more stuffs.
use 5.010;
use strict;
use warnings;
use File::Which;
use Getopt::Long;
use Log::Any::App '$log';
use Log::Any::For::Builtins 'system';
use LWP::UserAgent;
use Mojo::DOM;
my $start_period;
my $end_period;
GetOptions(
'start-period=s' => sub {
$_[1] =~ /^\d{4}-\d{2}$/
or die "Invalid start period $_[1], please use YYYY-MM\n";
$start_period = $_[1];
},
'end-period=s' => sub {
$_[1] =~ /^\d{4}-\d{2}$/
or die "Invalid end period $_[1], please use YYYY-MM\n";
$end_period = $_[1];
},
'verbose' => sub {
no warnings;
$main::LOG_LEVEL = 'info';
},
'help' => sub {
print <<_;
Usage:
$0 [options] <username>
Options:
--start-period=YYYY-MM
Only download from this period.
--end-period=YYYY-MM
Only download until this period.
--verbose
Show progress.
_
exit 0;
},
);
sub get {
state $ua = LWP::UserAgent->new;
my $res = $ua->get($_[0]);
die "Can't get $_[0]: " . $res->code . " - " . $res->message . "\n"
unless $res->is_success;
$res->content;
}
die "wget is needed to download the articles (+images +convert links)\n"
unless which("wget");
die "Usage: $0 [options] <username>\n" unless @ARGV == 1;
my $user = $ARGV[0];
my $site_url = "http://blogs.perl.org";
my $dom;
$log->info('Getting the monthly archives page ...');
my @month_urls;
{
my $url = "$site_url/users/$user/archives.html";
$dom = Mojo::DOM->new(get $url);
for my $e ($dom->find('.archive-content li a')->each) {
$e->{href} =~ m!(\d{4})/(\d{2})!;
my $period = "$1-$2";
if ($start_period && $period lt $start_period) {
$log->debug("Skipping $e->{href} because it's ".
"earlier than start period ($start_period)");
next;
}
if ($end_period && $period gt $end_period) {
$log->debug("Skipping $e->{href} because it's ".
"later than end period ($end_period)");
next;
}
push @month_urls, $e->{href};
}
}
$log->infof('There are %d month(s) with blog entries', ~~@month_urls);
for my $murl (reverse @month_urls) {
# i hope a monthly page does not have <next> links?
$log->infof('Getting the monthly page %s ...', $murl);
$dom = Mojo::DOM->new(get $murl);
my @post_urls;
for my $e ($dom->find('.entry-title a')->each) {
push @post_urls, $e->{href};
}
$log->infof("Downloading posts: %s", [$murl, @post_urls]);
system "wget", "-k", "-p", "-c",
(("-q") x !$log->is_debug), $murl, @post_urls;
}
Jump to Line
Something went wrong with that request. Please try again.