Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added cleanup mode for script/package #647

Merged
merged 1 commit into from May 14, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
50 changes: 45 additions & 5 deletions lib/MetaCPAN/Script/Package.pm
Expand Up @@ -2,10 +2,11 @@ package MetaCPAN::Script::Package;

use Moose;

use CPAN::DistnameInfo ();
use IO::Uncompress::Gunzip ();
use Log::Contextual qw( :log );
use MetaCPAN::Document::Package ();
use IO::Uncompress::Gunzip ();
use CPAN::DistnameInfo ();
use MetaCPAN::Types qw( Bool );

with 'MooseX::Getopt', 'MetaCPAN::Role::Script';

Expand All @@ -15,6 +16,12 @@ Loads 02packages.details info into db.

=cut

has clean_up => (
is => 'ro',
isa => Bool,
default => 0,
);

sub run {
my $self = shift;
$self->index_packages;
Expand Down Expand Up @@ -44,11 +51,14 @@ sub index_packages {
}
log_debug {$meta};

my $bulk_helper = $self->es->bulk_helper(
my $bulk = $self->es->bulk_helper(
index => $self->index->name,
type => 'package',
);

my %seen;
log_debug {"adding data"};

# read the rest of the file line-by-line (too big to slurp)
while ( my $line = <$fh> ) {
next unless $line;
Expand All @@ -66,19 +76,49 @@ sub index_packages {
dist_version => $distinfo->version,
};

$bulk_helper->update(
$bulk->update(
{
id => $name,
doc => $doc,
doc_as_upsert => 1,
}
);

$seen{$name} = 1;
}
$bulk->flush;

$self->run_cleanup( $bulk, \%seen ) if $self->clean_up;

$bulk_helper->flush;
log_info {'finished indexing 02packages.details'};
}

sub run_cleanup {
my ( $self, $bulk, $seen ) = @_;

log_debug {"checking package data to remove"};

my $scroll = $self->es->scroll_helper(
index => $self->index->name,
type => 'package',
scroll => '30m',
body => { query => { match_all => {} } },
);

my @remove;
my $count = $scroll->total;
while ( my $p = $scroll->next ) {
my $id = $p->{_id};
unless ( exists $seen->{$id} ) {
push @remove, $id;
log_debug {"removed $id"};
}
log_debug { $count . " left to check" } if --$count % 10000 == 0;
}
$bulk->delete_ids(@remove);
$bulk->flush;
}

__PACKAGE__->meta->make_immutable;
1;

Expand Down