From 0ec347e77d87ea3c176bcdb6854d6c5f5d4e1e4a Mon Sep 17 00:00:00 2001 From: Mickey Nasriachi Date: Sun, 14 May 2017 15:37:21 +0100 Subject: [PATCH] Added cleanup mode for script/package This will allow cleaning up removed records once in a while (it requires a longer run time so not suited for every cron run, and records don't get deleted often anyways). Usage: add --clean_up flag to the package script execution. --- lib/MetaCPAN/Script/Package.pm | 50 ++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/lib/MetaCPAN/Script/Package.pm b/lib/MetaCPAN/Script/Package.pm index 57fc13be3..e22967784 100644 --- a/lib/MetaCPAN/Script/Package.pm +++ b/lib/MetaCPAN/Script/Package.pm @@ -2,10 +2,11 @@ package MetaCPAN::Script::Package; use Moose; +use CPAN::DistnameInfo (); +use IO::Uncompress::Gunzip (); use Log::Contextual qw( :log ); use MetaCPAN::Document::Package (); -use IO::Uncompress::Gunzip (); -use CPAN::DistnameInfo (); +use MetaCPAN::Types qw( Bool ); with 'MooseX::Getopt', 'MetaCPAN::Role::Script'; @@ -15,6 +16,12 @@ Loads 02packages.details info into db. =cut +has clean_up => ( + is => 'ro', + isa => Bool, + default => 0, +); + sub run { my $self = shift; $self->index_packages; @@ -44,11 +51,14 @@ sub index_packages { } log_debug {$meta}; - my $bulk_helper = $self->es->bulk_helper( + my $bulk = $self->es->bulk_helper( index => $self->index->name, type => 'package', ); + my %seen; + log_debug {"adding data"}; + # read the rest of the file line-by-line (too big to slurp) while ( my $line = <$fh> ) { next unless $line; @@ -66,19 +76,49 @@ sub index_packages { dist_version => $distinfo->version, }; - $bulk_helper->update( + $bulk->update( { id => $name, doc => $doc, doc_as_upsert => 1, } ); + + $seen{$name} = 1; } + $bulk->flush; + + $self->run_cleanup( $bulk, \%seen ) if $self->clean_up; - $bulk_helper->flush; log_info {'finished indexing 02packages.details'}; } +sub run_cleanup { + my ( $self, $bulk, $seen ) = @_; + + log_debug {"checking package data to remove"}; + + my $scroll = $self->es->scroll_helper( + index => $self->index->name, + type => 'package', + scroll => '30m', + body => { query => { match_all => {} } }, + ); + + my @remove; + my $count = $scroll->total; + while ( my $p = $scroll->next ) { + my $id = $p->{_id}; + unless ( exists $seen->{$id} ) { + push @remove, $id; + log_debug {"removed $id"}; + } + log_debug { $count . " left to check" } if --$count % 10000 == 0; + } + $bulk->delete_ids(@remove); + $bulk->flush; +} + __PACKAGE__->meta->make_immutable; 1;