From 1d58f5f5d8ff76185a3466b48283c6c11628785e Mon Sep 17 00:00:00 2001 From: Moritz Onken Date: Sun, 26 Feb 2012 15:52:21 +0100 Subject: [PATCH] moved authorized logic into the indexer --- lib/MetaCPAN/Document/File.pm | 46 ++++- lib/MetaCPAN/Document/Module.pm | 2 +- lib/MetaCPAN/Document/Release.pm | 2 +- lib/MetaCPAN/Script/Authorized.pm | 176 ------------------ lib/MetaCPAN/Script/Release.pm | 56 +++++- t/release/multiple-modules.t | 21 +++ .../configs/multiple-modules-0.1.json | 4 + 7 files changed, 122 insertions(+), 185 deletions(-) delete mode 100644 lib/MetaCPAN/Script/Authorized.pm diff --git a/lib/MetaCPAN/Document/File.pm b/lib/MetaCPAN/Document/File.pm index 9051ae62d..3b65e351d 100644 --- a/lib/MetaCPAN/Document/File.pm +++ b/lib/MetaCPAN/Document/File.pm @@ -195,7 +195,7 @@ has abstract => has description => ( is => 'ro', required => 1, lazy_build => 1, index => 'analyzed' ); has status => ( is => 'ro', required => 1, default => 'cpan' ); -has authorized => ( required => 1, is => 'ro', isa => 'Bool', default => 1 ); +has authorized => ( required => 1, is => 'rw', isa => 'Bool', default => 1 ); has maturity => ( is => 'ro', required => 1, default => 'released' ); has directory => ( is => 'ro', required => 1, isa => 'Bool', default => 0 ); has level => ( is => 'ro', required => 1, isa => 'Int', lazy_build => 1 ); @@ -430,6 +430,48 @@ sub _build_pod { return \$text; } +=head2 set_authorized + +Expects a C<$perms> parameter which is a HashRef. The key is the module name +and the value an ArrayRef of author names who are allowed to release +that module. + +The method returns a list of unauthorized, but indexed modules. + +Unauthorized modules are modules that were uploaded in the name of a +different author than stated in the C<06perms.txt.gz> file. One problem +with this file is, that it doesn't record historical data. It may very +well be that an author was authorized to upload a module at the time. +But then his co-maintainer rights might have been revoked, making consecutive +uploads of that release unauthorized. However, since this script runs +with the latest version of C<06perms.txt.gz>, the former upload will +be flagged as unauthorized as well. Same holds the other way round, +a previously unauthorized release would be flagged authorized if the +co-maintainership was added later on. + +If a release contains unauthorized modules, the whole release is marked +as unauthorized as well. + +=cut + +sub set_authorized { + my ( $self, $perms ) = @_; + # only authorized perl distributions make it into the CPAN + return () if ( $self->distribution eq 'perl' ); + foreach my $module ( @{ $self->module } ) { + $module->authorized(0) + if ( $perms->{ $module->name } && !grep { $_ eq $self->author } + @{ $perms->{ $module->name } } ); + } + $self->authorized(0) + if ( $self->authorized + && $self->documentation + && $perms->{ $self->documentation } + && !grep { $_ eq $self->author } + @{ $perms->{ $self->documentation } } ); + return grep { !$_->authorized && $_->indexed } @{ $self->module }; +} + __PACKAGE__->meta->make_immutable; package MetaCPAN::Document::File::Set; @@ -510,7 +552,7 @@ sub find_pod { ($file) = grep { grep { $_->indexed && $_->authorized && $_->name eq $module } @{ $_->module || [] } - } @files unless($file); + } @files unless ($file); return $file ? $file : shift @files; } diff --git a/lib/MetaCPAN/Document/Module.pm b/lib/MetaCPAN/Document/Module.pm index 31d2f48e6..6f39fa238 100644 --- a/lib/MetaCPAN/Document/Module.pm +++ b/lib/MetaCPAN/Document/Module.pm @@ -67,7 +67,7 @@ has version => ( is => 'ro' ); has version_numified => ( is => 'ro', isa => 'Num', lazy_build => 1, required => 1 ); has indexed => ( is => 'rw', required => 1, isa => 'Bool', default => 0 ); -has authorized => ( is => 'ro', required => 1, isa => 'Bool', default => 1 ); +has authorized => ( is => 'rw', required => 1, isa => 'Bool', default => 1 ); # REINDEX: make 'ro' once a full reindex has been done has associated_pod => ( required => 0, is => 'rw' ); diff --git a/lib/MetaCPAN/Document/Release.pm b/lib/MetaCPAN/Document/Release.pm index 5342bd770..db28da7b3 100644 --- a/lib/MetaCPAN/Document/Release.pm +++ b/lib/MetaCPAN/Document/Release.pm @@ -120,7 +120,7 @@ has status => ( is => 'rw', required => 1, default => 'cpan' ); has maturity => ( is => 'ro', required => 1, default => 'released' ); has stat => ( is => 'ro', isa => Stat, dynamic => 1 ); has tests => ( is => 'ro', isa => Tests, dynamic => 1 ); -has authorized => ( is => 'ro', required => 1, isa => 'Bool', default => 1 ); +has authorized => ( is => 'rw', required => 1, isa => 'Bool', default => 1 ); has first => ( is => 'ro', required => 1, diff --git a/lib/MetaCPAN/Script/Authorized.pm b/lib/MetaCPAN/Script/Authorized.pm deleted file mode 100644 index 11d5fdc60..000000000 --- a/lib/MetaCPAN/Script/Authorized.pm +++ /dev/null @@ -1,176 +0,0 @@ -package MetaCPAN::Script::Authorized; - -use Moose; -with 'MooseX::Getopt'; -use Log::Contextual qw( :log :dlog ); -with 'MetaCPAN::Role::Common'; -use List::MoreUtils qw(uniq); - -has dry_run => ( is => 'ro', isa => 'Bool', default => 0 ); - -sub run { - my $self = shift; - log_info {"Dry run: updates will not be written to ES"} - if ( $self->dry_run ); - my @authorized; - my $authors = $self->parse_perms; - log_info {"looking for modules"}; - my $scroll = $self->scroll; - log_info { $scroll->total . " modules found" }; - my @releases; - my $i = 0; - - while ( my $file = $scroll->next ) { - $i++; - my $update = 0; - my $data = $file->{_source}; - next if ( $data->{distribution} eq 'perl' ); - my @modules - = grep { $_->{indexed} && $_->{authorized} } @{ $data->{module} }; - foreach my $module (@modules) { - if (!$authors->{ $module->{name} } - || !( - $authors->{ $module->{name} } - && grep { $_ eq $data->{author} } - @{ $authors->{ $module->{name} } } - ) - ) - { - log_debug { - "unauthorized module $module->{name} in $data->{release} by $data->{author}"; - }; - $module->{authorized} = \0; - $update = 1; - } - } - if ( $data->{authorized} - && $data->{documentation} - && $authors->{ $data->{documentation} } - && !grep { $_ eq $data->{author} } - @{ $authors->{ $data->{documentation} } } ) - { - log_debug { - "unauthorized documentation $data->{documentation} in $data->{release} by $data->{author}"; - }; - $data->{authorized} = \0; - $update = 1; - } - push( @authorized, $data ) if ($update); - if ( @authorized > 100 ) { - $self->bulk_update(@authorized); - @authorized = (); - } - log_info { "$i files processed, ", $scroll->total - $i, " to go" } - unless ( $i % 1000 ); - } - $self->bulk_update(@authorized); - $self->index->refresh; -} - -sub bulk_update { - my ( $self, @authorized ) = @_; - return unless (@authorized); - if ( $self->dry_run ) { - log_debug {"dry run, not updating"}; - return; - } - my @bulk; - foreach my $file (@authorized) { - my ($module) - = grep { $_->{name} eq $file->{documentation} } - @{ $file->{module} } - if ( $file->{documentation} ); - $file->{authorized} = $module->{authorized} - if ( $module && $module->{indexed} ); - push( - @bulk, - { index => { - index => $self->index->name, - type => 'file', - id => $file->{id}, - data => $file - } - } - ); - } - $self->es->bulk( \@bulk ) unless ( $self->dry_run ); -} - -sub scroll { - my $self = shift; - $self->index->refresh; - return $self->model->es->scrolled_search( - { index => $self->index->name, - type => 'file', - query => { - filtered => { - query => { match_all => {} }, - filter => { - and => [ - { or => [ - { exists => - { field => 'file.module.name' } - }, - - { exists => { field => 'documentation' } - } - ] - }, - - # { term => { documentation => 'Template' } }, - ] - } - } - }, - scroll => '5m', - size => 1000, - search_type => 'scan', - } - ); -} - -sub parse_perms { - my $self = shift; - my $file = $self->cpan->file(qw(modules 06perms.txt)); - log_info { "parsing ", $file }; - my $fh = $file->openr; - my %authors; - while ( my $line = <$fh> ) { - my ( $module, $author, $type ) = split( /,/, $line ); - next unless ($type); - $authors{$module} ||= []; - push( @{ $authors{$module} }, $author ); - } - return \%authors; - -} - -1; - -__END__ - -=head1 NAME - -MetaCPAN::Script::Authorized - Set the C property on files - -=head1 SYNOPSIS - - $ bin/metacpan authorized - - $ bin/metacpan release /path/to/tarball.tar.gz --authorized - -=head1 DESCRIPTION - -Unauthorized modules are modules that were uploaded in the name of a -different author than stated in the C<06perms.txt.gz> file. One problem -with this file is, that it doesn't record historical data. It may very -well be that an author was authorized to upload a module at the time. -But then his co-maintainer rights might have been revoked, making consecutive -uploads of that release unauthorized. However, since this script runs -with the latest version of C<06perms.txt.gz>, the former upload will -be flagged as unauthorized as well. Same holds the other way round, -a previously unauthorized release would be flagged authorized if the -co-maintainership was added later on. - -If a release contains unauthorized modules, the whole release is marked -as unauthorized as well. diff --git a/lib/MetaCPAN/Script/Release.pm b/lib/MetaCPAN/Script/Release.pm index a444a90ff..30a0cb658 100644 --- a/lib/MetaCPAN/Script/Release.pm +++ b/lib/MetaCPAN/Script/Release.pm @@ -20,7 +20,7 @@ use File::stat ('stat'); use CPAN::DistnameInfo (); use File::Spec::Functions ( 'tmpdir', 'catdir' ); use File::Find (); -use File::stat (); +use File::stat (); use MetaCPAN::Script::Latest; use DateTime::Format::Epoch::Unix; use File::Find::Rule; @@ -65,6 +65,13 @@ has detect_backpan => ( ); has backpan_index => ( is => 'ro', lazy_build => 1 ); +has perms => ( + is => 'ro', + isa => 'HashRef', + lazy_build => 1, + traits => ['NoGetopt'] +); + sub run { my $self = shift; my ( undef, @args ) = @{ $self->extra_argv }; @@ -76,7 +83,12 @@ sub run { qr/\.(tgz|tbz|tar[\._-]gz|tar\.bz2|tar\.Z|zip|7z)$/); $find = $find->mtime( ">" . ( time - $self->age * 3600 ) ) if ( $self->age ); - push( @files, map { $_->{file}} sort { $a->{mtime} <=> $b->{mtime} } map { +{ file => $_, mtime => File::stat::stat($_)->mtime } } $find->in($_) ); + push( + @files, + map { $_->{file} } sort { $a->{mtime} <=> $b->{mtime} } map { + +{ file => $_, mtime => File::stat::stat($_)->mtime } + } $find->in($_) + ); } elsif ( -f $_ ) { push( @files, $_ ); @@ -166,7 +178,7 @@ sub import_tarball { # load Archive::Any in the child due to bugs in MMagic and MIME::Types require Archive::Any; my $at = Archive::Any->new($tarball); - my $tmpdir = dir( File::Temp::tempdir( CLEANUP => 0 ) ); + my $tmpdir = dir( File::Temp::tempdir( CLEANUP => 0 ) ); # TODO: add release to the index with status => 'broken' and move along log_error {"$tarball is being naughty"} @@ -278,7 +290,8 @@ sub import_tarball { foreach my $file (@files) { my $obj = $file_set->new_document($file); $bulk->put($obj); - $file->{$_} = $obj->$_ for (qw(abstract id pod sloc pod_lines documentation)); + $file->{$_} = $obj->$_ + for (qw(abstract id pod sloc pod_lines documentation)); $file->{module} = []; } $bulk->commit; @@ -337,7 +350,9 @@ sub import_tarball { } log_debug { "Indexing ", scalar @modules, " modules" }; $i = 1; + my $perms = $self->perms; my $mod_set = $cpan->type('module'); + my @release_unauthorized; foreach my $file (@modules) { $file = MetaCPAN::Document::File->new( %$file, index => $cpan ); foreach my $mod ( @{ $file->module } ) { @@ -363,11 +378,22 @@ sub import_tarball { !!grep { $file->documentation eq $_->name } @{ $file->module } ) if ( $file->documentation ); log_trace {"reindexing file $file->{path}"}; + push(@release_unauthorized, $file->set_authorized($perms)) if(keys %$perms); $file->clear_module if ( $file->is_pod_file ); $bulk->put($file); - } $bulk->commit; + if (@release_unauthorized) { + log_info { + "release " + . $release->name + . " contains unauthorized modules: " + . join( ",", map { $_->name } @release_unauthorized ); + }; + $release->authorized(0); + $release->put; + } + $tmpdir->rmtree; @@ -390,6 +416,7 @@ sub load_meta_file { my $file; for (qw{*/META.json */META.yml */META.yaml META.json META.yml META.yaml}) { + # scalar context globbing (without exhausting results) produces # confusing results (which caused existsing */META.json files to # get skipped). using list context seems more reliable. @@ -472,6 +499,25 @@ sub detect_status { } } +sub _build_perms { + my $self = shift; + my $file = $self->cpan->file(qw(modules 06perms.txt)); + unless(-e $file) { + log_warn {"$file could not be found. All modules are assumed authorized."}; + return {}; + } + log_info { "parsing ", $file }; + my $fh = $file->openr; + my %authors; + while ( my $line = <$fh> ) { + my ( $module, $author, $type ) = split( /,/, $line ); + next unless ($type); + $authors{$module} ||= []; + push( @{ $authors{$module} }, $author ); + } + return \%authors; +} + 1; __END__ diff --git a/t/release/multiple-modules.t b/t/release/multiple-modules.t index 08eacdacc..773872f25 100644 --- a/t/release/multiple-modules.t +++ b/t/release/multiple-modules.t @@ -65,4 +65,25 @@ ok(!$release->first, 'Release is not first'); } } +$release = $idx->type('release')->get( + { author => 'LOCAL', + name => 'Multiple-Modules-0.1' + } +); + +ok(my $file = $idx->type('file')->filter( + { and => [ + { term => { release => 'Multiple-Modules-0.1' } }, + { term => { documentation => 'Moose' } } + ] + } +)->first, 'get Moose.pm'); + +ok( my ($moose) = ( grep { $_->name eq 'Moose' } @{ $file->module } ), + 'grep Moose module' ); + +ok( !$moose->authorized, 'Moose is not authorized' ); + +ok( !$release->authorized, 'release is not authorized' ); + done_testing; diff --git a/t/var/fakecpan/configs/multiple-modules-0.1.json b/t/var/fakecpan/configs/multiple-modules-0.1.json index e426af8a1..228c0118b 100644 --- a/t/var/fakecpan/configs/multiple-modules-0.1.json +++ b/t/var/fakecpan/configs/multiple-modules-0.1.json @@ -12,6 +12,10 @@ "file": "lib/Multiple/Modules/Deprecated.pm", "content": "package Multiple::Modules::Deprecated;\n\n=head1 NAME\n\nMultiple::Modules::Deprecated - Will be removed in a future release\n" }, + { + "file": "lib/Moose.pm", + "content": "package Moose;\n\n=head1 NAME\n\nMoose - Unauthorized\n" + }, { "file": "t/foo.t", "content": "use Test::More;"