Skip to content

Commit

Permalink
Bump version to 1.3.0
Browse files Browse the repository at this point in the history
Major improvements in the full-text indexer.

* Implement config option 'index_words_extractors' to replace indexer
plugins.

* Add internal html to plain text converter with
'index_words_html_parts' option.

* Improve insert performance for bit vectors by using postgres' COPY
(up to 2x faster)

* manitou-mgr --reindex-words: when possible, partition the work to
avoid updating the bit vectors (only inserts)

* Avoid the unnecessary "DELETE FROM jobs_queue" for each mail when
fully reindexing.

* Avoid some word duplicates when unaccent mode is on.

* Extract components of compound words.
  • Loading branch information
manitou-mail committed Sep 4, 2012
1 parent 4b61bbe commit 7aa679b
Show file tree
Hide file tree
Showing 9 changed files with 411 additions and 103 deletions.
3 changes: 2 additions & 1 deletion Makefile.PL
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@ my %opts = (
AUTHOR => 'Daniel Verite (daniel@manitou-mail.org)',
ABSTRACT => 'Perl mail-database exchanger for the Manitou-Mail software',
NAME => 'manitou-mdx',
VERSION => '1.2.1',
VERSION => '1.3.0',
EXE_FILES => [ qw(script/manitou-mdx script/manitou-mgr script/manitou-spool) ],
PREREQ_PM => { 'Getopt::Long' => 2.00,
'IO' => 0,
'DBI' => 1.32,
'DBD::Pg' => 1.30,
'HTML::TreeBuilder' => 3.23,
'MIME::Entity'=> 5.0,
'MIME::Words'=> 5.0,
'MIME::Parser'=> 5.0,
Expand Down
119 changes: 117 additions & 2 deletions lib/Manitou/Attachments.pm
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2004-2010 Daniel Verite
# Copyright (C) 2004-2012 Daniel Verite

# This file is part of Manitou-Mail (see http://www.manitou-mail.org)

Expand Down Expand Up @@ -26,7 +26,11 @@ use Carp;
use POSIX qw(tmpnam);
use Encode;
use Manitou::Encoding qw(encode_dbtxt header_decode);
use Manitou::Log qw(error_log warning_log);
use Manitou::Config qw(getconf);
use Digest::SHA1;
use IPC::Open3;
use IO::Handle;

require Exporter;
@ISA = qw(Exporter);
Expand Down Expand Up @@ -159,7 +163,6 @@ sub insert_attachment {

my $charset=header_decode($mime_obj->head->mime_attr("content-type.charset"));
$stha->bind_param(++$pos, encode_dbtxt(substr($charset,0,30)));
LogError($stha->errstr) if $stha->err;

my $content_id=$mime_obj->get("Content-ID");
# content-ID syntax must be <addr-spec> (RFC2111)
Expand Down Expand Up @@ -311,3 +314,115 @@ sub create_html_part {
$sth->finish;
return $part;
}

# Input: identity
sub text_extractors {
my $word_extractors = getconf('index_words_extractors', $_[0]);
my %extractors;
if (defined $word_extractors && @{$word_extractors}>0) {
foreach (@{$word_extractors}) { # content_type : program
# TODO: keep only the extractors that match content types for which
# we have actual attachments for this message.
# Get the attachments list in this function and pass them
# to attach_parts() and launch_text_extractors() rather than
# letting these functions query the database.
if (/^(.*)\s*:\s*(.*)\s*$/) {
$extractors{$1}=$2;
}
else {
warning_log("Entry ignored in index_words_extractors: $_");
}
}
}
return %extractors;
}

# $commands: hashref {"content_type"=>"command to extract words"}
# Requires the db connection to be inside a transaction because of the
# operations on large objects
#
# Returns: 0 on failure, 1 otherwise.
sub launch_text_extractors {
my ($dbh, $mail_id, $commands, $ref_text)=@_;

my $sth = $dbh->prepare("SELECT a.attachment_id,a.content_type,a.content_size,ac.content FROM attachments a JOIN attachment_contents ac ON a.attachment_id=ac.attachment_id WHERE a.mail_id=? AND a.mime_content_id IS NULL");
my $errmsg;

$sth->execute($mail_id);

while (my $row = $sth->fetchrow_hashref) {
my $ct=$row->{content_type};
if (exists $commands->{$ct}) {
my $cmd=$commands->{$ct};
my $output;
# Pipe the contents to the extractor and get results into $output
my $ret=0;
my $in=IO::Handle->new();
my $out=IO::Handle->new();
my $err=IO::Handle->new();
eval {
$SIG{'PIPE'} = 'IGNORE';
my $pid = open3($in, $out, $err, $cmd) or die $!;
binmode $out, ':utf8';
$out->blocking(0);
my $bits;
vec($bits, fileno($out), 1)=1;

my $content_size = $row->{content_size};
my $lobj_fd = $dbh->func($row->{content}, $dbh->{pg_INV_READ}, 'lo_open');
die $dbh->errstr if (!defined $lobj_fd);
my $buf;
my $nbytes;
while ($content_size>0) {
$nbytes = $dbh->func($lobj_fd, $buf, $content_size>524288 ? 524288:$content_size, 'lo_read');
die $dbh->errstr if (!defined $nbytes);
$content_size -= $nbytes;
# Send to script
print $in $buf;
while (select(undef, $bits, undef, 0.2)) {
# read the output of the extractor during execution
# to avoid too much buffering
$$ref_text.=<$out>;
}
}
$dbh->func($lobj_fd, 'lo_close');
close($in);
$out->blocking(1);
while (<$out>) {
$$ref_text .=$_;
}
waitpid($pid, 0);
};
my $base_msg="Attachments text extractor execution error (\`$cmd\`, exit code=".($?>>8)."), message #$mail_id, attachment #$row->{attachment_id}";
if ($@) {
$errmsg="$base_msg: $@";
}
else {
my $e=<$err>;
if ($e ne "" || ($?>>8)!=0) {
$errmsg= "$base_msg: $e";
}
}
$SIG{'PIPE'}='DEFAULT';
close($err);
close($out);
if ($errmsg) {
error_log($errmsg);
return 0;
}
}
elsif ($ct eq "text/html") {
# Built-in default extractor for HTML attachments
my $lobj_fd = $dbh->func ($row->{content}, $dbh->{pg_INV_READ}, 'lo_open');
die $dbh->errstr if (!defined $lobj_fd);
my $buf;
if ($dbh->func($lobj_fd, $buf, $row->{content_size}, 'lo_read')) {
$$ref_text .= Manitou::Words::html_to_text($buf);
}
$dbh->func ($lobj_fd, 'lo_close');
}
}
1;
}

1;
5 changes: 4 additions & 1 deletion lib/Manitou/Config.pm
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2004-2011 Daniel Verite
# Copyright (C) 2004-2012 Daniel Verite

# This file is part of Manitou-Mail (see http://www.manitou-mail.org)

Expand Down Expand Up @@ -41,6 +41,7 @@ my %default_conf =
'outgoing_check_interval' => 5,
'index_words' => "yes",
'index_words_accent_mode' => "dual", # strip, keep
'index_words_html_parts' => "yes",
'local_delivery_agent' => "sendmail -f \$FROM\$ -t",
'log_filter_hits' => 'yes',
'preferred_charset' => "iso-8859-1 iso-8859-15 utf-8",
Expand Down Expand Up @@ -70,6 +71,8 @@ my %conf_opts =
'flush_word_index_max_queued' => 'integer',
'index_words' => 'bool',
'index_words_accent_mode' => 'string',
'index_words_extractors' => 'strings',
'index_words_html_parts' => 'bool',
'init_sql' => 'strings',
'local_delivery_agent' => 'program',
'log_filter_hits' => 'bool',
Expand Down
15 changes: 14 additions & 1 deletion lib/Manitou/Database.pm
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,14 @@ use vars qw(@ISA @EXPORT_OK);

require Exporter;
@ISA = qw(Exporter);
@EXPORT_OK = qw(db_connect);
@EXPORT_OK = qw(db_connect bytea_output);

use Manitou::Config qw(getconf);
my $cache_bytea_output;

sub bytea_output {
$cache_bytea_output;
}

sub db_connect {
my $cnx_string=getconf("db_connect_string");
Expand All @@ -51,5 +56,13 @@ sub db_connect {
foreach (@init) {
$dbh->do($_);
}

my $s=$dbh->prepare("SELECT setting FROM pg_catalog.pg_settings WHERE name='bytea_output'");
$s->execute;
($cache_bytea_output) = $s->fetchrow_array;
$cache_bytea_output="escape" if (!defined $cache_bytea_output);

return $dbh;
}

1;
4 changes: 3 additions & 1 deletion lib/Manitou/Log.pm
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2004-2011 Daniel Verite
# Copyright (C) 2004-2012 Daniel Verite

# This file is part of Manitou-Mail (see http://www.manitou-mail.org)

Expand Down Expand Up @@ -47,3 +47,5 @@ sub debug_log {
sub warning_log {
syslog("warning", shift);
}

1;
12 changes: 8 additions & 4 deletions lib/Manitou/Schema.pm
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2004-2011 Daniel Verite
# Copyright (C) 2004-2012 Daniel Verite

# This file is part of Manitou-Mail (see http://www.manitou-mail.org)

Expand Down Expand Up @@ -29,11 +29,11 @@ require Exporter;
create_table_statements create_trigger_statements);

sub current_version {
return "1.2.0";
return "1.3.0";
}

sub supported_versions {
return ("0.9.12", "1.0.0", "1.0.1", "1.0.2", "1.1.0", "1.2.0");
return ("0.9.12", "1.0.0", "1.0.1", "1.0.2", "1.1.0", "1.2.0", "1.3.0");
}

my $create_script=<<EOF;
Expand Down Expand Up @@ -855,7 +855,11 @@ sub upgrade_schema_statements {
push @stmt, sql_comment("mail_addresses.addr_type");
push @stmt, $functions{"replace_header_field"};
}

elsif ($from eq "1.2.0" && $to eq "1.3.0") {
push @stmt, "ALTER TABLE jobs_queue ADD status SMALLINT";
push @stmt, $tables{"import_mbox"};
push @stmt, $tables{"import_message"};
}
return @stmt;
}

Expand Down
Loading

0 comments on commit 7aa679b

Please sign in to comment.