Skip to content

Commit

Permalink
Add reindex-words command to rebuild the inverted word index
Browse files Browse the repository at this point in the history
  • Loading branch information
manitou-mail committed Apr 23, 2012
1 parent d884e1a commit 99acfe8
Showing 1 changed file with 48 additions and 57 deletions.
105 changes: 48 additions & 57 deletions script/manitou-mgr
Expand Up @@ -63,10 +63,10 @@ my $opt_dbpassword;
my ($opt_dbhost, $opt_dbport);
my $opt_dry_run;

my $opt_reindex_step;
my $opt_reindex_begin;
my $opt_reindex_end;
#my $opt_search_terms;
my $commit_step=100;
my $vacuum_step=1000;
my ($min_mail_id, $max_mail_id);

sub usage {
my $p="[--conf=/path/to/config_file] [--quiet]";
Expand All @@ -79,6 +79,7 @@ sub usage {
$0 --hash-attachments $p
$0 --merge-attachments $p
$0 --print-size $p
$0 --reindex-words [--reindex-words-step=step] [--reindex-words-begin=first_mail_id] [--reindex-words-end=last_mail_id] $p
~;
};

Expand All @@ -103,6 +104,10 @@ my $rc = GetOptions("conf:s" => \$conf_file,
"print-size" => \$opt_action{'print-size'},
"merge-attachments" => \$opt_action{'merge-attachments'},
"hash-attachments" => \$opt_action{'hash-attachments'},
"reindex-words" => \$opt_action{'reindex-words'},
"reindex-words-step=s" => \$opt_reindex_step,
"reindex-words-begin=s" => \$opt_reindex_begin,
"reindex-words-end=s" => \$opt_reindex_end
);

if (!$rc) {
Expand Down Expand Up @@ -153,6 +158,14 @@ elsif ($opt_action{"upgrade-schema"}) {
elsif ($opt_action{"create-database"}) {
create_database();
}
elsif ($opt_action{"reindex-words"}) {
Connect();
my %optr;
$optr{'step'}=$opt_reindex_step if ($opt_reindex_step>=0);
$optr{'begin'}=$opt_reindex_begin if ($opt_reindex_begin>=0);
$optr{'end'}=$opt_reindex_end if ($opt_reindex_end>=0);
reindex_words(\%optr);
}
#elsif ($opt_action{"iwi-query"}) {
# iwi_query($opt_search_terms);
#}
Expand Down Expand Up @@ -238,19 +251,15 @@ sub create_database {
print "Database $dbname created.\n" unless ($opt_quiet);
$dbh1->disconnect;

# create language by superuser
# Reconnect to the new database as a superuser
$dbh1 = DBI->connect("dbi:Pg:$scnx_string dbname=$dbname") or die DBI->errstr;
my @lang = $dbh1->selectrow_array("SELECT 1 FROM pg_language WHERE lanname='plpgsql'");
if (!@lang) {
$dbh1->do("CREATE LANGUAGE plpgsql") or die $dbh->errstr;
$dbh1->do("CREATE LANGUAGE plpgsql") or die $dbh1->errstr;
}
$dbh1->disconnect;
$dbh1->do("SET SESSION AUTHORIZATION $dbuser") or die $dbh1->errstr;

my $cnx_string = "user=$dbuser dbname=$dbname";
$cnx_string .= " password=$opt_dbpassword" if (defined $opt_dbpassword);
$cnx_string .= " host=$opt_dbhost" if (defined $opt_dbhost);
$cnx_string .= " port=$opt_dbport" if (defined $opt_dbport);
$dbh=DBI->connect("dbi:Pg:$cnx_string") or die DBI->errstr;
$dbh=$dbh1;
create_schema();
print "Database $dbname successfully created.\n" unless ($opt_quiet);
}
Expand Down Expand Up @@ -349,78 +358,60 @@ sub upgrade_schema {
return 1;
}

# UNUSED at the moment
sub reindex_words {
# $dbh2 is used for fetching mail_id from a cursor, inside a transaction
# that ends only at the end of session (no commit, hence the need for
# this second database connection)
my $dbh2 = DBI->connect($cnx_string) or die "Can't connect: $DBI::errstr";
$dbh2->{AutoCommit}=0;
my $opt=shift;

load_stopwords($dbh);

my $where;
if ($min_mail_id) {
if ($max_mail_id) {
$where="WHERE mail_id BETWEEN $min_mail_id AND $max_mail_id";
} else {
$where="WHERE mail_id>=$min_mail_id";
}
} elsif ($max_mail_id) {
$where="WHERE mail_id<=$max_mail_id";
}

my $sc=$dbh->prepare("SELECT count(*) FROM mail $where");
$sc->execute;
my ($total)=$sc->fetchrow_array;
my $limit = (defined $opt->{step}) ? "LIMIT $opt->{step}" :
"LIMIT " . Manitou::Words::load_partsize($dbh);
my $min_id = $opt->{begin} ? int($opt->{begin}):1;
my $sthb = $dbh->prepare("SELECT bodytext FROM body where mail_id=?");
my $sthh = $dbh->prepare("SELECT lines FROM header where mail_id=?");
my $end=0;

while (!$end) {
$dbh->begin_work;
my @cond;
push @cond, "mail_id>=".$min_id if ($min_id>0);
push @cond, "mail_id<=".int($opt->{end}) if ($opt->{end});
$where = @cond>0 ? "WHERE ".join(" AND ", @cond) : "";
my $s=$dbh->prepare("SELECT mail_id FROM mail $where ORDER BY mail_id $limit");

my $commits=0;
$dbh2->do("DECLARE c CURSOR FOR SELECT mail_id FROM mail $where ORDER BY mail_id");
my $s=$dbh2->prepare("FETCH $commit_step FROM c");
$s->execute;
my $count=0;
while ($s->rows>0) {
my $sthb=$dbh->prepare("SELECT bodytext FROM body where mail_id=?");
my $sthh=$dbh->prepare("SELECT lines FROM header where mail_id=?");
$s->execute;
my $count=0;
if ($s->rows==0) {
$end=1;
last;
}

my $mail_id;
$dbh->begin_work;
while (($mail_id)=$s->fetchrow_array) {
if ($min_id<$mail_id) {
$min_id=$mail_id;
}
$count++;
$sthb->execute($mail_id);
my ($body)=$sthb->fetchrow_array;
$sthh->execute($mail_id);
my ($header)=$sthh->fetchrow_array;
$body = decode_dbtxt($body);
$header = decode_dbtxt($header);
$body .= Manitou::Words::header_contents_to_ftidx($header);
$header = Manitou::Words::header_contents_to_ftidx($header);
index_words($dbh, $mail_id, \$body, \$header);
}

print "max mail_id=$min_id\n";
print "Flushing word vectors..." unless ($opt_quiet);
flush_word_vectors($dbh);
clear_word_vectors();
$dbh->commit;
print "done ($count/$total)\n" unless ($opt_quiet);
$commits++;

if ($commits % $vacuum_step==0) {
print "Vacuuming..." unless ($opt_quiet);
$dbh->do("VACUUM ANALYZE inverted_word_index");
print "done\n" unless ($opt_quiet);
}
$s->execute;
print "$count messages reindexed (up to mail_id=$min_id)\n" unless ($opt_quiet);
$min_id++;
}

$dbh->begin_work;
print "Flushing word vectors..." unless ($opt_quiet);
flush_word_vectors($dbh);
$dbh->commit;
print "done\n" unless ($opt_quiet);

$dbh->do("VACUUM ANALYZE inverted_word_index");
$dbh2->do("CLOSE c");
$dbh2->commit;
}

sub hash_attachments {
Expand Down

0 comments on commit 99acfe8

Please sign in to comment.