Permalink
Browse files

Add reindex-words command to rebuild the inverted word index

  • Loading branch information...
1 parent d884e1a commit 99acfe8872dd784a5d1ed7bdfecc5009c95e9b02 @manitou-mail committed Apr 23, 2012
Showing with 48 additions and 57 deletions.
  1. +48 −57 script/manitou-mgr
View
@@ -63,10 +63,10 @@ my $opt_dbpassword;
my ($opt_dbhost, $opt_dbport);
my $opt_dry_run;
+my $opt_reindex_step;
+my $opt_reindex_begin;
+my $opt_reindex_end;
#my $opt_search_terms;
-my $commit_step=100;
-my $vacuum_step=1000;
-my ($min_mail_id, $max_mail_id);
sub usage {
my $p="[--conf=/path/to/config_file] [--quiet]";
@@ -79,6 +79,7 @@ sub usage {
$0 --hash-attachments $p
$0 --merge-attachments $p
$0 --print-size $p
+ $0 --reindex-words [--reindex-words-step=step] [--reindex-words-begin=first_mail_id] [--reindex-words-end=last_mail_id] $p
~;
};
@@ -103,6 +104,10 @@ my $rc = GetOptions("conf:s" => \$conf_file,
"print-size" => \$opt_action{'print-size'},
"merge-attachments" => \$opt_action{'merge-attachments'},
"hash-attachments" => \$opt_action{'hash-attachments'},
+ "reindex-words" => \$opt_action{'reindex-words'},
+ "reindex-words-step=s" => \$opt_reindex_step,
+ "reindex-words-begin=s" => \$opt_reindex_begin,
+ "reindex-words-end=s" => \$opt_reindex_end
);
if (!$rc) {
@@ -153,6 +158,14 @@ elsif ($opt_action{"upgrade-schema"}) {
elsif ($opt_action{"create-database"}) {
create_database();
}
+elsif ($opt_action{"reindex-words"}) {
+ Connect();
+ my %optr;
+ $optr{'step'}=$opt_reindex_step if ($opt_reindex_step>=0);
+ $optr{'begin'}=$opt_reindex_begin if ($opt_reindex_begin>=0);
+ $optr{'end'}=$opt_reindex_end if ($opt_reindex_end>=0);
+ reindex_words(\%optr);
+}
#elsif ($opt_action{"iwi-query"}) {
# iwi_query($opt_search_terms);
#}
@@ -238,19 +251,15 @@ sub create_database {
print "Database $dbname created.\n" unless ($opt_quiet);
$dbh1->disconnect;
- # create language by superuser
+ # Reconnect to the new database as a superuser
$dbh1 = DBI->connect("dbi:Pg:$scnx_string dbname=$dbname") or die DBI->errstr;
my @lang = $dbh1->selectrow_array("SELECT 1 FROM pg_language WHERE lanname='plpgsql'");
if (!@lang) {
- $dbh1->do("CREATE LANGUAGE plpgsql") or die $dbh->errstr;
+ $dbh1->do("CREATE LANGUAGE plpgsql") or die $dbh1->errstr;
}
- $dbh1->disconnect;
+ $dbh1->do("SET SESSION AUTHORIZATION $dbuser") or die $dbh1->errstr;
- my $cnx_string = "user=$dbuser dbname=$dbname";
- $cnx_string .= " password=$opt_dbpassword" if (defined $opt_dbpassword);
- $cnx_string .= " host=$opt_dbhost" if (defined $opt_dbhost);
- $cnx_string .= " port=$opt_dbport" if (defined $opt_dbport);
- $dbh=DBI->connect("dbi:Pg:$cnx_string") or die DBI->errstr;
+ $dbh=$dbh1;
create_schema();
print "Database $dbname successfully created.\n" unless ($opt_quiet);
}
@@ -349,78 +358,60 @@ sub upgrade_schema {
return 1;
}
-# UNUSED at the moment
sub reindex_words {
- # $dbh2 is used for fetching mail_id from a cursor, inside a transaction
- # that ends only at the end of session (no commit, hence the need for
- # this second database connection)
- my $dbh2 = DBI->connect($cnx_string) or die "Can't connect: $DBI::errstr";
- $dbh2->{AutoCommit}=0;
+ my $opt=shift;
load_stopwords($dbh);
my $where;
- if ($min_mail_id) {
- if ($max_mail_id) {
- $where="WHERE mail_id BETWEEN $min_mail_id AND $max_mail_id";
- } else {
- $where="WHERE mail_id>=$min_mail_id";
- }
- } elsif ($max_mail_id) {
- $where="WHERE mail_id<=$max_mail_id";
- }
-
- my $sc=$dbh->prepare("SELECT count(*) FROM mail $where");
- $sc->execute;
- my ($total)=$sc->fetchrow_array;
+ my $limit = (defined $opt->{step}) ? "LIMIT $opt->{step}" :
+ "LIMIT " . Manitou::Words::load_partsize($dbh);
+ my $min_id = $opt->{begin} ? int($opt->{begin}):1;
+ my $sthb = $dbh->prepare("SELECT bodytext FROM body where mail_id=?");
+ my $sthh = $dbh->prepare("SELECT lines FROM header where mail_id=?");
+ my $end=0;
+
+ while (!$end) {
+ $dbh->begin_work;
+ my @cond;
+ push @cond, "mail_id>=".$min_id if ($min_id>0);
+ push @cond, "mail_id<=".int($opt->{end}) if ($opt->{end});
+ $where = @cond>0 ? "WHERE ".join(" AND ", @cond) : "";
+ my $s=$dbh->prepare("SELECT mail_id FROM mail $where ORDER BY mail_id $limit");
- my $commits=0;
- $dbh2->do("DECLARE c CURSOR FOR SELECT mail_id FROM mail $where ORDER BY mail_id");
- my $s=$dbh2->prepare("FETCH $commit_step FROM c");
- $s->execute;
- my $count=0;
- while ($s->rows>0) {
- my $sthb=$dbh->prepare("SELECT bodytext FROM body where mail_id=?");
- my $sthh=$dbh->prepare("SELECT lines FROM header where mail_id=?");
+ $s->execute;
+ my $count=0;
+ if ($s->rows==0) {
+ $end=1;
+ last;
+ }
my $mail_id;
- $dbh->begin_work;
while (($mail_id)=$s->fetchrow_array) {
+ if ($min_id<$mail_id) {
+ $min_id=$mail_id;
+ }
$count++;
$sthb->execute($mail_id);
my ($body)=$sthb->fetchrow_array;
$sthh->execute($mail_id);
my ($header)=$sthh->fetchrow_array;
$body = decode_dbtxt($body);
$header = decode_dbtxt($header);
- $body .= Manitou::Words::header_contents_to_ftidx($header);
+ $header = Manitou::Words::header_contents_to_ftidx($header);
index_words($dbh, $mail_id, \$body, \$header);
}
+ print "max mail_id=$min_id\n";
print "Flushing word vectors..." unless ($opt_quiet);
flush_word_vectors($dbh);
clear_word_vectors();
$dbh->commit;
- print "done ($count/$total)\n" unless ($opt_quiet);
- $commits++;
-
- if ($commits % $vacuum_step==0) {
- print "Vacuuming..." unless ($opt_quiet);
- $dbh->do("VACUUM ANALYZE inverted_word_index");
- print "done\n" unless ($opt_quiet);
- }
- $s->execute;
+ print "$count messages reindexed (up to mail_id=$min_id)\n" unless ($opt_quiet);
+ $min_id++;
}
- $dbh->begin_work;
- print "Flushing word vectors..." unless ($opt_quiet);
- flush_word_vectors($dbh);
- $dbh->commit;
print "done\n" unless ($opt_quiet);
-
- $dbh->do("VACUUM ANALYZE inverted_word_index");
- $dbh2->do("CLOSE c");
- $dbh2->commit;
}
sub hash_attachments {

0 comments on commit 99acfe8

Please sign in to comment.