diff --git a/search/README.txt b/search/README.txt index 4841d3e4863d5..379a27277c869 100644 --- a/search/README.txt +++ b/search/README.txt @@ -1,3 +1,25 @@ +2006/08/21 +---------- +Fixed index document count, and created new config variable to store +the size. (Search now has 3 global vars in $CFG, date, size and complete, +see indexer.php for var names). Index size is cached to provide an always +current value for the index - this is to take into account the fact that +deleted documents are in fact not removed from the index, but instead just +marked as deleted and not returned in search results. The actual document +still features in the index, and skews sizes. When the index optimiser is +completed in ZFS, then these deleted documents will be pruned, thus +correctly modifying the index size. + +Additional commenting added. + +Query page logic very slightly modified to clean up GET string a bit (removed +'p' variable). + +Add/delete functions added to other document types. + +A few TODO fields added to source, indicating changes still to come (or at +least to be considered). + 2006/08/16 ---------- Add/delete/update cron functions finished - can be called seperately diff --git a/search/add.php b/search/add.php index b862fdda02850..b430316aa5e25 100644 --- a/search/add.php +++ b/search/add.php @@ -22,11 +22,18 @@ $dbcontrol = new IndexDBControl(); $addition_count = 0; + $indexdate = $CFG->search_indexer_run_date; + mtrace('
Starting index update (additions)...');
-  mtrace('Index size before: '.$index->count()."\n");
+  mtrace('Index size before: '.$CFG->search_index_size."\n");
   
+  //get all modules
   if ($mods = get_records_select('modules')) {
+  //append virtual modules onto array
+  $mods = array_merge($mods, search_get_additional_modules());
+   
   foreach ($mods as $mod) {
+    //build include file and function names
     $class_file = $CFG->dirroot.'/search/documents/'.$mod->name.'_document.php';    
     $db_names_function = $mod->name.'_db_names';
     $get_document_function = $mod->name.'_single_document';
@@ -35,22 +42,29 @@
     if (file_exists($class_file)) {
       require_once($class_file);
     
+      //if both required functions exist
       if (function_exists($db_names_function) and function_exists($get_document_function)) {
         mtrace("Checking $mod->name module for additions.");
         $values = $db_names_function();
+        $where = (isset($values[4])) ? $values[4] : ''; 
         
-        $sql = "select id, ".$values[0]." as docid from ".$values[1]."
-                where id not in
-                (select docid from ".SEARCH_DATABASE_TABLE." where doctype like '$mod->name')";
+        //select records in MODULE table, but not in SEARCH_DATABASE_TABLE
+        $sql =  "select id, ".$values[0]." as docid from ".$values[1].
+                " where id not in".
+                " (select docid from ".SEARCH_DATABASE_TABLE." where doctype like '$mod->name')".
+                " and ".$values[2]." > $indexdate".
+                " $where";                 
 
         $records = get_records_sql($sql);     
         
+        //foreach record, build a module specific search document using the get_document function
         if (is_array($records)) {       
           foreach($records as $record) {
             $additions[] = $get_document_function($record->id);
           } //foreach
         } //if    
           
+        //foreach document, add it to the index and database table
         foreach ($additions as $add) {
           ++$addition_count;
           
@@ -74,9 +88,11 @@
   //commit changes
   $index->commit();
   
-  //update index date
+  //update index date and size
   set_config("search_indexer_run_date", time());
+  set_config("search_index_size", (int)$CFG->search_index_size + (int)$addition_count);
 
+  //print some additional info
   mtrace("Added $addition_count documents.");
   mtrace('Index size after: '.$index->count().'
'); diff --git a/search/cron.php b/search/cron.php index 68e867745c350..245ffa1a48be9 100644 --- a/search/cron.php +++ b/search/cron.php @@ -1,5 +1,17 @@ dirroot/search/lib.php"); diff --git a/search/delete.php b/search/delete.php index a7aae71eb4855..7f8ef834058bb 100644 --- a/search/delete.php +++ b/search/delete.php @@ -23,10 +23,13 @@ $deletion_count = 0; mtrace('
Starting clean-up of removed records...');
-  mtrace('Index size before: '.$index->count()."\n");
+  mtrace('Index size before: '.$CFG->search_index_size."\n");
   
   if ($mods = get_records_select('modules')) {
+  $mods = array_merge($mods, search_get_additional_modules());
+  
   foreach ($mods as $mod) {
+    //build function names
     $class_file = $CFG->dirroot.'/search/documents/'.$mod->name.'_document.php';
     $delete_function = $mod->name.'_delete';
     $db_names_function = $mod->name.'_db_names';
@@ -39,13 +42,14 @@
         mtrace("Checking $mod->name module for deletions.");
         $values = $db_names_function();
         
-        $sql = "select id, docid from ".SEARCH_DATABASE_TABLE."
-                where doctype like '$mod->name'
-                and docid not in
-                (select ".$values[0]." from ".$values[1].")";
+        $sql = "select id, docid from ".SEARCH_DATABASE_TABLE.
+                " where doctype like '$mod->name'".
+                " and docid not in".
+                " (select ".$values[0]." from ".$values[1].")";
 
         $records = get_records_sql($sql);     
         
+        //build an array of all the deleted records
         if (is_array($records)) {       
           foreach($records as $record) {
             $deletions[] = $delete_function($record->docid);
@@ -53,6 +57,7 @@
         } //if    
           
         foreach ($deletions as $delete) {        
+          //find the specific document in the index, using it's docid and doctype as keys
           $doc = $index->find("+docid:$delete +doctype:$mod->name");            
           
           //get the record, should only be one
@@ -60,6 +65,7 @@
             ++$deletion_count;
             mtrace("  Delete: $thisdoc->title (database id = $thisdoc->dbid, index id = $thisdoc->id, moodle instance id = $thisdoc->docid)");
             
+            //remove it from index and database table
             $dbcontrol->delDocument($thisdoc);
             $index->delete($thisdoc->id);              
           } //foreach
@@ -74,8 +80,9 @@
   //commit changes
   $index->commit();
   
-  //update index date
+  //update index date and index size
   set_config("search_indexer_run_date", time());
+  set_config("search_index_size", (int)$CFG->search_index_size - (int)$deletion_count);
 
   mtrace("Finished $deletion_count removals.");
   mtrace('Index size after: '.$index->count().'
'); diff --git a/search/documents/document.php b/search/documents/document.php index fdb741afa13ef..98e732a2dfe88 100644 --- a/search/documents/document.php +++ b/search/documents/document.php @@ -12,6 +12,7 @@ public function __construct(&$doc, &$data, $document_type, $course_id, $group_id $this->addField(Zend_Search_Lucene_Field::UnIndexed('url', $doc->url)); $this->addField(Zend_Search_Lucene_Field::UnIndexed('date', $doc->date)); + //additional data added on a per-module basis $this->addField(Zend_Search_Lucene_Field::Binary('data', serialize($data))); $this->addField(Zend_Search_Lucene_Field::Keyword('doctype', $document_type)); diff --git a/search/documents/forum_document.php b/search/documents/forum_document.php index 92786f4b49890..84904f5a29179 100644 --- a/search/documents/forum_document.php +++ b/search/documents/forum_document.php @@ -68,6 +68,30 @@ function forum_get_content_for_index(&$forum) { return $documents; } //forum_get_content_for_index + //returns a single forum search document based on a forum_entry id + function forum_single_document($id) { + $posts = get_recordset('forum_posts', 'id', $id); + $post = $posts->fields; + + $discussions = get_recordset('forum_discussions', 'id', $post['discussion']); + $discussion = $discussions->fields; + + $forums = get_recordset('forum', 'id', $discussion['forum']); + $forum = $forums->fields; + + return new ForumSearchDocument($post, $forum['id'], $forum['course'], $post['groupid']); + } //forum_single_document + + function forum_delete($info) { + return $info; + } //forum_delete + + //returns the var names needed to build a sql query for addition/deletions + function forum_db_names() { + //[primary id], [table name], [time created field name], [time modified field name] + return array('id', 'forum_posts', 'created', 'modified'); + } //forum_db_names + //reworked faster version from /mod/forum/lib.php function forum_get_discussions_fast($forum) { global $CFG, $USER; diff --git a/search/documents/glossary_document.php b/search/documents/glossary_document.php index 6ce87eb11af75..2147e880a7ac3 100644 --- a/search/documents/glossary_document.php +++ b/search/documents/glossary_document.php @@ -7,7 +7,6 @@ * */ require_once("$CFG->dirroot/search/documents/document.php"); - //require_once("$CFG->dirroot/mod/glossary/lib.php"); class GlossarySearchDocument extends SearchDocument { public function __construct(&$entry, $glossary_id, $course_id, $group_id) { @@ -63,6 +62,7 @@ function glossary_get_content_for_index(&$glossary) { return $documents; } //glossary_get_content_for_index + //returns a single glossary search document based on a glossary_entry id function glossary_single_document($id) { $entries = get_recordset('glossary_entries', 'id', $id); $entry = $entries->fields; @@ -73,12 +73,16 @@ function glossary_single_document($id) { return new GlossarySearchDocument($entry, $entry['glossaryid'], $glossary['course'], -1); } //glossary_single_document + //dummy delete function that converts docid from the search table to itself.. + //this was here for a reason, but I can't remember it at the moment. function glossary_delete($info) { return $info; } //glossary_delete + //returns the var names needed to build a sql query for addition/deletions function glossary_db_names() { - return array('id', 'glossary_entries', 'timemodified'); + //[primary id], [table name], [time created field name], [time modified field name] + return array('id', 'glossary_entries', 'timecreated', 'timemodified'); } //glossary_db_names ?> \ No newline at end of file diff --git a/search/documents/resource_document.php b/search/documents/resource_document.php index 204efc8e72878..db15d6a2b8101 100644 --- a/search/documents/resource_document.php +++ b/search/documents/resource_document.php @@ -58,4 +58,29 @@ function resource_get_content_for_index(&$notneeded) { return $documents; } //resource_get_content_for_index + //returns a single resource search document based on a resource_entry id + function resource_single_document($id) { + $resources = get_recordset_sql('SELECT * + FROM `resource` + WHERE alltext NOT LIKE "" + AND alltext NOT LIKE " " + AND alltext NOT LIKE " " + AND TYPE != "file", + AND id = '.$id); + + $resource = $resources->fields; + + return new ResourceSearchDocument($resource); + } //resource_single_document + + function resource_delete($info) { + return $info; + } //resource_delete + + //returns the var names needed to build a sql query for addition/deletions + function resource_db_names() { + //[primary id], [table name], [time created field name], [time modified field name], [additional where conditions for sql] + return array('id', 'resource', 'timemodified', 'timemodified', "WHERE alltext NOT LIKE '' AND alltext NOT LIKE ' ' AND alltext NOT LIKE ' ' AND TYPE != 'file'"); + } //resource_db_names + ?> \ No newline at end of file diff --git a/search/documents/wiki_document.php b/search/documents/wiki_document.php index 6bdf0ad4707c7..ad207b63fc052 100644 --- a/search/documents/wiki_document.php +++ b/search/documents/wiki_document.php @@ -134,4 +134,25 @@ function wiki_get_content_for_index(&$wiki) { return $documents; } //wiki_get_content_for_index + //returns a single wiki search document based on a wiki_entry id + function wiki_single_document($id) { + $pages = get_recordset('wiki_pages', 'id', $id); + $page = $pages->fields; + + $entries = get_recordset('wiki_entries', 'id', $page['wiki']); + $entry = $entries->fields; + + return new WikiSearchDocument($page, $entry['wikiid'], $entry['course'], $entry['groupid']); + } //wiki_single_document + + function wiki_delete($info) { + return $info; + } //wiki_delete + + //returns the var names needed to build a sql query for addition/deletions + function wiki_db_names() { + //[primary id], [table name], [time created field name], [time modified field name] + return array('id', 'wiki_pages', 'created', 'lastmodified'); + } //wiki_db_names + ?> \ No newline at end of file diff --git a/search/indexer.php b/search/indexer.php index 874775e2f902b..24f329ab48fcd 100644 --- a/search/indexer.php +++ b/search/indexer.php @@ -92,15 +92,17 @@ // * mod_get_content_for_index //are the sole basis for including a module in the index at the moment. - if ($mods = get_records_select('modules' /*'index this module?' where statement*/)) { - $mods = array_merge($mods, search_get_additional_modules()); + if ($mods = get_records_select('modules' /*'index this module?' where statement*/)) { + //add virtual modules onto the back of the array + $mods = array_merge($mods, search_get_additional_modules()); - foreach ($mods as $mod) { + foreach ($mods as $mod) { $class_file = $CFG->dirroot.'/search/documents/'.$mod->name.'_document.php'; if (file_exists($class_file)) { include_once($class_file); + //build function names $iter_function = $mod->name.'_iterator'; $index_function = $mod->name.'_get_content_for_index'; @@ -163,5 +165,8 @@ //mark the time we last updated set_config("search_indexer_run_date", time()); - + + //and the index size + set_config("search_index_size", (int)$index->count()); + ?> \ No newline at end of file diff --git a/search/indexlib.php b/search/indexlib.php index 305768c00723e..f2335921b0e5f 100644 --- a/search/indexlib.php +++ b/search/indexlib.php @@ -24,13 +24,15 @@ public function __construct($path=SEARCH_INDEX_PATH) { $this->path = $path; + //test to see if there is a valid index on disk, at the specified path try { $test_index = new Zend_Search_Lucene($this->path, false); $validindex = true; } catch(Exception $e) { $validindex = false; } //catch - + + //retrieve file system info about the index if it is valid if ($validindex) { $this->size = display_size(get_directory_size($this->path)); $index_dir = get_directory_list($this->path, '', false, false); @@ -42,11 +44,16 @@ public function __construct($path=SEARCH_INDEX_PATH) { $this->indexcount = 0; } //else - $db_exists = false; + $db_exists = false; //for now + //get all the current tables in moodle $admin_tables = $db->MetaTables(); + //TODO: use new IndexDBControl class for database checks? + + //check if our search table exists if (in_array($CFG->prefix.SEARCH_DATABASE_TABLE, $admin_tables)) { + //retrieve database information if it does $db_exists = true; //total documents @@ -65,12 +72,14 @@ public function __construct($path=SEARCH_INDEX_PATH) { $this->types = array(); } //else + //check if the busy flag is set if ($CFG->search_indexer_busy == '1') { $this->complete = false; } else { $this->complete = true; } //if + //get the last run date for the indexer if ($this->valid() && $CFG->search_indexer_run_date) { $this->time = $CFG->search_indexer_run_date; } else { @@ -78,6 +87,7 @@ public function __construct($path=SEARCH_INDEX_PATH) { } //else } //__construct + //returns false on error, and the error message via referenced variable $err public function valid(&$err=null) { $err = array(); $ret = true; @@ -100,6 +110,7 @@ public function valid(&$err=null) { return $ret; } //valid + //is the index dir valid public function is_valid_dir() { if ($this->filecount > 0) { return true; @@ -108,6 +119,7 @@ public function is_valid_dir() { } //else } //is_valid_dir + //is the db table valid public function is_valid_db() { if ($this->dbcount > 0) { return true; @@ -116,6 +128,7 @@ public function is_valid_db() { } //else } //is_valid_db + //shorthand get method for the class variables public function __get($var) { if (in_array($var, array_keys(get_class_vars(get_class($this))))) { return $this->$var; @@ -126,9 +139,11 @@ public function __get($var) { /* DB Index control class * + * Used to control the search index database table * */ class IndexDBControl { + //does the table exist? public function checkTableExists() { global $CFG, $db; @@ -142,6 +157,7 @@ public function checkTableExists() { } //else } //checkTableExists + //is our database setup valid? public function checkDB() { global $CFG, $db; @@ -159,6 +175,7 @@ public function checkDB() { return $ret; } //checkDB + //add a document record to the table public function addDocument($document=null) { global $db; @@ -182,6 +199,7 @@ public function addDocument($document=null) { return $id; } //addDocument + //remove a document record from the index public function delDocument($document) { global $db; diff --git a/search/query.php b/search/query.php index 82ea337a50193..2bd6f5b3c07be 100644 --- a/search/query.php +++ b/search/query.php @@ -27,18 +27,22 @@ //check for php5, but don't die yet (see line 52) if ($check = search_check_php5()) { - require_once("$CFG->dirroot/search/querylib.php"); + require_once("$CFG->dirroot/search/querylib.php"); - $advanced = (optional_param('a', '0', PARAM_INT) == '1') ? true : false; - $pages = (optional_param('p', '0', PARAM_INT) == '1') ? true : false; + $page_number = optional_param('page', -1, PARAM_INT); + $pages = ($page_number == -1) ? false : true; + $advanced = (optional_param('a', '0', PARAM_INT) == '1') ? true : false; $query_string = optional_param('query_string', '', PARAM_CLEAN); if ($pages && isset($_SESSION['search_advanced_query'])) { + //if both are set, then we are busy browsing through the result pages of an advanced query $adv = unserialize($_SESSION['search_advanced_query']); } else if ($advanced) { + //otherwise we are dealing with a new advanced query unset($_SESSION['search_advanced_query']); session_unregister('search_advanced_query'); + //retrieve advanced query variables $adv->mustappear = trim(optional_param('mustappear', '', PARAM_CLEAN), $chars); $adv->notappear = trim(optional_param('notappear', '', PARAM_CLEAN), $chars); $adv->canappear = trim(optional_param('canappear', '', PARAM_CLEAN), $chars); @@ -48,47 +52,59 @@ } //else if ($advanced) { + //parse the advanced variables into a query string + //TODO: move out to external query class (QueryParse?) + + //chars to strip from strings (whitespace) $chars = ' \t\n\r\0\x0B,;'; $query_string = ''; + //get all available module types $module_types = array_merge(array('All'), array_values(search_get_document_types())); $adv->module = in_array($adv->module, $module_types) ? $adv->module : 'All'; + //convert '1 2' into '+1 +2' for required words field if (strlen(trim($adv->mustappear)) > 0) { $query_string = ' +'.implode(' +', preg_split("/[\s,;]+/", $adv->mustappear)); } //if + //convert '1 2' into '-1 -2' for not wanted words field if (strlen(trim($adv->notappear)) > 0) { $query_string .= ' -'.implode(' -', preg_split("/[\s,;]+/", $adv->notappear)); } //if + //this field is left untouched, apart from whitespace being stripped if (strlen(trim($adv->canappear)) > 0) { $query_string .= ' '.implode(' ', preg_split("/[\s,;]+/", $adv->canappear)); } //if + //add module restriction if ($adv->module != 'All') { $query_string .= ' +doctype:'.$adv->module; } //if + //create title search string if (strlen(trim($adv->title)) > 0) { $query_string .= ' +title:'.implode(' +title:', preg_split("/[\s,;]+/", $adv->title)); } //if + //create author search string if (strlen(trim($adv->author)) > 0) { $query_string .= ' +author:'.implode(' +author:', preg_split("/[\s,;]+/", $adv->author)); } //if + //save our options if the query is valid if (!empty($query_string)) { $_SESSION['search_advanced_query'] = serialize($adv); } //if } //if - $page_number = optional_param('page', 1, PARAM_INT); - + //normalise page number if ($page_number < 1) { $page_number = 1; - } //if + } //if + //run the query against the index $sq = new SearchQuery($query_string, $page_number, 10, true); } //if @@ -204,7 +220,8 @@ print 'Searching: '; if ($sq->is_valid_index()) { - print $sq->index_count(); + //use cached variable to show up-to-date index size (takes deletions into account) + print $CFG->search_index_size; } else { print "0"; } //else @@ -235,7 +252,10 @@ $hits = $sq->results(); if ($advanced) { - $page_links = preg_replace("/query_string=[^&]+/", 'a=1&p=1', $page_links); + //if in advanced mode, search options are saved in the session, so + //we can remove the query string var from the page links, and replace + //it with a=1 (Advanced = on) instead + $page_links = preg_replace("/query_string=[^&]+/", 'a=1', $page_links); } //if print "
    "; diff --git a/search/querylib.php b/search/querylib.php index c31a0dfd9e935..87007ff361dad 100644 --- a/search/querylib.php +++ b/search/querylib.php @@ -274,9 +274,10 @@ public function count() { return count($this->results); } //count - public function index_count() { - return $this->index->count(); - } //index_count + //this shouldn't be in this class + //public function index_count() { + // return $this->index->count(); + //} //index_count public function is_valid() { return ($this->validquery and $this->validindex); diff --git a/search/stats.php b/search/stats.php index 33ee471ff06d9..f54d72aceec50 100644 --- a/search/stats.php +++ b/search/stats.php @@ -87,10 +87,14 @@ $table->data[] = array('Database', 'search_documents'); - //add an extra field if we're admin + //add extra fields if we're admin if (isadmin()) { //don't want to confuse users if the two totals don't match (hint: they should) $table->data[] = array('Documents in index', $indexinfo->indexcount); + + //*cough* they should match if deletions were actually removed from the index, + //as it turns out, they're only marked as deleted and not returned in search results + $table->data[] = array('Deletions in index', (int)$indexinfo->indexcount - (int)$indexinfo->dbcount); } //if $table->data[] = array('Documents in database', $indexinfo->dbcount); diff --git a/search/update.php b/search/update.php index 6b11e4d0db63e..d784acd4f1318 100644 --- a/search/update.php +++ b/search/update.php @@ -27,6 +27,8 @@ mtrace("
    Starting index update (updates)...\n");  
       
       if ($mods = get_records_select('modules')) {
    +  $mods = array_merge($mods, search_get_additional_modules());
    +  
       foreach ($mods as $mod) {
         $class_file = $CFG->dirroot.'/search/documents/'.$mod->name.'_document.php';
         $get_document_function = $mod->name.'_single_document';
    @@ -41,8 +43,10 @@
             mtrace("Checking $mod->name module for updates.");
             $values = $db_names_function();
             
    -        $sql = "select id, ".$values[0]." as docid from ".$values[1]."
    -                where ".$values[2]." > $indexdate";
    +        //TODO: check 'in' syntax with other RDBMS' (add and update.php as well)
    +        $sql = "select id, ".$values[0]." as docid from ".$values[1].
    +               " where ".$values[3]." > $indexdate".
    +               " and id in (select docid from ".SEARCH_DATABASE_TABLE.")";
                     
             $records = get_records_sql($sql);