Merge #4631

4631: Split the field id map from the weight of each fields r=Kerollmops a=irevoire # Pull Request ## Related issue Fixes #4484 ## What does this PR do? - Make the (internal) searchable fields database always contain the searchable fields (instead of None when the user-defined searchable fields were not defined) - Introduce a new « fieldids_weights_map » that does the mapping between a fieldId and its Weight - Ensure that when two searchable fields are swapped, the field ID map doesn't change anymore (and thus, doesn't re-index) - Uses the weight instead of the order of the searchable fields in the attribute ranking rule at search time - When no searchable attributes are defined, make all their weights equal to zero - When a field is declared as searchable and contains nested fields, all its subfields share the same weight ## Impact on relevancy ### When no searchable attributes are declared When no searchable attributes are declared, all the fields have the same importance instead of randomly giving more importance to the field we've encountered « the most early » in the life of the index. This means that before this PR, send the following json: ```json [ { "id": 0, "name": "kefir", "color": "white" }, { "id": 1, "name": "white", "last name": "spirit" } ] ``` Would make the field `name` more important than the field `color` or `last name`. This means that searching for `white` would make the document `1` automatically higher ranked than the document `0`. After this PR, all the fields have the same weight, and none are considered more important than others. ### When a nested field is made searchable The second behavior change that happened with this PR is in the case you're sending this document, for example: ```json { "id": 0, "name": "tamo", "doggo": { "name": "kefir", "surname": "le kef" }, "catto": "gromez" } ``` Previously, defining the searchable attributes as: `["tamo", "doggo", "catto"]` was actually defining the « real » searchable attributes in the engine as: `["tamo", "doggo", "catto", "doggo.name", "doggo.surname"]`, which means that `doggo.name` and `doggo.surname` were _NOT_ where the user expected them and had completely different weights than `doggo`. In this PR all the weights have been unified, and the « real » searchable fields look like this: ```json [ "tamo", "doggo", "doggo.name", "doggo.surname", "catto"] ^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^ Weight 0 Weight 1 Weight 2 Co-authored-by: Tamo <tamo@meilisearch.com>
meilisearch · May 16, 2024 · 7c19c07 · 7c19c07
2 parents 76bb6d5 + 673b6e1
commit 7c19c07
Show file tree

Hide file tree

Showing 27 changed files with 765 additions and 185 deletions.
diff --git a/index-scheduler/src/utils.rs b/index-scheduler/src/utils.rs
@@ -272,9 +272,9 @@ pub fn swap_index_uid_in_task(task: &mut Task, swap: (&str, &str)) {
     }
     for index_uid in index_uids {
         if index_uid == swap.0 {
-            *index_uid = swap.1.to_owned();
+            swap.1.clone_into(index_uid);
         } else if index_uid == swap.1 {
-            *index_uid = swap.0.to_owned();
+            swap.0.clone_into(index_uid);
         }
     }
 }

diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs
@@ -730,7 +730,7 @@ pub fn perform_search(
         let mut ids = BTreeSet::new();
         for attr in attrs {
             if attr == "*" {
-                ids = displayed_ids.clone();
+                ids.clone_from(&displayed_ids);
                 break;
             }
 

diff --git a/meilisearch/src/search_queue.rs b/meilisearch/src/search_queue.rs
@@ -85,8 +85,13 @@ impl SearchQueue {
                 },
 
                 search_request = receive_new_searches.recv() => {
-                    // this unwrap is safe because we're sure the `SearchQueue` still lives somewhere in actix-web
-                    let search_request = search_request.unwrap();
+                    let search_request = match search_request {
+                        Some(search_request) => search_request,
+                        // This should never happen while actix-web is running, but it's not a reason to crash
+                        // and it can generate a lot of noise in the tests.
+                        None => continue,
+                    };
+
                     if searches_running < usize::from(parallelism) && queue.is_empty() {
                         searches_running += 1;
                         // if the search requests die it's not a hard error on our side

diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs
@@ -85,8 +85,8 @@ async fn simple_search() {
         )
         .await;
     snapshot!(code, @"200 OK");
-    snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.996969696969697},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.996969696969697},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###);
-    snapshot!(response["semanticHitCount"], @"1");
+    snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###);
+    snapshot!(response["semanticHitCount"], @"2");
 
     let (response, code) = index
         .search_post(
@@ -331,7 +331,7 @@ async fn query_combination() {
     .await;
 
     snapshot!(code, @"200 OK");
-    snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.996969696969697},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.996969696969697},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.8848484848484849}]"###);
+    snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9242424242424242}]"###);
     snapshot!(response["semanticHitCount"], @"null");
 
     // query + vector, no hybrid keyword =>
@@ -374,6 +374,6 @@ async fn query_combination() {
         .await;
 
     snapshot!(code, @"200 OK");
-    snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848}]"###);
+    snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9242424242424242}]"###);
     snapshot!(response["semanticHitCount"], @"0");
 }
diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs
@@ -921,7 +921,7 @@ async fn test_score_details() {
                         "order": 3,
                         "attributeRankingOrderScore": 1.0,
                         "queryWordDistanceScore": 0.8095238095238095,
-                        "score": 0.9727891156462584
+                        "score": 0.8095238095238095
                       },
                       "exactness": {
                         "order": 4,

diff --git a/meilisearch/tests/search/restrict_searchable.rs b/meilisearch/tests/search/restrict_searchable.rs
@@ -285,10 +285,10 @@ async fn attributes_ranking_rule_order() {
                 @r###"
             [
               {
-                "id": "2"
+                "id": "1"
               },
               {
-                "id": "1"
+                "id": "2"
               }
             ]
             "###

diff --git a/meilisearch/tests/snapshot/mod.rs b/meilisearch/tests/snapshot/mod.rs
@@ -1,6 +1,5 @@
 use std::time::Duration;
 
-use actix_rt::time::sleep;
 use meili_snap::{json_string, snapshot};
 use meilisearch::option::ScheduleSnapshot;
 use meilisearch::Opt;
@@ -53,11 +52,29 @@ async fn perform_snapshot() {
 
     index.load_test_set().await;
 
-    server.index("test1").create(Some("prim")).await;
+    let (task, code) = server.index("test1").create(Some("prim")).await;
+    meili_snap::snapshot!(code, @"202 Accepted");
 
-    index.wait_task(2).await;
+    index.wait_task(task.uid()).await;
+
+    // wait for the _next task_ to process, aka the snapshot that should be enqueued at some point
 
-    sleep(Duration::from_secs(2)).await;
+    println!("waited for the next task to finish");
+    let now = std::time::Instant::now();
+    let next_task = task.uid() + 1;
+    loop {
+        let (value, code) = index.get_task(next_task).await;
+        dbg!(&value);
+        if code != 404 && value["status"].as_str() == Some("succeeded") {
+            break;
+        }
+
+        if now.elapsed() > Duration::from_secs(30) {
+            panic!("The snapshot didn't schedule in 30s even though it was supposed to be scheduled every 2s: {}",
+                serde_json::to_string_pretty(&value).unwrap()
+            );
+        }
+    }
 
     let temp = tempfile::tempdir().unwrap();
 

diff --git a/milli/examples/search.rs b/milli/examples/search.rs
@@ -48,7 +48,7 @@ fn main() -> Result<(), Box<dyn Error>> {
 
             let start = Instant::now();
 
-            let mut ctx = SearchContext::new(&index, &txn);
+            let mut ctx = SearchContext::new(&index, &txn)?;
             let universe = filtered_universe(&ctx, &None)?;
 
             let docs = execute_search(

diff --git a/milli/src/error.rs b/milli/src/error.rs
@@ -32,6 +32,8 @@ pub enum InternalError {
     DatabaseClosing,
     #[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))]
     DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> },
+    #[error("Missing {key} in the fieldids weights mapping.")]
+    FieldidsWeightsMapMissingEntry { key: FieldId },
     #[error(transparent)]
     FieldIdMapMissingEntry(#[from] FieldIdMapMissingEntry),
     #[error("Missing {key} in the field id mapping.")]

diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs
@@ -0,0 +1,48 @@
+//! The fieldids weights map is in charge of storing linking the searchable fields with their weights.
+
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+
+use crate::{FieldId, FieldsIdsMap, Weight};
+
+#[derive(Debug, Default, Serialize, Deserialize)]
+pub struct FieldidsWeightsMap {
+    map: HashMap<FieldId, Weight>,
+}
+
+impl FieldidsWeightsMap {
+    /// Insert a field id -> weigth into the map.
+    /// If the map did not have this key present, `None` is returned.
+    /// If the map did have this key present, the value is updated, and the old value is returned.
+    pub fn insert(&mut self, fid: FieldId, weight: Weight) -> Option<Weight> {
+        self.map.insert(fid, weight)
+    }
+
+    /// Create the map from the fields ids maps.
+    /// Should only be called in the case there are NO searchable attributes.
+    /// All the fields will be inserted in the order of the fields ids map with a weight of 0.
+    pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self {
+        FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() }
+    }
+
+    /// Removes a field id from the map, returning the associated weight previously in the map.
+    pub fn remove(&mut self, fid: FieldId) -> Option<Weight> {
+        self.map.remove(&fid)
+    }
+
+    /// Returns weight corresponding to the key.
+    pub fn weight(&self, fid: FieldId) -> Option<Weight> {
+        self.map.get(&fid).copied()
+    }
+
+    /// Returns highest weight contained in the map if any.
+    pub fn max_weight(&self) -> Option<Weight> {
+        self.map.values().copied().max()
+    }
+
+    /// Return an iterator visiting all field ids in arbitrary order.
+    pub fn ids(&self) -> impl Iterator<Item = FieldId> + '_ {
+        self.map.keys().copied()
+    }
+}