Skip to content

Commit

Permalink
.Net: Improvements for Azure Cosmos DB for MongoDB connector (#6169)
Browse files Browse the repository at this point in the history
### Motivation and Context

<!-- Thank you for your contribution to the semantic-kernel repo!
Please help reviewers and future users, providing the following
information:
  1. Why is this change required?
  2. What problem does it solve?
  3. What scenario does it contribute to?
  4. If it fixes an open issue, please link to the issue here.
-->

### Description

<!-- Describe your changes, the overall approach, the underlying design.
These notes will help understanding how your code works. Thanks! -->

### Contribution Checklist

<!-- Before submitting this PR, please make sure: -->

- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [x] I didn't break anyone 😄
  • Loading branch information
dmytrostruk committed May 8, 2024
1 parent 4c7fcb1 commit 5249aed
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,7 @@ public async Task AzureCosmosDBMongoDBCacheAsync()
var kernel = GetKernelWithCache(_ => new AzureCosmosDBMongoDBMemoryStore(
TestConfiguration.AzureCosmosDbMongoDb.ConnectionString,
TestConfiguration.AzureCosmosDbMongoDb.DatabaseName,
new()
{
Kind = AzureCosmosDBVectorSearchType.VectorIVF,
Similarity = AzureCosmosDBSimilarityType.Cosine,
Dimensions = 1536
}));
new(dimensions: 1536)));

var result1 = await ExecuteAsync(kernel, "First run", "What's the tallest building in New York?");
var result2 = await ExecuteAsync(kernel, "Second run", "What is the highest building in New York City?");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,82 +5,73 @@
namespace Microsoft.SemanticKernel.Connectors.AzureCosmosDBMongoDB;

/// <summary>
/// Get more details about Azure Cosmos Mongo vCore and these configs https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search
/// Azure Cosmos Mongo vCore configuration.
/// More information here: https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search.
/// </summary>
public class AzureCosmosDBMongoDBConfig
/// <remarks>
/// Initialize the <see cref="AzureCosmosDBMongoDBConfig"/> with default values.
/// </remarks>
public class AzureCosmosDBMongoDBConfig(int dimensions)
{
private const string DefaultIndexName = "default_index";

/// <summary>
/// Application name for the client for tracking and logging
/// </summary>
public string ApplicationName { get; set; }
public string ApplicationName { get; set; } = HttpHeaderConstant.Values.UserAgent;

/// <summary>
/// Index name for the Mongo vCore DB
/// Index name for the Mongo vCore DB. Default is "default_index".
/// </summary>
public string IndexName { get; set; }
public string IndexName { get; set; } = DefaultIndexName;

/// <summary>
/// Kind: Type of vector index to create.
/// Type of vector index to create.
/// Possible options are:
/// - vector-ivf
/// - vector-ivf (default)
/// - vector-hnsw: available as a preview feature only,
/// to enable visit https://learn.microsoft.com/azure/azure-resource-manager/management/preview-features
/// </summary>
public AzureCosmosDBVectorSearchType Kind { get; set; }
public AzureCosmosDBVectorSearchType Kind { get; set; } = AzureCosmosDBVectorSearchType.VectorIVF;

/// <summary>
/// NumLists: This integer is the number of clusters that the inverted file (IVF) index uses to group the vector data.
/// This integer is the number of clusters that the inverted file (IVF) index uses to group the vector data. Default is 1.
/// We recommend that numLists is set to documentCount/1000 for up to 1 million documents and to sqrt(documentCount)
/// for more than 1 million documents. Using a numLists value of 1 is akin to performing brute-force search, which has
/// limited performance.
/// </summary>
public int NumLists { get; set; }
public int NumLists { get; set; } = 1;

/// <summary>
/// Number of dimensions for vector similarity. The maximum number of supported dimensions is 2000.
/// </summary>
public int Dimensions { get; set; }
public int Dimensions { get; set; } = dimensions;

/// <summary>
/// Similarity: Similarity metric to use with the IVF index.
/// Similarity metric to use with the IVF index.
/// Possible options are:
/// - COS (cosine distance),
/// - COS (cosine distance, default),
/// - L2 (Euclidean distance), and
/// - IP (inner product).
/// </summary>
public AzureCosmosDBSimilarityType Similarity { get; set; }
public AzureCosmosDBSimilarityType Similarity { get; set; } = AzureCosmosDBSimilarityType.Cosine;

/// <summary>
/// NumberOfConnections: The max number of connections per layer (16 by default, minimum value is 2, maximum value is
/// The max number of connections per layer (16 by default, minimum value is 2, maximum value is
/// 100). Higher m is suitable for datasets with high dimensionality and/or high accuracy requirements.
/// </summary>
public int NumberOfConnections { get; set; }
public int NumberOfConnections { get; set; } = 16;

/// <summary>
/// EfConstruction: the size of the dynamic candidate list for constructing the graph (64 by default, minimum value is 4,
/// The size of the dynamic candidate list for constructing the graph (64 by default, minimum value is 4,
/// maximum value is 1000). Higher ef_construction will result in better index quality and higher accuracy, but it will
/// also increase the time required to build the index. EfConstruction has to be at least 2 * m
/// </summary>
public int EfConstruction { get; set; }
public int EfConstruction { get; set; } = 64;

/// <summary>
/// EfSearch: The size of the dynamic candidate list for search (40 by default). A higher value provides better recall at
/// The size of the dynamic candidate list for search (40 by default). A higher value provides better recall at
/// the cost of speed.
/// </summary>
public int EfSearch { get; set; }

/// <summary>
/// Initialize the AzureCosmosDBMongoDBConfig with default values
/// </summary>
public AzureCosmosDBMongoDBConfig()
{
this.ApplicationName = HttpHeaderConstant.Values.UserAgent;
this.IndexName = "default_index";
this.Kind = AzureCosmosDBVectorSearchType.VectorHNSW;
this.NumLists = 1;
this.Similarity = AzureCosmosDBSimilarityType.Cosine;
this.NumberOfConnections = 16;
this.EfConstruction = 64;
this.EfSearch = 40;
}
public int EfSearch { get; set; } = 40;
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ public async Task ItCanCreateGetCheckAndDeleteCollectionAsync()
var collectionName = this._fixture.CollectionName;
var memoryStore = this._fixture.MemoryStore;

await memoryStore.CreateCollectionAsync(collectionName);
var collectionNames = memoryStore.GetCollectionsAsync();

Assert.True(await collectionNames.ContainsAsync(collectionName));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,14 @@ public AzureCosmosDBMongoDBMemoryStoreTestsFixture()
this.MemoryStore = new AzureCosmosDBMongoDBMemoryStore(
connectionString,
this.DatabaseName,
new AzureCosmosDBMongoDBConfig()
new AzureCosmosDBMongoDBConfig(dimensions: 3)
);
}

public async Task InitializeAsync()
{
await this.MemoryStore.CreateCollectionAsync(this.CollectionName);

await this
.MemoryStore.UpsertBatchAsync(this.CollectionName, DataHelper.VectorSearchTestRecords)
.ToListAsync();
Expand Down

0 comments on commit 5249aed

Please sign in to comment.