# Setup

In [1]:
import itertools

from bs4 import BeautifulSoup

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import SitemapLoader
from langchain.docstore.document import Document
from langchain.text_splitter import TokenTextSplitter

from momento.auth import CredentialProvider
from momento.config import VectorIndexConfigurations
from momento.responses.vector_index import Search
from langchain.vectorstores import MomentoVectorIndex

# Fixes bug with asyncio and jupyter notebooks.
# Required for SitemapLoader to work correctly.
import nest_asyncio

nest_asyncio.apply()


In [2]:
%load_ext dotenv
%dotenv


# Get website content

## Part 1: Find relevant URLs to scrape

We are going to read web page content. Let's start out with a helper to strip out irrelevant details:

In [3]:
def parse_content_fn(content: BeautifulSoup) -> str:
    # Strip irrelevant elements from the content
    to_remove = list(itertools.chain( 
        content.find_all("title"),
        content.find_all("nav"),
        content.find_all("div", role="region"),
        content.find_all("div", class_="page-wrapper"),
        content.find_all("div", class_="blog-post_newsletter"),
        content.find_all("div", class_="blog-post-social-wrapper"),
        content.find_all("section", class_="section-more-blog-posts"),
        content.find_all("button"),
        content.find_all("aside"),
        content.find_all(id="faqs"),
        content.find_all("header"),
        content.find_all("footer")))

    for element in to_remove:
        element.decompose()

    return str(content.get_text()).strip()


We have a lot of rich information from the technical documentation page. Since that has a sitemap, we can scrape the pages using Langchain's SitemapLoader:

In [4]:
tech_docs_loader = SitemapLoader(
    web_path="https://docs.momentohq.com/sitemap.xml",
    parsing_function=parse_content_fn
)


We also have a trove of documents first from blogs. We'll scrape the links directly from the index page, then use Langchain's WebBaseLoader to scrape the content:

In [18]:
blog_docs_loader = SitemapLoader(
    web_path="https://www.gomomento.com/sitemap.xml",
    filter_urls=[r"https://www.gomomento.com/blog.*"]
)

## Part 2: Scrape relevant URLs

Note: the next calls take 1 to 2 minutes to run.

In [6]:
tech_docs = tech_docs_loader.load()


Fetching pages: 100%|##########| 123/123 [00:13<00:00,  9.41it/s]


In [7]:
tech_docs[0]


Document(page_content='Momento CacheAccelerate your app, reduce costs, and free your developers.Momento TopicsEnable real-time communication between different parts of a distributed system.Momento Vector IndexA serverless vector index for your AI-enabled applications.#1213Momento LeaderboardsA serverless leaderboard service', metadata={'source': 'https://docs.momentohq.com/', 'loc': 'https://docs.momentohq.com/', 'changefreq': 'weekly', 'priority': '0.5'})

In [26]:
blogs = blog_docs_loader.load()

# Gently preprocess the metadata
def trim_metadata(doc: Document) -> Document:
    metadata = {k: v.strip() for k, v in doc.metadata.items()}
    return Document(page_content=doc.page_content, metadata=metadata)

blogs = [trim_metadata(doc) for doc in blogs]

Fetching pages: 100%|##########| 134/134 [00:20<00:00,  6.61it/s]


In [28]:
blogs[1].page_content

'3 crucial caching choices: Where, when, and how  — Momento\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGet started with Momento Vector Index today!Get started with Momento Vector Index today!Solutions\n\nBy Use CaseBy CategoryBy Team TypeServicesMomento CacheMomento TopicsMomento Vector IndexServicesMomento CacheMomento TopicsMomento Vector IndexBuildUse CasesChatFront-End DevelopmentServerless AppsIndustriesAI/MLGamingMedia & EntertainmentIntegrationsDynamoDBMongoDBRedisBuildUse CasesChatFront-end DevelopmentServerless AppsIndustriesAI/MLGamingMedia & EntertainmentIntegrationsDynamoDBMongoDBRedisPricingResourcesCase StudiesFAQComplianceResourcesCase StudiesFAQMoCon RecapComplianceBlogDocsCompanyAbout UsJoin our NewsletterCompanyAbout UsJoin our NewsletterDocsLog InContact UsSolutions\n\nBy Use CaseBy CategoryBy Team TypeConsoleContact UsEN\n\nJPENJPCloseOctober 13, 2022 - 7 Min Read3 crucial caching choices: Where, when, and how The right questions determine the righ

In [29]:
docs = tech_docs + blogs


In [30]:
len(tech_docs), len(blogs), len(docs)


(123, 134, 257)

# Split data into smaller pieces in prep for Q&A

In [31]:
text_splitter = TokenTextSplitter(chunk_size=128, chunk_overlap=32, model_name="text-embedding-ada-002")
split_docs = text_splitter.split_documents(docs)
len(split_docs)


4358

# Load the data into MVI

Let's come up with meaningful ID's to apply

In [32]:
def gen_ids(docs: list[Document]) -> list[str]:
    ids = []
    prev_source = None
    prev_source_index = 0
    for doc in docs:
        source = doc.metadata["source"]
        if source != prev_source:
            ids.append(f"{source}, chunk=1")
            prev_source_index = 1
        else:
            ids.append(f"{source}, chunk={prev_source_index+1}")
            prev_source_index += 1
        prev_source = source
    return ids


Index the data in MVI using OpenAI text embeddings

In [33]:
from momento import PreviewVectorIndexClient, VectorIndexConfigurations, CredentialProvider

client = PreviewVectorIndexClient(
    configuration=VectorIndexConfigurations.Default.latest(),
    credential_provider=CredentialProvider.from_environment_variable("MOMENTO_API_KEY"),
)


In [39]:
client.list_indexes()


ListIndexes.Success(indexes=[])

In [40]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")  # type: ignore

docs = split_docs
ids = gen_ids(docs)

vector_store = MomentoVectorIndex.from_documents(
    docs,
    embedding=embeddings,
    client=client,
    index_name="momento",
    ids=ids
)


# Query using the langchain integration

In [41]:
vector_store.similarity_search_with_score("How much does Momento cost?")


[(Document(page_content=" than cutting edge technology advancements, but at Momento, we believe it's a crucial part of the journey. Total Cost of Ownership (TCO) matters! Complex pricing models can make it difficult to project costs. We carefully crafted our pricing model with our primary design tenant being simplicity.\u200dHow is Momento priced? The Momento pricing model is simple and straightforward. A single pricing dimension of $0.50/GB inbound and outbound. Simply the amount of data that moves in and out of Momento Cache. And there's no separate storage charge to worry about.\u200dWe have a free tier to make it fast for", metadata={'loc': 'https://www.gomomento.com/blog/simple-the-way-cloud-pricing-should-be', 'source': 'https://www.gomomento.com/blog/simple-the-way-cloud-pricing-should-be'}),
  0.8557299971580505),
 (Document(page_content=" We believe in a simple, single-dimension pricing model that's easy to reason with as your app grows. We want Momento customers to be able to

# Query using the MVI client

In [42]:
client = vector_store._client


In [43]:
client.list_indexes()


ListIndexes.Success(indexes=[IndexInfo(name='momento', num_dimensions=1536, similarity_metric=<SimilarityMetric.COSINE_SIMILARITY: 'COSINE_SIMILARITY'>)])

In [44]:
v = embeddings.embed_query("Can I use Momento with C#?")


In [45]:
response = client.search("momento", v, metadata_fields=["text"])
if isinstance(response, Search.Success):
    for hit in response.hits:
        print(hit.id)
        print(hit.score)
        print(hit.metadata["text"].replace("\n", " ")[:256])
        print()
    

https://docs.momentohq.com/cache/develop/sdks/dotnet, chunk=1
0.8727020621299744
Momento .NET SDKWelcome to the Momento .NET SDK documentation!The Momento .NET SDK is available via the nuget package Momento.Sdk.The source code can be found on GitHub: momentohq/client-sdk-dotnet.Requirements​dotnet runtime and command line tools; after 

https://docs.momentohq.com/topics/develop/sdks/dotnet, chunk=1
0.8709917664527893
Momento .NET SDKWelcome to the Momento .NET SDK documentation!The Momento .NET SDK is available via the nuget package Momento.Sdk.The source code can be found on GitHub: momentohq/client-sdk-dotnet.Requirements​dotnet runtime and command line tools; after 

https://docs.momentohq.com/cache/develop/sdks/dotnet/cheat-sheet, chunk=1
0.8559289574623108
Cheat sheet for .NET with Momento CacheIf you need to get going quickly with .NET and Momento Cache, this page contains the basic API calls you'll need. Check the .NET SDK examples for complete, working examples including build 

# Cleanup

In [1]:
# Uncomment if you want to clean up; otherwise leave the index for the chatbot.
# client.delete_index("momento")
