# Setup

In [1]:
import itertools

import requests
from bs4 import BeautifulSoup

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import WebBaseLoader, SitemapLoader
from langchain.docstore.document import Document
from langchain.text_splitter import TokenTextSplitter

from momento.auth import CredentialProvider
from momento.config import VectorIndexConfigurations
from momento.responses.vector_index import Search
from langchain.vectorstores import MomentoVectorIndex

# Fixes bug with asyncio and jupyter notebooks.
# Required for SitemapLoader to work correctly.
import nest_asyncio

nest_asyncio.apply()


In [2]:
%load_ext dotenv
%dotenv


# Get website content

## Part 1: Find relevant URLs to scrape

We are going to read web page content. Let's start out with a helper to strip out irrelevant details:

In [3]:
def parse_content_fn(content: BeautifulSoup) -> str:
    # Strip irrelevant elements from the content
    to_remove = list(itertools.chain( 
        content.find_all("title"),
        content.find_all("nav"),
        content.find_all("div", role="region"),
        content.find_all("div", class_="page-wrapper"),
        content.find_all("div", class_="blog-post_newsletter"),
        content.find_all("div", class_="blog-post-social-wrapper"),
        content.find_all("section", class_="section-more-blog-posts"),
        content.find_all("button"),
        content.find_all("aside"),
        content.find_all(id="faqs"),
        content.find_all("header"),
        content.find_all("footer")))

    for element in to_remove:
        element.decompose()

    return str(content.get_text()).strip()


We have a lot of rich information from the technical documentation page. Since that has a sitemap, we can scrape the pages using Langchain's SitemapLoader:

In [4]:
tech_docs_loader = SitemapLoader(
    web_path="https://docs.momentohq.com/sitemap.xml",
    parsing_function=parse_content_fn
)


We also have a trove of documents first from blogs. We'll scrape the links directly from the index page, then use Langchain's WebBaseLoader to scrape the content:

In [5]:
def read_blog_urls(base_url: str = "https://www.gomomento.com") -> list[str]:
    response = requests.get(f"{base_url}/blog")
    soup = BeautifulSoup(response.text, "html.parser")

    # find all a elements
    a_elements = soup.find_all("a", href=True, recursive=True)
    hrefs = [
        a.get("href") for a in a_elements
        if a.get("href").startswith("/blog") and a.get("href") != "/blog"
    ]
    urls = [f"{base_url}{href}" for href in hrefs]

    # dedup and retain the order, see https://peps.python.org/pep-0468/
    return list(dict.fromkeys(urls))


In [6]:
blog_urls = read_blog_urls()
len(blog_urls), blog_urls[:5]


(100,
 ['https://www.gomomento.com/blog/episode-4-million-dollar-lines-of-code-engineering-your-cloud-cost-optimization-with-erik-peterson',
  'https://www.gomomento.com/blog/chatting-on-the-edge-integrating-momento-with-netlify-and-vercel',
  'https://www.gomomento.com/blog/unity-chat-demo-quickly-build-a-multiplayer-chat-with-serverless-pub-sub',
  'https://www.gomomento.com/blog/introducing-momento-leaderboards-the-serverless-leaderboard-service',
  'https://www.gomomento.com/blog/momento-cache-is-the-cloud-native-answer-to-elasticache-redis'])

In [7]:
blog_loader = WebBaseLoader(blog_urls)
blog_loader.parser = parse_content_fn


## Part 2: Scrape relevant URLs

Note: the next calls take 1 to 2 minutes to run.

In [8]:
tech_docs = tech_docs_loader.load()


Fetching pages: 100%|##########| 110/110 [00:10<00:00, 10.05it/s]


In [9]:
tech_docs[0]


Document(page_content='Momento CacheAccelerate your app, reduce costs, and free your developers.Momento TopicsEnable real-time communication between different parts of a distributed system.Momento Vector IndexA serverless vector index for your AI-enabled applications.#1213Momento LeaderboardsA serverless leaderboard service', metadata={'source': 'https://docs.momentohq.com/', 'loc': 'https://docs.momentohq.com/', 'changefreq': 'weekly', 'priority': '0.5'})

In [10]:
blogs = blog_loader.load()


In [11]:
blogs[0].page_content


'Episode #4 - Million dollar lines of code: Engineering your cloud cost optimization with Erik Peterson — Momento\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nNameEmail AddressThank you! Your submission has been received!Oops! Something went wrong while submitting the form.Get started with Momento Vector Index today!Get started with Momento Vector Index today!\n\n\nSolutions\n\nBy Use CaseBy CategoryBy Team TypeServicesMomento CacheMomento TopicsMomento Vector IndexServicesMomento CacheMomento TopicsMomento Vector IndexBuildUse CasesChatFront-End DevelopmentServerless AppsIndustriesAI/MLGamingIntegrationsDynamoDBMongoDBRedisBuildUse CasesChatFront-end DevelopmentServerless AppsIndustriesAI/MLGamingIntegrationsDynamoDBMongoDBRedisPricingResourcesCase StudiesFAQComplianceResourcesCase StudiesFAQMoCon RecapComplianceBlogDocsCompanyAbout UsJoin our NewsletterCompanyAbout UsJoin our NewsletterDocsLog InContact UsSolutions\n\nBy Use CaseBy CategoryBy Team TypeConsoleCon

In [12]:
docs = tech_docs + blogs


In [13]:
len(tech_docs), len(blogs), len(docs)


(110, 100, 210)

# Split data into smaller pieces in prep for Q&A

In [15]:
text_splitter = TokenTextSplitter(chunk_size=128, chunk_overlap=32, model_name="text-embedding-ada-002")
split_docs = text_splitter.split_documents(docs)
len(split_docs)


3464

# Load the data into MVI

Let's come up with meaningful ID's to apply

In [16]:
def gen_ids(docs: list[Document]) -> list[str]:
    ids = []
    prev_source = None
    prev_source_index = 0
    for doc in docs:
        source = doc.metadata["source"]
        if source != prev_source:
            ids.append(f"{source}, chunk=1")
            prev_source_index = 1
        else:
            ids.append(f"{source}, chunk={prev_source_index+1}")
            prev_source_index += 1
        prev_source = source
    return ids


Index the data in MVI using OpenAI text embeddings

In [17]:
from momento import PreviewVectorIndexClient, VectorIndexConfigurations, CredentialProvider

client = PreviewVectorIndexClient(
    configuration=VectorIndexConfigurations.Default.latest(),
    credential_provider=CredentialProvider.from_environment_variable("MOMENTO_API_KEY"),
)


In [25]:
client.list_indexes()


ListIndexes.Success(indexes=[])

In [21]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")  # type: ignore

docs = split_docs
ids = gen_ids(docs)

vector_store = MomentoVectorIndex.from_documents(
    docs,
    embedding=embeddings,
    client=client,
    index_name="momento",
    ids=ids
)


# Query using the langchain integration

In [22]:
vector_store.similarity_search_with_score("How much does Momento cost?")


[(Document(page_content=" Total Cost of Ownership (TCO) matters! Complex pricing models can make it difficult to project costs. We carefully crafted our pricing model with our primary design tenant being simplicity.\u200dHow is Momento priced? The Momento pricing model is simple and straightforward. A single pricing dimension of $0.50/GB inbound and outbound. Simply the amount of data that moves in and out of Momento Cache. And there's no separate storage charge to worry about.\u200dWe have a free tier to make it fast for developers to try Momento. Customers get their first 5GB of inbound and outbound for free every month without", metadata={'title': 'Simple. The way cloud pricing should be. — Momento', 'source': 'https://www.gomomento.com/blog/simple-the-way-cloud-pricing-should-be', 'language': 'en', 'description': 'You don’t need to be lost in a pricing maze. Try Momento Cache for free today.'}),
  0.8547287583351135),
 (Document(page_content="Pricing and free tier for Momento Topic

# Query using the MVI client

In [23]:
client = vector_store._client


In [27]:
client.list_indexes()


ListIndexes.Success(indexes=[IndexInfo(name='momento', num_dimensions=1536, similarity_metric=<SimilarityMetric.COSINE_SIMILARITY: 'COSINE_SIMILARITY'>)])

In [28]:
v = embeddings.embed_query("Can I use Momento with C#?")


In [29]:
response = client.search("momento", v, metadata_fields=["text"])
if isinstance(response, Search.Success):
    for hit in response.hits:
        print(hit.id)
        print(hit.score)
        print(hit.metadata["text"].replace("\n", " ")[:256])
        print()
    

https://docs.momentohq.com/cache/develop/sdks/dotnet, chunk=1
0.8712294101715088
Momento .NET SDKWelcome to the Momento .NET SDK documentation!The Momento .NET SDK is available via the nuget package Momento.Sdk.The source code can be found on GitHub: momentohq/client-sdk-dotnet.Requirements​dotnet runtime and command line tools; after 

https://docs.momentohq.com/topics/develop/sdks/dotnet, chunk=1
0.8710330724716187
Momento .NET SDKWelcome to the Momento .NET SDK documentation!The Momento .NET SDK is available via the nuget package Momento.Sdk.The source code can be found on GitHub: momentohq/client-sdk-dotnet.Requirements​dotnet runtime and command line tools; after 

https://www.gomomento.com/blog/major-release-version-1-0-of-momento-serverless-cache-net-client, chunk=1
0.8558578491210938
Major release: v1.0 of the Momento .NET client — Momento                                  NameEmail AddressThank you! Your submission has been received!Oops! Something went wrong while submitting t

# Cleanup

In [1]:
# Uncomment if you want to clean up; otherwise leave the index for the chatbot.
# client.delete_index("momento")
