# Set your API Key

In [None]:
# %env OPENAI_API_KEY=<Fetch at: https://platform.openai.com/account/api-keys>

# Import dependencies

In [61]:
# built-ins
from pathlib import Path
from functools import partial
import os
from typing import Tuple, List
from dataclasses import dataclass

# custom lib
from rag_tools.repo.ops import ensure_repo_exists_locally, DocumentationExtractor

# third party libs
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, Document, download_loader
from llama_index.retrievers import VectorIndexRetriever
from llama_index.response_synthesizers import get_response_synthesizer
from llama_index.llms import OpenAI
from metaflow import Flow
import pandas as pd
from IPython.display import display, Markdown

# Define utility functions

In [11]:
@dataclass
class Context:
    def __init__(self, response: str, source_node_ids: List[str]):
        self.response = response
        self.source_node_ids = source_node_ids

    def get_link_df(self, meta_df, link_col = 'doc_id'):
        return meta_df[meta_df[link_col].isin(self.source_node_ids)]

def qa_iter(
    question: str, 
    index: VectorStoreIndex, 
    k:int = 2, 
    response_mode:str = 'tree_summarize'
) -> Context:
    "Match a question against an index and returns the response."
    retriever = VectorIndexRetriever(index=index, similarity_top_k=k)
    response_synthesizer = get_response_synthesizer(response_mode=response_mode)
    query_engine = index.as_query_engine(response_synthesizer=response_synthesizer, retriever=retriever)
    query_res = query_engine.query(question)
    return Context(
        response=query_res.response, source_node_ids=list(query_res.metadata.keys())
    )

dm = lambda x: display(Markdown(x))
def dmqa(q, a): 
    dm(f"""
**Question:** {q}

**Answer:** {a}
"""
)

def nb_output_format(question, response, similar_chunk_df):
    dm(f"#### {question}")
    dm(f"**Retrieved Response**")
    dm(response)
    dm(f"#### Sources")
    for _, chunk in similar_chunk_df.iterrows():
        dm(f"##### [{chunk.header}]({chunk.page_url})")
        dm(f"{chunk.contents[:100]}...")

def get_documents_from_content_section_df(df):
    ids = []; documents = []
    for i, text in enumerate(df.contents):
        doc = Document(text=text, id_=i)
        documents.append(doc)
        ids.append(doc.id_)
    return documents, ids

def generative_search_engine_iter(question, index, meta_df, meta_df_id_col='doc_id'):
    "Assumes index and df are defined in the global scope"
    context = qa_iter(question, index)
    similar_chunk_df = meta_df[meta_df[meta_df_id_col].isin(context.source_node_ids)]
    nb_output_format(question, context.response, similar_chunk_df)

def get_documents_from_md_file_paths(fps):
    MarkdownReader = download_loader("MarkdownReader")
    loader = MarkdownReader()
    documents = []
    for fp in fps:
        documents += loader.load_data(file=Path(fp))
    return documents

# 🛑 Set variables based on your machine's setup

In [4]:
# these are part of this repo, so you don't need to change
DATA_DIR='../data'
LLAMA_INDEX_TUTORIAL_DATA = os.path.join(DATA_DIR, 'llama-index-tutorial')

# this is unique to your machine. where did you clone https://github.com/Netflix/metaflow-docs to?
YOUR_LOCAL_METAFLOW_DOCS_REPO_PATH = os.path.expanduser("~/Dev/metaflow-docs")

# Which GitHub repos?

In [5]:
REPO_PARAMS = [
    {
        "deployment_url": "docs.metaflow.org",
        "repository_path": "https://github.com/Netflix/metaflow-docs",
        "repository_ref": "master",
        "base_search_path": "docs",
        "exclude_paths": ["docs/v"],
        "exclude_files": ["README.md", "README"],
    }
]

# 1. Llama Index Warmup

In [63]:
# This cell will trigger llama_index to look for OPENAI_API_KEY in environment variables,
# then default to downloading a llama 2 model binary locally.

# this thing is looking for a .txt file in the data dir.
documents = SimpleDirectoryReader(LLAMA_INDEX_TUTORIAL_DATA).load_data()

# Indexing is the first, and most crucial, stage in a RAG workflow.
# It is the process of converting a set of documents into a vector representation.
# This vector representation is later used to retrieve relevant documents for a given query.
service_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.0)
)
index = VectorStoreIndex.from_documents(documents, service_context=service_context) 

In [64]:
# Query engine "takes in a natural language query, and returns a response, along with reference context retrieved and passed to the LLM."
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
dm(response.response)

# Taking too many minutes to run with Llama 2 on my macbook :( 
# Conservatively estimate OpenAI API is ~$1 per dozen end-to-end runs of this notebook.

The author worked on writing and programming outside of school before college. They wrote short stories and tried writing programs on an IBM 1401 computer. They also built a microcomputer kit and started programming on it, writing simple games and a word processor.

## Try a [Llama Hub tool for parsing `.md` files](https://llamahub.ai/l/file-markdown)

This could be used in conjunction or in place of the custom markdown parser used in `./markdown_chunker.py`

In [40]:
from pathlib import Path
from llama_index import download_loader
import os

In [41]:
MarkdownReader = download_loader("MarkdownReader")
loader = MarkdownReader()

# start with a single document
test_path = os.path.abspath("%s/test-data/ob/blog/metaflow-fast-data.md" % DATA_DIR)
documents = loader.load_data(file=Path(test_path))

In [65]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

## Ask some questions over the index

In [66]:
question = (
    "What is the fastest way to load data onto AWS Batch instances using Metaflow?"
)

query_engine = index.as_query_engine()
response = query_engine.query(question).response

dmqa(question, response)


**Question:** What is the fastest way to load data onto AWS Batch instances using Metaflow?

**Answer:** The fastest way to load data onto AWS Batch instances using Metaflow would be to utilize the data loading capabilities provided by Metaflow itself. Metaflow offers built-in functionality for handling data loading and processing, allowing you to efficiently transfer and process data on AWS Batch instances. By leveraging Metaflow's data loading features, you can optimize the loading process and ensure efficient utilization of AWS Batch resources.


In [68]:
question = "How does Metaflow use `tmpfs`?"
response = query_engine.query(question).response
dmqa(question, response)


**Question:** How does Metaflow use `tmpfs`?

**Answer:** Metaflow uses `tmpfs` to store temporary data during the execution of workflows. `tmpfs` is a temporary file system that resides in memory, which means that the data stored in `tmpfs` is not persisted across system reboots. This makes it ideal for storing temporary data that is only needed during the execution of a workflow and can be discarded afterwards. By using `tmpfs`, Metaflow can achieve faster read and write operations compared to using disk-based storage.


# 2. Controlling hallucinations by curating an index
Here are a few questions we will explore in this section:

**What is the problem with the above workflow?**

In the previous section's index, we created all the vectors from chunks of a [post](https://outerbounds.com/blog/metaflow-fast-data/) specifically about the `tmpfs` feature. [Metaflow docs](https://docs.metaflow.org/) don't contain that much content about `tmpfs` yet.

If we use the Metaflow docs as the source objects to populate the index, and a question is asked to the model about `tmpfs`, how can we know if it is hallucinating it, or referencing an actual piece of content that Metaflow maintainers endorse?
> Bing Chat AI:
Give a bunch of links to the content that LLM response was conditioned on.

We will build a simple system like this in the next section. First, let's see the power of understanding the domain of our index, and then move to using it as a way to reference source material in the generated response.

## Index Metaflow docs

In [69]:
file_paths = DocumentationExtractor().filter_files(
    YOUR_LOCAL_METAFLOW_DOCS_REPO_PATH,
    base_search_path = "docs",
    exclude_paths = ["docs/v"],
    exclude_files = ["README.md", "README"],
    considered_extensions = [".md"],
)

In [70]:
file_paths = [abs_path for abs_path, _ in file_paths]

In [71]:
documents = get_documents_from_md_file_paths(file_paths)

In [72]:
N = 1
print(
    "Showing sample of {m} out of {n} {t} objects".format(
        m=N, n=len(documents), t=type(documents[0])
    )
)
documents[:N]

Showing sample of 1 out of 500 <class 'llama_index.schema.Document'> objects


[Document(id_='b185a5f6-76a2-4095-9523-adb59d3518c9', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='168707b49cebf5424ed48615defbba683a0d7932f8451d263d491040c37aa87f', text='\n\nWelcome to Metaflow\n\nMetaflow makes it easy to build and manage real-life data science and machine learning projects.\n\n\n\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')]

In [73]:
%%time
index = VectorStoreIndex.from_documents(documents)
# TODO: Measure times as this thing scales with N documents and larger sized documents.

CPU times: user 582 ms, sys: 187 ms, total: 770 ms
Wall time: 14.1 s


## Q&A iterations over the Metaflow docs index

In [74]:
question = "What is Metaflow?"
context = qa_iter(question, index)
dmqa(question, context.response)


**Question:** What is Metaflow?

**Answer:** Metaflow is a Python library that simplifies the development, deployment, and operation of data-intensive applications, specifically those related to data science and machine learning. It was initially created at Netflix to enhance the efficiency of data scientists working on a range of projects, from traditional statistics to cutting-edge deep learning. Metaflow is an open-source tool released under the Apache License, Version 2.0.


In [75]:
question = "How do I specify conda dependencies in my flow?"
context = qa_iter(question, index)
dmqa(question, context.response)


**Question:** How do I specify conda dependencies in my flow?

**Answer:** You can specify conda dependencies in your flow using the `@conda_base` and `@conda` decorators. The `@conda_base` decorator is used at the flow level to specify explicit library dependencies, python version, and whether to exclude all steps from executing within a conda environment. The `@conda` decorator is used at the step level to update the explicit library dependencies, python version, and conda environment exclusion as specified by the `@conda_base` decorator. By using these decorators, you can define the conda environment for each step in your flow. Additionally, you can add an explicit dependency on a specific module by using the `@conda` decorator in the corresponding step.


### Do Metaflow docs know about `tmpfs` though?

In [76]:
question = "How does Metaflow use `tmpfs`?"
dmqa(question, qa_iter(question, index).response)


**Question:** How does Metaflow use `tmpfs`?

**Answer:** Metaflow does not use `tmpfs` based on the given information.


### Adding specific knowledge to the index

As of August, 2023, the Metaflow documentation do not have much writing about `tmpfs`, so this makes sense.

How can we add [Outerbounds blog post](https://outerbounds.com/blog/metaflow-fast-data/) that announced the `tmpfs` and Metaflow integration to the index, to give the model the context it needs to answer this question?

Let's create an index that combines the one we saw earlier for the `tmpfs` blog post with the one we just created for Metaflow docs.

In [78]:
fast_data_file_path = os.path.abspath('%s/test-data/ob/blog/metaflow-fast-data.md' % DATA_DIR)

# combining the document set
fast_data_doc = get_documents_from_md_file_paths([fast_data_file_path])
index_fast_data_post = VectorStoreIndex.from_documents(fast_data_doc)

In [79]:
# same question as above. now we can answer it with the new index.
question = "How does Metaflow use `tmpfs`?"
dmqa(question, qa_iter(question, index_fast_data_post).response)


**Question:** How does Metaflow use `tmpfs`?

**Answer:** Metaflow recently implemented support for memory-based `tmpfs` filesystem on AWS Batch and Kubernetes. This feature allows users to create an ephemeral filesystem backed by memory on the fly, without making any changes to the infrastructure. By enabling this feature using the `@batch(use_tmpfs=True)` decorator for AWS Batch workloads or `@kubernetes(use_tmpfs=True)` decorator for Kubernetes, the `metaflow.S3` client is automatically aware of the `tmpfs` volume and will use it to speed up downloads. This helps improve the performance of data downloads from S3 in Metaflow workflows.


In [81]:
# indexes are updateable/composable! 
for doc_chunk in fast_data_doc:
    index.insert(doc_chunk)

In [82]:
# same questions as above. now we can answer it with the new index.
question = "How does Metaflow use `tmpfs`?"
dmqa(question, qa_iter(question, index).response)

# and this one too.
question = "How do I specify conda dependencies in my flow?"
dmqa(question, qa_iter(question, index).response)


**Question:** How does Metaflow use `tmpfs`?

**Answer:** Metaflow uses `tmpfs` by implementing support for memory-based `tmpfs` filesystem on Batch and Kubernetes. This allows users to create an ephemeral filesystem backed by memory on the fly, without having to make any changes on the infrastructure side. When the `tmpfs` feature is enabled, the `metaflow.S3` client automatically uses it to speed up downloads. To enable this feature, users can add `@batch(use_tmpfs=True)` for AWS Batch workloads or `@kubernetes(use_tmpfs=True)` for Kubernetes in their Metaflow code.



**Question:** How do I specify conda dependencies in my flow?

**Answer:** You can specify conda dependencies in your flow using the `@conda_base` and `@conda` decorators. The `@conda_base` decorator is used at the flow level to specify explicit library dependencies, python version, and whether to exclude all steps from executing within a conda environment. The `@conda` decorator is used at the step level to update the explicit library dependencies, python version, and conda environment exclusion as specified by the `@conda_base` decorator. By using these decorators, you can define the conda environment for each step in your flow. Additionally, you can add an explicit dependency on a specific module by using the `@conda` decorator in the desired step.


# 3. Constructing an index based on sections of endorsed content we can link to
[Parse the Documents into Nodes](https://gpt-index.readthedocs.io/en/latest/end_to_end_tutorials/usage_pattern.html#parse-the-documents-into-nodes)

## Fetch all file paths of .md files

In [83]:
from rag_tools.filetypes.markdown import Mixin as mm
# this cell is like a condensed version of `/flows/markdown_chunker.py`
_mm = mm()
_mm.repo_params = REPO_PARAMS

# this is an unprocessed df, so you may want to clean it as /flows/data_table_processor.py does.
df = _mm.load_df_from_repo_list()

Looking for remote repository at https://github.com/Netflix/metaflow-docs
Looking for remote repository at https://github.com/huggingface/accelerate


In [84]:
documents, ids = get_documents_from_content_section_df(df)
df['doc_ids'] = ids

In [85]:
index = VectorStoreIndex.from_documents(documents)

In [100]:
import random
dm(random.choice(documents).text)

The above instructions work even if you use [`@conda`
 decorators](/scaling/dependencies#managing-dependencies-with-conda-decorator) in your
 code; you need, however, to ensure that the `conda` binary is available in your `PATH`.
 The easiest way to do this is to set the `PATH` environment variable to properly include
 the path to the `conda` binary if it is in a non-standard location. In VSCode, you can
 simply add this value in the env section of launch.json and in PyCharm, the UI allows
 you to set environment variables.

# 4. Use Cases

## Load a dataframe of text chunks and metadata from your latest workflow runs

In [6]:
# find latest Metaflow run that saved processed df
run = None
for _run in Flow('DataTableProcessor'):
    if _run.data.save_processed_df:
        run = _run
        break

print(run.id)

1692847583518101


In [7]:
df = run.data.processed_df
documents, ids = get_documents_from_content_section_df(df)
df['doc_id'] = ids
index = VectorStoreIndex(documents)

## Q&A

In [12]:
question = "What is Metaflow?"
generative_search_engine_iter(question, index, df)

['1', '1209']


#### What is Metaflow?

**Retrieved Response**

Metaflow is a Python library that simplifies the development, deployment, and operation of data-intensive applications, particularly those related to data science and machine learning. It was initially created at Netflix to enhance the productivity of data scientists working on various projects. Metaflow is available as an open-source framework under the Apache License, Version 2.0. It allows data scientists to focus on important aspects like feature engineering and model development while abstracting away tasks such as job organization, orchestration, scheduling, and interaction with data warehouses. Additionally, Metaflow enables the building of production-ready machine learning workflows using a simple Python API and facilitates seamless transitions between local prototyping environments and cloud-based deployments.

#### Sources

##### [What is Metaflow](https://docs.metaflow.org/introduction/what-is-metaflow#what-is-metaflow)

Metaflow is a human-friendly Python library that makes it straightforward to develop, deploy, and op...

##### [Reproducible ML pipelines with Metaflow](https://outerbounds.com/blog/machine-learning-pipelines-from-prototype-to-production#reproducible-ml-pipelines-with-metaflow)

To help data scientists focus on the parts of the stack they really care about, such as feature engi...

In [108]:
question = "How does Metaflow work with Kubernetes?"
generative_search_engine_iter(question, index, df)

#### How does Metaflow work with Kubernetes?

**Retrieved Response**

Metaflow now has the capability to run on top of any Kubernetes cluster. Users can run all or parts of any Metaflow flow on Kubernetes from their workstation by using the command "run --with kubernetes". Additionally, users can deploy their flow to Argo Workflows, a Kubernetes-native workflow scheduler, with a single command "argo-workflows create" to execute the flow asynchronously. For more information on setting up and operating Kubernetes for Metaflow, users can refer to the engineering resources provided by Metaflow.

#### Sources

##### [Add capability to launch Metaflow tasks on Kubernetes and schedule Metaflow flows with Argo Workflows.](https://docs.metaflow.org/internals/release-notes#add-capability-to-launch-metaflow-tasks-on-kubernetes-and-schedule-metaflow-flows-with-argo-workflows)

This release enables brand new capabilities for [Metaflow on top of
 Kubernetes](https://outerbounds...

##### [Using Kubernetes](https://docs.metaflow.org/scaling/remote-tasks/kubernetes#using-kubernetes)

Here are some useful tips and tricks related to running Metaflow on Kubernetes. See our
 engineering...

In [109]:
question = "What is a DAG?"
generative_search_engine_iter(question, index, df)

#### What is a DAG?

**Retrieved Response**

A DAG, or directed acyclic graph, is a graph that consists of nodes connected by directed edges, where the edges have a specific direction and there are no cycles in the graph. In the context of Metaflow, a DAG is inferred based on the transitions between step functions. The nodes in the DAG are the steps, which represent operations, and the edges represent the transitions between steps. The DAG structure is important for defining the flow of execution and dependencies between steps in Metaflow.

#### Sources

##### [Graph](https://docs.metaflow.org/internals/technical-overview#graph)

Metaflow infers a directed (typically acyclic) graph based on the transitions between
 step function...

##### [The Structure of Metaflow Code](https://docs.metaflow.org/metaflow/basics#the-structure-of-metaflow-code)

Metaflow follows [the dataflow
 paradigm](https://en.wikipedia.org/wiki/Dataflow_programming) which ...

## RAG for a generic sales pitch 

### A custom prompt template in pure Python
Many tools exist to make prompts easy to manage. [Langchain](https://www.langchain.com/) is an emerging leader in this space.

In [126]:
prompt_template = """
Write an introduction email to a potential technical user who works as a {role} at {company}.

Do not directly address the user's role or company anywhere in the email.

Write the email for a technical audience who doesn't want to read marketing copy.

Highlight Metaflow features related to their interests including {interests}.

Include a summary motivating the benefits of these features by summarizing this context about Metaflow:
{context_about_interests}

Make a subtle reference that Outerbounds platform can help them with {enterprise_platform_interest_hook}.

Include a summary motivating the benefits of Outerbounds platform by summarizing this context about Outerbounds platform:
{context_about_enterprise_platform_interest_hook}

Make the CTA to schedule a meeting to discuss how Outerbounds platform can help them.

Make the email as short as possible. 

Do not reference your own profession or any experiences. Do not talk about yourself.

Do not explicitly reference the company that the receiver works for. Only implicitly use this knowledge to demonstrate knowledge of the problems their organization may face.

Avoid speaking from the first person.

Avoid directly saying that you know about anyone's past experience or background. 

Avoid saying anything with similar sentiment to these statements:
    <br/> &ensp;- Author Metaflow flows using notebooks
    <br/> &ensp;- Refer to an interest listed above as something Metaflow works with
"""

### Prompt engineering 101
Inject some relevant context into our prompt template.

In [127]:
things_we_learned_about_a_prospect = dict(
    role="data scientist",
    company="Big Industries Co.",

    # comma-separated lists
    interests="mlops, deep learning, kubernetes",
    enterprise_platform_interest_hook="CI/CD, security",
)

### RAGify! 
Here we use the Q&A iteration you saw in previous sections, but instead of printing the results we are using them to "augment" the prompt.

In [128]:
# RAG step - fetch some context
from tqdm import tqdm

interest_list = things_we_learned_about_a_prospect["interests"].split(", ")
context_about_interests = []
for interest in tqdm(interest_list):
    context_about_interest = qa_iter(
        f"Describe how Metaflow and {interest} can be used together in ML workflows. Focus on Metaflow being used as a complimentary tool.",
        index,
    )
    context_about_interests.append(context_about_interest)

enterprise_platform_interest_hook = things_we_learned_about_a_prospect[
    "enterprise_platform_interest_hook"
].split(", ")
context_about_enterprise_platform_interest_hook = []
for interest in tqdm(enterprise_platform_interest_hook):
    context_about_interest = qa_iter(
        f"Describe how Metaflow and {interest} can be used together in ML workflows. Focus on Metaflow being used as a complimentary tool.",
        index,
    )
    context_about_enterprise_platform_interest_hook.append(context_about_interest)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:27<00:00,  9.04s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:20<00:00, 10.34s/it]


In [129]:
# unpack context injection data to encourage the generative LLM to embed the links in its Markdown response.

def prepare_with_links(context_list: List[Context]):
    def strip_numbers_and_punctuation(header):
        import string
        import re

        return re.sub(
            r"\d", "", header.translate(str.maketrans("", "", string.punctuation))
        )

    context_to_inject = """
The results will be given to you in a format like
<br/> &ensp;- "Text": [Comma-separated list of [Link Label](Link URL)]
Use the [Link Label](Link URL) syntax in the summary, and use the links in context of the paragraph.

These are the results to summarize:
"""
    for _context in context_list:
        similar_chunk_df = _context.get_link_df(df)
        links = []
        for header, url in list(
            zip(similar_chunk_df.header.values, similar_chunk_df.page_url.values)
        ):
            links.append(f"[{strip_numbers_and_punctuation(header)}]({url})")
        context_to_inject += (
            "<br/>" + " &ensp;-" + ' "' + _context.response + '": ' + ", ".join(links)
        )
    return context_to_inject

In [130]:
prompt = prompt_template.format(
    **things_we_learned_about_a_prospect,
    context_about_interests=prepare_with_links(context_about_interests),
    context_about_enterprise_platform_interest_hook=prepare_with_links(
        context_about_enterprise_platform_interest_hook
    )
)

In [131]:
# append negative sentiment escapes to prompt.
# treat these like UX escape hatches, where you can always just stuff extra things in, until you reach the model's context width.
user_interaction = True
negative_sentiment_context_lines = []
while user_interaction:

    if len(negative_sentiment_context_lines) == 0:
        user_input = input(
            "Enter a negative sentiment to avoid by append to the prompt with an 'avoid' instruction, or press enter to continue, or type r then enter to restart: "
        )
    else:
        print("Current list of negative sentiment statements:")
        for line in negative_sentiment_context_lines:
            print(line)
        user_input = input(
            "Single enter to continue, double enter to complete list of results of negative sentiment statements."
        )
    # process user input
    if user_input == "":
        user_interaction = False
    elif user_input == "r":
        negative_sentiment_context_lines = []
    else:
        negative_sentiment_context_lines.append(user_input)

for line in negative_sentiment_context_lines:
    prompt += "<br/>" + " &ensp;-" + " " + line

Enter a negative sentiment to avoid by append to the prompt with an 'avoid' instruction, or press enter to continue, or type r then enter to restart:  


### View the final prompt the model will see

In [132]:
dm("##### The first five hundred chars") 
dm(prompt[:500] + '...')

##### The first five hundred chars


Write an introduction email to a potential technical user who works as a data scientist at Big Industries Co..

Do not directly address the user's role or company anywhere in the email.

Write the email for a technical audience who doesn't want to read marketing copy.

Highlight Metaflow features related to their interests including mlops, deep learning, kubernetes.

Include a summary motivating the benefits of these features by summarizing this context about Metaflow:

The results will be give...

### See your RAG app in action! 

In [133]:
query_engine = index.as_query_engine()
query_res = query_engine.query(prompt)
response = query_res.response
dm("##### Generated email")
dm(response)

##### Generated email

Subject: Enhancing ML Workflows with Metaflow and Outerbounds Platform

Dear [Recipient],

I hope this email finds you well. I wanted to reach out to introduce you to Metaflow, a powerful tool that can enhance your data science workflows and help you achieve scalability, reproducibility, and production-readiness.

Metaflow can be used as a complimentary tool in ML workflows, particularly when combined with MLOps practices. By integrating Metaflow into your pipeline, you can leverage its capabilities to build and deploy ML models more efficiently. It takes care of low-level infrastructure such as data, compute, orchestration, and versioning, allowing you to focus on the fun parts of building applications and models. [Metaflow takes care of the plumbing so you can focus on the fun parts](https://docs.metaflow.org/introduction/why-metaflow#10-metaflow-takes-care-of-the-plumbing-so-you-can-focus-on-the-fun-parts).

For deep learning projects, Metaflow provides a robust and user-friendly foundation. It covers the full stack of DS/ML infrastructure, allowing you to focus on iterating on ideas quickly and deploying them confidently. [Metaflow covers the full stack of DS/ML infrastructure](https://docs.metaflow.org/introduction/why-metaflow#9-metaflow-covers-the-full-stack-of-ds-ml-infrastructure).

If you're working with Kubernetes, Metaflow seamlessly integrates with it to leverage scalable infrastructure for running ML/DS applications. This makes it suitable for both small and large organizations. [Metaflow relies on systems that engineers know and trust](https://docs.metaflow.org/introduction/why-metaflow#11-metaflow-relies-on-systems-that-engineers-know-and-trust).

In addition to Metaflow, I wanted to mention Outerbounds Platform, which can further enhance your ML workflows. It offers CI/CD capabilities, ensuring that changes to ML models and data pipelines are thoroughly tested and deployed in a reliable and efficient manner. Outerbounds Platform also prioritizes security, respecting your company's security policies and providing a secure environment for executing data science projects.

I would love to schedule a meeting to discuss how Metaflow and Outerbounds Platform can specifically benefit your organization and address any challenges you may be facing. Please let me know a time that works for you, and I will be happy to set up a call.

Looking forward to hearing from you.

Best regards,
[Your Name]

## Chatbot

In [33]:
repos = [
    {
        "deployment_url": "docs.metaflow.org",
        "repository_path": os.path.expanduser("~/Dev/metaflow-docs"),
        "repository_ref": "master",
        "base_search_path": "docs",
        "exclude_paths": ["docs/v"],
        "exclude_files": ["README.md", "README"],
    }
]

In [34]:
def local_repos_to_docs(repos):
    documents = []
    for params in repos:
        md_files = DocumentationExtractor().filter_files(
            params["repository_path"],
            base_search_path=params["base_search_path"],
            exclude_paths=params["exclude_paths"],
            exclude_files=params["exclude_files"],
            considered_extensions=[".md"]
        )
        md_files = [abs_path for abs_path, _ in md_files]
        documents += get_documents_from_md_file_paths(md_files)
    return documents

In [35]:
documents = local_repos_to_docs(repos)
documents[:2]

[Document(id_='818c7d59-7d59-4c9a-8e26-be8fa66929f6', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='168707b49cebf5424ed48615defbba683a0d7932f8451d263d491040c37aa87f', text='\n\nWelcome to Metaflow\n\nMetaflow makes it easy to build and manage real-life data science and machine learning projects.\n\n\n\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='baebefe9-9cf4-4c6b-8831-3fec827af036', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='0c733db8a7e3a129b3c47977e53aa8d0ee5e0942e768f40d822a65e1da4f9aea', text='\n\nMotivation\n\n- Why Metaflow\n- What is Metaflow\n- Metaflow Resources\n\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')]

In [36]:
temp = 0.0
model = "gpt-3.5-turbo"
chat_mode = "react"

In [37]:
service_context = ServiceContext.from_defaults(llm=OpenAI(model=model, temperature=temp))
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [38]:
chat_engine = index.as_chat_engine(service_context=service_context, chat_mode=chat_mode, verbose=True)

In [48]:
response = chat_engine.chat("How does Metaflow help AI developers?")

[38;5;200m[1;3mThought: I need to use a tool to help me answer the question.
Action: query_engine_tool
Action Input: {'input': 'How does Metaflow help AI developers?'}
[0m[36;1m[1;3mObservation: Metaflow helps AI developers by providing a robust and user-friendly foundation for data-intensive applications. It takes care of the low-level infrastructure such as data, compute, orchestration, and versioning, allowing developers to focus on building their own applications, models, and policies on top of it. This means that AI developers who have a basic understanding of Python can leverage Metaflow to streamline their development process and focus on the more enjoyable aspects of their work.
[0m

In [34]:
dm(response.response)

Metaflow helps AI developers by providing a robust and user-friendly foundation for data-intensive applications. It takes care of the low-level infrastructure such as data, compute, orchestration, and versioning, allowing developers to focus on building their own applications, models, and policies on top of it. This means that AI developers who have a basic understanding of Python can leverage Metaflow to streamline their development process and focus on the more enjoyable aspects of their work.

In [36]:
response = chat_engine.chat("What are the enjoyable aspects of their work?")
dm(response.response)

[38;5;200m[1;3mThought: I have already answered this question. I can provide a summary of my previous response to help answer the question again.
Action: query_engine_tool
Action Input: {'input': "The enjoyable aspects of AI developers' work"}
[0m[36;1m[1;3mObservation: The enjoyable aspects of AI developers' work include the ability to create value with machine learning, similar to traditional software engineering. The development and deployment of ML systems are expected to mature and improve over time, just as software development has done in the past 20 years. Efforts from various entities, such as governments, open source communities, and for-profit companies, are being made to articulate risks, develop best practices, and provide tooling to support ML development. This indicates a positive prognosis for the future of AI development.
[0m[38;5;200m[1;3mResponse: The enjoyable aspects of AI developers' work include the ability to create value with machine learning, the oppor

The enjoyable aspects of AI developers' work include the ability to create value with machine learning, the opportunity for continuous learning and staying updated with the latest advancements, the potential for impactful applications, collaboration with interdisciplinary teams, the freedom to experiment and innovate, and the automation of repetitive tasks. Additionally, efforts from various entities are being made to support and improve the development and deployment of ML systems, indicating a positive prognosis for the future of AI development.

In [69]:
# chat_engine.chat_repl()