In [1]:
import os
from getpass import getpass

import openai

if not (openai_api_key := os.getenv("OPENAI_API_KEY")):
    openai_api_key = getpass("🔑 Enter your OpenAI API key: ")
openai.api_key = openai_api_key
os.environ["OPENAI_API_KEY"] = openai_api_key

In [2]:
import pandas as pd
import phoenix as px
from phoenix.trace import TraceDataset, using_project


# Display the complete contents of dataframe cells.
# pd.set_option("display.max_colwidth", None)

In [3]:
with using_project("ragas-testset"):
    trace_dataset = TraceDataset.load("6d71d4a4-55cf-4222-a084-b988481f296b", directory='./data')
    session = px.launch_app(trace=trace_dataset)

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [4]:
from llama_index.core import SimpleDirectoryReader

dir_path = "./data/prompt-engineering-papers"
reader = SimpleDirectoryReader(dir_path, num_files_limit=2)
documents = reader.load_data()

In [5]:
len(documents)

41

In [6]:
documents[1]

Document(id_='36a6070d-47f1-4643-8ce1-6065337fefe3', embedding=None, metadata={'page_label': '2', 'file_name': '1605.08386v1.Heat_bath_random_walks_with_Markov_bases.pdf', 'file_path': '/home/peter-legion-wsl2/peter-projects/regen-ai/nbs/data/prompt-engineering-papers/1605.08386v1.Heat_bath_random_walks_with_Markov_bases.pdf', 'file_type': 'application/pdf', 'file_size': 289178, 'creation_date': '2024-04-13', 'last_modified_date': '2024-04-13'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='2 CAPRICE STANLEY AND TOBIAS WINDISCH\nreached by a random walk that uses moves from M, whereas for the continuous version, a\nrandom sampling from the unit sphere suﬃces. However, in man y situations where a Markov\nbasis is known, the heat-bath random walk is evid

Set handler for `Llama Index` and `Langchain`(Ragas uses Langchain)

In [7]:

from llama_index.core import set_global_handler
from phoenix.trace.langchain import LangChainInstrumentor


# Setup instrumentation for both llama-index and LangChain (used by Ragas)
set_global_handler("arize_phoenix")
LangChainInstrumentor().instrument()

In [8]:
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from phoenix.trace import using_project


def build_query_engine(documents):
    vector_index = VectorStoreIndex.from_documents(
        documents,
        embed_model=OpenAIEmbedding(),
    )
    query_engine = vector_index.as_query_engine(similarity_top_k=2)
    return query_engine


with using_project("indexing"):
    # By assigning a project name, the instrumentation will send all the embeddings to the indexing project
    query_engine = build_query_engine(documents)

In [15]:
my_traces = px.Client().get_trace_dataset(project_name="indexing").save(directory="./data")

💾 Trace dataset saved to under ID: 908ea008-e97f-449b-a098-a26b1b9e1ae4
📂 Trace dataset path: data/trace_dataset-908ea008-e97f-449b-a098-a26b1b9e1ae4.parquet


In [16]:
my_traces.hex

'908ea008e97f449ba098a26b1b9e1ae4'

In [25]:
px.Client().query_spans(project_name="indexing")

Unnamed: 0_level_0,name,span_kind,parent_id,start_time,end_time,status_code,status_message,events,conversation,context.trace_id,context.span_id,attributes.embedding.model_name,attributes.embedding.embeddings
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ddcd1d3a91f7a983,embedding,EMBEDDING,,2024-04-14T09:43:50.755733+00:00,2024-04-14T09:43:51.778108+00:00,OK,,[],,7b344eeee875a7cfa20f0701cfe9e505,ddcd1d3a91f7a983,text-embedding-ada-002,"[{'embedding.vector': [0.0015107227955013514, ..."
2e0130e2a56d26cc,chunking,CHAIN,4a7ea97d9d6d3b98,2024-04-14T09:43:50.330474+00:00,2024-04-14T09:43:50.330617+00:00,OK,,[],,c0214303c81fdd8c748189b673d6740d,2e0130e2a56d26cc,,
6f40282ba1ba74d1,chunking,CHAIN,4a7ea97d9d6d3b98,2024-04-14T09:43:50.325988+00:00,2024-04-14T09:43:50.328346+00:00,OK,,[],,c0214303c81fdd8c748189b673d6740d,6f40282ba1ba74d1,,
fabd0b3ccf3042b0,chunking,CHAIN,4a7ea97d9d6d3b98,2024-04-14T09:43:50.320995+00:00,2024-04-14T09:43:50.323314+00:00,OK,,[],,c0214303c81fdd8c748189b673d6740d,fabd0b3ccf3042b0,,
05ebc275a295d3a3,chunking,CHAIN,4a7ea97d9d6d3b98,2024-04-14T09:43:50.316329+00:00,2024-04-14T09:43:50.318586+00:00,OK,,[],,c0214303c81fdd8c748189b673d6740d,05ebc275a295d3a3,,
c38b51bd129e7192,chunking,CHAIN,4a7ea97d9d6d3b98,2024-04-14T09:43:50.312030+00:00,2024-04-14T09:43:50.314224+00:00,OK,,[],,c0214303c81fdd8c748189b673d6740d,c38b51bd129e7192,,
6b0feb0d2e541a18,chunking,CHAIN,4a7ea97d9d6d3b98,2024-04-14T09:43:50.306989+00:00,2024-04-14T09:43:50.309845+00:00,OK,,[],,c0214303c81fdd8c748189b673d6740d,6b0feb0d2e541a18,,
f4be4b1a1a01f349,chunking,CHAIN,4a7ea97d9d6d3b98,2024-04-14T09:43:50.302087+00:00,2024-04-14T09:43:50.304394+00:00,OK,,[],,c0214303c81fdd8c748189b673d6740d,f4be4b1a1a01f349,,
77ad8ef9bbc042ff,chunking,CHAIN,4a7ea97d9d6d3b98,2024-04-14T09:43:50.296767+00:00,2024-04-14T09:43:50.299264+00:00,OK,,[],,c0214303c81fdd8c748189b673d6740d,77ad8ef9bbc042ff,,
f9a8cbc76bef5b2e,chunking,CHAIN,4a7ea97d9d6d3b98,2024-04-14T09:43:50.291847+00:00,2024-04-14T09:43:50.294170+00:00,OK,,[],,c0214303c81fdd8c748189b673d6740d,f9a8cbc76bef5b2e,,


Embeddings to dataframe for use in visualization

In [None]:
from phoenix.trace.dsl.helpers import SpanQuery

client = px.Client()
corpus_df = px.Client().query_spans(
    SpanQuery().explode(
        "embedding.embeddings",
        text="embedding.text",
        vector="embedding.vector",
    ),
    project_name="indexing",
)
corpus_df.head()