Using this  - [Notebook](https://github.com/Arize-ai/phoenix/blob/main/tutorials/llm_ops_overview.ipynb) - as a jumping off point

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:

import os
from getpass import getpass

if not (openai_api_key := os.getenv("OPENAI_API_KEY")):
    openai_api_key = getpass("ðŸ”‘ Enter your OpenAI API key: ")

os.environ["OPENAI_API_KEY"] = openai_api_key

In [None]:

import phoenix as px
from llama_index.core import set_global_handler

# Setup phoenix tracing
px.launch_app()
set_global_handler("arize_phoenix")

In [None]:
import os
from getpass import getpass

import phoenix as px
from gcsfs import GCSFileSystem
from llama_index.core import (
    Settings,
    StorageContext,
    load_index_from_storage,
)
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

file_system = GCSFileSystem(project="public-assets-275721")
index_path = "arize-phoenix-assets/datasets/unstructured/llm/llama-index/arize-docs/index/"
storage_context = StorageContext.from_defaults(
    fs=file_system,
    persist_dir=index_path,
)

Settings.llm = OpenAI(model="gpt-3.5-turbo-0125")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
index = load_index_from_storage(
    storage_context,
)
query_engine = index.as_query_engine()

In [None]:
from tqdm import tqdm

queries = [
    "How can I query for a monitor's status using GraphQL?",
    "How do I delete a model?",
    "How much does an enterprise license of Arize cost?",
    "How do I log a prediction using the python SDK?",
]

for query in tqdm(queries):
    response = query_engine.query(query)
    print(f"Query: {query}")
    print(f"Response: {response}")
     

In [None]:
# import os
# from getpass import getpass

# import phoenix as px
# from llama_index.core import (
#     Settings,
#     StorageContext,
#     load_index_from_storage,
#     VectorStoreIndex
# )
# from llama_index.embeddings.openai import OpenAIEmbedding
# from llama_index.llms.openai import OpenAI
# from llama_index.core.readers import SimpleDirectoryReader
# from llama_index.core.node_parser import SimpleNodeParser


# Settings.llm = OpenAI(model="gpt-3.5-turbo-0125")
# Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

In [None]:
# import tempfile
# from urllib.request import urlretrieve

# with tempfile.NamedTemporaryFile() as tf:
#     urlretrieve(
#         "https://raw.githubusercontent.com/Arize-ai/phoenix-assets/main/data/paul_graham/paul_graham_essay.txt",
#         tf.name,
#     )
#     documents = SimpleDirectoryReader(input_files=[tf.name]).load_data()

# node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
# nodes = node_parser.get_nodes_from_documents(documents)
# index = VectorStoreIndex(nodes)
# query_engine = index.as_query_engine()

In [None]:

# from tqdm import tqdm

# queries = [
#     "What is Paul Graham's contribution to computer science?",
#     "What startups has Paul Graham founded?",
#     "What is the impact of Paul Graham's Y Combinator on the tech industry?",
#     "What are some notable essays written by Paul Graham?"
# ]

# for query in tqdm(queries):
#     response = query_engine.query(query)
#     print(f"Query: {query}")
#     print(f"Response: {response}")

Export Spans to DF

In [None]:
spans_df = px.Client().get_spans_dataframe()
spans_df[["name", "span_kind", "attributes.input.value", "attributes.retrieval.documents"]].head()

## Eval

Convert traces to datasets

In [None]:
from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents

retrieved_documents_df = get_retrieved_documents(px.active_session())
queries_df = get_qa_with_reference(px.active_session())

In [None]:
import nest_asyncio
from phoenix.evals import (
    HALLUCINATION_PROMPT_RAILS_MAP,
    HALLUCINATION_PROMPT_TEMPLATE,
    QA_PROMPT_RAILS_MAP,
    QA_PROMPT_TEMPLATE,
    OpenAIModel,
    llm_classify,
)

nest_asyncio.apply()  # Speeds up OpenAI API calls

# Check if the application has any indications of hallucinations
hallucination_eval = llm_classify(
    dataframe=queries_df,
    model=OpenAIModel(model="gpt-3.5-turbo-0125", temperature=0.0),
    template=HALLUCINATION_PROMPT_TEMPLATE,
    rails=list(HALLUCINATION_PROMPT_RAILS_MAP.values()),
    provide_explanation=True,  # Makes the LLM explain its reasoning
)
hallucination_eval["score"] = (
    hallucination_eval.label[~hallucination_eval.label.isna()] == "factual"
).astype(int)

# Check if the application is answering questions correctly
qa_correctness_eval = llm_classify(
    dataframe=queries_df,
    model=OpenAIModel(model_name="gpt-3.5-turbo-0125", temperature=0.0),
    template=QA_PROMPT_TEMPLATE,
    rails=list(QA_PROMPT_RAILS_MAP.values()),
    provide_explanation=True,  # Makes the LLM explain its reasoning
    concurrency=4,
)

qa_correctness_eval["score"] = (
    hallucination_eval.label[~qa_correctness_eval.label.isna()] == "correct"
).astype(int)

In [None]:

hallucination_eval.head()

In [None]:

qa_correctness_eval.head()

In [None]:
from phoenix.trace import SpanEvaluations

px.Client().log_evaluations(
    SpanEvaluations(eval_name="Hallucination", dataframe=hallucination_eval),
    SpanEvaluations(eval_name="QA Correctness", dataframe=qa_correctness_eval),
)

In [None]:

print("The Phoenix UI:", px.active_session().url)

### Eval Relevance of RAG Chunks

In [None]:
from phoenix.experimental.evals import (
    RAG_RELEVANCY_PROMPT_RAILS_MAP,
    RAG_RELEVANCY_PROMPT_TEMPLATE,
    OpenAIModel,
    llm_classify,
)

retrieved_documents_eval = llm_classify(
    dataframe=retrieved_documents_df,
    model=OpenAIModel(model="gpt-3.5-turbo-0125", temperature=0.0),
    template=RAG_RELEVANCY_PROMPT_TEMPLATE,
    rails=list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()),
    provide_explanation=True,
)

retrieved_documents_eval["score"] = (
    retrieved_documents_eval.label[~retrieved_documents_eval.label.isna()] == "relevant"
).astype(int)

In [None]:
retrieved_documents_eval.head()
     

In [None]:
from phoenix.trace import DocumentEvaluations

px.Client().log_evaluations(
    DocumentEvaluations(eval_name="Relevance", dataframe=retrieved_documents_eval)
)

## UMAP Projection

In [None]:
import pandas as pd

# Pull in queries from the LLM
query_df = pd.read_parquet(
    "http://storage.googleapis.com/arize-phoenix-assets/datasets/unstructured/llm/llama-index/arize-docs/query_data_complete3.parquet",
)

query_ds = px.Inferences.from_open_inference(query_df)

query_ds.dataframe.head()

In [None]:

import numpy as np


def storage_context_to_dataframe(storage_context: StorageContext) -> pd.DataFrame:
    """Converts the storage context to a pandas dataframe.

    Args:
        storage_context (StorageContext): Storage context containing the index
        data.

    Returns:
        pd.DataFrame: The dataframe containing the index data.
    """
    document_ids = []
    document_texts = []
    document_embeddings = []
    docstore = storage_context.docstore
    vector_store = storage_context.vector_store
    for node_id, node in docstore.docs.items():
        document_ids.append(node.hash)  # use node hash as the document ID
        document_texts.append(node.text)
        document_embeddings.append(np.array(vector_store.get(node_id)))
    return pd.DataFrame(
        {
            "document_id": document_ids,
            "text": document_texts,
            "text_vector": document_embeddings,
        }
    )


database_df = storage_context_to_dataframe(storage_context)
database_df = database_df.drop_duplicates(subset=["text"])
database_df.head()

In [None]:
# get a random sample of 500 documents (including retrieved documents)
# this will be handled by by the application in a coming release
num_sampled_point = 500
retrieved_document_ids = set(
    [
        doc_id
        for doc_ids in query_df[":feature.[str].retrieved_document_ids:prompt"].to_list()
        for doc_id in doc_ids
    ]
)
retrieved_document_mask = database_df["document_id"].isin(retrieved_document_ids)
num_retrieved_documents = len(retrieved_document_ids)
num_additional_samples = num_sampled_point - num_retrieved_documents
unretrieved_document_mask = ~retrieved_document_mask
sampled_unretrieved_document_ids = set(
    database_df[unretrieved_document_mask]["document_id"]
    .sample(n=num_additional_samples, random_state=0)
    .to_list()
)
sampled_unretrieved_document_mask = database_df["document_id"].isin(
    sampled_unretrieved_document_ids
)
sampled_document_mask = retrieved_document_mask | sampled_unretrieved_document_mask
sampled_database_df = database_df[sampled_document_mask]

database_schema = px.Schema(
    prediction_id_column_name="document_id",
    prompt_column_names=px.EmbeddingColumnNames(
        vector_column_name="text_vector",
        raw_data_column_name="text",
    ),
)
database_ds = px.Inferences(
    dataframe=sampled_database_df,
    schema=database_schema,
    name="database",
)

session = px.launch_app(primary=query_ds, corpus=database_ds, run_in_thread=False)

In [None]:
exports = px.active_session().exports
dataframe = exports[-1]
dataframe

In [None]:
session = px.launch_app(primary=query_ds, run_in_thread=False)

In [None]:
query_ds.dataframe

In [None]:
query_df.head()

In [None]:
database_df