# Phoenix Tracing

In [1]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine, select, values, update, and_, exists, text, distinct
from sqlalchemy.orm import sessionmaker, join
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink, NaicsCodes
from app.models.schema import NoticeBase, ResourceLinkBase, ResourceLinkSimple, NaicsCodeBase, NaicsCodeSimple
from dotenv import load_dotenv
import os
import pendulum
from openai import OpenAI
import pandas as pd
import phoenix as px

In [3]:

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
# Database
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=5).strftime("%Y%m%d")

In [4]:
client = OpenAI()

In [5]:
query = "software"

## NAICS codes

In [6]:
with SessionLocal() as db:
    res = client.embeddings.create(input=query, model="text-embedding-3-small")
    query_embed = res.data[0].embedding
    stmt = (
        select(NaicsCodes)
        .order_by(NaicsCodes.description_embedding.l2_distance(query_embed))
        .limit(100)
    )
    result = db.execute(stmt)
    codes = result.scalars().all()
    data = [NaicsCodeBase.model_validate(code) for code in codes]

In [7]:
embeddings = np.array([item.description_embedding for item in data])
labels = [item.title for item in data]

In [8]:
data_dict = [entry.model_dump() for entry in data]

In [9]:
df = pd.DataFrame(data_dict)
df.shape

(100, 5)

In [10]:
df.head()

Unnamed: 0,id,naicsCode,title,description,description_embedding
0,769,541511,Custom Computer Programming Services,This U.S. industry comprises establishments pr...,"[-0.020327847450971603, -0.0009499804000370204..."
1,677,513210,Software Publishers,This industry comprises establishments primari...,"[-0.011772933416068554, 0.003895207541063428, ..."
2,770,541512,Computer Systems Design Services,This U.S. industry comprises establishments pr...,"[-0.0222358126193285, 0.011486489325761795, 0...."
3,688,518210,"Computing Infrastructure Providers, Data Proce...",This industry comprises establishments primari...,"[-0.01620187796652317, 0.012468253262341022, 0..."
4,385,334111,Electronic Computer Manufacturing,This U.S. industry comprises establishments pr...,"[-0.018319282680749893, 0.018116984516382217, ..."


In [11]:
datapath = "./data/naics_embeds.parquet"

In [12]:
df.to_parquet(datapath, engine="pyarrow")

In [13]:
df = pd.read_parquet(datapath, engine="pyarrow")

In [14]:
database_schema = px.Schema(id_column_name="id", prompt_column_names=px.EmbeddingColumnNames(
    vector_column_name="description_embedding", 
    raw_data_column_name="description"
    ),
)

In [15]:
database_ds = px.Inferences(dataframe=df, schema=database_schema, name="database")

In [16]:
database_ds.dataframe

Unnamed: 0_level_0,id,naicsCode,title,description,description_embedding,timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-04-17 18:16:08.847808+00:00,769,541511,Custom Computer Programming Services,This U.S. industry comprises establishments pr...,"[-0.020327847450971603, -0.0009499804000370204...",2024-04-17 18:16:08.847808+00:00
2024-04-17 18:16:08.847808+00:00,677,513210,Software Publishers,This industry comprises establishments primari...,"[-0.011772933416068554, 0.003895207541063428, ...",2024-04-17 18:16:08.847808+00:00
2024-04-17 18:16:08.847808+00:00,770,541512,Computer Systems Design Services,This U.S. industry comprises establishments pr...,"[-0.0222358126193285, 0.011486489325761795, 0....",2024-04-17 18:16:08.847808+00:00
2024-04-17 18:16:08.847808+00:00,688,518210,"Computing Infrastructure Providers, Data Proce...",This industry comprises establishments primari...,"[-0.01620187796652317, 0.012468253262341022, 0...",2024-04-17 18:16:08.847808+00:00
2024-04-17 18:16:08.847808+00:00,385,334111,Electronic Computer Manufacturing,This U.S. industry comprises establishments pr...,"[-0.018319282680749893, 0.018116984516382217, ...",2024-04-17 18:16:08.847808+00:00
...,...,...,...,...,...,...
2024-04-17 18:16:08.847808+00:00,195,315210,Cut and Sew Apparel Contractors,This industry comprises establishments commonl...,"[0.002346826484426856, 0.04536880552768707, 0....",2024-04-17 18:16:08.847808+00:00
2024-04-17 18:16:08.847808+00:00,165,311824,"Dry Pasta, Dough, and Flour Mixes Manufacturin...",This U.S. industry comprises establishments pr...,"[-0.08442489802837372, 0.0037800853606313467, ...",2024-04-17 18:16:08.847808+00:00
2024-04-17 18:16:08.847808+00:00,498,423690,Other Electronic Parts and Equipment Merchant ...,This industry comprises establishments primari...,"[-0.004994011949747801, 0.020390775054693222, ...",2024-04-17 18:16:08.847808+00:00
2024-04-17 18:16:08.847808+00:00,851,611430,Professional and Management Development Training,This industry comprises establishments primari...,"[-0.04235759377479553, 0.021035049110651016, 0...",2024-04-17 18:16:08.847808+00:00


In [17]:
session = px.launch_app(primary=database_ds,run_in_thread=False)

Existing running Phoenix instance detected! Shutting it down and starting a new instance...


üåç To view the Phoenix app in your browser, visit http://localhost:6006/
üì∫ To view the Phoenix app in a notebook, run `px.active_session().view()`
üìñ For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


## Query vs Summary Embeddings

In [18]:
query = "I want to pave a parking lot"

In [19]:

with SessionLocal() as db:
    res = client.embeddings.create(input=query, model="text-embedding-3-small")
    query_embed = res.data[0].embedding
    stmt = (
        select(ResourceLink)
        .order_by(ResourceLink.summary_embedding.cosine_distance(query_embed))
        .limit(1000)
    )
    results = db.execute(stmt)
    data = results.scalars().all()
    nearest_links = [ResourceLinkSimple.model_validate(item) for item in data]
    link_ids = [link.id for link in nearest_links]
    stmt = (
        select(
            Notice.id,
            ResourceLink.summary,
            Notice.title,
            ResourceLink.text,
            Notice.uiLink,
            Notice.postedDate,
            ResourceLink.summary_embedding,
        )
        .join(ResourceLink, Notice.id == ResourceLink.notice_id)
        .where(ResourceLink.id.in_(link_ids))
    )
    result = db.execute(stmt)
    data = result.all()
    mapped_data = [
        {
            "notice_id": item[0],
            "summary": item[1],
            "title": item[2],
            "text": item[3],
            "uiLink": item[4],
            "postedDate": item[5].isoformat(),
            "summary_embedding": item[6]
        }
        for item in data
    ]

    # embeddings = [{"summary_embedding": item[6].tolist()} for item in data]

In [20]:
query_df = pd.DataFrame({"id": [0], "query": [query], "embedding": [query_embed]})

In [21]:
query_df

Unnamed: 0,id,query,embedding
0,0,I want to pave a parking lot,"[-0.015264973044395447, -0.06939153373241425, ..."


In [22]:
query_schema = px.Schema(
    id_column_name="id",
    prompt_column_names=px.EmbeddingColumnNames(
        vector_column_name="embedding", raw_data_column_name="query"
    ),
)

In [23]:
query_ds = px.Inferences(dataframe=query_df, schema=query_schema, name="query")

In [24]:
df = pd.DataFrame(mapped_data)

In [25]:
df.head()

Unnamed: 0,notice_id,summary,title,text,uiLink,postedDate,summary_embedding
0,9f460049177441fdbbe298fd0f7f00bb,This document appears to be an amendment to a ...,Monitoring the Historic Area Remediation Site ...,1. CONTRACT ID CODE\n\nAMENDMENT OF SOLICITATI...,https://sam.gov/opp/9f460049177441fdbbe298fd0f...,2024-03-15T00:00:00,"[0.039435264, 0.0059268097, 0.061893474, -0.00..."
1,95ccd80ef82d4c12a4e56bdf54b7d889,This document appears to be an amendment to a ...,Y1DA--635 A/E design Specialty Hospital Outsid...,5. PROJECT NUMBER (if applicable)\n\nCODE\n\n7...,https://sam.gov/opp/95ccd80ef82d4c12a4e56bdf54...,2024-03-15T00:00:00,"[-0.009441078, 0.029832205, 0.030980624, 0.003..."
2,bce84de2312e4221b1862c15711eea74,This document appears to be a standard governm...,56--METAL ROOF ARBOR SHADE,DATE BOND EXECUTED (Must be same or later than...,https://sam.gov/opp/bce84de2312e4221b1862c1571...,2024-03-15T00:00:00,"[-0.025685133, -0.0089465715, 0.06374645, 0.01..."
3,ec961a4666e04f2f9cbed2e0a6c6cb63,Certificate of Compliance\nThis document is a ...,SALMON CLOTH,Form Approved\n\nCONTRACT DATA REQUIREMENTS LI...,https://sam.gov/opp/ec961a4666e04f2f9cbed2e0a6...,2024-03-15T00:00:00,"[0.026451945, 0.038039513, 0.05694054, 0.02316..."
4,bce84de2312e4221b1862c15711eea74,"This document appears to be a Bid Bond form, w...",56--METAL ROOF ARBOR SHADE,DATE BOND EXECUTED (Must not be later than bid...,https://sam.gov/opp/bce84de2312e4221b1862c1571...,2024-03-15T00:00:00,"[-0.014625432, 0.014506623, 0.03388443, 0.0391..."


In [26]:
database_schema = px.Schema(
    id_column_name="notice_id",
    prompt_column_names=px.EmbeddingColumnNames(
        vector_column_name="summary_embedding", raw_data_column_name="summary"
    ),
)

In [27]:
database_ds = px.Inferences(dataframe=df, schema=database_schema, name="summaries")

In [28]:
session = px.launch_app(primary=query_ds, corpus=database_ds, run_in_thread=False)

Existing running Phoenix instance detected! Shutting it down and starting a new instance...


üåç To view the Phoenix app in your browser, visit http://localhost:6006/
üì∫ To view the Phoenix app in a notebook, run `px.active_session().view()`
üìñ For more information on how to use Phoenix, check out https://docs.arize.com/phoenix
