In [6]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

In [7]:
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine, select, values, update, and_, exists, text, distinct
from sqlalchemy.orm import sessionmaker, join
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink, NaicsCodes
from app.models.schema import NoticeBase, ResourceLinkBase, ResourceLinkSimple, NaicsCodeBase, NaicsCodeSimple
from dotenv import load_dotenv
import os
import pendulum
from openai import OpenAI
import pandas as pd
import phoenix as px

In [8]:

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
# Database
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=5).strftime("%Y%m%d")

In [9]:
client = OpenAI()

In [10]:
query = "cabinet making"

In [11]:
with SessionLocal() as db:
    res = client.embeddings.create(input=query, model="text-embedding-3-small")
    query_embed = res.data[0].embedding
    stmt = (
        select(NaicsCodes)
        .order_by(NaicsCodes.description_embedding.l2_distance(query_embed))
        .limit(20)
    )
    result = db.execute(stmt)
    codes = result.scalars().all()
    data = [NaicsCodeBase.model_validate(code) for code in codes]

In [12]:
embeddings = np.array([item.description_embedding for item in data])
labels = [item.title for item in data]

In [13]:
data_dict = [entry.model_dump() for entry in data]

In [14]:
pd.DataFrame()

In [15]:
df = pd.DataFrame(data_dict)

In [16]:
df.head()

Unnamed: 0,id,naicsCode,title,description,description_embedding
0,450,337110,Wood Kitchen Cabinet and Countertop Manufacturing,This industry comprises establishments primari...,"[-0.04873565584421158, 0.02250848338007927, 0...."
1,128,238350,Finish Carpentry Contractors,This industry comprises establishments primari...,"[-0.04489817097783089, 0.06084062159061432, 0...."
2,452,337122,Nonupholstered Wood Household Furniture Manufa...,This U.S. industry comprises establishments pr...,"[-0.04289516061544418, 0.04171433672308922, 0...."
3,456,337212,Custom Architectural Woodwork and Millwork Man...,This U.S. industry comprises establishments pr...,"[-0.030280133709311485, 0.032772015780210495, ..."
4,455,337211,Wood Office Furniture Manufacturing,This U.S. industry comprises establishments pr...,"[-0.0499969981610775, 0.05100461468100548, 0.0..."


In [18]:
database_schema = px.Schema(id_column_name="id", prompt_column_names=px.EmbeddingColumnNames(
    vector_column_name="description_embedding", 
    raw_data_column_name="description"
    ),
)

In [19]:
database_ds = px.Inferences(dataframe=df, schema=database_schema, name="database")

In [20]:
session = px.launch_app(primary=database_ds,run_in_thread=False)

Existing running Phoenix instance detected! Shutting it down and starting a new instance...


🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix
