In [1]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine, select, values, update, and_, exists, text, distinct
from sqlalchemy.orm import sessionmaker, join
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink, NaicsCodes
from app.models.schema import NoticeBase, ResourceLinkBase, ResourceLinkSimple, NaicsCodeBase, NaicsCodeSimple
from dotenv import load_dotenv
import os
import pendulum
from openai import OpenAI
import pandas as pd
import phoenix as px

In [3]:

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
# Database
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=5).strftime("%Y%m%d")

In [4]:
client = OpenAI()

In [17]:
query = "software"

In [18]:
with SessionLocal() as db:
    res = client.embeddings.create(input=query, model="text-embedding-3-small")
    query_embed = res.data[0].embedding
    stmt = (
        select(NaicsCodes)
        .order_by(NaicsCodes.description_embedding.l2_distance(query_embed))
        # .limit(2000)
    )
    result = db.execute(stmt)
    codes = result.scalars().all()
    data = [NaicsCodeBase.model_validate(code) for code in codes]

In [19]:
embeddings = np.array([item.description_embedding for item in data])
labels = [item.title for item in data]

In [20]:
data_dict = [entry.model_dump() for entry in data]

In [23]:
df = pd.DataFrame(data_dict)
df.shape

(1012, 5)

In [24]:
df.head()

Unnamed: 0,id,naicsCode,title,description,description_embedding
0,769,541511,Custom Computer Programming Services,This U.S. industry comprises establishments pr...,"[-0.020327847450971603, -0.0009499804000370204..."
1,677,513210,Software Publishers,This industry comprises establishments primari...,"[-0.011772933416068554, 0.003895207541063428, ..."
2,770,541512,Computer Systems Design Services,This U.S. industry comprises establishments pr...,"[-0.0222358126193285, 0.011486489325761795, 0...."
3,688,518210,"Computing Infrastructure Providers, Data Proce...",This industry comprises establishments primari...,"[-0.01620187796652317, 0.012468253262341022, 0..."
4,385,334111,Electronic Computer Manufacturing,This U.S. industry comprises establishments pr...,"[-0.018319282680749893, 0.018116984516382217, ..."


In [26]:
datapath = "./data/naics_embeds.parquet"

In [25]:
df.to_parquet(f"{datapath}", engine="pyarrow")

In [None]:
pd.read_parquet(".'")

In [13]:
database_schema = px.Schema(id_column_name="id", prompt_column_names=px.EmbeddingColumnNames(
    vector_column_name="description_embedding", 
    raw_data_column_name="description"
    ),
)

In [14]:
database_ds = px.Inferences(dataframe=df, schema=database_schema, name="database")

In [16]:
database_ds.dataframe

Unnamed: 0_level_0,id,naicsCode,title,description,description_embedding,timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-04-17 08:31:22.365486+00:00,769,541511,Custom Computer Programming Services,This U.S. industry comprises establishments pr...,"[-0.020327847450971603, -0.0009499804000370204...",2024-04-17 08:31:22.365486+00:00
2024-04-17 08:31:22.365486+00:00,677,513210,Software Publishers,This industry comprises establishments primari...,"[-0.011772933416068554, 0.003895207541063428, ...",2024-04-17 08:31:22.365486+00:00
2024-04-17 08:31:22.365486+00:00,770,541512,Computer Systems Design Services,This U.S. industry comprises establishments pr...,"[-0.0222358126193285, 0.011486489325761795, 0....",2024-04-17 08:31:22.365486+00:00
2024-04-17 08:31:22.365486+00:00,688,518210,"Computing Infrastructure Providers, Data Proce...",This industry comprises establishments primari...,"[-0.01620187796652317, 0.012468253262341022, 0...",2024-04-17 08:31:22.365486+00:00
2024-04-17 08:31:22.365486+00:00,385,334111,Electronic Computer Manufacturing,This U.S. industry comprises establishments pr...,"[-0.018319282680749893, 0.018116984516382217, ...",2024-04-17 08:31:22.365486+00:00
...,...,...,...,...,...,...
2024-04-17 08:31:22.365486+00:00,940,722515,Snack and Nonalcoholic Beverage Bars,This U.S. industry comprises establishments pr...,"[-0.025715263560414314, 0.0006730166496708989,...",2024-04-17 08:31:22.365486+00:00
2024-04-17 08:31:22.365486+00:00,123,238310,Drywall and Insulation Contractors,This industry comprises establishments primari...,"[-0.030799265950918198, 0.018252259120345116, ...",2024-04-17 08:31:22.365486+00:00
2024-04-17 08:31:22.365486+00:00,95,221122,Electric Power Distribution,This U.S. industry comprises electric power es...,"[-0.046039506793022156, -0.012125770561397076,...",2024-04-17 08:31:22.365486+00:00
2024-04-17 08:31:22.365486+00:00,1008,926140,Regulation of Agricultural Marketing and Commo...,This industry comprises government establishme...,"[-0.03687594458460808, 0.0037729875184595585, ...",2024-04-17 08:31:22.365486+00:00


In [None]:
px.

In [14]:
session = px.launch_app(primary=database_ds,run_in_thread=False)

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix
