In [1]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

In [2]:
import os
import pandas as pd
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
import tiktoken
from sqlalchemy import create_engine, select
from sqlalchemy.orm import sessionmaker
from app.models.models import NaicsCodes
from app.models.schema import NaicsCodeBase


In [3]:
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"

In [4]:
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

In [5]:
client = OpenAI()

In [6]:
df = pd.read_parquet("./embeddings/naics_code_embeddings.parquet")

In [32]:
query = "Dogs"

In [33]:
res = client.embeddings.create(input=query, model="text-embedding-3-small")
query_embed = res.data[0].embedding

In [34]:
with SessionLocal() as session:
    nearest = session.scalars(select(NaicsCodes).order_by(NaicsCodes.description_embedding.l2_distance(query_embed)).limit(5))
    results = [NaicsCodeBase.model_validate(item) for item in nearest]

In [35]:
descriptions = [result.description for result in results]
descriptions

['This U.S. industry comprises establishments primarily engaged in manufacturing dog and cat food from ingredients, such as grains, oilseed mill products, and meat products.\n',
 'This industry comprises establishments primarily engaged in retailing pets, pet foods, and pet supplies.\n',
 'This industry comprises establishments primarily engaged in providing pet care services (except veterinary), such as boarding, grooming, sitting, walking, and training pets.\n',
 "This industry comprises establishments of licensed veterinary practitioners primarily engaged in the practice of veterinary medicine, dentistry, or surgery for animals; and establishments primarily engaged in providing testing services for licensed veterinary practitioners.\nIllustrative Examples:\nAnimal hospitals\nVeterinary clinics\nVeterinarians' offices\nVeterinary testing laboratories\n",
 'This U.S. industry comprises establishments primarily engaged in manufacturing animal food (except dog and cat) from ingredients,

In [31]:
codes = [result.naicsCode for result in results]
codes

[813110, 813311, 813990, 928120, 541720]