In [11]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

In [12]:
import os
import pandas as pd
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
import tiktoken
from sqlalchemy import create_engine, select
from sqlalchemy.orm import sessionmaker
from app.models.models import NaicsCodes
from app.models.schema import NaicsCodeBase


In [13]:
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"

In [14]:
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

In [15]:
client = OpenAI()

In [16]:
df = pd.read_parquet("./embeddings/naics_code_embeddings.parquet")

In [17]:
query = "Growing wheat"

In [18]:
res = client.embeddings.create(input=query, model="text-embedding-3-small")
query_embed = res.data[0].embedding

In [19]:
with SessionLocal() as session:
    nearest = session.scalars(select(NaicsCodes).order_by(NaicsCodes.description_embedding.l2_distance(query_embed)).limit(5))
    results = [NaicsCodeBase.model_validate(item) for item in nearest]

In [20]:
descriptions = [result.description for result in results]
descriptions

['This industry comprises establishments primarily engaged in growing wheat and/or producing wheat seeds.\n',
 'This U.S. industry comprises establishments primarily engaged in growing grains and/or producing grain(s) seeds (except wheat, corn, rice, and oilseed(s) and grain(s) combinations).\nIllustrative Examples:\nBarley farming\nSorghum farming\nOat farming\nWild rice farming\nRye farming\n',
 'This industry comprises establishments primarily engaged in the merchant wholesale distribution of grains, such as corn, wheat, oats, barley, and unpolished rice; dry beans; and soybeans and other inedible beans.  Included in this industry are establishments primarily engaged in operating country or terminal grain elevators primarily for the purpose of wholesaling.\n',
 'This industry comprises establishments primarily engaged in operating bulk farm product warehousing and storage facilities (except refrigerated).  Grain elevators primarily engaged in storage are included in this industry.\n

In [21]:
codes = [result.naicsCode for result in results]
codes

[111140, 111199, 424510, 493130, 311221]