In [1]:
import os
import pandas as pd
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
import tiktoken

In [2]:
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"

True

In [5]:
base_df = pd.read_sql_table("naics_codes", DATABASE_URL)

In [6]:
df = base_df.sample(n=10)

In [7]:
client = OpenAI()

In [8]:
try:
    descriptions = df['description'].tolist()
    response = client.embeddings.create(input=descriptions, model="text-embedding-3-small")
    embeddings = [np.array(res.embedding) for res in response.data]
    df['description_embedding'] = embeddings
except Exception as e:
    print(f"An error occurred: {e}")

In [9]:
df.head()

Unnamed: 0,id,naicsCode,title,description,description_embedding
342,343,332919,Other Metal Valve and Pipe Fitting Manufacturing,This U.S. industry comprises establishments pr...,"[-0.027981054037809372, 0.055173907428979874, ..."
26,27,111940,Hay Farming,This industry comprises establishments primari...,"[-0.04596225172281265, -0.0077963354997336864,..."
222,223,323111,Commercial Printing (except Screen and Books),This U.S. industry comprises establishments pr...,"[-0.03339000418782234, 0.02418380416929722, 0...."
486,487,423410,Photographic Equipment and Supplies Merchant W...,This industry comprises establishments primari...,"[-0.007617996074259281, 0.031311675906181335, ..."
141,141,311314,Cane Sugar Manufacturing,This U.S. industry comprises establishments pr...,"[-0.03370805084705353, 0.013124123215675354, 0..."


In [10]:
df = base_df

In [11]:
df

Unnamed: 0,id,naicsCode,title,description
0,1,111110,Soybean Farming,This industry comprises establishments primari...
1,2,111120,Oilseed (except Soybean) Farming,This industry comprises establishments primari...
2,3,111130,Dry Pea and Bean Farming,This industry comprises establishments primari...
3,4,111140,Wheat Farming,This industry comprises establishments primari...
4,5,111150,Corn Farming,This industry comprises establishments primari...
...,...,...,...,...
1007,1008,926140,Regulation of Agricultural Marketing and Commo...,This industry comprises government establishme...
1008,1009,926150,"Regulation, Licensing, and Inspection of Misce...",This industry comprises government establishme...
1009,1010,927110,Space Research and Technology,This industry comprises government establishme...
1010,1011,928110,National Security,This industry comprises government establishme...


In [12]:
token_test = ' '.join(df['description'])

In [13]:
def num_tokens_in_corpus(input:str, encoding_name: str) -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(input))
    return num_tokens

print(num_tokens_in_corpus(token_test, "gpt-3.5-turbo"))

70115


In [14]:
try:
    descriptions = df['description'].tolist()
    response = client.embeddings.create(input=descriptions, model="text-embedding-3-small")
    embeddings = [res.embedding for res in response.data]
    df['description_embedding'] = embeddings
except Exception as e:
    print(f"An error occurred: {e}")

df.to_parquet("./embeddings/naics_code_embeddings.parquet")

In [27]:
df_res = pd.read_parquet("./embeddings/naics_code_embeddings.parquet")

In [28]:
df_res.head()

Unnamed: 0,id,naicsCode,title,description,description_embedding
0,1,111110,Soybean Farming,This industry comprises establishments primari...,"[-0.0391668938100338, -0.006738914176821709, 0..."
1,2,111120,Oilseed (except Soybean) Farming,This industry comprises establishments primari...,"[-0.03455028310418129, 0.018914388492703438, 0..."
2,3,111130,Dry Pea and Bean Farming,This industry comprises establishments primari...,"[-0.056121308356523514, -0.022741097956895828,..."
3,4,111140,Wheat Farming,This industry comprises establishments primari...,"[-0.0472857840359211, -0.006182927638292313, 0..."
4,5,111150,Corn Farming,This industry comprises establishments primari...,"[-0.04400010034441948, 0.027638481929898262, 0..."


In [30]:
type(df_res.iloc[0]['description_embedding'])

numpy.ndarray

In [31]:

len(df_res.iloc[0]['description_embedding'])

1536

In [33]:
!cp ./embeddings/naics_code_embeddings.parquet ../backend/app/data/naics/cleaned_combined_naics2022.parquet

In [34]:
df_res

Unnamed: 0,id,naicsCode,title,description,description_embedding
0,1,111110,Soybean Farming,This industry comprises establishments primari...,"[-0.0391668938100338, -0.006738914176821709, 0..."
1,2,111120,Oilseed (except Soybean) Farming,This industry comprises establishments primari...,"[-0.03455028310418129, 0.018914388492703438, 0..."
2,3,111130,Dry Pea and Bean Farming,This industry comprises establishments primari...,"[-0.056121308356523514, -0.022741097956895828,..."
3,4,111140,Wheat Farming,This industry comprises establishments primari...,"[-0.0472857840359211, -0.006182927638292313, 0..."
4,5,111150,Corn Farming,This industry comprises establishments primari...,"[-0.04400010034441948, 0.027638481929898262, 0..."
...,...,...,...,...,...
1007,1008,926140,Regulation of Agricultural Marketing and Commo...,This industry comprises government establishme...,"[-0.03687594458460808, 0.0037729875184595585, ..."
1008,1009,926150,"Regulation, Licensing, and Inspection of Misce...",This industry comprises government establishme...,"[-0.033936310559511185, 0.01909656450152397, 0..."
1009,1010,927110,Space Research and Technology,This industry comprises government establishme...,"[-0.025842957198619843, -0.01179493311792612, ..."
1010,1011,928110,National Security,This industry comprises government establishme...,"[-0.013598986901342869, 0.029110155999660492, ..."
