In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
import requests
import psycopg2
from io import BytesIO
import tiktoken
from sqlalchemy import create_engine, Integer, text

In [None]:
load_dotenv()
SAM_PUBLIC_API_KEY = os.environ.get("SAM_PUBLIC_API_KEY")
S3_AWS_ACCESS_KEY_ID = os.environ.get("S3_AWS_ACCESS_KEY_ID")
S3_AWS_SECRET_ACCESS_KEY = os.environ.get("S3_AWS_SECRET_ACCESS_KEY")
S3_REGION_NAME = os.environ.get("S3_REGION_NAME")

POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [None]:

connection_string = f"postgresql://postgres:{POSTGRES_PASSWORD}@localhost:5432/postgres"
sqla_connection_string = f"postgresql+psycopg2://postgres:{POSTGRES_PASSWORD}@localhost:5432/postgres"

In [None]:
engine = create_engine(sqla_connection_string)

In [None]:
engine

In [None]:
index_url = "https://www.census.gov/naics/2022NAICS/2022_NAICS_Index_File.xlsx"
description_url = "https://www.census.gov/naics/2022NAICS/2022_NAICS_Descriptions.xlsx"

In [None]:
res_index = requests.get

In [None]:
res_index = requests.get(index_url)

In [None]:
df_index = pd.read_excel(BytesIO(res_index.content))

In [None]:
df_index.head()

In [None]:
df_index.dtypes

In [None]:
mask = pd.to_numeric(df_index['NAICS22'], errors='coerce').isna()
df_index[mask]

In [None]:
df_index = df_index.drop(df_index[mask].index)

In [None]:
df_index['NAICS22'] = df_index['NAICS22'].astype(int)

In [None]:
df_index.head()

In [None]:
df_index.drop(df_index[df_index['NAICS22'] < 111110].index, inplace=True)

In [None]:
df_index

In [None]:
res_desc = requests.get(description_url)
res_desc

In [None]:
df_desc = pd.read_excel(BytesIO(res_desc.content))

In [None]:
df_desc.head()

In [None]:
df_desc.rename(columns={'Code': 'NAICS22'}, inplace=True)

In [None]:
df_desc.head()

In [None]:
mask = pd.to_numeric(df_desc['NAICS22'], errors='coerce').isna()
df_desc[mask]

In [None]:
df_desc = df_desc.drop(df_desc[mask].index)

In [None]:
df_desc['NAICS22'] = df_desc['NAICS22'].astype(int)


In [None]:
df_desc.drop(df_desc[df_desc['NAICS22'] < 111110].index, inplace=True)

In [None]:
df_desc.head()

In [None]:
mask = df_desc["Title"].str.endswith('T')
mask

In [None]:
df = df_index.merge(df_desc, how='outer', on='NAICS22')

In [None]:
df.head()

In [None]:
len(df['Description'].iloc[0])

In [None]:
df.to_csv('./data/combined_naics2022.csv')

In [None]:
description = ' '.join(df['Description'])

In [None]:
len(description)

In [None]:
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
token_count = len(enc.encode(description))

In [None]:
token_count

In [None]:
df.columns

In [None]:
df.rename(columns={'NAICS22': 'naicscode', 'INDEX ITEM DESCRIPTION': 'index_item_description', 'Title': 'title', 'Description': 'description'}, inplace=True)

In [None]:
df.to_sql('naics_index', engine, if_exists='replace', dtype={'naicscode': Integer}, index=True, index_label='naicscode')

In [None]:
with engine.begin() as connection:
    query = text(
        f"""
        ALTER TABLE public.naics_index
        ADD PRIMARY KEY (index);
        """
    )
    connection.execute(query)

In [None]:
with engine.begin() as connection:
    query = text(
        f"""
        DROP INDEX ix_naics_index_index;
        """
    )
    connection.execute(query)