In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
import requests
import psycopg2
from io import BytesIO
import tiktoken
from sqlalchemy import create_engine
import sqlalchemy


In [2]:
load_dotenv()
SAM_PUBLIC_API_KEY = os.environ.get("SAM_PUBLIC_API_KEY")
S3_AWS_ACCESS_KEY_ID = os.environ.get("S3_AWS_ACCESS_KEY_ID")
S3_AWS_SECRET_ACCESS_KEY = os.environ.get("S3_AWS_SECRET_ACCESS_KEY")
S3_REGION_NAME = os.environ.get("S3_REGION_NAME")

POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [3]:

connection_string = f"postgresql://postgres:{POSTGRES_PASSWORD}@localhost:5432/postgres"
sqla_connection_string = f"postgresql+psycopg2://postgres:{POSTGRES_PASSWORD}@localhost:5432/postgres"

In [4]:
engine = create_engine(sqla_connection_string)

In [5]:
engine

Engine(postgresql+psycopg2://postgres:***@localhost:5432/postgres)

In [6]:
index_url = "https://www.census.gov/naics/2022NAICS/2022_NAICS_Index_File.xlsx"
description_url = "https://www.census.gov/naics/2022NAICS/2022_NAICS_Descriptions.xlsx"

In [7]:
res_index = requests.get

In [8]:
res_index = requests.get(index_url)

In [9]:
df_index = pd.read_excel(BytesIO(res_index.content))

In [10]:
df_index.head()

Unnamed: 0,NAICS22,INDEX ITEM DESCRIPTION
0,111110,"Soybean farming, field and seed production"
1,111120,"Canola farming, field and seed production"
2,111120,"Flaxseed farming, field and seed production"
3,111120,"Mustard seed farming, field and seed production"
4,111120,"Oilseed farming (except soybean), field and se..."


In [11]:
df_index.dtypes

NAICS22                   object
INDEX ITEM DESCRIPTION    object
dtype: object

In [12]:
mask = pd.to_numeric(df_index['NAICS22'], errors='coerce').isna()
df_index[mask]

Unnamed: 0,NAICS22,INDEX ITEM DESCRIPTION
20373,******,"Cards, publishing -- see specific product"
20374,******,"Clinics, medical -- see type"
20375,******,Consultants -- see specific activity
20376,******,Contractors -- see specific activity
20377,******,"Day camps, instructional -- see type of instru..."
20378,******,Dealers -- see type
20379,******,Farming -- see type
20380,******,Hospitals -- see type
20381,******,Instruction -- see type of training
20382,******,Laboratories -- see specific type


In [13]:
df_index = df_index.drop(df_index[mask].index)

In [14]:
df_index['NAICS22'] = df_index['NAICS22'].astype(int)

In [15]:
df_index.head()

Unnamed: 0,NAICS22,INDEX ITEM DESCRIPTION
0,111110,"Soybean farming, field and seed production"
1,111120,"Canola farming, field and seed production"
2,111120,"Flaxseed farming, field and seed production"
3,111120,"Mustard seed farming, field and seed production"
4,111120,"Oilseed farming (except soybean), field and se..."


In [16]:
df_index.drop(df_index[df_index['NAICS22'] < 111110].index, inplace=True)

In [17]:
df_index

Unnamed: 0,NAICS22,INDEX ITEM DESCRIPTION
0,111110,"Soybean farming, field and seed production"
1,111120,"Canola farming, field and seed production"
2,111120,"Flaxseed farming, field and seed production"
3,111120,"Mustard seed farming, field and seed production"
4,111120,"Oilseed farming (except soybean), field and se..."
...,...,...
20368,928120,Passport issuing services
20369,928120,Peace Corps
20370,928120,State Department
20371,928120,United Nations


In [18]:
res_desc = requests.get(description_url)
res_desc

<Response [200]>

In [19]:
df_desc = pd.read_excel(BytesIO(res_desc.content))

In [20]:
df_desc.head()

Unnamed: 0,Code,Title,Description
0,11,"Agriculture, Forestry, Fishing and HuntingT","The Sector as a Whole\n\nThe Agriculture, Fore..."
1,111,Crop ProductionT,Industries in the Crop Production subsector gr...
2,1111,Oilseed and Grain FarmingT,This industry group comprises establishments p...
3,11111,Soybean FarmingT,See industry description for 111110.
4,111110,Soybean Farming,This industry comprises establishments primari...


In [21]:
df_desc.rename(columns={'Code': 'NAICS22'}, inplace=True)

In [22]:
df_desc.head()

Unnamed: 0,NAICS22,Title,Description
0,11,"Agriculture, Forestry, Fishing and HuntingT","The Sector as a Whole\n\nThe Agriculture, Fore..."
1,111,Crop ProductionT,Industries in the Crop Production subsector gr...
2,1111,Oilseed and Grain FarmingT,This industry group comprises establishments p...
3,11111,Soybean FarmingT,See industry description for 111110.
4,111110,Soybean Farming,This industry comprises establishments primari...


In [23]:
mask = pd.to_numeric(df_desc['NAICS22'], errors='coerce').isna()
df_desc[mask]

Unnamed: 0,NAICS22,Title,Description
270,31-33,ManufacturingT,The Sector as a Whole\n\nThe Manufacturing sec...
1061,44-45,Retail TradeT,The Sector as a Whole\n\nThe Retail Trade sect...
1200,48-49,Transportation and WarehousingT,The Sector as a Whole\n\nThe Transportation an...


In [24]:
df_desc = df_desc.drop(df_desc[mask].index)

In [25]:
df_desc['NAICS22'] = df_desc['NAICS22'].astype(int)


In [26]:
df_desc.drop(df_desc[df_desc['NAICS22'] < 111110].index, inplace=True)

In [27]:
df_desc.head()

Unnamed: 0,NAICS22,Title,Description
4,111110,Soybean Farming,This industry comprises establishments primari...
6,111120,Oilseed (except Soybean) Farming,This industry comprises establishments primari...
8,111130,Dry Pea and Bean Farming,This industry comprises establishments primari...
10,111140,Wheat Farming,This industry comprises establishments primari...
12,111150,Corn Farming,This industry comprises establishments primari...


In [28]:
mask = df_desc["Title"].str.endswith('T')
mask

4       False
6       False
8       False
10      False
12      False
        ...  
2112    False
2114    False
2118    False
2122    False
2124    False
Name: Title, Length: 1012, dtype: bool

In [29]:
df = df_index.merge(df_desc, how='outer', on='NAICS22')

In [30]:
df.head()

Unnamed: 0,NAICS22,INDEX ITEM DESCRIPTION,Title,Description
0,111110,"Soybean farming, field and seed production",Soybean Farming,This industry comprises establishments primari...
1,111120,"Canola farming, field and seed production",Oilseed (except Soybean) Farming,This industry comprises establishments primari...
2,111120,"Flaxseed farming, field and seed production",Oilseed (except Soybean) Farming,This industry comprises establishments primari...
3,111120,"Mustard seed farming, field and seed production",Oilseed (except Soybean) Farming,This industry comprises establishments primari...
4,111120,"Oilseed farming (except soybean), field and se...",Oilseed (except Soybean) Farming,This industry comprises establishments primari...


In [31]:
len(df['Description'].iloc[0])

127

In [32]:
df.to_csv('./data/combined_naics2022.csv')

In [33]:
description = ' '.join(df['Description'])

In [34]:
len(description)

10767246

In [35]:
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
token_count = len(enc.encode(description))

In [36]:
token_count

1947779

In [37]:
print("pandas version:", pd.__version__)
print("SQLAlchemy version:", sqlalchemy.__version__)

pandas version: 2.1.4
SQLAlchemy version: 1.4.51


In [41]:
df.to_sql('naics_index', engine, if_exists='replace', index=True, index_label='index')

375