In [4]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [openpyxl]
[1A[2KSuccessfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [5]:
import pandas as pd

# Load the file, skipping the first two header rows
file_path = "/Users/michaelwalker/Downloads/2022_NAICS_Structure.xlsx"
df = pd.read_excel(file_path, skiprows=2, usecols=[1,2], names=["Code", "Title"])

# Drop rows where code is missing
df = df.dropna(subset=["Code"])

# Ensure Code is string for length checks
df["Code"] = df["Code"].astype(str)

# Create DataFrames by code length
df_sector = df[df["Code"].str.len() == 2]      # 2-digit
df_subsector = df[df["Code"].str.len() == 3]   # 3-digit
df_group = df[df["Code"].str.len() == 4]       # 4-digit
df_industry = df[df["Code"].str.len() == 5]    # 5-digit
df_detail = df[df["Code"].str.len() == 6]      # 6-digit

# Example: show first few rows of each
print("Sectors:")
print(df_sector.head())

print("\nSubsectors:")
print(df_subsector.head())


Sectors:
    Code                                           Title
0     11     Agriculture, Forestry, Fishing and HuntingT
132   21  Mining, Quarrying, and Oil and Gas ExtractionT
174   22                                      UtilitiesT
200   23                                   ConstructionT
905   42                                Wholesale TradeT

Subsectors:
    Code                                              Title
1    111                                  Crop ProductionT 
55   112                Animal Production and AquacultureT 
97   113                             Forestry and LoggingT 
107  114                    Fishing, Hunting and TrappingT 
116  115  Support Activities for Agriculture and ForestryT 


In [6]:
df_sector.to_csv("NAICS_sector.csv")

In [7]:
df_sector

Unnamed: 0,Code,Title
0,11,"Agriculture, Forestry, Fishing and HuntingT"
132,21,"Mining, Quarrying, and Oil and Gas ExtractionT"
174,22,UtilitiesT
200,23,ConstructionT
905,42,Wholesale TradeT
1348,51,InformationT
1420,52,Finance and InsuranceT
1500,53,Real Estate and Rental and LeasingT
1554,54,"Professional, Scientific, and Technical ServicesT"
1650,55,Management of Companies and EnterprisesT


In [11]:
import pandas as pd
import re

file_path = "/Users/michaelwalker/Downloads/2022_NAICS_Structure.xlsx"

# --- Load & select only the NAICS Code + Title columns ---
# In your file: col 1 = sequence (ignore), col 2 = code, col 3 = title.
# We skip the first two rows of preface lines.
raw = pd.read_excel(file_path, skiprows=2, usecols=[1, 2], names=["Code", "Title"], engine="openpyxl")

# Drop rows with missing codes and coerce to string
df = raw.dropna(subset=["Code"]).copy()
df["Code"] = df["Code"].astype(str).str.strip()

# Some Excel reads can turn codes into floats like "11.0" -> "11"
df["Code"] = df["Code"].str.replace(r"\.0$", "", regex=True)

# Keep only pure digit codes (drop section headers like 'Change Indicator', etc.)
df = df[df["Code"].str.fullmatch(r"\d+")].copy()

# Clean titles:
# 1) strip whitespace
# 2) remove a single trailing 'T' token (the trilateral flag), preserving real words with 't'
df["Title"] = (
    df["Title"].astype(str)
               .str.strip()
               .str.replace(r"\s*T$", "", regex=True)  # remove trailing ' T' or 'T'
)

# --- Split by code length ---
df_sector    = df[df["Code"].str.len() == 2].reset_index(drop=True)  # 2-digit
df_subsector = df[df["Code"].str.len() == 3].reset_index(drop=True)  # 3-digit
df_group     = df[df["Code"].str.len() == 4].reset_index(drop=True)  # 4-digit
df_industry  = df[df["Code"].str.len() == 5].reset_index(drop=True)  # 5-digit
df_detail    = df[df["Code"].str.len() == 6].reset_index(drop=True)  # 6-digit

# Optional: quick sanity checks
for name, d in {
    "sector(2)": df_sector,
    "subsector(3)": df_subsector,
    "group(4)": df_group,
    "industry(5)": df_industry,
    "detail(6)": df_detail,
}.items():
    print(name, d.shape)
    print(d.head(3), "\n")

# Optional: save to CSVs
df_sector.to_csv("/Users/michaelwalker/RDM_Datalab/naics_2022_sector_2digit.csv", index=False)
df_subsector.to_csv("/Users/michaelwalker/RDM_Datalab/naics_2022_subsector_3digit.csv", index=False)
df_group.to_csv("/Users/michaelwalker/RDM_Datalab/naics_2022_group_4digit.csv", index=False)
df_industry.to_csv("/Users/michaelwalker/RDM_Datalab/naics_2022_industry_5digit.csv", index=False)
df_detail.to_csv("/Users/michaelwalker/RDM_Datalab/naics_2022_detail_6digit.csv", index=False)


sector(2) (17, 2)
  Code                                          Title
0   11     Agriculture, Forestry, Fishing and Hunting
1   21  Mining, Quarrying, and Oil and Gas Extraction
2   22                                      Utilities 

subsector(3) (96, 2)
  Code                              Title
0  111                    Crop Production
1  112  Animal Production and Aquaculture
2  113               Forestry and Logging 

group(4) (308, 2)
   Code                        Title
0  1111    Oilseed and Grain Farming
1  1112  Vegetable and Melon Farming
2  1113   Fruit and Tree Nut Farming 

industry(5) (689, 2)
    Code                             Title
0  11111                   Soybean Farming
1  11112  Oilseed (except Soybean) Farming
2  11113          Dry Pea and Bean Farming 

detail(6) (1012, 2)
     Code                             Title
0  111110                   Soybean Farming
1  111120  Oilseed (except Soybean) Farming
2  111130          Dry Pea and Bean Farming 

