In [27]:
import pandas as pd

# Load the data from the Excel files
naics_df = pd.read_excel("data/Naics3_taxonomy.xlsx")
business_df = pd.read_excel("data/business_category_taxonomy.xlsx")

# Display the first few rows of each dataframe to understand their structure
naics_df.head(), business_df.head()

(   naics_code                                      naics_label  \
 0         111                                  Crop Production   
 1         112                Animal Production and Aquaculture   
 2         113                             Forestry and Logging   
 3         114                    Fishing, Hunting and Trapping   
 4         115  Support Activities for Agriculture and Forestry   
 
                                          description  
 0  Industries in the Crop Production subsector gr...  
 1  Industries in the Animal Production and Aquacu...  
 2  Industries in the Forestry and Logging subsect...  
 3  Industries in the Fishing, Hunting and Trappin...  
 4  Industries in the Support Activities for Agric...  ,
                                label  \
 0            ATVs Dealers & Services   
 1                   Abortion Clinics   
 2  Accounting & Bookkeeping Services   
 3                 Acupuncture clinic   
 4               Adhesives & Sealants   
 
           

In [28]:
# display full description of first row
business_df.iloc[0, 1]

'Businesses categorized under ATVs Dealers & Services specialize in the sale, maintenance, and rental of all-terrain vehicles (ATVs). These establishments offer a range of services related to ATVs, including sales of new and used vehicles, repair and maintenance services, as well as rental options for recreational use. Customers can expect expert guidance on ATV selection, repair services to ensure optimal performance, and convenient rental options for outdoor adventures. The industry is dedicated to meeting the diverse needs of ATV enthusiasts, providing a one-stop destination for all ATV-related requirements.'

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Preprocess the descriptions by converting them to lowercase and removing special characters
naics_df["cleaned_description"] = (
    naics_df["description"].str.lower().str.replace("[^\w\s]", "", regex=True)
)
business_df["cleaned_description"] = (
    business_df["description"].str.lower().str.replace("[^\w\s]", "", regex=True)
)

# Create TF-IDF Vectors
tfidf_vectorizer = TfidfVectorizer()
all_descriptions = pd.concat(
    [naics_df["cleaned_description"], business_df["cleaned_description"]]
)
tfidf_matrix = tfidf_vectorizer.fit_transform(all_descriptions)

# Calculate the cosine similarity
naics_tfidf = tfidf_matrix[: len(naics_df)]
business_tfidf = tfidf_matrix[len(naics_df) :]
cosine_sim = cosine_similarity(naics_tfidf, business_tfidf)

# Match labels based on highest similarity
match_indices = cosine_sim.argmax(axis=1)
matches = business_df.iloc[match_indices]

# Combine the results
matched_df = naics_df[["naics_code", "naics_label"]].copy()
matched_df["matched_label"] = matches["label"].values
matched_df["similarity_score"] = cosine_sim.max(axis=1)

matched_df.head()

Unnamed: 0,naics_code,naics_label,matched_label,similarity_score
0,111,Crop Production,Farms & Agriculture Production,0.289727
1,112,Animal Production and Aquaculture,Animal Shelters,0.4357
2,113,Forestry and Logging,Timber & Wood Products,0.335532
3,114,"Fishing, Hunting and Trapping",Hunting & Fishing - Services & Supplies,0.390857
4,115,Support Activities for Agriculture and Forestry,Tree Services,0.233665


In [30]:
matched_df[matched_df['similarity_score'] > 0.5]


Unnamed: 0,naics_code,naics_label,matched_label,similarity_score
5,211,Oil and Gas Extraction,Oil & Gas - Extraction & Distribution,0.67153
17,316,Leather and Allied Product Manufacturing,Leather Processing,0.628202
20,323,Printing and Related Support Activities,Commercial Printing,0.591659
31,337,Furniture and Related Product Manufacturing,Furniture Stores,0.56598
42,457,Gasoline Stations and Fuel Dealers,Gas Stations,0.517219
43,458,"Clothing, Clothing Accessories, Shoe, and Jewe...",Clothing Stores,0.592723
49,485,Transit and Ground Passenger Transportation,"Buses, Shuttles & Local Transit",0.615342
55,493,Warehousing and Storage,Warehousing & Storage,0.565059
60,518,"Computing Infrastructure Providers, Data Proce...",Hosting & Cloud Infrastructure Providers,0.545963
65,524,Insurance Carriers and Related Activities,"Insurance - Agents, Carriers & Brokers",0.615876


In [31]:
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the spaCy model
nlp = spacy.load("en_core_web_md")

# Preprocess and vectorize descriptions (only once to avoid redundant computation)
naics_vectors = np.array([nlp(text).vector for text in naics_df["cleaned_description"]])
business_vectors = np.array(
    [nlp(text).vector for text in business_df["cleaned_description"]]
)

# Calculate cosine similarity between all pairs of NAICS and business vectors
similarity_matrix = cosine_similarity(business_vectors, naics_vectors)

# Find the highest similarity and corresponding naics_label for each business description
match_indices = similarity_matrix.argmax(axis=1)
matched_naics_labels = naics_df["naics_label"].iloc[match_indices].values
similarity_scores = similarity_matrix.max(axis=1)

# Create a DataFrame to display the results
matched_df = business_df[["label"]].copy()
matched_df["matched_naics_label"] = matched_naics_labels
matched_df["similarity_score"] = similarity_scores

matched_df.head()

Unnamed: 0,label,matched_naics_label,similarity_score
0,ATVs Dealers & Services,Rental and Leasing Services,0.969993
1,Abortion Clinics,Ambulatory Health Care Services,0.964626
2,Accounting & Bookkeeping Services,Administrative and Support Services,0.966553
3,Acupuncture clinic,Ambulatory Health Care Services,0.961714
4,Adhesives & Sealants,Machinery Manufacturing,0.968328


In [34]:
matched_df[matched_df['label'] == 'Digital & Marketing Agencies']

Unnamed: 0,label,matched_naics_label,similarity_score
173,Digital & Marketing Agencies,"Performing Arts, Spectator Sports, and Related...",0.96161
