#### pip install spacy openpyxl
#### python -m spacy download en_core_web_lg
#### pip install nltk


In [9]:
import pandas as pd
import spacy
import nltk
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Download necessary NLTK resources and spaCy model
nltk.download("wordnet")
spacy.load("en_core_web_md")

nlp = spacy.load("en_core_web_md")

# Load data
categories_df = pd.read_excel("data/business_category_taxonomy.xlsx", engine="openpyxl")
domains_df = pd.read_excel("data/Naics3_taxonomy.xlsx", engine="openpyxl")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
# Preprocess text: tokenize, lemmatize, remove stopwords and punctuation
def preprocess_text(text):
    return " ".join(
        [
            token.lemma_
            for token in nlp(text)
            if not token.is_stop and not token.is_punct
        ]
    )


# Add synonyms using NLTK's WordNet
def add_synonyms(text):
    expanded_text = []
    for word in text.split():
        synonyms = {
            lemma.name().replace("_", " ")
            for synset in wn.synsets(word)
            for lemma in synset.lemmas()
        }
        expanded_text.append(" ".join(synonyms))
    return " ".join(expanded_text)


def preprocess_and_expand_text(text):
    lemmatized_text = preprocess_text(text)
    expanded_text = add_synonyms(lemmatized_text)
    return expanded_text

In [11]:
# Process descriptions
categories_df["processed_description"] = categories_df["description"].apply(
    preprocess_and_expand_text
)
domains_df["processed_description"] = domains_df["description"].apply(
    preprocess_and_expand_text
)

# Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer()
all_descriptions = pd.concat(
    [categories_df["processed_description"], domains_df["processed_description"]]
)
vectorizer.fit(all_descriptions)

categories_df["tfidf_vector"] = list(
    vectorizer.transform(categories_df["processed_description"]).toarray()
)
domains_df["tfidf_vector"] = list(
    vectorizer.transform(domains_df["processed_description"]).toarray()
)

In [13]:
# Calculate Cosine Similarity and Find Best Match
def find_best_match(cat_tfidf_vec, domain_tfidf_vectors):
    similarities = cosine_similarity([cat_tfidf_vec], domain_tfidf_vectors)[0]
    return similarities.argmax()


domain_tfidf_matrix = np.array(domains_df["tfidf_vector"].tolist())

categories_df["best_match_label_index"] = categories_df["tfidf_vector"].apply(
    lambda x: find_best_match(x, domain_tfidf_matrix)
)
categories_df["best_match_label"] = categories_df["best_match_label_index"].apply(
    lambda x: domains_df.iloc[x]["naics_label"]
)


In [16]:
# Print results
print(categories_df[["label", "best_match_label"]])

                                    label  \
0                 ATVs Dealers & Services   
1                        Abortion Clinics   
2       Accounting & Bookkeeping Services   
3                      Acupuncture clinic   
4                    Adhesives & Sealants   
..                                    ...   
598  Work Clothing & Protection Equipment   
599                 Writers & Copywriters   
600                            YMCA Camps   
601                          Yoga Studios   
602                                   Zoo   

                                      best_match_label  
0                        Personal and Laundry Services  
1                        Personal and Laundry Services  
2                  Administrative and Support Services  
3                      Ambulatory Health Care Services  
4                              Machinery Manufacturing  
..                                                 ...  
598  Clothing, Clothing Accessories, Shoe, and Jewe...  
599

In [17]:
categories_df

Unnamed: 0,label,description,processed_description,tfidf_vector,best_match_label_index,best_match_label
0,ATVs Dealers & Services,Businesses categorized under ATVs Dealers & Se...,business enterprise commercial enterprise busi...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",85,Personal and Laundry Services
1,Abortion Clinics,Abortion Clinics provide medical services rela...,abortion miscarriage clinic put up bring home ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",85,Personal and Laundry Services
2,Accounting & Bookkeeping Services,Accounting & Bookkeeping Services encompass a ...,accounting account answer for method of accoun...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",72,Administrative and Support Services
3,Acupuncture clinic,An Acupuncture clinic is a healthcare facility...,stylostixis acupuncture clinic healthcare heal...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",75,Ambulatory Health Care Services
4,Adhesives & Sealants,Adhesives & Sealants encompass a diverse range...,adhesive adhesive agent adhesive material seal...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",27,Machinery Manufacturing
...,...,...,...,...,...,...
598,Work Clothing & Protection Equipment,Work Clothing & Protection Equipment pertains ...,function ferment sour exercise make employment...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",43,"Clothing, Clothing Accessories, Shoe, and Jewe..."
599,Writers & Copywriters,Writers & Copywriters are skilled professional...,writer author copywriter skilled professional ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",57,Publishing Industries
600,YMCA Camps,YMCA Camps offer structured programs and activ...,coterie ingroup refugee camp summer camp enca...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",82,Accommodation
601,Yoga Studios,Yoga Studios offer a tranquil space for indivi...,yoga studio apartment studio put up fling whir...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",74,Educational Services


In [18]:
domains_df

Unnamed: 0,naics_code,naics_label,description,processed_description,tfidf_vector
0,111,Crop Production,Industries in the Crop Production subsector gr...,industry diligence industriousness manufacture...,"[0.0, 0.0, 0.0, 0.0, 0.02250591374748779, 0.0,..."
1,112,Animal Production and Aquaculture,Industries in the Animal Production and Aquacu...,industry diligence industriousness manufacture...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,113,Forestry and Logging,Industries in the Forestry and Logging subsect...,industry diligence industriousness manufacture...,"[0.056821440919833516, 0.0, 0.0, 0.0, 0.0, 0.0..."
3,114,"Fishing, Hunting and Trapping","Industries in the Fishing, Hunting and Trappin...",industry diligence industriousness manufacture...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,115,Support Activities for Agriculture and Forestry,Industries in the Support Activities for Agric...,industry diligence industriousness manufacture...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
91,924,Administration of Environmental Quality Programs,The Administration of Environmental Quality Pr...,organisation judicature presidential term gove...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
92,925,"Administration of Housing Programs, Urban Plan...","The Administration of Housing Programs, Urban ...",organisation judicature presidential term gove...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
93,926,Administration of Economic Programs,The Administration of Economic Programs subsec...,organisation judicature presidential term gove...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
94,927,Space Research and Technology,The Space Research and Technology subsector co...,blank place quad blank space infinite distance...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
