#### pip install spacy openpyxl
#### python -m spacy download en_core_web_lg


In [1]:
import pandas as pd

# Load your CSV files
categories_df = pd.read_excel("data/business_category_taxonomy.xlsx", engine='openpyxl')
domains_df = pd.read_excel("data/Naics3_taxonomy.xlsx", engine='openpyxl')

In [2]:
import spacy

# Load the medium spaCy model
nlp = spacy.load("en_core_web_lg")


# Process descriptions to create vectors
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized = " ".join(
        [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    )
    return lemmatized


# Apply the lemmatization function to the descriptions
categories_df["lemmatized_description"] = categories_df["description"].apply(
    lemmatize_text
)
domains_df["lemmatized_description"] = domains_df["description"].apply(lemmatize_text)

categories_df["description_vector"] = categories_df["lemmatized_description"].apply(nlp)
domains_df["description_vector"] = domains_df["lemmatized_description"].apply(nlp)

In [3]:
def find_best_match(cat_desc_vec, label_vectors):
    # Calculate similarities and return the index of the highest similarity
    similarities = [
        cat_desc_vec.similarity(label_desc) for label_desc in label_vectors
    ]
    return similarities.index(max(similarities))


# Convert domain vectors to a list for efficiency
label_vectors = domains_df["description_vector"].tolist()

# Find the best match for each category
categories_df["best_match_label_index"] = categories_df["description_vector"].apply(
    lambda x: find_best_match(x, label_vectors)
)

# Map the index to the actual domain name
categories_df["best_match_label"] = categories_df["best_match_label_index"].apply(
    lambda x: domains_df.iloc[x]["naics_label"]
)

In [4]:
categories_df

Unnamed: 0,label,description,lemmatized_description,description_vector,best_match_label_index,best_match_label
0,ATVs Dealers & Services,Businesses categorized under ATVs Dealers & Se...,business categorize atv Dealers Services speci...,"(business, categorize, atv, Dealers, Services,...",36,Motor Vehicle and Parts Dealers
1,Abortion Clinics,Abortion Clinics provide medical services rela...,Abortion Clinics provide medical service relat...,"(Abortion, Clinics, provide, medical, service,...",76,Hospitals
2,Accounting & Bookkeeping Services,Accounting & Bookkeeping Services encompass a ...,Accounting Bookkeeping Services encompass rang...,"(Accounting, Bookkeeping, Services, encompass,...",72,Administrative and Support Services
3,Acupuncture clinic,An Acupuncture clinic is a healthcare facility...,acupuncture clinic healthcare facility special...,"(acupuncture, clinic, healthcare, facility, sp...",76,Hospitals
4,Adhesives & Sealants,Adhesives & Sealants encompass a diverse range...,adhesive Sealants encompass diverse range prod...,"(adhesive, Sealants, encompass, diverse, range...",32,Miscellaneous Manufacturing
...,...,...,...,...,...,...
598,Work Clothing & Protection Equipment,Work Clothing & Protection Equipment pertains ...,work Clothing Protection Equipment pertain spe...,"(work, Clothing, Protection, Equipment, pertai...",32,Miscellaneous Manufacturing
599,Writers & Copywriters,Writers & Copywriters are skilled professional...,Writers Copywriters skilled professional speci...,"(Writers, Copywriters, skilled, professional, ...",57,Publishing Industries
600,YMCA Camps,YMCA Camps offer structured programs and activ...,YMCA Camps offer structured program activity d...,"(YMCA, Camps, offer, structured, program, acti...",74,Educational Services
601,Yoga Studios,Yoga Studios offer a tranquil space for indivi...,yoga studio offer tranquil space individual en...,"(yoga, studio, offer, tranquil, space, individ...",74,Educational Services
