In [2]:
import pandas as pd

naics_df = pd.read_csv("data/naics_summary.csv")
business_df = pd.read_excel("data/business_category_taxonomy.xlsx")

In [3]:
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import string

# Load spaCy model
nlp = spacy.load("en_core_web_md")


def remove_mentions_and_hashtags(text):
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    return text


def remove_numbers(text):
    text = re.sub(r"\d+", "", text)
    return text


def remove_punctuation(text):

    translator = str.maketrans("", "", string.punctuation)

    # Remove punctuation using the translation table
    text_without_punct = text.translate(translator)

    return text_without_punct


def remove_stopwords(text):
    filtered_sentence = []
    doc = nlp(text)
    for token in doc:
        if token.is_stop == False:
            filtered_sentence.append(token.text)
    return " ".join(filtered_sentence)


def lemmatize(text):
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc])
    return text


# Define the text cleaning and preprocessing functions
def clean_text(text, to_lemmatize: bool = True):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = remove_mentions_and_hashtags(text)
    text = text.lower()
    text = remove_punctuation(text)
    text = remove_numbers(text)
    text = re.sub(r"\W", " ", text)
    text = re.sub(r"\s+", " ", text, flags=re.I)
    if to_lemmatize:
        text = lemmatize(text)
    return text


# Function to extract key terms using TF-IDF
def extract_top_keywords(texts, n=10):
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")
    X = vectorizer.fit_transform(texts)
    top_n_keywords = []
    for row in X:
        row_array = row.toarray().flatten()
        top_indices = np.argsort(row_array)[-n:]  # Indices of top n scores
        top_features = [vectorizer.get_feature_names_out()[i] for i in top_indices]
        top_n_keywords.append(top_features)
    return top_n_keywords

# Clean descriptions
business_df["clean_description"] = business_df["description"].apply(clean_text)
naics_df["clean_description"] = naics_df["description"].apply(clean_text)

# Extract top 10 keywords from descriptions
business_df["keywords"] = business_df["clean_description"].apply(
    lambda x: extract_top_keywords([x], 10)[0]
)
naics_df["keywords"] = naics_df["clean_description"].apply(
    lambda x: extract_top_keywords([x], 10)[0]
)

# Convert keywords list to single string per description
business_df["keywords_str"] = business_df["keywords"].apply(" ".join)
naics_df["keywords_str"] = naics_df["keywords"].apply(" ".join)

# Vectorize and calculate cosine similarity
vectorizer = TfidfVectorizer()
all_keywords = list(naics_df["keywords_str"]) + list(business_df["keywords_str"])
vectorizer.fit(all_keywords)
naics_vectors = vectorizer.transform(naics_df["keywords_str"])
business_vectors = vectorizer.transform(business_df["keywords_str"])
similarity_matrix = cosine_similarity(business_vectors, naics_vectors)

# Find the best NAICS match for each business label based on highest similarity
match_indices = np.argmax(similarity_matrix, axis=1)
business_df["matched_naics_label"] = naics_df["naics_label"].values[match_indices]
business_df["similarity_score"] = np.max(similarity_matrix, axis=1)

In [4]:
final_data = pd.read_csv("data/clean_description.csv")
final_data


Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category,best_match_label_index,best_match_label,clean_description,description_vector
0,White Horse,Tile Manufacturing | European Aesthetics Ceram...,White Horse is highly regarded as a tile trail...,White Horse Ceramic Singapore is a leading man...,Tile Store,31,Furniture and Related Product Manufacturing,white horse ceramic singapore lead manufacture...,white horse ceramic singapore lead manufacture...
1,Wealth Solution Partners,Super and SMSF Services | Financial Planning a...,"WSP, Wealth Solution Partners, Financial Plann...",Wealth Solution Partners Pty Ltd is an indepen...,Investment Consultants & Financial Advisors,64,"Securities, Commodity Contracts, and Other Fin...",wealth solution partner pty ltd independent fi...,wealth solution partner pty ltd independent fi...
2,PMG,Fire and Water Cleanup Services | Mold Remedia...,PMG General Solutions Inc. is an environmental...,PMG General Solutions Inc. is an environmental...,Damage Restoration & Mold Remediation,84,Repair and Maintenance,pmg general solutions inc environmental remedi...,pmg general solutions inc environmental remedi...
3,TMP Capital PLLC,Licensed in AL & FL | 203K Loans | 15-year Fix...,TMP Capital PLLC Consulting Company Franklin M...,"TMP Capital PLLC Consulting Company, also know...",Mortgage Brokers,64,"Securities, Commodity Contracts, and Other Fin...",tmp capital pllc consult company know franklin...,tmp capital pllc consult company know franklin...
4,Genertek Power,Industrial and Commercial Energy Storage | Ass...,Genertek Power Ltd a UK electricity systems & ...,Genertek Power Limited is a privately-owned UK...,Renewable energy companies,8,Utilities,genertek power limit privatelyowned uk investm...,genertek power limit privatelyowned uk investm...
...,...,...,...,...,...,...,...,...,...
99995,Karma Hardware,Wholesale | Brass Hook Marble Tower Bolt | Coa...,"Karma Hardware - Manufacturer of Door Hinges, ...",Karma Hardware is a manufacturer based in Koth...,Hardware Stores,37,Building Material and Garden Equipment and Sup...,karma hardware manufacturer base kothariya raj...,karma hardware manufacturer base kothariya raj...
99996,Rainbow Paper,Bulky Newsprint | Parchment Paper & Parchment ...,Rainbow Paper offers a huge selection of art &...,Rainbow Paper is an Australian-owned brand tha...,Paper & Cardboard Products,20,Printing and Related Support Activities,rainbow paper australianowned brand specialize...,rainbow paper australianowned brand specialize...
99997,Blue Support Services,24/7 Facility Management | Building Maintenanc...,Blue Support Services provides a flexible and ...,Blue Support Services is a company that offers...,Security Guards & Patrol Services,64,"Securities, Commodity Contracts, and Other Fin...",blue support service company offer flexible in...,blue support service company offer flexible in...
99998,Digital Citizen,Bespoke Resourcing Solutions | Cross-platform ...,Digital Citizen is a NYC-based digital product...,Digital Citizen is a New York City-based digit...,Digital & Marketing Agencies,72,Administrative and Support Services,digital citizen new york citybase digital prod...,digital citizen new york citybase digital prod...


In [5]:
dataset = pd.read_csv("data/tournament_hints_data.csv")
dataset

Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category
0,White Horse,Tile Manufacturing | European Aesthetics Ceram...,White Horse is highly regarded as a tile trail...,White Horse Ceramic Singapore is a leading man...,Tile Store
1,Wealth Solution Partners,Super and SMSF Services | Financial Planning a...,"WSP, Wealth Solution Partners, Financial Plann...",Wealth Solution Partners Pty Ltd is an indepen...,Investment Consultants & Financial Advisors
2,PMG,Fire and Water Cleanup Services | Mold Remedia...,PMG General Solutions Inc. is an environmental...,PMG General Solutions Inc. is an environmental...,Damage Restoration & Mold Remediation
3,TMP Capital PLLC,Licensed in AL & FL | 203K Loans | 15-year Fix...,TMP Capital PLLC Consulting Company Franklin M...,"TMP Capital PLLC Consulting Company, also know...",Mortgage Brokers
4,Genertek Power,Industrial and Commercial Energy Storage | Ass...,Genertek Power Ltd a UK electricity systems & ...,Genertek Power Limited is a privately-owned UK...,Renewable energy companies
...,...,...,...,...,...
626241,Global Golf Tech Solutions,Manufacturing | Golf Academy | G Launch Monito...,The best personal golf launch monitor screen p...,Global Golf Tech Solutions is a company that s...,Golf Courses & Country Clubs
626242,Renko,Latest Processing Technologies | E Gaskets for...,EPDM Rubber products from Renko can be found o...,RENKO is a company that has been producing hig...,Fabricated Rubber Products
626243,Norstal,Residential Buildings | Custom Project Service...,Norstal produces a broad range of steel struct...,Norstal is a steel structure producer that spe...,Metal Fabrication Services
626244,Acoustic,Wood-based Acoustic Products Manufacturer | De...,"We are designed and manufactured in UAE, Acous...",Acoustic.ae is a member of a UAE-based group o...,Building Material Manufacturers


In [6]:
test_df = pd.read_csv("data/data.csv")

In [7]:
# Vectorize and calculate cosine similarity
vectorizer = TfidfVectorizer()
all_keywords = list(dataset["commercial_name"].to_list())
vectorizer.fit(all_keywords)
vectors = vectorizer.transform(dataset["commercial_name"])
similarity_matrix = cosine_similarity(vectors)

# Find the best NAICS match for each business label based on highest similarity

MemoryError: Unable to allocate 2.85 TiB for an array with shape (626246, 626246) and data type float64

In [None]:
naics_df["keywords"] = naics_df["clean_description"].apply(
    lambda x: extract_top_keywords([x], 10)[0]
)
dataset["keywords_commercial_name"] = dataset["commercial_name"].apply(
    lambda x: extract_top_keywords([x], 10)[0]
)


# Convert keywords list to single string per description
business_df["keywords_str"] = business_df["keywords"].apply(" ".join)
naics_df["keywords_str"] = naics_df["keywords"].apply(" ".join)

# Vectorize and calculate cosine similarity
vectorizer = TfidfVectorizer()
all_keywords = list(naics_df["keywords_str"]) + list(business_df["keywords_str"])
vectorizer.fit(all_keywords)
naics_vectors = vectorizer.transform(naics_df["keywords_str"])
business_vectors = vectorizer.transform(business_df["keywords_str"])
similarity_matrix = cosine_similarity(business_vectors, naics_vectors)

# Find the best NAICS match for each business label based on highest similarity
match_indices = np.argmax(similarity_matrix, axis=1)
business_df["matched_naics_label"] = naics_df["naics_label"].values[match_indices]
business_df["similarity_score"] = np.max(similarity_matrix, axis=1)

In [None]:
business_df # n=10

Unnamed: 0,label,description,clean_description,keywords,keywords_str,matched_naics_label,similarity_score
0,ATVs Dealers & Services,Businesses categorized under ATVs Dealers & Se...,business categorize under atvs dealer service ...,"[rental option, repair, maintenance, use, opti...",rental option repair maintenance use option ve...,Repair and Maintenance,0.269430
1,Abortion Clinics,Abortion Clinics provide medical services rela...,abortion clinic provide medical service relate...,"[health service, healthcare, healthcare assist...",health service healthcare healthcare assistanc...,Social Assistance,0.234246
2,Accounting & Bookkeeping Services,Accounting & Bookkeeping Services encompass a ...,accounting bookkeeping service encompass a ran...,"[field, ensure compliance, ensure, encompass r...",field ensure compliance ensure encompass range...,"Professional, Scientific, and Technical Services",0.059068
3,Acupuncture clinic,An Acupuncture clinic is a healthcare facility...,an acupuncture clinic be a healthcare facility...,"[health quality, health condition, facility sp...",health quality health condition facility speci...,Health and Personal Care Retailers,0.201104
4,Adhesives & Sealants,Adhesives & Sealants encompass a diverse range...,adhesives sealant encompass a diverse range of...,"[encompass diverse, encompass, durable bonding...",encompass diverse encompass durable bonding du...,"Merchant Wholesalers, Durable Goods",0.272192
...,...,...,...,...,...,...,...
598,Work Clothing & Protection Equipment,Work Clothing & Protection Equipment pertains ...,work clothing protection equipment pertain to ...,"[clothing protection, ensure, protection equip...",clothing protection ensure protection equipmen...,"Clothing, Clothing Accessories, Shoe, and Jewe...",0.237567
599,Writers & Copywriters,Writers & Copywriters are skilled professional...,writer copywriter be skilled professional who ...,"[engaging, individual, write content, writer, ...",engaging individual write content writer copyw...,Broadcasting and Content Providers,0.134737
600,YMCA Camps,YMCA Camps offer structured programs and activ...,ymca camp offer structured program and activit...,"[education physical, education, involvement, t...",education physical education involvement tempo...,Social Assistance,0.126352
601,Yoga Studios,Yoga Studios offer a tranquil space for indivi...,yoga studio offer a tranquil space for individ...,"[instructional, individual skill, individual e...",instructional individual skill individual enga...,Monetary Authorities-Central Bank,0.055057


In [None]:
business_df # n=20

Unnamed: 0,label,description,clean_description,keywords,keywords_str,matched_naics_label,similarity_score
0,ATVs Dealers & Services,Businesses categorized under ATVs Dealers & Se...,business categorize under atvs dealer service ...,"[establishment, establishment offer, ensure, d...",establishment establishment offer ensure desti...,Repair and Maintenance,0.201426
1,Abortion Clinics,Abortion Clinics provide medical services rela...,abortion clinic provide medical service relate...,"[clinical setting, confidential, medical servi...",clinical setting confidential medical service ...,Ambulatory Health Care Services,0.157904
2,Accounting & Bookkeeping Services,Accounting & Bookkeeping Services encompass a ...,accounting bookkeeping service encompass a ran...,"[insight help, insight, help business, help, f...",insight help insight help business help financ...,"Funds, Trusts, and Other Financial Vehicles",0.110031
3,Acupuncture clinic,An Acupuncture clinic is a healthcare facility...,an acupuncture clinic be a healthcare facility...,"[management, life, licensed acupuncturist, lic...",management life licensed acupuncturist license...,Ambulatory Health Care Services,0.118972
4,Adhesives & Sealants,Adhesives & Sealants encompass a diverse range...,adhesives sealant encompass a diverse range of...,"[industrial need, industrial, include wide, in...",industrial need industrial include wide includ...,"Merchant Wholesalers, Durable Goods",0.114499
...,...,...,...,...,...,...,...
598,Work Clothing & Protection Equipment,Work Clothing & Protection Equipment pertains ...,work clothing protection equipment pertain to ...,"[garment gear, garment, functionality individu...",garment gear garment functionality individual ...,"Clothing, Clothing Accessories, Shoe, and Jewe...",0.244370
599,Writers & Copywriters,Writers & Copywriters are skilled professional...,writer copywriter be skilled professional who ...,"[engage inform, engage, effectively communicat...",engage inform engage effectively communicate e...,Publishing Industries,0.160105
600,YMCA Camps,YMCA Camps offer structured programs and activ...,ymca camp offer structured program and activit...,"[focus foster, focus, experience supportive, e...",focus foster focus experience supportive exper...,Accommodation,0.108615
601,Yoga Studios,Yoga Studios offer a tranquil space for indivi...,yoga studio offer a tranquil space for individ...,"[nurture environment, exercise supportive, exp...",nurture environment exercise supportive expect...,Nursing and Residential Care Facilities,0.097725


In [None]:
business_df # n=5

Unnamed: 0,label,description,clean_description,keywords,keywords_str,matched_naics_label,similarity_score
0,ATVs Dealers & Services,Businesses categorized under ATVs Dealers & Se...,business categorize under atvs dealer service ...,"[vehicle, sale, rental, atv, service]",vehicle sale rental atv service,Motor Vehicle and Parts Dealers,0.418655
1,Abortion Clinics,Abortion Clinics provide medical services rela...,abortion clinic provide medical service relate...,"[counsel medical, clinic, medical, abortion, s...",counsel medical clinic medical abortion service,Postal Service,0.060666
2,Accounting & Bookkeeping Services,Accounting & Bookkeeping Services encompass a ...,accounting bookkeeping service encompass a ran...,"[effectively, field responsible, service, prof...",effectively field responsible service professi...,Postal Service,0.064200
3,Acupuncture clinic,An Acupuncture clinic is a healthcare facility...,an acupuncture clinic be a healthcare facility...,"[patient, overall, acupuncture clinic, clinic,...",patient overall acupuncture clinic clinic acup...,Crop Production,0.000000
4,Adhesives & Sealants,Adhesives & Sealants encompass a diverse range...,adhesives sealant encompass a diverse range of...,"[product, use, sealant, provide, material]",product use sealant provide material,Furniture and Related Product Manufacturing,0.370511
...,...,...,...,...,...,...,...
598,Work Clothing & Protection Equipment,Work Clothing & Protection Equipment pertains ...,work clothing protection equipment pertain to ...,"[item, safety, equipment, clothing, work]",item safety equipment clothing work,"Justice, Public Order, and Safety Activities",0.225609
599,Writers & Copywriters,Writers & Copywriters are skilled professional...,writer copywriter be skilled professional who ...,"[audience, material, content, writer copywrite...",audience material content writer copywriter write,Broadcasting and Content Providers,0.170063
600,YMCA Camps,YMCA Camps offer structured programs and activ...,ymca camp offer structured program and activit...,"[service, development, camp, social, offer]",service development camp social offer,Social Assistance,0.303499
601,Yoga Studios,Yoga Studios offer a tranquil space for indivi...,yoga studio offer a tranquil space for individ...,"[instructional session, yoga studio, physical,...",instructional session yoga studio physical ind...,Crop Production,0.000000


In [None]:
business_df # n=30

Unnamed: 0,label,description,clean_description,keywords,keywords_str,matched_naics_label,similarity_score
0,ATVs Dealers & Services,Businesses categorized under ATVs Dealers & Se...,business categorize under atvs dealer service ...,"[industry, include sale, include, guidance atv...",industry include sale include guidance atv gui...,Repair and Maintenance,0.175069
1,Abortion Clinics,Abortion Clinics provide medical services rela...,abortion clinic provide medical service relate...,"[visit clinic, medical procedure, abortion cli...",visit clinic medical procedure abortion clinic...,Nursing and Residential Care Facilities,0.154082
2,Accounting & Bookkeeping Services,Accounting & Bookkeeping Services encompass a ...,accounting bookkeeping service encompass a ran...,"[aim, activity aim, activity, accurate organiz...",aim activity aim activity accurate organize ac...,"Computing Infrastructure Providers, Data Proce...",0.127419
3,Acupuncture clinic,An Acupuncture clinic is a healthcare facility...,an acupuncture clinic be a healthcare facility...,"[acupuncturist utilize, acupuncturist, acupunc...",acupuncturist utilize acupuncturist acupunctur...,Nursing and Residential Care Facilities,0.090690
4,Adhesives & Sealants,Adhesives & Sealants encompass a diverse range...,adhesives sealant encompass a diverse range of...,"[adhesion protection, manufacturing, different...",adhesion protection manufacturing different su...,Computer and Electronic Product Manufacturing,0.128232
...,...,...,...,...,...,...,...
598,Work Clothing & Protection Equipment,Work Clothing & Protection Equipment pertains ...,work clothing protection equipment pertain to ...,"[helmet, hazard promote, hazard, goggle essent...",helmet hazard promote hazard goggle essential ...,"Clothing, Clothing Accessories, Shoe, and Jewe...",0.152324
599,Writers & Copywriters,Writers & Copywriters are skilled professional...,writer copywriter be skilled professional who ...,"[idea, help business, help, greeting card, gre...",idea help business help greeting card greeting...,Publishing Industries,0.080139
600,YMCA Camps,YMCA Camps offer structured programs and activ...,ymca camp offer structured program and activit...,"[activity design, cater, ymca, development emp...",activity design cater ymca development emphasi...,"Administration of Housing Programs, Urban Plan...",0.082813
601,Yoga Studios,Yoga Studios offer a tranquil space for indivi...,yoga studio offer a tranquil space for individ...,"[engage physical, exercise, engage, class, cat...",engage physical exercise engage class cater in...,Educational Services,0.052862


In [None]:
business_df # n=50

Unnamed: 0,label,description,clean_description,keywords,keywords_str,matched_naics_label,similarity_score
0,ATVs Dealers & Services,Businesses categorized under ATVs Dealers & Se...,business categorize under atvs dealer service ...,"[convenient, categorize atvs, categorize, busi...",convenient categorize atvs categorize business...,Motor Vehicle and Parts Dealers,0.137153
1,Abortion Clinics,Abortion Clinics provide medical services rela...,abortion clinic provide medical service relate...,"[professional healthcare, provide, provide med...",professional healthcare provide provide medica...,Hospitals,0.140610
2,Accounting & Bookkeeping Services,Accounting & Bookkeeping Services encompass a ...,accounting bookkeeping service encompass a ran...,"[responsible, resource effectively, resource, ...",responsible resource effectively resource rele...,"Securities, Commodity Contracts, and Other Fin...",0.095743
3,Acupuncture clinic,An Acupuncture clinic is a healthcare facility...,an acupuncture clinic be a healthcare facility...,"[specific point, wellbeing, wellbeing patient,...",specific point wellbeing wellbeing patient ove...,Ambulatory Health Care Services,0.099587
4,Adhesives & Sealants,Adhesives & Sealants encompass a diverse range...,adhesives sealant encompass a diverse range of...,"[crucial role, crucial, coating application, c...",crucial role crucial coating application coati...,Administration of Environmental Quality Programs,0.090626
...,...,...,...,...,...,...,...
598,Work Clothing & Protection Equipment,Work Clothing & Protection Equipment pertains ...,work clothing protection equipment pertain to ...,"[category, enhance, encompass range, encompass...",category enhance encompass range encompass eff...,"Clothing, Clothing Accessories, Shoe, and Jewe...",0.148520
599,Writers & Copywriters,Writers & Copywriters are skilled professional...,writer copywriter be skilled professional who ...,"[card, business organization, business, brand ...",card business organization business brand voic...,Publishing Industries,0.067410
600,YMCA Camps,YMCA Camps offer structured programs and activ...,ymca camp offer structured program and activit...,"[recreational, need, temporary, involvement of...",recreational need temporary involvement offer ...,Accommodation,0.200502
601,Yoga Studios,Yoga Studios offer a tranquil space for indivi...,yoga studio offer a tranquil space for individ...,"[spiritual practice, studio, studio offer, sup...",spiritual practice studio studio offer support...,Educational Services,0.089445


In [None]:
business_df # n=3

Unnamed: 0,label,description,clean_description,keywords,keywords_str,matched_naics_label,similarity_score
0,ATVs Dealers & Services,Businesses categorized under ATVs Dealers & Se...,business categorize under atvs dealer service ...,"[rental, atv, service]",rental atv service,Administrative and Support Services,0.133505
1,Abortion Clinics,Abortion Clinics provide medical services rela...,abortion clinic provide medical service relate...,"[medical, abortion, service]",medical abortion service,Administrative and Support Services,0.142036
2,Accounting & Bookkeeping Services,Accounting & Bookkeeping Services encompass a ...,accounting bookkeeping service encompass a ran...,"[service, professional, financial]",service professional financial,Administrative and Support Services,0.146114
3,Acupuncture clinic,An Acupuncture clinic is a healthcare facility...,an acupuncture clinic be a healthcare facility...,"[acupuncture clinic, clinic, acupuncture]",acupuncture clinic clinic acupuncture,Crop Production,0.000000
4,Adhesives & Sealants,Adhesives & Sealants encompass a diverse range...,adhesives sealant encompass a diverse range of...,"[sealant, provide, material]",sealant provide material,Food Services and Drinking Places,0.309452
...,...,...,...,...,...,...,...
598,Work Clothing & Protection Equipment,Work Clothing & Protection Equipment pertains ...,work clothing protection equipment pertain to ...,"[equipment, clothing, work]",equipment clothing work,Specialty Trade Contractors,0.402276
599,Writers & Copywriters,Writers & Copywriters are skilled professional...,writer copywriter be skilled professional who ...,"[content, writer copywriter, write]",content writer copywriter write,Broadcasting and Content Providers,0.268699
600,YMCA Camps,YMCA Camps offer structured programs and activ...,ymca camp offer structured program and activit...,"[camp, social, offer]",camp social offer,Social Assistance,0.204873
601,Yoga Studios,Yoga Studios offer a tranquil space for indivi...,yoga studio offer a tranquil space for individ...,"[physical, individual, yoga]",physical individual yoga,Crop Production,0.000000


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")


# Function to compute embeddings
def compute_embeddings(texts):
    return model.encode(texts, show_progress_bar=True)


# Compute embeddings for each description
business_embeddings = compute_embeddings(business_df["clean_description"].tolist())
naics_embeddings = compute_embeddings(naics_df["clean_description"].tolist())

# Calculate cosine similarity between embeddings
similarity_matrix = cosine_similarity(business_embeddings, naics_embeddings)

# Find the best match for each business description
match_indices = np.argmax(similarity_matrix, axis=1)
business_df["matched_naics_label"] = naics_df["naics_label"].values[match_indices]
business_df["similarity_score"] = np.max(similarity_matrix, axis=1)

# Display results
print(business_df[["label", "matched_naics_label", "similarity_score"]])
# n=15

ModuleNotFoundError: No module named 'torch.utils'

In [None]:
import torch

print(torch.__version__)
print(torch.rand(2, 2))

AttributeError: module 'torch' has no attribute '__version__'

In [None]:
import spac

def remove_mentions_and_hashtags(text):
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    return text


def remove_numbers(text):
    text = re.sub(r"\d+", "", text)
    return text


def remove_punctuation(text):

    translator = str.maketrans("", "", string.punctuation)

    # Remove punctuation using the translation table
    text_without_punct = text.translate(translator)

    return text_without_punct


def remove_stopwords(text):
    filtered_sentence = []
    doc = nlp(text)
    for token in doc:
        if token.is_stop == False:
            filtered_sentence.append(token.text)
    return " ".join(filtered_sentence)
y
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import string

nlp = spacy.load("en_core_web_md")


def lemmatize(text):
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc])
    return text


def clean_text(text, to_lemmatize: bool = True):
    # Standardize text
    # text = standardize_accented_chars(text)

    # Remove URLs
    text = re.sub(r"http\S+", "", text)

    # Remove mentions and hashtags
    text = remove_mentions_and_hashtags(text)

    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = remove_punctuation(text)

    # Remove numbers
    text = remove_numbers(text)

    # Remove all the special characters
    text = re.sub(r"\W", " ", text)

    # Remove stopwords
    text = remove_stopwords(text)

    # Substituting multiple spaces with single space
    text = re.sub(r"\s+", " ", text, flags=re.I)

    if to_lemmatize:
        text = lemmatize(text)

    return text


business_df["clean_description"] = business_df["description"].apply(
    lambda x: clean_text(x)
)
naics_df["clean_description"] = naics_df["description"].apply(
    lambda x: clean_text(x)
)


def extract_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents]


naics_df["keywords"] = naics_df["clean_description"].apply(extract_entities)
business_df["keywords"] = business_df["clean_description"].apply(extract_entities)

# Convert keywords list to single string per description
naics_df["keywords_str"] = naics_df["keywords"].apply(" ".join)
business_df["keywords_str"] = business_df["keywords"].apply(" ".join)

# Use TF-IDF to vectorize keywords strings
vectorizer = TfidfVectorizer()
all_keywords = list(naics_df["keywords_str"]) + list(business_df["keywords_str"])
vectorizer.fit(all_keywords)
naics_vectors = vectorizer.transform(naics_df["keywords_str"])
business_vectors = vectorizer.transform(business_df["keywords_str"])

# Calculate cosine similarity
similarity_matrix = cosine_similarity(business_vectors, naics_vectors)

# Find the best NAICS match for each business label based on highest similarity
match_indices = np.argmax(similarity_matrix, axis=1)
business_df["matched_naics_label"] = naics_df["naics_label"].values[
    match_indices
]  # Use values to directly access by position
business_df["similarity_score"] = np.max(similarity_matrix, axis=1)

In [None]:
business_df

Unnamed: 0,label,description,clean_description,keywords,keywords_str,matched_naics_label,similarity_score
0,ATVs Dealers & Services,Businesses categorized under ATVs Dealers & Se...,business categorize atv dealer service special...,[],,Crop Production,0.000000
1,Abortion Clinics,Abortion Clinics provide medical services rela...,abortion clinic provide medical service relate...,[abortion clinic provide medical service],abortion clinic provide medical service,Personal and Laundry Services,0.272744
2,Accounting & Bookkeeping Services,Accounting & Bookkeeping Services encompass a ...,accounting bookkeeping service encompass range...,[],,Crop Production,0.000000
3,Acupuncture clinic,An Acupuncture clinic is a healthcare facility...,acupuncture clinic healthcare facility special...,[chinese],chinese,Crop Production,0.000000
4,Adhesives & Sealants,Adhesives & Sealants encompass a diverse range...,adhesives sealant encompass diverse range prod...,[],,Crop Production,0.000000
...,...,...,...,...,...,...,...
598,Work Clothing & Protection Equipment,Work Clothing & Protection Equipment pertains ...,work clothing protection equipment pertain spe...,[],,Crop Production,0.000000
599,Writers & Copywriters,Writers & Copywriters are skilled professional...,writer copywriter skilled professional special...,[],,Crop Production,0.000000
600,YMCA Camps,YMCA Camps offer structured programs and activ...,ymca camp offer structured program activity de...,"[ymca, wellbee development]",ymca wellbee development,Crop Production,0.000000
601,Yoga Studios,Yoga Studios offer a tranquil space for indivi...,yoga studio offer tranquil space individual en...,[],,Crop Production,0.000000


In [None]:
# display clean_description of first element
print(business_df["clean_description"].iloc[0])

business categorize atv dealer service specialize sale maintenance rental allterrain vehicle atvs establishment offer range service relate atv include sale new vehicle repair maintenance service rental option recreational use customer expect expert guidance atv selection repair service ensure optimal performance convenient rental option outdoor adventure industry dedicated meeting diverse need atv enthusiast provide onestop destination atvrelate requirement


In [None]:
def jaccard_similarity(list1, list2):
    intersection = set(list1).intersection(set(list2))
    union = set(list1).union(set(list2))
    return len(intersection) / len(union)


# Compute similarity for each combination of NAICS and business description
import numpy as np

similarity_matrix = np.zeros((len(business_df), len(naics_df)))

for i, business_keywords in enumerate(business_df["keywords"]):
    for j, naics_keywords in enumerate(naics_df["keywords"]):
        similarity_matrix[i, j] = jaccard_similarity(business_keywords, naics_keywords)

ZeroDivisionError: division by zero

In [None]:
match_indices = np.argmax(similarity_matrix, axis=1)
business_df["matched_naics_label"] = naics_df["naics_label"].iloc[match_indices].values
business_df["similarity_score"] = np.max(similarity_matrix, axis=1)


In [None]:
business_df

Unnamed: 0,label,description,clean_description,keywords,keywords_str,matched_naics_label,similarity_score
0,ATVs Dealers & Services,Businesses categorized under ATVs Dealers & Se...,business categorize atv dealer service special...,[],,Crop Production,0.000000
1,Abortion Clinics,Abortion Clinics provide medical services rela...,abortion clinic provide medical service relate...,[abortion clinic provide medical service],abortion clinic provide medical service,Personal and Laundry Services,0.272744
2,Accounting & Bookkeeping Services,Accounting & Bookkeeping Services encompass a ...,accounting bookkeeping service encompass range...,[],,Crop Production,0.000000
3,Acupuncture clinic,An Acupuncture clinic is a healthcare facility...,acupuncture clinic healthcare facility special...,[chinese],chinese,Crop Production,0.000000
4,Adhesives & Sealants,Adhesives & Sealants encompass a diverse range...,adhesives sealant encompass diverse range prod...,[],,Crop Production,0.000000
...,...,...,...,...,...,...,...
598,Work Clothing & Protection Equipment,Work Clothing & Protection Equipment pertains ...,work clothing protection equipment pertain spe...,[],,Crop Production,0.000000
599,Writers & Copywriters,Writers & Copywriters are skilled professional...,writer copywriter skilled professional special...,[],,Crop Production,0.000000
600,YMCA Camps,YMCA Camps offer structured programs and activ...,ymca camp offer structured program activity de...,"[ymca, wellbee development]",ymca wellbee development,Crop Production,0.000000
601,Yoga Studios,Yoga Studios offer a tranquil space for indivi...,yoga studio offer tranquil space individual en...,[],,Crop Production,0.000000
