#### pip install spacy openpyxl
#### python -m spacy download en_core_web_lg


In [1]:
import pandas as pd

# Load your CSV files
categories_df = pd.read_excel("data/business_category_taxonomy.xlsx", engine='openpyxl')
domains_df = pd.read_excel("data/Naics3_taxonomy.xlsx", engine='openpyxl')
business_df = pd.read_csv("data/tournament_hints_data.csv")


In [3]:
clean_description_df = pd.read_csv("data/clean_description.csv")

In [2]:
business_df

Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category
0,White Horse,Tile Manufacturing | European Aesthetics Ceram...,White Horse is highly regarded as a tile trail...,White Horse Ceramic Singapore is a leading man...,Tile Store
1,Wealth Solution Partners,Super and SMSF Services | Financial Planning a...,"WSP, Wealth Solution Partners, Financial Plann...",Wealth Solution Partners Pty Ltd is an indepen...,Investment Consultants & Financial Advisors
2,PMG,Fire and Water Cleanup Services | Mold Remedia...,PMG General Solutions Inc. is an environmental...,PMG General Solutions Inc. is an environmental...,Damage Restoration & Mold Remediation
3,TMP Capital PLLC,Licensed in AL & FL | 203K Loans | 15-year Fix...,TMP Capital PLLC Consulting Company Franklin M...,"TMP Capital PLLC Consulting Company, also know...",Mortgage Brokers
4,Genertek Power,Industrial and Commercial Energy Storage | Ass...,Genertek Power Ltd a UK electricity systems & ...,Genertek Power Limited is a privately-owned UK...,Renewable energy companies
...,...,...,...,...,...
626241,Global Golf Tech Solutions,Manufacturing | Golf Academy | G Launch Monito...,The best personal golf launch monitor screen p...,Global Golf Tech Solutions is a company that s...,Golf Courses & Country Clubs
626242,Renko,Latest Processing Technologies | E Gaskets for...,EPDM Rubber products from Renko can be found o...,RENKO is a company that has been producing hig...,Fabricated Rubber Products
626243,Norstal,Residential Buildings | Custom Project Service...,Norstal produces a broad range of steel struct...,Norstal is a steel structure producer that spe...,Metal Fabrication Services
626244,Acoustic,Wood-based Acoustic Products Manufacturer | De...,"We are designed and manufactured in UAE, Acous...",Acoustic.ae is a member of a UAE-based group o...,Building Material Manufacturers


In [4]:
import spacy
import string
from sklearn.metrics.pairwise import cosine_similarity
import re

# Load the medium spaCy model
nlp = spacy.load("en_core_web_lg")


def remove_mentions_and_hashtags(text):
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    return text


def remove_numbers(text):
    text = re.sub(r"\d+", "", text)
    return text


def remove_punctuation(text):

    translator = str.maketrans("", "", string.punctuation)

    # Remove punctuation using the translation table
    text_without_punct = text.translate(translator)

    return text_without_punct


def remove_stopwords(text):
    filtered_sentence = []
    doc = nlp(text)
    for token in doc:
        if token.is_stop == False:
            filtered_sentence.append(token.text)
    return " ".join(filtered_sentence)


def lemmatize(text):
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc])
    return text


def clean_text(text, to_lemmatize: bool = True):
    # Standardize text
    # text = standardize_accented_chars(text)

    # Remove URLs
    text = re.sub(r"http\S+", "", text)

    # Remove mentions and hashtags
    text = remove_mentions_and_hashtags(text)

    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = remove_punctuation(text)

    # Remove numbers
    text = remove_numbers(text)

    # Remove all the special characters
    text = re.sub(r"\W", " ", text)

    # Remove stopwords
    text = remove_stopwords(text)

    # Substituting multiple spaces with single space
    text = re.sub(r"\s+", " ", text, flags=re.I)

    if to_lemmatize:
        text = lemmatize(text)

    return text


categories_df["clean_description"] = categories_df["description"].apply(
    lambda x: clean_text(x)
)
domains_df["clean_description"] = domains_df["description"].apply(
    lambda x: clean_text(x)
)

categories_df["description_vector"] = categories_df["clean_description"].apply(nlp)
domains_df["description_vector"] = domains_df["clean_description"].apply(nlp)

In [5]:
categories_df.to_csv("data/final_categories.csv", index=False)
domains_df.to_csv("data/final_domains.csv", index=False)

In [6]:
def find_best_match(cat_desc_vec, label_vectors):
    # Calculate similarities and return the index of the highest similarity
    similarities = [
        cat_desc_vec.similarity(label_desc) for label_desc in label_vectors
    ]
    return similarities.index(max(similarities))


# Convert domain vectors to a list for efficiency
label_vectors = domains_df["description_vector"].tolist()

# Find the best match for each category
categories_df["best_match_label_index"] = categories_df["description_vector"].apply(
    lambda x: find_best_match(x, label_vectors)
)

# Map the index to the actual domain name
categories_df["best_match_label"] = categories_df["best_match_label_index"].apply(
    lambda x: domains_df.iloc[x]["naics_label"]
)

In [19]:
categories_df

Unnamed: 0,label,description,clean_description,description_vector,best_match_label_index,best_match_label
0,ATVs Dealers & Services,Businesses categorized under ATVs Dealers & Se...,business categorize atvs dealer service specia...,"(business, categorize, atvs, dealer, service, ...",68,Rental and Leasing Services
1,Abortion Clinics,Abortion Clinics provide medical services rela...,abortion clinic provide medical service relate...,"(abortion, clinic, provide, medical, service, ...",76,Hospitals
2,Accounting & Bookkeeping Services,Accounting & Bookkeeping Services encompass a ...,accounting bookkeeping service encompass range...,"(accounting, bookkeeping, service, encompass, ...",72,Administrative and Support Services
3,Acupuncture clinic,An Acupuncture clinic is a healthcare facility...,acupuncture clinic healthcare facility special...,"(acupuncture, clinic, healthcare, facility, sp...",75,Ambulatory Health Care Services
4,Adhesives & Sealants,Adhesives & Sealants encompass a diverse range...,adhesive sealant encompass diverse range produ...,"(adhesive, sealant, encompass, diverse, range,...",32,Miscellaneous Manufacturing
...,...,...,...,...,...,...
598,Work Clothing & Protection Equipment,Work Clothing & Protection Equipment pertains ...,work clothing protection equipment pertain spe...,"(work, clothing, protection, equipment, pertai...",84,Repair and Maintenance
599,Writers & Copywriters,Writers & Copywriters are skilled professional...,writer copywriter skilled professional special...,"(writer, copywriter, skilled, professional, sp...",57,Publishing Industries
600,YMCA Camps,YMCA Camps offer structured programs and activ...,ymca camp offer structured program activity de...,"(ymca, camp, offer, structured, program, activ...",85,Personal and Laundry Services
601,Yoga Studios,Yoga Studios offer a tranquil space for indivi...,yoga studio offer tranquil space individual en...,"(yoga, studio, offer, tranquil, space, individ...",79,"Performing Arts, Spectator Sports, and Related..."


In [20]:
business_df[:5]

Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category
0,White Horse,Tile Manufacturing | European Aesthetics Ceram...,White Horse is highly regarded as a tile trail...,White Horse Ceramic Singapore is a leading man...,Tile Store
1,Wealth Solution Partners,Super and SMSF Services | Financial Planning a...,"WSP, Wealth Solution Partners, Financial Plann...",Wealth Solution Partners Pty Ltd is an indepen...,Investment Consultants & Financial Advisors
2,PMG,Fire and Water Cleanup Services | Mold Remedia...,PMG General Solutions Inc. is an environmental...,PMG General Solutions Inc. is an environmental...,Damage Restoration & Mold Remediation
3,TMP Capital PLLC,Licensed in AL & FL | 203K Loans | 15-year Fix...,TMP Capital PLLC Consulting Company Franklin M...,"TMP Capital PLLC Consulting Company, also know...",Mortgage Brokers
4,Genertek Power,Industrial and Commercial Energy Storage | Ass...,Genertek Power Ltd a UK electricity systems & ...,Genertek Power Limited is a privately-owned UK...,Renewable energy companies


In [7]:
final_data = pd.merge(business_df, categories_df[['label','best_match_label_index','best_match_label']], left_on="main_business_category",right_on="label")
final_data.drop(columns=['label'], inplace=True)

In [8]:
final_data

Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category,best_match_label_index,best_match_label
0,White Horse,Tile Manufacturing | European Aesthetics Ceram...,White Horse is highly regarded as a tile trail...,White Horse Ceramic Singapore is a leading man...,Tile Store,31,Furniture and Related Product Manufacturing
1,Wealth Solution Partners,Super and SMSF Services | Financial Planning a...,"WSP, Wealth Solution Partners, Financial Plann...",Wealth Solution Partners Pty Ltd is an indepen...,Investment Consultants & Financial Advisors,64,"Securities, Commodity Contracts, and Other Fin..."
2,PMG,Fire and Water Cleanup Services | Mold Remedia...,PMG General Solutions Inc. is an environmental...,PMG General Solutions Inc. is an environmental...,Damage Restoration & Mold Remediation,84,Repair and Maintenance
3,TMP Capital PLLC,Licensed in AL & FL | 203K Loans | 15-year Fix...,TMP Capital PLLC Consulting Company Franklin M...,"TMP Capital PLLC Consulting Company, also know...",Mortgage Brokers,64,"Securities, Commodity Contracts, and Other Fin..."
4,Genertek Power,Industrial and Commercial Energy Storage | Ass...,Genertek Power Ltd a UK electricity systems & ...,Genertek Power Limited is a privately-owned UK...,Renewable energy companies,8,Utilities
...,...,...,...,...,...,...,...
626241,Global Golf Tech Solutions,Manufacturing | Golf Academy | G Launch Monito...,The best personal golf launch monitor screen p...,Global Golf Tech Solutions is a company that s...,Golf Courses & Country Clubs,81,"Amusement, Gambling, and Recreation Industries"
626242,Renko,Latest Processing Technologies | E Gaskets for...,EPDM Rubber products from Renko can be found o...,RENKO is a company that has been producing hig...,Fabricated Rubber Products,27,Machinery Manufacturing
626243,Norstal,Residential Buildings | Custom Project Service...,Norstal produces a broad range of steel struct...,Norstal is a steel structure producer that spe...,Metal Fabrication Services,27,Machinery Manufacturing
626244,Acoustic,Wood-based Acoustic Products Manufacturer | De...,"We are designed and manufactured in UAE, Acous...",Acoustic.ae is a member of a UAE-based group o...,Building Material Manufacturers,27,Machinery Manufacturing


In [23]:
final_data.to_csv('labelled_data.csv', index=False)


In [24]:
final_data[final_data['main_business_category'] == 'Beauty Salons']


Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category,best_match_label_index,best_match_label
243,CARIDAD VIDRO,On-location Services for Private Events | Brid...,"Welcome to CARIDAD VIDRO SHOP, wedding favors ...",Caridad VIDRO is a professional hair and makeu...,Beauty Salons,41,Health and Personal Care Retailers
680,Azalea Beauty Consultancy India,Sourcing and Purchasing Services | Layout Plan...,Initial and ongoing training and support for S...,Azalea Beauty Consultancy India is a company t...,Beauty Salons,41,Health and Personal Care Retailers
1469,BEAUTIFUL REMEDY,Waxing Services | Permanent Cosmetics | Facial...,BEAUTIFUL REMEDY is a beauty salon based in Ka...,Beautiful Remedy LLC is a beauty company that ...,Beauty Salons,41,Health and Personal Care Retailers
1674,Sogno The Salon Company,Personal Care Services | Hair Coloring Service...,Sogno The Salon Company is a beauty salon base...,Sogno The Salon Company is a beauty salon loca...,Beauty Salons,41,Health and Personal Care Retailers
2244,Eman Beauty Lab,Beauty Treatments | Beauty Services | Retail |...,"Welcome to Eman Beauty Lab, our Nail Salon in ...",Eman Beauty Lab is a nail salon located in Asc...,Beauty Salons,41,Health and Personal Care Retailers
...,...,...,...,...,...,...,...
625702,Mia's Beauty,Pedicure Services | Personal Care Services | W...,Mia's Beauty is the perfect Nail Salon for any...,Mia's Beauty is a nail salon located in the Ta...,Beauty Salons,41,Health and Personal Care Retailers
625792,Jenyosola Beauty Salon,Beauty Services | Personal Care Services,Jenyosola Beauty Salon is a beauty salon based...,Jenyosola Beauty Salon is a beauty salon locat...,Beauty Salons,41,Health and Personal Care Retailers
625972,Manhattan Nail Bar,Personal Care Services,Manhattan Nail Bar is a beauty salon based in ...,Manhattan Nail Bar is a nail salon that offers...,Beauty Salons,41,Health and Personal Care Retailers
626006,Grand Nails Lounge Ocotillo,Personal Care Services | Manicure and Pedicure...,Grand Nails Lounge Ocotillo at 3990 S Alma Sch...,Grand Nails Lounge Ocotillo is a beauty salon ...,Beauty Salons,41,Health and Personal Care Retailers


In [25]:
description = "NU U 2 Hair & Beauty is a professional hair and beauty salon located in Milton Regis High Street, Sittingbourne. They offer a variety of hair and body treatments, including hair cutting and styling, as well as other beauty treatments such as waxing, eye treatments, facials, massage, body treatments and permanent make-up."

In [27]:
final_data["clean_description"] = final_data["description"].apply(
    lambda x: clean_text(x)
)

final_data["description_vector"] = final_data["clean_description"].apply(nlp)

KeyboardInterrupt: 

In [None]:
final_data

In [2]:
test_df = pd.read_csv("data/data.csv")
test_df

Unnamed: 0.1,Unnamed: 0,company_id,round_1,answer,round_2,round_3,round_4,round_5
0,0,84,Advisory Care,621 Ambulatory Health Care Services,Hygiene Services | One-on-one Attention and Ca...,Advisory Care is a home health care services p...,Advisory Care Pty Ltd is a company that provid...,Home Health Care
1,1,115,PPK Mining Equipment,213 Support Activities for Mining,High & Low Seam Drills | Ventilation Solutions...,PPK Mining Equipment is the solution partner o...,PPK Mining Equipment is a company that special...,Mining & Gas Exploration
2,2,50,First Tracks Lawn Care,561 Administrative and Support Services,Weekly Mowing | Driveway Plowing Services | Mu...,"First Tracks Lawn Care, LLC prides itself on p...","First Tracks Lawn Care, LLC is a lawn care com...",Landscaping & Lawn Services
3,3,92,New Golden Furnishers,337 Furniture and Related Product Manufacturing,Staff Task Computer Chairs Supplier | Mesh Bac...,New Golden Furnishers Co - Manufacturer of Dou...,New Golden Furnishers Co is a furniture manufa...,Office Furniture
4,4,88,Lilac City Septic,562 Waste Management and Remediation Services,Clean-Out Installation | Line Replacement Serv...,Lilac City Septic services residential & comme...,Lilac City Septic Services Co is a family-owne...,Garbage Collection & Waste Disposal
...,...,...,...,...,...,...,...,...
101,101,28,Purple Pixl,811 Repair and Maintenance,Battery Replacement | Viruses Removal | Front ...,Purple Pixl offers fast & effective repair ser...,Purple Pixl is a technology and electronics re...,Phone Repair & Services
102,102,116,Pasargad Jonub,237 Heavy and Civil Engineering Construction,Infrastructure Project Management | Design and...,Having a brilliant background and well-trained...,Pasargad Jonub is a private joint stock compan...,Civil Engineering Services
103,103,6,Davis Business Machines,"423 Merchant Wholesalers, Durable Goods",Managed IT Services | Document Scanners | Fax ...,"Since May 13, 1963, Davis Business Machines ha...",Davis Business Machines is a company that has ...,Office Supplies
104,104,24,High Desert Tool,326 Plastics and Rubber Products Manufacturing,Aerospace Molds Supplier | Automotive Molds Di...,High Desert Tool specializes in large mold man...,High Desert Tool is a company that specializes...,Injection Molding Services


In [12]:
label_vectors = final_data["commercial_name"].apply(lambda x: clean_text(x)).tolist()
label_vectors

KeyboardInterrupt: 

In [3]:
clean_dataset = pd.read_csv("data/cleaned_dataset.csv")
clean_dataset

Unnamed: 0.1,Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category,commercial_name_clean,business_tags_clean,short_description_clean,description_clean,main_business_category_clean,naics_label
0,0,White Horse,Tile Manufacturing | European Aesthetics Ceram...,White Horse is highly regarded as a tile trail...,White Horse Ceramic Singapore is a leading man...,Tile Store,white horse,tile manufacturing european aesthetic ceramic ...,white horse highly regard tile trailblazer des...,white horse ceramic singapore lead manufacture...,tile store,Building Material and Garden Equipment and Sup...
1,1,Wealth Solution Partners,Super and SMSF Services | Financial Planning a...,"WSP, Wealth Solution Partners, Financial Plann...",Wealth Solution Partners Pty Ltd is an indepen...,Investment Consultants & Financial Advisors,wealth solution partner,super smsf service financial planning investme...,wsp wealth solution partner financial planning...,wealth solution partner pty ltd independent fi...,investment consultant financial advisor,"Funds, Trusts, and Other Financial Vehicles"
2,2,PMG,Fire and Water Cleanup Services | Mold Remedia...,PMG General Solutions Inc. is an environmental...,PMG General Solutions Inc. is an environmental...,Damage Restoration & Mold Remediation,pmg,fire water cleanup service mold remediation re...,pmg general solutions inc environmental remedi...,pmg general solutions inc environmental remedi...,damage restoration mold remediation,Waste Management and Remediation Services
3,3,TMP Capital PLLC,Licensed in AL & FL | 203K Loans | 15-year Fix...,TMP Capital PLLC Consulting Company Franklin M...,"TMP Capital PLLC Consulting Company, also know...",Mortgage Brokers,tmp capital pllc,license al fl 203k loan 15year fix mortgage mo...,tmp capital pllc consult company franklin mort...,tmp capital pllc consult company know franklin...,mortgage broker,Credit Intermediation and Related Activities
4,4,Genertek Power,Industrial and Commercial Energy Storage | Ass...,Genertek Power Ltd a UK electricity systems & ...,Genertek Power Limited is a privately-owned UK...,Renewable energy companies,genertek power,industrial commercial energy storage asset man...,genertek power ltd uk electricity system uk in...,genertek power limit privatelyowne uk investme...,renewable energy company,Utilities
...,...,...,...,...,...,...,...,...,...,...,...,...
613326,613326,Track Audio,Manufacturing | ISO 9001 Certification | Suppo...,Track Audio - Speaker Stands and Isolation Mou...,Track Audio Ltd is a British company based in ...,Electronics Stores,track audio,manufacture iso 9001 certification support sol...,track audio speaker stand isolation mount engi...,track audio ltd british company base amersham ...,electronic store,"Furniture, Home Furnishings, Electronics, and ..."
613327,613327,Global Golf Tech Solutions,Manufacturing | Golf Academy | G Launch Monito...,The best personal golf launch monitor screen p...,Global Golf Tech Solutions is a company that s...,Golf Courses & Country Clubs,global golf tech solution,manufacture golf academy g launch monitor scre...,good personal golf launch monitor screen prote...,global golf tech solution company specialize p...,golf course country club,"Amusement, Gambling, and Recreation Industries"
613328,613328,Renko,Latest Processing Technologies | E Gaskets for...,EPDM Rubber products from Renko can be found o...,RENKO is a company that has been producing hig...,Fabricated Rubber Products,renko,late processing technology e gasket facade joi...,epdm rubber product renko find roof window fac...,renko company produce highquality epdm rubber ...,fabricate rubber product,Plastics and Rubber Products Manufacturing
613329,613329,Norstal,Residential Buildings | Custom Project Service...,Norstal produces a broad range of steel struct...,Norstal is a steel structure producer that spe...,Metal Fabrication Services,norstal,residential building custom project service st...,norstal produce broad range steel structure re...,norstal steel structure producer specialize hi...,metal fabrication service,Fabricated Metal Product Manufacturing


In [15]:
clean_dataset["commercial_name_clean"] = clean_dataset["commercial_name"].astype(str)


In [16]:

clean_dataset["name_vector"] = clean_dataset["commercial_name_clean"].apply(nlp)

In [17]:
label_vectors = clean_dataset["name_vector"].tolist()
label_vectors

[White Horse,
 Wealth Solution Partners,
 PMG,
 TMP Capital PLLC,
 Genertek Power,
 HQ Machine Tech,
 iFleet,
 Strategic Benefit Resources,
 Billington Holdings,
 Haram Couture,
 Regina Dent,
 Flagg Consulting,
 Nerveless Nocks,
 Satatuote,
 H2O Vend,
 Chrome Horse Saloon,
 Emily Brown Physical Therapy,
 Tax Service,
 Wholesale Trailers,
 Shivashrit Foods,
 Preston Dental Care,
 BAHRAIN WORKSHOP COMPANY W.L.L,
 Partitions,
 Travell,
 LEISIDUN,
 Geological Service,
 3H DECOR,
 Al Rahimah,
 Al Amwaj,
 Adyjohns,
 Forschene,
 Black Oak Wilderness,
 SideKick,
 Hiap Shing,
 Virginia Eagle,
 Auto-jet,
 RAP Power Industries,
 Sheet Metal Crafters,
 They Poop,
 Preps Recruiting,
 East Calais Community Trust,
 The Fresh Toast,
 Ship Finance Maritime,
 NuGen Systems,
 VAI Capital,
 HIKS,
 Cleveland Mica,
 Green Bus Brewing,
 Teleli Golf Club,
 Ormond Aircraft,
 HITV Lab,
 Palermo Lubricants,
 Shri Bharat Agencies,
 Stoneage Granite and Marble,
 77 STORAGE,
 White Water Marine Hardware,
 Star of t

In [18]:
test_df["clean_r1"] = test_df["round_1"].apply(
    lambda x: clean_text(x)
)

test_df["description_vector"] = test_df["clean_r1"].apply(nlp)

In [19]:
test_df["best_match_label_index"] = test_df["description_vector"].apply(
    lambda x: find_best_match(x, label_vectors)
)

# Map the index to the actual domain name
test_df["best_match_label"] = test_df["best_match_label_index"].apply(
    lambda x: clean_dataset.iloc[x]["naics_label"]
)

  cat_desc_vec.similarity(label_desc) for label_desc in label_vectors


In [20]:
test_df

Unnamed: 0.1,Unnamed: 0,company_id,round_1,answer,round_2,round_3,round_4,round_5,clean_r1,description_vector,best_match_label_index,best_match_label
0,0,84,Advisory Care,621 Ambulatory Health Care Services,Hygiene Services | One-on-one Attention and Ca...,Advisory Care is a home health care services p...,Advisory Care Pty Ltd is a company that provid...,Home Health Care,advisory care,"(advisory, care)",233581,Utilities
1,1,115,PPK Mining Equipment,213 Support Activities for Mining,High & Low Seam Drills | Ventilation Solutions...,PPK Mining Equipment is the solution partner o...,PPK Mining Equipment is a company that special...,Mining & Gas Exploration,ppk mining equipment,"(ppk, mining, equipment)",488880,"Securities, Commodity Contracts, and Other Fin..."
2,2,50,First Tracks Lawn Care,561 Administrative and Support Services,Weekly Mowing | Driveway Plowing Services | Mu...,"First Tracks Lawn Care, LLC prides itself on p...","First Tracks Lawn Care, LLC is a lawn care com...",Landscaping & Lawn Services,track lawn care,"(track, lawn, care)",153186,Ambulatory Health Care Services
3,3,92,New Golden Furnishers,337 Furniture and Related Product Manufacturing,Staff Task Computer Chairs Supplier | Mesh Bac...,New Golden Furnishers Co - Manufacturer of Dou...,New Golden Furnishers Co is a furniture manufa...,Office Furniture,new golden furnisher,"(new, golden, furnisher)",445889,Plastics and Rubber Products Manufacturing
4,4,88,Lilac City Septic,562 Waste Management and Remediation Services,Clean-Out Installation | Line Replacement Serv...,Lilac City Septic services residential & comme...,Lilac City Septic Services Co is a family-owne...,Garbage Collection & Waste Disposal,lilac city septic,"(lilac, city, septic)",400570,Primary Metal Manufacturing
...,...,...,...,...,...,...,...,...,...,...,...,...
101,101,28,Purple Pixl,811 Repair and Maintenance,Battery Replacement | Viruses Removal | Front ...,Purple Pixl offers fast & effective repair ser...,Purple Pixl is a technology and electronics re...,Phone Repair & Services,purple pixl,"(purple, pixl)",75819,Machinery Manufacturing
102,102,116,Pasargad Jonub,237 Heavy and Civil Engineering Construction,Infrastructure Project Management | Design and...,Having a brilliant background and well-trained...,Pasargad Jonub is a private joint stock compan...,Civil Engineering Services,pasargad jonub,"(pasargad, jonub)",0,Building Material and Garden Equipment and Sup...
103,103,6,Davis Business Machines,"423 Merchant Wholesalers, Durable Goods",Managed IT Services | Document Scanners | Fax ...,"Since May 13, 1963, Davis Business Machines ha...",Davis Business Machines is a company that has ...,Office Supplies,davis business machines,"(davis, business, machines)",493113,Miscellaneous Manufacturing
104,104,24,High Desert Tool,326 Plastics and Rubber Products Manufacturing,Aerospace Molds Supplier | Automotive Molds Di...,High Desert Tool specializes in large mold man...,High Desert Tool is a company that specializes...,Injection Molding Services,high desert tool,"(high, desert, tool)",340300,Miscellaneous Manufacturing


# Round 2

In [21]:
test_df.to_csv("round_1_labelled.csv", index=False)

In [26]:
clean_dataset["business_tags_clean"] = clean_dataset["business_tags_clean"].astype(str)
clean_dataset["tags_vector"] = clean_dataset["business_tags_clean"].apply(nlp)
label_vectors = final_data["business_tags"].tolist()
label_vectors

In [23]:
test_df["clean_r2"] = test_df["round_2"].apply(lambda x: clean_text(x))

test_df["r2_vector"] = test_df["clean_r2"].apply(nlp)

In [24]:
test_df["best_match_label_index"] = test_df["r2_vector"].apply(
    lambda x: find_best_match(x, label_vectors)
)

# Map the index to the actual domain name
test_df["best_match_r2"] = test_df["best_match_label_index"].apply(
    lambda x: clean_dataset.iloc[x]["naics_label"]
)

AttributeError: 'str' object has no attribute 'vector_norm'