#### pip install spacy openpyxl
#### python -m spacy download en_core_web_lg


In [25]:
import pandas as pd

# Load your CSV files
categories_df = pd.read_excel("data/business_category_taxonomy.xlsx", engine='openpyxl')
domains_df = pd.read_excel("data/Naics3_taxonomy.xlsx", engine='openpyxl')
business_df = pd.read_csv("data/tournament_hints_data.csv", nrows=10000)

In [26]:
import spacy
import string
from sklearn.metrics.pairwise import cosine_similarity
import re

# Load the medium spaCy model
nlp = spacy.load("en_core_web_md")


def remove_mentions_and_hashtags(text):
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    return text


def remove_numbers(text):
    text = re.sub(r"\d+", "", text)
    return text


def remove_punctuation(text):

    translator = str.maketrans("", "", string.punctuation)

    # Remove punctuation using the translation table
    text_without_punct = text.translate(translator)

    return text_without_punct


def remove_stopwords(text):
    filtered_sentence = []
    doc = nlp(text)
    for token in doc:
        if token.is_stop == False:
            filtered_sentence.append(token.text)
    return " ".join(filtered_sentence)


def lemmatize(text):
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc])
    return text


def clean_text(text, to_lemmatize: bool = True):
    # Standardize text
    # text = standardize_accented_chars(text)

    # Remove URLs
    text = re.sub(r"http\S+", "", text)

    # Remove mentions and hashtags
    text = remove_mentions_and_hashtags(text)

    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = remove_punctuation(text)

    # Remove numbers
    text = remove_numbers(text)

    # Remove all the special characters
    text = re.sub(r"\W", " ", text)

    # Remove stopwords
    text = remove_stopwords(text)

    # Substituting multiple spaces with single space
    text = re.sub(r"\s+", " ", text, flags=re.I)

    if to_lemmatize:
        text = lemmatize(text)

    return text


categories_df["clean_description"] = categories_df["description"].apply(
    lambda x: clean_text(x, to_lemmatize=False)
)
domains_df["clean_description"] = domains_df["description"].apply(
    lambda x: clean_text(x, to_lemmatize=False)
)

categories_df["description_vector"] = categories_df["clean_description"].apply(nlp)
domains_df["description_vector"] = domains_df["clean_description"].apply(nlp)

In [22]:
categories_df

Unnamed: 0,label,description,clean_description,description_vector
0,ATVs Dealers & Services,Businesses categorized under ATVs Dealers & Se...,businesses categorized atvs dealers services s...,"(businesses, categorized, atvs, dealers, servi..."
1,Abortion Clinics,Abortion Clinics provide medical services rela...,abortion clinics provide medical services rela...,"(abortion, clinics, provide, medical, services..."
2,Accounting & Bookkeeping Services,Accounting & Bookkeeping Services encompass a ...,accounting bookkeeping services encompass rang...,"(accounting, bookkeeping, services, encompass,..."
3,Acupuncture clinic,An Acupuncture clinic is a healthcare facility...,acupuncture clinic healthcare facility special...,"(acupuncture, clinic, healthcare, facility, sp..."
4,Adhesives & Sealants,Adhesives & Sealants encompass a diverse range...,adhesives sealants encompass diverse range pro...,"(adhesives, sealants, encompass, diverse, rang..."
...,...,...,...,...
598,Work Clothing & Protection Equipment,Work Clothing & Protection Equipment pertains ...,work clothing protection equipment pertains sp...,"(work, clothing, protection, equipment, pertai..."
599,Writers & Copywriters,Writers & Copywriters are skilled professional...,writers copywriters skilled professionals spec...,"(writers, copywriters, skilled, professionals,..."
600,YMCA Camps,YMCA Camps offer structured programs and activ...,ymca camps offer structured programs activitie...,"(ymca, camps, offer, structured, programs, act..."
601,Yoga Studios,Yoga Studios offer a tranquil space for indivi...,yoga studios offer tranquil space individuals ...,"(yoga, studios, offer, tranquil, space, indivi..."


In [5]:
def find_best_match(cat_desc_vec, label_vectors):
    # Calculate similarities and return the index of the highest similarity
    similarities = [
        cat_desc_vec.similarity(label_desc) for label_desc in label_vectors
    ]
    return similarities.index(max(similarities))


# Convert domain vectors to a list for efficiency
label_vectors = domains_df["description_vector"].tolist()

# Find the best match for each category
categories_df["best_match_label_index"] = categories_df["description_vector"].apply(
    lambda x: find_best_match(x, label_vectors)
)

# Map the index to the actual domain name
categories_df["best_match_label"] = categories_df["best_match_label_index"].apply(
    lambda x: domains_df.iloc[x]["naics_label"]
)

In [14]:
categories_df.to_csv("data/categories_with_best_match.csv", index=False)

In [7]:
business_df[:5]

Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category
0,White Horse,Tile Manufacturing | European Aesthetics Ceram...,White Horse is highly regarded as a tile trail...,White Horse Ceramic Singapore is a leading man...,Tile Store
1,Wealth Solution Partners,Super and SMSF Services | Financial Planning a...,"WSP, Wealth Solution Partners, Financial Plann...",Wealth Solution Partners Pty Ltd is an indepen...,Investment Consultants & Financial Advisors
2,PMG,Fire and Water Cleanup Services | Mold Remedia...,PMG General Solutions Inc. is an environmental...,PMG General Solutions Inc. is an environmental...,Damage Restoration & Mold Remediation
3,TMP Capital PLLC,Licensed in AL & FL | 203K Loans | 15-year Fix...,TMP Capital PLLC Consulting Company Franklin M...,"TMP Capital PLLC Consulting Company, also know...",Mortgage Brokers
4,Genertek Power,Industrial and Commercial Energy Storage | Ass...,Genertek Power Ltd a UK electricity systems & ...,Genertek Power Limited is a privately-owned UK...,Renewable energy companies


In [8]:
final_data = pd.merge(business_df, categories_df[['label','best_match_label_index','best_match_label']], left_on="main_business_category",right_on="label")
final_data.drop(columns=['label'], inplace=True)

In [9]:
final_data

Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category,best_match_label_index,best_match_label
0,White Horse,Tile Manufacturing | European Aesthetics Ceram...,White Horse is highly regarded as a tile trail...,White Horse Ceramic Singapore is a leading man...,Tile Store,31,Furniture and Related Product Manufacturing
1,Wealth Solution Partners,Super and SMSF Services | Financial Planning a...,"WSP, Wealth Solution Partners, Financial Plann...",Wealth Solution Partners Pty Ltd is an indepen...,Investment Consultants & Financial Advisors,64,"Securities, Commodity Contracts, and Other Fin..."
2,PMG,Fire and Water Cleanup Services | Mold Remedia...,PMG General Solutions Inc. is an environmental...,PMG General Solutions Inc. is an environmental...,Damage Restoration & Mold Remediation,84,Repair and Maintenance
3,TMP Capital PLLC,Licensed in AL & FL | 203K Loans | 15-year Fix...,TMP Capital PLLC Consulting Company Franklin M...,"TMP Capital PLLC Consulting Company, also know...",Mortgage Brokers,64,"Securities, Commodity Contracts, and Other Fin..."
4,Genertek Power,Industrial and Commercial Energy Storage | Ass...,Genertek Power Ltd a UK electricity systems & ...,Genertek Power Limited is a privately-owned UK...,Renewable energy companies,22,Chemical Manufacturing
...,...,...,...,...,...,...,...
9995,Racing Duel,Military Games | Handball Manager | Boxing Due...,Racing Duel is an online racing game where you...,Racing Duel is an online racing game that allo...,Race Tracks & Racing Supplies,81,"Amusement, Gambling, and Recreation Industries"
9996,Triton Graphite,Graphite Products Manufacturer | Graphite Rods...,Industrial Graphite Products manufacturers - T...,"Triton Graphite is a well-known manufacturer, ...","Minerals, Earths & Ores",24,Nonmetallic Mineral Product Manufacturing
9997,NW Diamond painting,Sealing Services | Drywall Repair Services | P...,In addition to standard interior and exterior ...,NW Diamond Painting LLC is a professional pain...,"Painting, Plastering & Wall Covering",31,Furniture and Related Product Manufacturing
9998,TMF Tires,Wheel-related Products and Services | Automoti...,Trying to find a Tire Dealer and Service Provi...,TMF Tires Inc is a tire dealer located in Stua...,"Breaks, Wheels & Tire Dealers",36,Motor Vehicle and Parts Dealers


In [10]:
final_data.to_csv('labelled_data.csv', index=False)


In [11]:
final_data[final_data['main_business_category'] == 'Beauty Salons']


Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category,best_match_label_index,best_match_label
243,CARIDAD VIDRO,On-location Services for Private Events | Brid...,"Welcome to CARIDAD VIDRO SHOP, wedding favors ...",Caridad VIDRO is a professional hair and makeu...,Beauty Salons,41,Health and Personal Care Retailers
680,Azalea Beauty Consultancy India,Sourcing and Purchasing Services | Layout Plan...,Initial and ongoing training and support for S...,Azalea Beauty Consultancy India is a company t...,Beauty Salons,41,Health and Personal Care Retailers
1469,BEAUTIFUL REMEDY,Waxing Services | Permanent Cosmetics | Facial...,BEAUTIFUL REMEDY is a beauty salon based in Ka...,Beautiful Remedy LLC is a beauty company that ...,Beauty Salons,41,Health and Personal Care Retailers
1674,Sogno The Salon Company,Personal Care Services | Hair Coloring Service...,Sogno The Salon Company is a beauty salon base...,Sogno The Salon Company is a beauty salon loca...,Beauty Salons,41,Health and Personal Care Retailers
2244,Eman Beauty Lab,Beauty Treatments | Beauty Services | Retail |...,"Welcome to Eman Beauty Lab, our Nail Salon in ...",Eman Beauty Lab is a nail salon located in Asc...,Beauty Salons,41,Health and Personal Care Retailers
2535,Khubsoorat,Shellac Pedicure | Lycon Precision Waxing | La...,After your Khubsoorat Hair & Beauty experience...,Khubsoorat Hair & Beauty is a salon located in...,Beauty Salons,41,Health and Personal Care Retailers
2577,LuLu Lavender nail,Quick Dry Top Coat | Massage Services | Person...,LuLu Lavender nail boutique - Best nail boutiq...,LuLu Lavender Nail Boutique is a luxury nail s...,Beauty Salons,41,Health and Personal Care Retailers
3064,ERVA,Signature Treatments | Beauty and Wellness Ser...,"Non-toxic, vegan and cruelty-free nails. Welco...",ERVA is a boutique nail salon located in Burle...,Beauty Salons,41,Health and Personal Care Retailers
3271,Move Manicure,Gel Pedicure Classic | Gel Manicure and Pedicu...,Move Manicure is a mobile nail salon located i...,Move Manicure is a mobile nail salon based in ...,Beauty Salons,41,Health and Personal Care Retailers
3337,GLOSSLAB,Personal Care Services | Performance-based Pol...,Glosslab is a membership-based nail studio ope...,Glosslab is a nail salon that offers unlimited...,Beauty Salons,41,Health and Personal Care Retailers


In [12]:
description = "NU U 2 Hair & Beauty is a professional hair and beauty salon located in Milton Regis High Street, Sittingbourne. They offer a variety of hair and body treatments, including hair cutting and styling, as well as other beauty treatments such as waxing, eye treatments, facials, massage, body treatments and permanent make-up."

In [13]:
final_data["clean_description"] = final_data["description"].apply(
    lambda x: clean_text(x)
)

final_data["description_vector"] = final_data["clean_description"].apply(nlp)

KeyboardInterrupt: 

In [None]:
final_data

In [8]:
import pandas as pd

# Load the datasets
test_df = pd.read_csv("data/data.csv",nrows=105)
dataset = pd.read_csv("data/tournament_hints_data.csv",nrows=10000)

# Drop or fill NaN values


In [9]:
from fuzzywuzzy import process


# Function to find best match for each item in round_1
def find_best_matches(row, choices):
    print("row: ", row)
    best_match = process.extractOne(row, choices)
    return best_match


# Apply the function to each element in the round_1 column
test_df["best_match"] = test_df["round_1"].apply(
    find_best_matches, choices=dataset["commercial_name"].tolist()
)

# Split the tuple of best match into separate columns if needed
test_df[["match_name", "match_score", "match_index"]] = pd.DataFrame(
    test_df["best_match"].tolist(), index=test_df.index
)

row:  Advisory Care
row:  PPK Mining Equipment
row:  First Tracks Lawn Care
row:  New Golden Furnishers
row:  Lilac City Septic
row:  TTT Industries
row:  Rocks on the River
row:  Cake Me Away
row:  Saadzoi Law
row:  Harlequin Designs
row:  OIC DEVAGIRI
row:  Reliable IT School
row:  Oblique Aqua
row:  Water N Fire Recovery
row:  Venus Cosmetic Surgery
row:  Lavish Looks
row:  Smart Training and Consultancy
row:  Kids & Giggles
row:  Fcoustic
row:  Accurate Profile Pipe Fabrication
row:  Dr. Stitch
row:  Maveric Medical
row:  Graham Tree Service
row:  The Spicy Shark
row:  Pedsplus Pediatrics
row:  Cahoots
row:  JPS International
row:  The Cotswold Roof Company
row:  The Cotswold Roof Company
row:  Maggie Music
row:  Extreme Auto
row:  Mineral Lake Lodge
row:  GOODBEATS
row:  Studio D
row:  Springfield Painting Services
row:  3 Brothers Electric
row:  Valley Cycle Locksmiths
row:  Skim Steezy
row:  Central Seafoods
row:  Avertro
row:  Hickory Knoll Architects
row:  Quest Connect
row:  

ValueError: Columns must be same length as key