In [41]:
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import string

# Load datasets
categories_df = pd.read_csv("data/tournament_hints_data.csv", nrows=1000)
domains_df = pd.read_excel("data/Naics3_taxonomy.xlsx", engine="openpyxl")
# Initialize spaCy with a large model for better semantic understanding
nlp = spacy.load("en_core_web_lg")

def clean_text(text):
    doc = nlp(text.lower())  # Tokenize and convert to lowercase
    tokens = []
    for token in doc:
        # Remove punctuation and stopwords
        if (
            token.text not in nlp.Defaults.stop_words
            and token.text not in string.punctuation
        ):
            tokens.append(token.lemma_)  # Use lemmas for a normalized form
    return " ".join(tokens)


domains_df["description"] = domains_df["description"].apply(clean_text)
# List of columns to clean in categories_df
text_columns = ["commercial_name", "business_tags", "short_description", "description"]

# Apply the cleaning function to each column
for column in text_columns:
    categories_df[column] = categories_df[column].apply(clean_text)

KeyboardInterrupt: 

In [33]:
domains_df[:1]

Unnamed: 0,naics_code,naics_label,description
0,111,Crop Production,industry crop production subsector grow crop m...


In [32]:
categories_df[:1]

Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category
0,white horse,tile manufacturing european aesthetic ceramic ...,white horse highly regard tile trailblazer des...,white horse ceramic singapore lead manufacture...,tile store


In [34]:
# Function to preprocess and create vectors for different levels of detail
import numpy as np
def preprocess_and_vectorize(dataframe, columns):
    dataframe["combined_text"] = dataframe[columns].apply(
        lambda x: " ".join(x.dropna().astype(str)), axis=1
    )
    dataframe["text_vector"] = dataframe["combined_text"].apply(lambda x: nlp(x).vector)
    return dataframe


# Function to find the most similar domain for each startup entry
# Function to find the most similar domain for each startup entry
def find_most_similar_domain(entry_vector, domain_vectors):
    # Convert the list of domain vectors into a NumPy array if not already done
    domain_matrix = np.array(domain_vectors)
    # Compute similarities using cosine_similarity
    similarities = cosine_similarity([entry_vector], domain_matrix)[0]
    return similarities.argmax()  # Return the index of the highest similarity

In [35]:
domains_df["lemmatized_description"] = domains_df["description"].apply(
    lambda x: " ".join(
        [token.lemma_ for token in nlp(x) if not token.is_punct and not token.is_stop]
    )
)
domains_df["vector"] = domains_df["lemmatized_description"].apply(
    lambda x: nlp(x).vector
)
domain_vectors = domains_df["vector"].tolist()

In [36]:
name_df = preprocess_and_vectorize(categories_df, ["commercial_name"])
name_df["domain_match"] = name_df["text_vector"].apply(
    lambda x: domains_df.iloc[find_most_similar_domain(x, domain_vectors)][
        "naics_label"
    ]
)
print(name_df[["commercial_name", "domain_match"]])

                     commercial_name  \
0                        white horse   
1            wealth solution partner   
2                                pmg   
3                   tmp capital pllc   
4                     genertek power   
..                               ...   
995                    recovery shop   
996             uncle sam healthcare   
997                  daughtry family   
998  mango computer solution pvt ltd   
999                       truck deck   

                                          domain_match  
0    Clothing, Clothing Accessories, Shoe, and Jewe...  
1          Funds, Trusts, and Other Financial Vehicles  
2                                      Crop Production  
3                    Monetary Authorities-Central Bank  
4    Electrical Equipment, Appliance, and Component...  
..                                                 ...  
995  Building Material and Garden Equipment and Sup...  
996                                 Private Households  
997   

In [37]:
name_df

Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category,combined_text,text_vector,domain_match
0,white horse,tile manufacturing european aesthetic ceramic ...,white horse highly regard tile trailblazer des...,white horse ceramic singapore lead manufacture...,tile store,white horse,"[-4.142585, 0.19731499, -4.223, 1.0798, 1.4784...","Clothing, Clothing Accessories, Shoe, and Jewe..."
1,wealth solution partner,super smsf service financial planning investme...,wsp wealth solution partner financial planning...,wealth solution partner pty ltd independent fi...,investment consultant financial advisor,wealth solution partner,"[0.42037, -0.99176663, -1.6956133, 1.1250666, ...","Funds, Trusts, and Other Financial Vehicles"
2,pmg,fire water cleanup service mold remediation re...,pmg general solutions inc environmental remedi...,pmg general solutions inc environmental remedi...,damage restoration mold remediation,pmg,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Crop Production
3,tmp capital pllc,license al fl 203k loan 15 year fix mortgage m...,tmp capital pllc consult company franklin mort...,tmp capital pllc consult company know franklin...,mortgage broker,tmp capital pllc,"[-0.017940005, 0.539986, -0.56526, 1.4054074, ...",Monetary Authorities-Central Bank
4,genertek power,industrial commercial energy storage asset man...,genertek power ltd uk electricity systems uk i...,genertek power limited privately own uk invest...,renewable energy company,genertek power,"[-0.9067, 1.1018, -3.06825, 3.0983, 2.4848, 0....","Electrical Equipment, Appliance, and Component..."
...,...,...,...,...,...,...,...,...
995,recovery shop,leather coaster meditation book recovery item ...,recovery shop stop shop meet material recovery...,recovery shop company specialize provide meeti...,book store,recovery shop,"[-0.26645, -2.71437, -2.82695, 1.791085, 1.337...",Building Material and Garden Equipment and Sup...
996,uncle sam healthcare,benefit exploration consultation licensed medi...,uncle sam healthcare team independent medicare...,uncle sam healthcare team independent medicare...,health insurance agency,uncle sam healthcare,"[-0.35315666, 0.9658833, -2.7236664, -0.607056...",Private Households
997,daughtry family,furniture retailer lighting fixture retailer,daughtry family furniture store base miami flo...,daughtry family llc company specialize furnitu...,furniture store,daughtry family,"[-0.95025, -0.6224, -3.33935, -1.9578, 2.63755...",Private Households
998,mango computer solution pvt ltd,home automation office installation service ga...,mango computer solution pvt ltd computer repai...,mango computer solution pvt ltd computer repai...,phone repair service,mango computer solution pvt ltd,"[1.570874, -1.0392601, -0.46970206, 0.53459746...",Computer and Electronic Product Manufacturing


In [38]:
name_tags_df = preprocess_and_vectorize(categories_df, ["commercial_name", "business_tags"])
name_tags_df["domain_match"] = name_tags_df["text_vector"].apply(
    lambda x: domains_df.iloc[find_most_similar_domain(x, domain_vectors)][
        "naics_label"
    ]
)
name_df

Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category,combined_text,text_vector,domain_match
0,white horse,tile manufacturing european aesthetic ceramic ...,white horse highly regard tile trailblazer des...,white horse ceramic singapore lead manufacture...,tile store,white horse tile manufacturing european aesthe...,"[-2.852624, -0.8655327, -2.2241113, 1.4550223,...",Wood Product Manufacturing
1,wealth solution partner,super smsf service financial planning investme...,wsp wealth solution partner financial planning...,wealth solution partner pty ltd independent fi...,investment consultant financial advisor,wealth solution partner super smsf service fin...,"[-0.9210007, -0.61809033, -1.6261929, 0.381453...",Personal and Laundry Services
2,pmg,fire water cleanup service mold remediation re...,pmg general solutions inc environmental remedi...,pmg general solutions inc environmental remedi...,damage restoration mold remediation,pmg fire water cleanup service mold remediatio...,"[-0.84096384, 0.29814675, -2.4269829, 1.271212...",Utilities
3,tmp capital pllc,license al fl 203k loan 15 year fix mortgage m...,tmp capital pllc consult company franklin mort...,tmp capital pllc consult company know franklin...,mortgage broker,tmp capital pllc license al fl 203k loan 15 ye...,"[-0.24779387, 0.11646733, -3.197701, -0.786332...",Credit Intermediation and Related Activities
4,genertek power,industrial commercial energy storage asset man...,genertek power ltd uk electricity systems uk i...,genertek power limited privately own uk invest...,renewable energy company,genertek power industrial commercial energy st...,"[-1.7179434, -0.80020165, -3.2156904, 1.434138...",Utilities
...,...,...,...,...,...,...,...,...
995,recovery shop,leather coaster meditation book recovery item ...,recovery shop stop shop meet material recovery...,recovery shop company specialize provide meeti...,book store,recovery shop leather coaster meditation book ...,"[-0.74802256, -0.7864275, -1.0723927, -0.15110...",Printing and Related Support Activities
996,uncle sam healthcare,benefit exploration consultation licensed medi...,uncle sam healthcare team independent medicare...,uncle sam healthcare team independent medicare...,health insurance agency,uncle sam healthcare benefit exploration consu...,"[-0.54308057, 0.58594203, -2.0252416, -0.24553...",Ambulatory Health Care Services
997,daughtry family,furniture retailer lighting fixture retailer,daughtry family furniture store base miami flo...,daughtry family llc company specialize furnitu...,furniture store,daughtry family furniture retailer lighting fi...,"[-1.8651143, -1.1720501, -3.2690628, -0.158781...",Building Material and Garden Equipment and Sup...
998,mango computer solution pvt ltd,home automation office installation service ga...,mango computer solution pvt ltd computer repai...,mango computer solution pvt ltd computer repai...,phone repair service,mango computer solution pvt ltd home automatio...,"[-0.51764786, 0.112583615, -1.4089758, 0.71620...","Furniture, Home Furnishings, Electronics, and ..."


In [39]:
name_tags_short_df = preprocess_and_vectorize(
    categories_df, ["commercial_name", "business_tags", "short_description"]
)
name_tags_short_df["domain_match"] = name_tags_short_df["text_vector"].apply(
    lambda x: domains_df.iloc[find_most_similar_domain(x, domain_vectors)][
        "naics_label"
    ]
)
name_df

Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category,combined_text,text_vector,domain_match
0,white horse,tile manufacturing european aesthetic ceramic ...,white horse highly regard tile trailblazer des...,white horse ceramic singapore lead manufacture...,tile store,white horse tile manufacturing european aesthe...,"[-2.7721999, -0.6059768, -2.059296, 0.91198707...",Furniture and Related Product Manufacturing
1,wealth solution partner,super smsf service financial planning investme...,wsp wealth solution partner financial planning...,wealth solution partner pty ltd independent fi...,investment consultant financial advisor,wealth solution partner super smsf service fin...,"[-1.0157038, -0.8720501, -2.1618762, 0.4531729...","Funds, Trusts, and Other Financial Vehicles"
2,pmg,fire water cleanup service mold remediation re...,pmg general solutions inc environmental remedi...,pmg general solutions inc environmental remedi...,damage restoration mold remediation,pmg fire water cleanup service mold remediatio...,"[-0.80679107, -0.28244975, -2.1626437, 0.92423...",Utilities
3,tmp capital pllc,license al fl 203k loan 15 year fix mortgage m...,tmp capital pllc consult company franklin mort...,tmp capital pllc consult company know franklin...,mortgage broker,tmp capital pllc license al fl 203k loan 15 ye...,"[-0.24307723, -0.06344261, -2.9510522, -0.6423...",Credit Intermediation and Related Activities
4,genertek power,industrial commercial energy storage asset man...,genertek power ltd uk electricity systems uk i...,genertek power limited privately own uk invest...,renewable energy company,genertek power industrial commercial energy st...,"[-1.3261807, -0.8805708, -2.9136176, 0.8535335...",Utilities
...,...,...,...,...,...,...,...,...
995,recovery shop,leather coaster meditation book recovery item ...,recovery shop stop shop meet material recovery...,recovery shop company specialize provide meeti...,book store,recovery shop leather coaster meditation book ...,"[-0.6439705, -0.9563486, -1.4970986, 0.2868953...","Merchant Wholesalers, Nondurable Goods"
996,uncle sam healthcare,benefit exploration consultation licensed medi...,uncle sam healthcare team independent medicare...,uncle sam healthcare team independent medicare...,health insurance agency,uncle sam healthcare benefit exploration consu...,"[-0.48073655, 1.039269, -2.1673675, -0.2278548...",Ambulatory Health Care Services
997,daughtry family,furniture retailer lighting fixture retailer,daughtry family furniture store base miami flo...,daughtry family llc company specialize furnitu...,furniture store,daughtry family furniture retailer lighting fi...,"[-1.8882245, -0.7471057, -2.9546554, 0.5615974...",Building Material and Garden Equipment and Sup...
998,mango computer solution pvt ltd,home automation office installation service ga...,mango computer solution pvt ltd computer repai...,mango computer solution pvt ltd computer repai...,phone repair service,mango computer solution pvt ltd home automatio...,"[-0.27219564, 0.114344835, -1.3002203, 0.58916...","Furniture, Home Furnishings, Electronics, and ..."


In [40]:
full_detail_df = preprocess_and_vectorize(
    categories_df,["commercial_name", "business_tags", "short_description", "description"],
)
full_detail_df["domain_match"] = full_detail_df["text_vector"].apply(
    lambda x: domains_df.iloc[find_most_similar_domain(x, domain_vectors)][
        "naics_label"
    ]
)
full_detail_df

Unnamed: 0,commercial_name,business_tags,short_description,description,main_business_category,combined_text,text_vector,domain_match
0,white horse,tile manufacturing european aesthetic ceramic ...,white horse highly regard tile trailblazer des...,white horse ceramic singapore lead manufacture...,tile store,white horse tile manufacturing european aesthe...,"[-2.0064418, -0.668667, -2.095801, 1.0821338, ...",Furniture and Related Product Manufacturing
1,wealth solution partner,super smsf service financial planning investme...,wsp wealth solution partner financial planning...,wealth solution partner pty ltd independent fi...,investment consultant financial advisor,wealth solution partner super smsf service fin...,"[-0.6285864, -0.71007794, -2.2565114, 0.459973...","Funds, Trusts, and Other Financial Vehicles"
2,pmg,fire water cleanup service mold remediation re...,pmg general solutions inc environmental remedi...,pmg general solutions inc environmental remedi...,damage restoration mold remediation,pmg fire water cleanup service mold remediatio...,"[-0.53575677, -0.29605603, -1.6930506, 0.63993...",Repair and Maintenance
3,tmp capital pllc,license al fl 203k loan 15 year fix mortgage m...,tmp capital pllc consult company franklin mort...,tmp capital pllc consult company know franklin...,mortgage broker,tmp capital pllc license al fl 203k loan 15 ye...,"[-0.3509355, -0.057785336, -2.3863144, 0.01034...",Rental and Leasing Services
4,genertek power,industrial commercial energy storage asset man...,genertek power ltd uk electricity systems uk i...,genertek power limited privately own uk invest...,renewable energy company,genertek power industrial commercial energy st...,"[-1.1194688, -0.6294623, -2.7256947, 0.6430167...",Rental and Leasing Services
...,...,...,...,...,...,...,...,...
995,recovery shop,leather coaster meditation book recovery item ...,recovery shop stop shop meet material recovery...,recovery shop company specialize provide meeti...,book store,recovery shop leather coaster meditation book ...,"[-0.65147483, -1.1647545, -1.6992173, 0.411738...","Merchant Wholesalers, Nondurable Goods"
996,uncle sam healthcare,benefit exploration consultation licensed medi...,uncle sam healthcare team independent medicare...,uncle sam healthcare team independent medicare...,health insurance agency,uncle sam healthcare benefit exploration consu...,"[-0.27387372, 1.0215404, -2.5624285, -0.183149...",Health and Personal Care Retailers
997,daughtry family,furniture retailer lighting fixture retailer,daughtry family furniture store base miami flo...,daughtry family llc company specialize furnitu...,furniture store,daughtry family furniture retailer lighting fi...,"[-1.7076392, -1.299465, -2.9072094, 0.53019834...",Building Material and Garden Equipment and Sup...
998,mango computer solution pvt ltd,home automation office installation service ga...,mango computer solution pvt ltd computer repai...,mango computer solution pvt ltd computer repai...,phone repair service,mango computer solution pvt ltd home automatio...,"[-0.16302836, 0.20585741, -1.4637133, 0.602087...","Furniture, Home Furnishings, Electronics, and ..."
