# FULCRUM - TOURNAMENT


In [247]:
import pandas as pd
import spacy
import re
import string
import numpy as np
import time
import requests
import json

In [248]:
# PATHS

# luca
path_luca_json = "./data/map_business_to_naics.json"
path_luca_json2 = "./data/map_company_naics.json"
path_luca_json_lower = "./data/map_company_naics_lower.json"

# alex
dataset_path = './data/cleaned_dataset.csv'

embeddings_commercial_name_path = './data/embeddings_commercial_name_all.npy'
embeddings_short_description_path = './data/embeddings_short_description.npy'
embeddings_description_path = './data/embeddings_description_all.npy'

# vali 
NAICS_KEYWORDS_PATH = './data/naics_summary_keywords.csv'

## Alex

In [249]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')


loaded_embeddings_commercial_name = np.load(embeddings_commercial_name_path)
loaded_embeddings_short_description = np.load(embeddings_short_description_path)
loaded_embeddings_description = np.load(embeddings_description_path)

data = pd.read_csv(dataset_path)

#! Commercial name - cred

def round_1(company_name):
    sen = [company_name]
    sen_embeddings = model.encode(sen)

    results = cosine_similarity(
        [sen_embeddings[0]],
        loaded_embeddings_commercial_name[0:]
    )
    sim_value = results.max() # value
    label = data.iloc[results.argmax()]['naics_label'] # label
    return label

#! Short description

def round_3(short_description):
    sen = [short_description]
    sen_embeddings = model.encode(sen)

    results = cosine_similarity(
        [sen_embeddings[0]],
        loaded_embeddings_short_description[0:]
    )
    sim_value = results.max()
    label = data.iloc[results.argmax()]['naics_label']
    return label

#! Description

def round_4(description):
    sen = [description]
    sen_embeddings = model.encode(sen)

    results = cosine_similarity(
        [sen_embeddings[0]],
        loaded_embeddings_description[0:]
    )
    sim_value = results.max()
    label = data.iloc[results.argmax()]['naics_label']
    return label


## Vali - keywords for Round 2

In [250]:
pd_naics_keyw = pd.read_csv(NAICS_KEYWORDS_PATH)
pd_naics_keyw.head()

Unnamed: 0.1,Unnamed: 0,naics_code,naics_label,description,summary,keywords
0,0,111,Crop Production,Industries in the Crop Production subsector gr...,The Crop Production subsector comprises establ...,"Crop Production, farms, orchards, greenhouses,..."
1,1,112,Animal Production and Aquaculture,Industries in the Animal Production and Aquacu...,The Animal Production and Aquaculture subsecto...,"Animal Production, Aquaculture, Ranches, Farms..."
2,2,113,Forestry and Logging,Industries in the Forestry and Logging subsect...,The forestry and logging industries harvest ti...,"Forestry, Logging, Timber Production, Reforest..."
3,3,114,"Fishing, Hunting and Trapping","Industries in the Fishing, Hunting and Trappin...","The Fishing, Hunting and Trapping industry rel...","Fishing, Hunting, Trapping, Harvest, Wild Anim..."
4,4,115,Support Activities for Agriculture and Forestry,Industries in the Support Activities for Agric...,The Support Activities for Agriculture and For...,"Support Activities, Agriculture, Forestry, Sup..."


In [251]:
nlp = spacy.load("en_core_web_lg")

def remove_mentions_and_hashtags(text):
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    return text


def remove_numbers(text):
    text = re.sub(r"\d+", "", text)
    return text


def remove_punctuation(text):

    translator = str.maketrans("", "", string.punctuation)

    # Remove punctuation using the translation table
    text_without_punct = text.translate(translator)

    return text_without_punct


def remove_stopwords(text):
    filtered_sentence = []
    doc = nlp(text)
    for token in doc:
        if token.is_stop == False:
            filtered_sentence.append(token.text)
    return " ".join(filtered_sentence)


def clean_text(text,to_lemmatize:bool = True):

    # Standardize text
    # text = standardize_accented_chars(text)

    # Remove URLs
    text = re.sub(r"http\S+", "", text)

    # Remove mentions and hashtags
    text = remove_mentions_and_hashtags(text)

    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = remove_punctuation(text)

    # Remove numbers
    text = remove_numbers(text)

    # Remove all the special characters
    text = re.sub(r"\W", " ", text)

    # Remove stopwords
    text = remove_stopwords(text)

    # Substituting multiple spaces with single space
    text = re.sub(r"\s+", " ", text, flags=re.I)

    if to_lemmatize:
        text = lemmatize(text)

    return text


def lemmatize(text):
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc])
    return text

# Calculate similarity
def calculate_similarity(keywords, tokens):
    
    text1 = nlp(" ".join(keywords))
    text2 = nlp(" ".join(tokens))
    
    return text1.similarity(text2)


def predict_industry_label(keywords, industry_data):
    label_key = 'naics_label'
    code_key = 'naics_code'

#Precompute similarities if possible
    similarities = industry_data['tokens'].apply(lambda tokens: calculate_similarity(keywords, tokens))

    # Find index of the maximum similarity
    max_index = similarities.idxmax()
    max_similarity = similarities[max_index]
    
    # predicted_label = industry_data.at[max_index, code_key]
    # predicted_label = str(predicted_label) + " " + str(industry_data.at[max_index, label_key])
    predicted_code = str(industry_data.at[max_index, code_key])
    predicted_label = str(industry_data.at[max_index, label_key])

    return predicted_label, predicted_code, max_similarity


In [252]:

industry_data = pd_naics_keyw

# Preprocess industry descriptions
industry_data['processed_description'] = industry_data['keywords'].str.lower()
industry_data['processed_description'] = [clean_text(text) for text in industry_data['processed_description']]


print(industry_data['processed_description'][15])

# apply clean_text function to the column 'processed_description'
industry_data['processed_description'] = industry_data['processed_description'].apply(lambda x: clean_text(x))

# Tokenize and embed industry descriptions
industry_data['tokens'] = industry_data['processed_description'].apply(lambda x: [token.text for token in nlp(x)])

textile product mills nonapparel textile product sheet towel cut sew textile industry textile product purchase fabric cutting sewing


## Luca

In [253]:
import json
f = open(
    path_luca_json
)
label_dict_l = json.load(f)

def map_bussines_to_naics(taxonomy:str):
    return label_dict_l.get(taxonomy, "abstain")

In [254]:
from rapidfuzz import process, fuzz

def read_map(path_json: str):
    f = open(path_json)
    map_company_comercial = json.load(f)
    return map_company_comercial

map_company_naics = read_map(path_luca_json_lower)

def fuzzy_match_naics_round1(company_name, threshold: int = 50):
    matches = process.extractOne(
        company_name, map_company_naics.keys(), scorer=fuzz.WRatio
    )

    return map_company_naics[matches[0]]

## API calls

In [255]:
# base_url = 'http://116.202.111.229:8000'
# api_key = 'VPLrVk4hSZMdGrW2wAP0GTpsV2Jsdx5Z'

# headers = {
#     'x-api-key': api_key
# }

# rounds = 5
# current_round = 1

# filename = 'extra_data.csv'
# data_columns = ['commercial_name', 'business_tags', 'short_descripti', 'description', 'main_business_category', 'naics']

# index = 0

# count_abstain = 0
# guesses = ["", "", "", "", ""]

# # Get a new hint for current company or get the first hint for a new company after calling /evaluate/reset
#     response = requests.get(f"{base_url}/evaluate/hint", headers=headers)

#     print(response.status_code, response.json())
    
#     # Get the hint
#     hint = response.json()['hint']

#     # predict based off given hint
#     time.sleep(1)
    
#     [final_answer, final_code, final_similarity] = ["abstain", "?", "?"]
    
#     if current_round == 1:
#         # final_answer = round_1(hint)
#         final_answer = fuzzy_match_naics_round1(hint)
#     elif current_round == 2:
#         input_keywords = hint.split("|")
    
#         print("Keywords: ", input_keywords)
#         [final_answer, final_code, final_similarity] = predict_industry_label(input_keywords, industry_data)
#     # elif current_round == 3:
#     #     final_answer = round_3(hint)
#     elif current_round == 3:
#         # get the first 5 keywords from the text using spacy
#         doc = nlp(hint)
#         input_keywords = [chunk.text for chunk in doc.noun_chunks][:3]
#         [final_answer, final_code, final_similarity] = predict_industry_label(input_keywords, industry_data)
#     elif current_round == 4:
#         # final_answer = round_4(hint)
#         final_answer = "abstain"
#     else:
#         final_answer = map_bussines_to_naics(hint)
        
#     guesses[current_round - 1] = final_answer
    
#     # if count_abstain == 0 and current_round >= 3 and guesses[current_round - 2] != final_answer:
#     #     count_abstain = count_abstain + 1
#     #     final_answer = "abstain"

#     # Post your answer for current hint
#     data_send = {
#         'answer': final_answer
#     }
    
#     print("Guess: ", final_answer)
#     if current_round == 2:
#         print("Code: ", final_code)
#         print("Similarity: ", final_similarity)
    
#     response = requests.post(f"{base_url}/evaluate/answer", json=data_send, headers=headers)

#     print(response.status_code, response.json())
    
#     print("")
    
#     current_round = current_round + 1
#     time.sleep(1)

# # Get hints about a new company
# current_round = 1
# index = index + 1

# response = requests.get(f"{base_url}/evaluate/reset", headers=headers)

# print(response.status_code, response.json())


In [355]:
testtesttest = fuzzy_match_naics_round1("Hospitals & others")
print(testtesttest)
input_keywordszzz = ["Hospitals", "medic"]
[test2, _, __] = predict_industry_label(input_keywordszzz, industry_data)
print(test2)

Professional, Scientific, and Technical Services
Hospitals


In [372]:
base_url = 'http://116.202.111.229:8000'
api_key = 'VPLrVk4hSZMdGrW2wAP0GTpsV2Jsdx5Z'

headers = {
    'x-api-key': api_key
}

rounds = 5
current_round = 1

filename = 'extra_data.csv'
data_columns = ['commercial_name', 'business_tags', 'short_descripti', 'description', 'main_business_category', 'naics']

index = 0

count_abstain = 0
guesses = ["", "", "", "", ""]



In [377]:

# Get a new hint for current company or get the first hint for a new company after calling /evaluate/reset
response = requests.get(f"{base_url}/evaluate/hint", headers=headers)

print(response.status_code, response.json())

# Get the hint
hint = response.json()['hint']

# predict based off given hint
time.sleep(1)

[final_answer, final_code, final_similarity] = ["abstain", "?", "?"]

if current_round == 1:
    # final_answer = round_1(hint)
    # final_answer = fuzzy_match_naics_round1(hint)
    final_answer = fuzzy_match_naics_round1(hint.lower())
elif current_round == 2:
    input_keywords = hint.split("|")

    print("Keywords: ", input_keywords)
    [final_answer, final_code, final_similarity] = predict_industry_label(input_keywords, industry_data)
# elif current_round == 3:
#     final_answer = round_3(hint)
elif current_round == 3:
    # get the first 5 keywords from the text using spacy
    doc = nlp(hint)
    input_keywords = [chunk.text for chunk in doc.noun_chunks][:3]
    [final_answer, final_code, final_similarity] = predict_industry_label(input_keywords, industry_data)
elif current_round == 4:
    # final_answer = round_4(hint)
    final_answer = "abstain"
else:
    final_answer = map_bussines_to_naics(hint)
    
guesses[current_round - 1] = final_answer

# Post your answer for current hint
data_send = {
    'answer': final_answer
}

response = requests.post(f"{base_url}/evaluate/answer", json=data_send, headers=headers)

print(response.status_code, response.json())

print("Guess: ", final_answer)
if current_round == 2:
    print("Code: ", final_code)
    print("Similarity: ", final_similarity)

current_round = current_round + 1
if current_round > 5:
    current_round = 1
# time.sleep(1)


200 {'company_id': 38, 'level': 5, 'hint': 'Bakeries & Desserts'}
200 {'response': 'ack'}
Guess:  Food and Beverage Retailers


In [214]:

response = requests.get(f"{base_url}/evaluate/reset", headers=headers)

print(response.status_code, response.json())


200 {'response': 'Wait for 226.31919360160828 seconds'}
