In [49]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [50]:
taxonomy_path = "/content/drive/MyDrive/Task_Data/insurance_taxonomy.xlsx"

taxonomy_df = pd.read_excel(taxonomy_path)

print(taxonomy_df.head())

                                label
0     Agricultural Equipment Services
1  Soil Nutrient Application Services
2      Pesticide Application Services
3          Ornamental Plant Nurseries
4                Landscaping Services


In [51]:
companies_path = "/content/drive/MyDrive/Task_Data/ml_insurance_challenge.csv"

companies_df = pd.read_csv(companies_path)

print(companies_df.head())

                                         description  \
0  Welchcivils is a civil engineering and constru...   
1  Kyoto Vegetable Specialists Uekamo, also known...   
2  Loidholdhof Integrative Hofgemeinschaft is a c...   
3  PATAGONIA Chapa Y Pintura is an auto body shop...   
4  Stanica WODNA PTTK Swornegacie is a cultural e...   

                                       business_tags         sector  \
0  ['Construction Services', 'Multi-utilities', '...       Services   
1  ['Wholesale', 'Dual-task Movement Products', '...  Manufacturing   
2  ['Living Forms', 'Farm Cafe', 'Fresh Coffee', ...  Manufacturing   
3  ['Automotive Body Repair Services', 'Interior ...       Services   
4  ['Cultural Activities', 'Accommodation Service...       Services   

                               category  \
0            Civil Engineering Services   
1  Fruit & Vegetable - Markets & Stores   
2        Farms & Agriculture Production   
3                       Auto Body Shops   
4                  Bo

In [52]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):

    if not text or isinstance(text, float):
        return ""

    text = str(text).lower()
    text = re.sub(r"[^a-zAhetics\s]", " ", text)
    text = ''.join([char for char in text if char not in string.punctuation])

    words = [word for word in text.split() if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(words)

companies_df['description_cleaned'] = companies_df['description'].apply(clean_text)

print(companies_df['description_cleaned'].head())


0    welchcivils civil engineering construction com...
1    kyoto vegetable specialist uekamo also known i...
2    loidholdhof integrative hofgemeinschaft compan...
3    patagonia chapa pintura auto body shop located...
4    stanica wodna pttk swornegacie cultural establ...
Name: description_cleaned, dtype: object


In [53]:
from collections import Counter

def get_top_5_words(text):
    words = clean_text(text).split()
    word_counts = Counter(words)
    return word_counts.most_common(5)

companies_df['top_5_words'] = companies_df['description_cleaned'].apply(get_top_5_words)

In [58]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

companies_df['description_cleaned'] = companies_df['description_cleaned'].apply(lambda x: (x + ' ') * 1)
companies_df['business_tags'] = companies_df['business_tags'].apply(lambda x: (x + ' ') * 2)
companies_df['sector'] = companies_df['sector'].fillna('').apply(lambda x: (x + ' ') * 1)
companies_df['category'] = companies_df['category'].fillna('').apply(lambda x: (x + ' ') * 3)
companies_df['niche'] = companies_df['niche'].apply(lambda x: (x + ' ') * 1)

companies_df['combined_text'] = (
    companies_df['description_cleaned'] +
    companies_df['business_tags'] +
    companies_df['sector'] + ' ' +
    companies_df['category'] + ' ' +
    companies_df['niche']
)

vectorizer = TfidfVectorizer(stop_words='english')

combined_text = taxonomy_df['label'].tolist() + companies_df['combined_text'].tolist()

vectorizer.fit(combined_text)

taxonomy_vectors = vectorizer.transform(taxonomy_df['label'])
company_vectors = vectorizer.transform(companies_df['combined_text'])

def assign_taxonomy_to_company(row):

    top_words = [word for word, _ in row['top_5_words']]
    top_words_text = ' '.join(top_words * 5)

    full_text = row['combined_text'] + ' ' + top_words_text
    company_vector = vectorizer.transform([full_text])

    similarities = cosine_similarity(company_vector, taxonomy_vectors)
    most_similar_index = similarities.argmax()

    return taxonomy_df.iloc[most_similar_index]['label']

companies_df['assigned_taxonomy'] = companies_df.apply(assign_taxonomy_to_company, axis=1)

print(companies_df[['description', 'assigned_taxonomy']].head())


                                         description  \
0  Welchcivils is a civil engineering and constru...   
1  Kyoto Vegetable Specialists Uekamo, also known...   
2  Loidholdhof Integrative Hofgemeinschaft is a c...   
3  PATAGONIA Chapa Y Pintura is an auto body shop...   
4  Stanica WODNA PTTK Swornegacie is a cultural e...   

                    assigned_taxonomy  
0  Multi-Family Construction Services  
1     Agricultural Equipment Services  
2       Community Engagement Services  
3            Interior Design Services  
4           Well Maintenance Services  


In [59]:
import pandas as pd

companies_df2 = pd.read_csv(companies_path)

companies_df2['assigned_taxonomy'] = companies_df['assigned_taxonomy']

companies_df2.to_csv(companies_path, index=False)

In [22]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

for idx, row in companies_df.head(10).iterrows():
    if row['combined_text'] is not None and row['combined_text'].strip() != "":
        company_vector = company_vectors[idx]
        similarity_scores = cosine_similarity(company_vector, taxonomy_vectors)

        sorted_indices = np.argsort(similarity_scores.flatten())[::-1]

        top_3_indices = sorted_indices[:3]
        top_3_labels = taxonomy_df.iloc[top_3_indices]['label'].values
        top_3_scores = similarity_scores.flatten()[top_3_indices]

        print(f"Company: {row['description']}")
        print("Top 3 Predicted Taxonomies with Similarity Scores:")
        for label, score in zip(top_3_labels, top_3_scores):
            print(f"{label}: {score:.4f}")

        print("\n")
    else:
        print(f"Company at index {idx} has no valid combined text.\n")


Company: Welchcivils is a civil engineering and construction company that specializes in designing and building utility network connections across the UK. They offer multi-utility solutions that combine electricity, gas, water, and fibre optic installation into a single contract. Their design engineer teams are capable of designing electricity, water and gas networks from existing network connection points to meter locations at the development, as well as project management of reinforcements and diversions. They provide custom connection solutions that take into account any existing assets, maximize the usage of every trench, and meet project deadlines. Welchcivils has considerable expertise installing gas and electricity connections in a variety of market categories, including residential, commercial, and industrial projects, as as well.
Top 3 Predicted Taxonomies with Similarity Scores:
Multi-Family Construction Services: 0.1409
Gas Installation Services: 0.0954
Commercial Constructi