In [5]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack, vstack

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

import warnings
warnings.simplefilter(action='ignore')

In [6]:
data=pd.read_csv("data/ml_insurance_challenge.csv")
data.dropna(inplace=True)
data.reset_index(inplace=True,drop=True)
data

Unnamed: 0,description,business_tags,sector,category,niche
0,Welchcivils is a civil engineering and constru...,"['Construction Services', 'Multi-utilities', '...",Services,Civil Engineering Services,Other Heavy and Civil Engineering Construction
1,"Kyoto Vegetable Specialists Uekamo, also known...","['Wholesale', 'Dual-task Movement Products', '...",Manufacturing,Fruit & Vegetable - Markets & Stores,"Frozen Fruit, Juice, and Vegetable Manufacturing"
2,Loidholdhof Integrative Hofgemeinschaft is a c...,"['Living Forms', 'Farm Cafe', 'Fresh Coffee', ...",Manufacturing,Farms & Agriculture Production,All Other Miscellaneous Crop Farming
3,PATAGONIA Chapa Y Pintura is an auto body shop...,"['Automotive Body Repair Services', 'Interior ...",Services,Auto Body Shops,"Automotive Body, Paint, and Interior Repair an..."
4,Stanica WODNA PTTK Swornegacie is a cultural e...,"['Cultural Activities', 'Accommodation Service...",Services,Boat Tours & Cruises,"Scenic and Sightseeing Transportation, Water"
...,...,...,...,...,...
9450,"Anhui Zhongxin Electric Co., Ltd. is a high-te...","['Automation Equipment', 'Technical Consulting...",Manufacturing,Electric Supplies & Power Generation,All Other Miscellaneous Electrical Equipment a...
9451,"TP Material Co.,Ltd. is a company based in the...","['Construction Materials Supplier', 'Construct...",Services,Construction Services,Commercial and Institutional Building Construc...
9452,Aladiner Cherag is a company that offers a var...,"['Fruit And Vegetables', 'Hand Wash Products',...",Manufacturing,Dairy Products - Farms & Stores,Fluid Milk Manufacturing
9453,Candor Eeg is a medical care company located i...,"['Stress Tests', 'Hyperventilation', 'Holter M...",Services,Radiology Clinic,Diagnostic Imaging Centers


In [7]:
labels=pd.read_csv("data/insurance_taxonomy.csv")
labels=list(labels["label"])
true_k=len(labels)
labels

['Agricultural Equipment Services',
 'Soil Nutrient Application Services',
 'Pesticide Application Services',
 'Ornamental Plant Nurseries',
 'Landscaping Services',
 'Gardening Services',
 'Tree Services - Pruning / Removal',
 'Veterinary Services',
 'Veterinary Clinics',
 'Pet Boarding Services',
 'Animal Day Care Services',
 'Pet Grooming Services',
 'Animal Training Services',
 'Veterinary Health Centers',
 'Animal Trainers',
 'Livestock Dealer Services',
 'Timber Harvesting Operations',
 'Fishing and Hunting Services',
 'Well Maintenance Services',
 'Field Welding Services',
 'Sand and Gravel Mining',
 'Residential Driveway Construction',
 'Commercial Driveway Construction',
 'Fencing Construction Services',
 'Sidewalk Construction Services',
 'Commercial Irrigation Systems',
 'Residential Drainage Systems',
 'Residential Snow Removal',
 'Commercial Snow Removal',
 'General Snow Removal Services',
 'Land Leveling Services',
 'Residential Drain Cleaning',
 'Commercial Drain Cleanin

In [8]:
for i in range(data.shape[0]):
    data["business_tags"].iloc[i]=data["business_tags"].iloc[i].replace("[","").replace("]","").replace("'","")

### Vectorize each column

Vectorize each column and concatenate horizontally. 

In [5]:
var_cols=["description","business_tags","sector","category","niche"]
count=1

for col in var_cols:
    vectorizer = TfidfVectorizer(stop_words="english")
    X_dummy = vectorizer.fit_transform(list(data[col]))
    if count==1:
        X_tfidf=X_dummy
    else:
        X_dummy = vectorizer.fit_transform(list(data[col]))
        X_tfidf = hstack((X_tfidf,X_dummy))
    count+=1

X_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 617366 stored elements and shape (9455, 64250)>

In [6]:
kmeans = KMeans(n_clusters=true_k,max_iter=100,n_init=1).fit(X_tfidf)

In [7]:
cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [ 13  10 123  85  10  19  10  66  80 102  36  10  10  50  25  29  30  50
  69  85  29  20  11  29  84  74  10  65  40  39  54  19  40  60  10  50
  10  50  10  26  83  58  47  38  10  29  10  45  20  65  31  41  61  34
  57  83  56  48  30  11  53  40  30  31  38  30  36  50  49  10  42  42
  46  10  22  45  30  81  14  15  46  29  46  52  49  20   9  64  10 179
  46  20  40  37  19  98  10  30  37  20  20  29 314  20  31  42  40  19
  91  51   1  30  30  20  20  32  30  40  49  20  37  35 354  32  28  33
  22  28  46  81  21  47  27  20  12 236  31  40  12  26  28  50  20  40
  10  38  22  25  22  92  67  38  89  20  39  29 109  39  27  41  20  24
  28 177  20  20 217 101  36  24  10  31  80  39  48  27  11  20  78  66
  10  20  36   9  21  18  36  17  22  20  41  10  10  20  10  56  49  30
  30  45  14  22  33  25  10  20 104 167  19  47  30 212  20  10  21  10
  30  31  17  10]


In [8]:
pred_idx_labels=list(kmeans.labels_)
pred_labels=[labels[idx-1] for idx in pred_idx_labels]

In [9]:
res=pd.DataFrame()
res=data
res["taxonomy"]=pred_labels
res.to_csv("data/Taxonomy_companies.csv")