### Importing packages

In [31]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist
from difflib import SequenceMatcher, get_close_matches
import re
from tqdm import tqdm, trange
import time

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns',None)

import warnings
warnings.filterwarnings('ignore')

In [40]:
prod_list = pd.read_excel('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/data_v2/product_list.xlsx')
prod_list = prod_list[['Product Name', 'Type', 'Distributor', 'Product Id']]
prod_list = prod_list.drop_duplicates(subset=['Product Name', 'Type', 'Distributor']).reset_index()
prod_list.to_csv('prod_list.csv', index=False)
# prod_list

In [34]:
x = prod_list[prod_list.duplicated(subset=['Product Name', 'Type', 'Distributor'])]
x

Unnamed: 0,Category,Product Name,Type,Weight,Carton Size,Unit Cost,Distributor,Product Code,Product Id,Distributor Type,Unnamed: 10,Active,Manufacturer,Industry,Sub category,Shelf Life,Liquid/Solid,Seasonality,Season Dates,Consignment Life Span,Manufacturer Lead Time,Transport Constraints,Cold Chain,Classification,Lower Threshhold,Reordering Level,Average Stock Out Rate,Similar Product Codes,Warehouse Section,Warehouse Bin Location
2335,Veterinary Products,Cevamune 1CP,dose,0.00,500,,Unga Farmcare (EA) Ltd.,UNG058B,12221,1,,0,Unga Farmcare (EA) Ltd.,Agriculture,,,,,,,,,,,,,,,,
2449,Public Health,Mos-N-Roach,20ml,0.02,1,,Juanco SPS Ltd.,JUA030E,12352,1,,0,Juanco SPS Ltd.,Agriculture,,,,,,,,,,,,,,,,
2488,Agrochemicals,Wipeout,5lt,5.00,1,,Juanco SPS Ltd.,JUA047D,12392,1,,0,Juanco SPS Ltd.,Agriculture,Herbicide,,,,,,,,,,,,,,,
4080,Veterinary Products,Cevac IBD L,dose,0.00,1000,,Unga Farmcare (EA) Ltd.,UNG054B,15161,1,,0,Unga Farmcare (EA) Ltd.,Agriculture,,,,,,,,,,,,,,,,
4081,Veterinary Products,Cevac Gumbo L,dose,0.00,1000,,Unga Farmcare (EA) Ltd.,UNG055B,15162,1,,0,Unga Farmcare (EA) Ltd.,Agriculture,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12500,Seeds,Onion - Red Summit F1,10gm,0.01,0,,Simlaw Seeds Company Ltd.,SMW203A,228366,1,,1,Simlaw Seeds Company Ltd.,Agriculture,,,,,,,,,,,,,,,,
12510,Agrochemicals,Katrin 25 EC,30ml,0.03,60,,Twiga Chemical Industries Ltd.,TWG016F,230190,1,,1,Twiga Chemical Industries Ltd.,Agriculture,,,,,,,,,,,,,,,,
12587,Hygiene,Trevin Dudu Dust,100gm,0.10,24,,HighChem Essentials Ltd.,HGC095B,418501,1,,1,HighChem Essentials Ltd.,Agriculture,,,,,,,,,,,,,,,,
12588,Hygiene,Trevin Dudu Dust,200gm,0.20,24,,HighChem Essentials Ltd.,HGC095C,418502,1,,1,HighChem Essentials Ltd.,Agriculture,,,,,,,,,,,,,,,,


In [35]:
prod_list[(prod_list['Product Name'] == 'Mos-N-Roach') & (prod_list['Type'] == '20ml')]

Unnamed: 0,Category,Product Name,Type,Weight,Carton Size,Unit Cost,Distributor,Product Code,Product Id,Distributor Type,Unnamed: 10,Active,Manufacturer,Industry,Sub category,Shelf Life,Liquid/Solid,Seasonality,Season Dates,Consignment Life Span,Manufacturer Lead Time,Transport Constraints,Cold Chain,Classification,Lower Threshhold,Reordering Level,Average Stock Out Rate,Similar Product Codes,Warehouse Section,Warehouse Bin Location
2448,Public Health,Mos-N-Roach,20ml,0.02,1,,Juanco SPS Ltd.,JUA030D,12351,1,,0,Juanco SPS Ltd.,Agriculture,,,,,,,,,,,,,,,,
2449,Public Health,Mos-N-Roach,20ml,0.02,1,,Juanco SPS Ltd.,JUA030E,12352,1,,0,Juanco SPS Ltd.,Agriculture,,,,,,,,,,,,,,,,


### Loading the data

In [2]:
products_df = pd.read_csv('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/data_v2/subsequent_unmatched_products.csv')
products_df

Unnamed: 0,product_name,best_product_match,product_match_score
0,Capacitor,plantector,0.42
1,Starter Crumbs Fugo,broiler starter crumbs,0.68
2,Rewinding 1HP induction geared motor,biodistinction extra,0.36
3,pizza small chopping board,dairy meal economy okoa,0.49
4,Bamboo Chopping Bord,choline chloride,0.39
...,...,...,...
1403372,"aldrop yellow 6""",carophyll yellow 10%,0.67
1403373,Rocket,rocket 44 ec,0.67
1403374,Salad,seal,0.67
1403375,Heat inducer,hinder,0.67


In [4]:
products_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1403377 entries, 0 to 1403376
Data columns (total 3 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   product_name         1403377 non-null  object 
 1   best_product_match   1403377 non-null  object 
 2   product_match_score  1403377 non-null  float64
dtypes: float64(1), object(2)
memory usage: 32.1+ MB


### Data Preprocessing

In [6]:
# changing all strings to lowercase
products_df = products_df.applymap(lambda x: x.lower().strip() if isinstance(x, str) else x)
products_df.head()

Unnamed: 0,product_name,best_product_match,product_match_score
0,capacitor,plantector,0.42
1,starter crumbs fugo,broiler starter crumbs,0.68
2,rewinding 1hp induction geared motor,biodistinction extra,0.36
3,pizza small chopping board,dairy meal economy okoa,0.49
4,bamboo chopping bord,choline chloride,0.39


In [7]:
products_df['product_name'].nunique()

66005

In [9]:
# filtering for unique product names
unique_product_names = products_df['product_name'].unique()
len(unique_product_names)

66005

### Clustering

In [10]:
# vectorizing the product names
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(unique_product_names)

In [11]:
# clustering the data
num_clusters = 10000
kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(tfidf_matrix)
labels = kmeans.labels_

# creating a dataframe of the clusters
cluster_to_name = {}
for label in set(labels):
    indices = np.where(labels == label)[0]
    names = unique_product_names[indices].tolist()
    cluster_to_name[label] = names
    
# viewing the clusters
cluster_1_df = pd.DataFrame.from_dict(cluster_to_name.items())
cluster_1_df.rename(columns={0: 'cluster_id', 1: 'product_names'}, inplace=True)
cluster_1_df.set_index('cluster_id', inplace=True)
pd.set_option('display.max_colwidth', None)
cluster_1_df.head(5)

Unnamed: 0_level_0,product_names
cluster_id,Unnamed: 1_level_1
0,"[oraimo data cable, phone charging cable(oraimo), oraimo original cable, oraimo smart cable original, micro oraimo cable, oraimo cable]"
1,"[block, reva block red, reva block red block, reva red block]"
2,"[chick feeder small, feeder chick, chick feeder 24 slots, small chick feeder]"
3,"[dairy maxpro plus, farm dairy meal maxpro, dairy meal maxpro plus, maxpro dairy meal, maxpro plus dairy meal, maxpro dairy, pio maxpro plus dairy meal, dairy meal maxpro, maxpro dairy meal]"
4,"[highland stocklick- all purpose, all purpose stocklick, pharma all purpose stocklick, daima stocklick all purpose, hylick all purpose stocklick]"


In [16]:
cluster_1_df['cluster_size'] = cluster_1_df['product_names'].apply(lambda x: len(x))
cluster_1_df[cluster_1_df['cluster_size'] == 1]

Unnamed: 0_level_0,product_names,cluster_size
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1
9,[kienyeji bora],1
10,[crop champion (npk) 20:20:20],1
11,[max pollard],1
22,[maize mixture],1
25,[green organic 1ltr],1
...,...,...
9923,[gazette],1
9962,[otc10%],1
9982,[cp/tg(silver)org steeringbar(007)],1
9989,[233cables],1


In [17]:
unique_names_df = pd.DataFrame({'product_name': unique_product_names,
                                'label': labels})
unique_names_df.head()
len(unique_names_df)

66005

### Topic Modeling

In [18]:
# extracting most common words from each cluster in order
cluster_word_freq = {}

for doc, cluster_label in zip(unique_product_names, labels):
    words = re.split(r'\s+|-|\(|\)|/|\\|\||,', doc)
    for word in words:
        if cluster_label in cluster_word_freq:
            cluster_word_freq[cluster_label][word] = cluster_word_freq[cluster_label].get(word, 0) + 1
        else:
            cluster_word_freq[cluster_label] = {word: 1}
    
for cluster_label in cluster_word_freq:
    cluster_word_freq[cluster_label] = sorted(cluster_word_freq[cluster_label].items(), key=lambda x: x[1], reverse=True)
    
cluster_word_freq_df = pd.DataFrame.from_dict(cluster_word_freq.items())
cluster_word_freq_df.rename(columns={0: 'cluster_id', 1: 'word_freq'}, inplace=True)
cluster_word_freq_df.head()

Unnamed: 0,cluster_id,word_freq
0,7273,"[(capacitor, 2), (200, 1)]"
1,2180,"[(crumbs, 7), (fugo, 7), (starter, 6), (, 5), (50kg, 2), (10kg, 1), (25kg, 1), (1kg, 1)]"
2,8491,"[(motor, 12), (rewinding, 9), (induction, 8), (geared, 3), (1.5hp, 3), (2hp, 2), (1hp, 1), (3phase, 1), (5.5hp, 1), (7.5hp, 1), (3ph, 1), (1, 1), (hp, 1), (0.75kw, 1), (30hp, 1)]"
3,1409,"[(chopping, 9), (board, 9), (small, 3), (marble, 2), (big, 2), (pizza, 1), (@400, 1), (medium, 1), (knife, 1), (set, 1), (with, 1)]"
4,7282,"[(bamboo, 3), (chopping, 3), (board, 2), (bord, 1), (small, 1), (big, 1)]"


In [28]:
cluster_word_freq_df['cluster_name'] = cluster_word_freq_df['word_freq'].apply(lambda x: ' '.join(word[0] for word in x[:3] if word[0] != ' '))
cluster_word_freq_df

Unnamed: 0,cluster_id,word_freq,cluster_name
0,7273,"[(capacitor, 2), (200, 1)]",capacitor 200
1,2180,"[(crumbs, 7), (fugo, 7), (starter, 6), (, 5), (50kg, 2), (10kg, 1), (25kg, 1), (1kg, 1)]",crumbs fugo starter
2,8491,"[(motor, 12), (rewinding, 9), (induction, 8), (geared, 3), (1.5hp, 3), (2hp, 2), (1hp, 1), (3phase, 1), (5.5hp, 1), (7.5hp, 1), (3ph, 1), (1, 1), (hp, 1), (0.75kw, 1), (30hp, 1)]",motor rewinding induction
3,1409,"[(chopping, 9), (board, 9), (small, 3), (marble, 2), (big, 2), (pizza, 1), (@400, 1), (medium, 1), (knife, 1), (set, 1), (with, 1)]",chopping board small
4,7282,"[(bamboo, 3), (chopping, 3), (board, 2), (bord, 1), (small, 1), (big, 1)]",bamboo chopping board
...,...,...,...
9995,2786,"[(yellow, 1), (insulating, 1), (tape, 1), (small, 1)]",yellow insulating tape
9996,7486,"[(nitronor, 2), (34%, 1)]",nitronor 34%
9997,2889,"[(flukenor, 1), (500, 1)]",flukenor 500
9998,581,"[(globe, 1), (master, 1), (50g, 1), (e.a, 1)]",globe master 50g


In [29]:
cluster_word_freq_df[:50]

Unnamed: 0,cluster_id,word_freq,cluster_name
0,7273,"[(capacitor, 2), (200, 1)]",capacitor 200
1,2180,"[(crumbs, 7), (fugo, 7), (starter, 6), (, 5), (50kg, 2), (10kg, 1), (25kg, 1), (1kg, 1)]",crumbs fugo starter
2,8491,"[(motor, 12), (rewinding, 9), (induction, 8), (geared, 3), (1.5hp, 3), (2hp, 2), (1hp, 1), (3phase, 1), (5.5hp, 1), (7.5hp, 1), (3ph, 1), (1, 1), (hp, 1), (0.75kw, 1), (30hp, 1)]",motor rewinding induction
3,1409,"[(chopping, 9), (board, 9), (small, 3), (marble, 2), (big, 2), (pizza, 1), (@400, 1), (medium, 1), (knife, 1), (set, 1), (with, 1)]",chopping board small
4,7282,"[(bamboo, 3), (chopping, 3), (board, 2), (bord, 1), (small, 1), (big, 1)]",bamboo chopping board
5,3677,"[(gypsum, 9), (screw, 4), (1, 2), (2, 1), (1"", 1), (charnnel, 1), (1½"", 1), (board, 1), (sturd, 1), (gypsum{skirmcoat}, 1), (20kg, 1), (plaster, 1), (powder, 1)]",gypsum screw 1
6,2085,"[(malezi, 7), (feeds, 5), (, 4), (dairy, 2), (growers, 2), (dog, 1), (food, 1), (layer, 1), (meal, 1), (kienyeji, 1), (mash, 1)]",malezi feeds
7,4560,"[(juugo, 4), (maziwa, 2), (nyama, 1), (dairy, 1)]",juugo maziwa nyama
8,4027,"[(pioneer, 8), (dairy, 7), (maxpro, 6), (meal, 5), (plus, 4), (, 2), (maxplus, 1)]",pioneer dairy maxpro
9,1730,"[(coldcap, 13), (caps, 3), (syrup, 3), (orginal, 1), (original, 1), (100mls, 1), (day, 1), (night, 1), (capsules, 1), (generic, 1), (srp, 1), (syr, 1), (100ml, 1), (syrp, 1)]",coldcap caps syrup


In [30]:
cluster_merge = cluster_word_freq_df[['cluster_id', 'cluster_name']]
cluster_merge = cluster_merge.rename(columns={'cluster_id': 'label'})

products_df = products_df.merge(unique_names_df, on='product_name', how='left')
products_df = products_df.merge(cluster_merge, on='label', how='left')

columns = products_df.columns.tolist()
columns.remove('cluster_name')
columns.insert(1, 'cluster_name')
products_df = products_df[columns]

products_df[:50]
# products_df.to_csv('clean_subsequent_products.csv', index=False)

### Clustered Data

In [2]:
products_df = pd.read_csv('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/data_v2/clean_subsequent_products.csv')
unique_clustered_data = products_df[['product_name', 'cluster_name', 'best_product_match']].drop_duplicates(subset=['product_name'], keep='first').reset_index(drop=True)
unique_clustered_data

Unnamed: 0,product_name,cluster_name,best_product_match
0,capacitor,capacitor 200,plantector
1,starter crumbs fugo,crumbs fugo starter,broiler starter crumbs
2,rewinding 1hp induction geared motor,motor rewinding induction,biodistinction extra
3,pizza small chopping board,chopping board small,dairy meal economy okoa
4,bamboo chopping bord,bamboo chopping board,choline chloride
...,...,...,...
66000,48pages squared,48pages ruled squared,mason square
66001,bio-plus(milk block),bio plus block,milk block
66002,mr.clean,mr clean mr.clean,ultraclean
66003,scotts emusion orange flavor,scotts original 100ml,basco emulsion tango 4l


In [3]:
# cleanup function
def compare(row):
    comparison = {}
    i = row['product_name']
    prods_list = row[['cluster_name', 'best_product_match']].tolist()
    if isinstance(i, str):
       comparison.update({i: get_close_matches(i, prods_list, n=1, cutoff=0.1)})
    product_name = list(comparison.keys()) if comparison else None
    match = []
    score = []
    if comparison:
       for key, value in comparison.items():
           if value:
              match.append(value[0])
              score.append(round(SequenceMatcher(None, i, value[0]).ratio(), 2))
           else:
              match.append(None)
              score.append(None)
    else:
       match.append(None)
       score.append(None)
              
    return pd.Series([match, score], index = ['match', 'score'])

In [4]:
for i in tqdm(range(10), desc="Processing", ncols=80, bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}"):
    unique_clustered_data[['match', 'score']] = [compare(row) for _, row in unique_clustered_data.iterrows()]
    unique_clustered_data.head()

Processing: 100%|████████████████████████████████████████████████████████| 10/10


In [5]:
dnd_unique_clustered_data = unique_clustered_data.copy()

In [6]:
unique_clustered_data['match'] = unique_clustered_data['match'].apply(lambda x: x[0])
unique_clustered_data['score'] = unique_clustered_data['score'].apply(lambda x: x[0])

unique_clustered_data

Unnamed: 0,product_name,cluster_name,best_product_match,match,score
0,capacitor,capacitor 200,plantector,capacitor 200,0.82
1,starter crumbs fugo,crumbs fugo starter,broiler starter crumbs,broiler starter crumbs,0.68
2,rewinding 1hp induction geared motor,motor rewinding induction,biodistinction extra,motor rewinding induction,0.62
3,pizza small chopping board,chopping board small,dairy meal economy okoa,chopping board small,0.61
4,bamboo chopping bord,bamboo chopping board,choline chloride,bamboo chopping board,0.98
...,...,...,...,...,...
66000,48pages squared,48pages ruled squared,mason square,48pages ruled squared,0.83
66001,bio-plus(milk block),bio plus block,milk block,bio plus block,0.76
66002,mr.clean,mr clean mr.clean,ultraclean,ultraclean,0.67
66003,scotts emusion orange flavor,scotts original 100ml,basco emulsion tango 4l,basco emulsion tango 4l,0.67


In [10]:
unique_clustered_data['go_to_match'] = np.where(unique_clustered_data['score'] >= 0.65, unique_clustered_data['match'], unique_clustered_data['cluster_name'])
unique_clustered_data

Unnamed: 0,product_name,cluster_name,best_product_match,match,score,go_to_match
0,capacitor,capacitor 200,plantector,capacitor 200,0.82,capacitor 200
1,starter crumbs fugo,crumbs fugo starter,broiler starter crumbs,broiler starter crumbs,0.68,broiler starter crumbs
2,rewinding 1hp induction geared motor,motor rewinding induction,biodistinction extra,motor rewinding induction,0.62,motor rewinding induction
3,pizza small chopping board,chopping board small,dairy meal economy okoa,chopping board small,0.61,chopping board small
4,bamboo chopping bord,bamboo chopping board,choline chloride,bamboo chopping board,0.98,bamboo chopping board
...,...,...,...,...,...,...
66000,48pages squared,48pages ruled squared,mason square,48pages ruled squared,0.83,48pages ruled squared
66001,bio-plus(milk block),bio plus block,milk block,bio plus block,0.76,bio plus block
66002,mr.clean,mr clean mr.clean,ultraclean,ultraclean,0.67,ultraclean
66003,scotts emusion orange flavor,scotts original 100ml,basco emulsion tango 4l,basco emulsion tango 4l,0.67,basco emulsion tango 4l


### Cleaning against master list

In [12]:
# loading the master list
master_list_df = pd.read_csv('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/data_v1/master_list.csv')
master_list_df

Unnamed: 0,combined_name,product_name,manufacturer_name
0,aviboost aqua block; vital animal health,aviboost aqua block,vital animal health
1,aviboost cl-x blue; vital animal health,aviboost cl-x blue,vital animal health
2,aviboost nutri block; vital animal health,aviboost nutri block,vital animal health
3,aviboost spectrum; vital animal health,aviboost spectrum,vital animal health
4,aviboost poultry tonic; vital animal health,aviboost poultry tonic,vital animal health
...,...,...,...
8774,jayhawk 500sc suspension concentrate ;,jayhawk 500sc suspension concentrate,
8775,cynara 72 wp wettable powder;,cynara 72 wp wettable powder,
8776,twiga-amine 720sl soluble concentrate;,twiga-amine 720sl soluble concentrate,
8777,ranman top 160 sc suspension concentrate;,ranman top 160 sc suspension concentrate,


In [13]:
matches_cache = {}
master_list = master_list_df['product_name'].to_list()

def get_closest_match(word, possibilities: list[str]):
    word = str(word).lower()
    if found := matches_cache.get(word):
        return found

    matches = get_close_matches(word, possibilities, n=1, cutoff=0.0)
    match = matches[0] if matches else ''
    score = round(SequenceMatcher(None, word, match).ratio(), 2)
    found = {'best_match': match, 'best_score': score}
    matches_cache[word] = found

    return found         # pd.Series([word, match, score], index = ['product_name', 'match', 'score'])

found_df = unique_clustered_data['go_to_match'].apply(lambda x: get_closest_match(x, master_list))

In [14]:
dnd_found_df = found_df.copy()
dnd_unique_clustered_data_2 = unique_clustered_data.copy()

In [15]:
found_df = found_df.apply(pd.Series)
found_df.head()

Unnamed: 0,best_match,best_score
0,partner 200 sl,0.59
1,broiler starter crumbs,1.0
2,medodin injection,0.62
3,stopper small,0.61
4,remoov dehorning paste,0.51


In [18]:
# len(found_df)
# len(unique_clustered_data)

66005

In [19]:
unique_clustered_data = pd.concat([unique_clustered_data, found_df], axis = 1)
unique_clustered_data = unique_clustered_data.reset_index(drop=True)
# unique_clustered_data = unique_clustered_data.loc[:, ~unique_clustered_data.columns.duplicated()]
unique_clustered_data.head(20)

Unnamed: 0,product_name,cluster_name,best_product_match,match,score,go_to_match,best_match,best_score
0,capacitor,capacitor 200,plantector,capacitor 200,0.82,capacitor 200,partner 200 sl,0.59
1,starter crumbs fugo,crumbs fugo starter,broiler starter crumbs,broiler starter crumbs,0.68,broiler starter crumbs,broiler starter crumbs,1.0
2,rewinding 1hp induction geared motor,motor rewinding induction,biodistinction extra,motor rewinding induction,0.62,motor rewinding induction,medodin injection,0.62
3,pizza small chopping board,chopping board small,dairy meal economy okoa,chopping board small,0.61,chopping board small,stopper small,0.61
4,bamboo chopping bord,bamboo chopping board,choline chloride,bamboo chopping board,0.98,bamboo chopping board,remoov dehorning paste,0.51
5,gypsum screw 1 1/2,gypsum screw 1,puma super ec 120,gypsum screw 1,0.88,gypsum screw 1,gypsum,0.6
6,malezi feeds- dog food,malezi feeds,l-valine feed grade,malezi feeds,0.74,malezi feeds,mola feeds,0.7
7,juugo,juugo maziwa nyama,humigold,humigold,0.46,juugo maziwa nyama,twigalick maziwa max,0.58
8,pioneer maxplus dairy meal,pioneer dairy maxpro,bora dairy meal,pioneer dairy maxpro,0.7,pioneer dairy maxpro,pembe dairy meal,0.67
9,coldcap caps orginal,coldcap caps syrup,dairy meal ordinary,coldcap caps syrup,0.74,coldcap caps syrup,royal cap fs,0.47


In [20]:
unique_clustered_data['correct_match'] = np.where(unique_clustered_data['best_score'] >= 0.75, unique_clustered_data['best_match'], unique_clustered_data['go_to_match'])
unique_clustered_data.head()

Unnamed: 0,product_name,cluster_name,best_product_match,match,score,go_to_match,best_match,best_score,correct_match
0,capacitor,capacitor 200,plantector,capacitor 200,0.82,capacitor 200,partner 200 sl,0.59,capacitor 200
1,starter crumbs fugo,crumbs fugo starter,broiler starter crumbs,broiler starter crumbs,0.68,broiler starter crumbs,broiler starter crumbs,1.0,broiler starter crumbs
2,rewinding 1hp induction geared motor,motor rewinding induction,biodistinction extra,motor rewinding induction,0.62,motor rewinding induction,medodin injection,0.62,motor rewinding induction
3,pizza small chopping board,chopping board small,dairy meal economy okoa,chopping board small,0.61,chopping board small,stopper small,0.61,chopping board small
4,bamboo chopping bord,bamboo chopping board,choline chloride,bamboo chopping board,0.98,bamboo chopping board,remoov dehorning paste,0.51,bamboo chopping board


In [23]:
unique_clustered_data[unique_clustered_data['best_score'] >= 0.75]

Unnamed: 0,product_name,cluster_name,best_product_match,match,score,go_to_match,best_match,best_score,correct_match
1,starter crumbs fugo,crumbs fugo starter,broiler starter crumbs,broiler starter crumbs,0.68,broiler starter crumbs,broiler starter crumbs,1.00,broiler starter crumbs
14,kienyeji mash (faida feeds),kienyeji feeds,kienyeji mash (kuku wa kienyeji),kienyeji mash (kuku wa kienyeji),0.68,kienyeji mash (kuku wa kienyeji),kienyeji mash (kuku wa kienyeji),1.00,kienyeji mash (kuku wa kienyeji)
42,trinity f1 hybrid mellon,trinity f1 gold,tendo f1 hybrid - water melon,tendo f1 hybrid - water melon,0.68,tendo f1 hybrid - water melon,tendo f1 hybrid - water melon,1.00,tendo f1 hybrid - water melon
64,bactrocera,bactrocera block trap,attracker,bactrocera block trap,0.65,bactrocera block trap,bactrocera block block,0.79,bactrocera block block
78,fuga-vit maziwa extra sachet,fuga vit maziwa,gns bovita maziwa premium,fuga vit maziwa,0.65,fuga vit maziwa,gns bovita maziwa,0.75,gns bovita maziwa
...,...,...,...,...,...,...,...,...,...
65998,water sprinkler big,sprinkler yellow,water retainer,water retainer,0.67,water retainer,water retainer,1.00,water retainer
65999,simple moisturiser,tabs cream,aspee motorised,aspee motorised,0.67,aspee motorised,aspee motorised,1.00,aspee motorised
66002,mr.clean,mr clean mr.clean,ultraclean,ultraclean,0.67,ultraclean,ultraclean,1.00,ultraclean
66003,scotts emusion orange flavor,scotts original 100ml,basco emulsion tango 4l,basco emulsion tango 4l,0.67,basco emulsion tango 4l,basco emulsion tango 4l,1.00,basco emulsion tango 4l


In [None]:
unique_clustered_data_merge = unique_clustered_data[['product_name', 'correct_match']]

clustered_data = clustered_data.merge(unique_clustered_data_merge, on='product_name', how='left')

columns_list = clustered_data.columns.tolist()
columns_list.remove('correct_match')
columns_list.insert(2, 'correct_match')
clustered_data = clustered_data[columns_list]

clustered_data.head()

In [22]:
unique_clustered_data.to_csv('final_clustered_data.csv', index = False)

In [2]:
clustered_data = pd.read_csv('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/data_v2/final_clustered_data.csv')
clustered_data

Unnamed: 0,product_name,cluster_name,best_product_match,match,score,go_to_match,best_match,best_score,correct_match
0,capacitor,capacitor 200,plantector,capacitor 200,0.82,capacitor 200,partner 200 sl,0.59,capacitor 200
1,starter crumbs fugo,crumbs fugo starter,broiler starter crumbs,broiler starter crumbs,0.68,broiler starter crumbs,broiler starter crumbs,1.00,broiler starter crumbs
2,rewinding 1hp induction geared motor,motor rewinding induction,biodistinction extra,motor rewinding induction,0.62,motor rewinding induction,medodin injection,0.62,motor rewinding induction
3,pizza small chopping board,chopping board small,dairy meal economy okoa,chopping board small,0.61,chopping board small,stopper small,0.61,chopping board small
4,bamboo chopping bord,bamboo chopping board,choline chloride,bamboo chopping board,0.98,bamboo chopping board,remoov dehorning paste,0.51,bamboo chopping board
...,...,...,...,...,...,...,...,...,...
66000,48pages squared,48pages ruled squared,mason square,48pages ruled squared,0.83,48pages ruled squared,mason square,0.55,48pages ruled squared
66001,bio-plus(milk block),bio plus block,milk block,bio plus block,0.76,bio plus block,milk block,0.67,bio plus block
66002,mr.clean,mr clean mr.clean,ultraclean,ultraclean,0.67,ultraclean,ultraclean,1.00,ultraclean
66003,scotts emusion orange flavor,scotts original 100ml,basco emulsion tango 4l,basco emulsion tango 4l,0.67,basco emulsion tango 4l,basco emulsion tango 4l,1.00,basco emulsion tango 4l


In [12]:
# clustered_data[(clustered_data['best_score'] < 0.7) & (clustered_data['best_score'] >= 0.65)][:50]
# clustered_data[:50]
# clustered_data[clustered_data['best_score'] == 0.68][:50]
clustered_data['final_match'] = np.where(clustered_data['best_score'] >= 0.68, clustered_data['best_match'], clustered_data['go_to_match'])
# clustered_data.to_csv('subsequent_clustered_products.csv',index=False)

In [12]:
clustered_data = pd.read_csv('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/data_v2/subsequent_clustered_products.csv')
clustered_data = clustered_data[['product_name', 'final_match']]
# clustered_data.to_csv('subsequent_clustered_products.csv',index=False)

In [13]:
data = pd.read_csv('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/data_v2/subseq_products.csv')
data

Unnamed: 0,product_name,best_product_match,product_match_score
0,Baby skirts,fay white 4s,0.35
1,DUDU ACELAMECTIN 5%,dudu acelamectin,0.91
2,Capacitor,plantector,0.42
3,Baby jacket oldstock,baraki wax blocks,0.38
4,Diazon 60EC,dizon 60 ec,0.91
...,...,...,...
94882,Kapenstrep Injection,penistrep injection,0.92
94883,Potphos500SL,potphos 500 sl,0.92
94884,Super green1L,super green,0.92
94885,nrixin layers,nerixin layer,0.92


In [14]:
data['lower_prod'] = data['product_name'].apply(lambda x: x.lower().strip() if isinstance(x, str) else x)
data

Unnamed: 0,product_name,best_product_match,product_match_score,lower_prod
0,Baby skirts,fay white 4s,0.35,baby skirts
1,DUDU ACELAMECTIN 5%,dudu acelamectin,0.91,dudu acelamectin 5%
2,Capacitor,plantector,0.42,capacitor
3,Baby jacket oldstock,baraki wax blocks,0.38,baby jacket oldstock
4,Diazon 60EC,dizon 60 ec,0.91,diazon 60ec
...,...,...,...,...
94882,Kapenstrep Injection,penistrep injection,0.92,kapenstrep injection
94883,Potphos500SL,potphos 500 sl,0.92,potphos500sl
94884,Super green1L,super green,0.92,super green1l
94885,nrixin layers,nerixin layer,0.92,nrixin layers


In [15]:
clustered_data = clustered_data.rename(columns = {'product_name': 'lower_prod'})
clustered_data

Unnamed: 0,lower_prod,final_match
0,capacitor,capacitor 200
1,starter crumbs fugo,broiler starter crumbs
2,rewinding 1hp induction geared motor,motor rewinding induction
3,pizza small chopping board,chopping board small
4,bamboo chopping bord,bamboo chopping board
...,...,...
66000,48pages squared,48pages ruled squared
66001,bio-plus(milk block),bio plus block
66002,mr.clean,ultraclean
66003,scotts emusion orange flavor,basco emulsion tango 4l


In [16]:
data = data.merge(clustered_data, how='left', on='lower_prod')
data['final_match'] = np.where(data['final_match'].isna(), data['lower_prod'], data['final_match'])
data

Unnamed: 0,product_name,best_product_match,product_match_score,lower_prod,final_match
0,Baby skirts,fay white 4s,0.35,baby skirts,skirts adult material
1,DUDU ACELAMECTIN 5%,dudu acelamectin,0.91,dudu acelamectin 5%,dudu acelamectin 5%
2,Capacitor,plantector,0.42,capacitor,capacitor 200
3,Baby jacket oldstock,baraki wax blocks,0.38,baby jacket oldstock,jacket baby b74
4,Diazon 60EC,dizon 60 ec,0.91,diazon 60ec,diazon 60ec
...,...,...,...,...,...
94882,Kapenstrep Injection,penistrep injection,0.92,kapenstrep injection,kapenstrep injection
94883,Potphos500SL,potphos 500 sl,0.92,potphos500sl,potphos500sl
94884,Super green1L,super green,0.92,super green1l,super green1l
94885,nrixin layers,nerixin layer,0.92,nrixin layers,nrixin layers


In [17]:
data = data[['product_name', 'final_match']]
data.to_csv('subsequent_clustered_products.csv',index=False)