### **Clustering for Product Matching**

### Importing packages

In [3]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist
from difflib import SequenceMatcher, get_close_matches
import re
from tqdm import tqdm, trange
import time

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

### Loading the data

In [2]:
data = pd.read_csv('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/unmatched_stockist_transactions.csv')
data.head()

Unnamed: 0,product_name,best_product_match,product_match_score,manufacturer_name,best_manufacturer_match,manufacturer_match_score
0,laibuta foliar feeds,murphy foliar feed,0.68,laib,eabl,0.22
1,White Candles - Riva,white line - mint & coriander,0.49,Halar Industries Ltd,zaam industries ltd.,0.83
2,Ampiclo-dawa dry syrup,vitastar pet syrup,0.4,.,mea ltd.,0.2
3,carvedilol(vidol)6.25mg,clopidol,0.45,Cosmos ltd,cosmos ltd.,0.91
4,Amoxil 'O'125mg,mineral 1-5725,0.41,GSK,wellstock,0.31


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8614931 entries, 0 to 8614930
Data columns (total 6 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   product_name              object 
 1   best_product_match        object 
 2   product_match_score       float64
 3   manufacturer_name         object 
 4   best_manufacturer_match   object 
 5   manufacturer_match_score  float64
dtypes: float64(2), object(4)
memory usage: 394.4+ MB


In [4]:
data.isna().sum()

product_name                    1
best_product_match              0
product_match_score             0
manufacturer_name           86649
best_manufacturer_match     83377
manufacturer_match_score        0
dtype: int64

### Data Preprocessing

In [5]:
# dropping null values in product_name column
data = data.dropna(subset = ['product_name'])

# cleaning the product name column
data['product_name'] = data['product_name'].apply(lambda x: x.strip())

# changing all strings to lowercase
data = data.applymap(lambda x: x.lower() if isinstance(x, str) else x)
data.head()

Unnamed: 0,product_name,best_product_match,product_match_score,manufacturer_name,best_manufacturer_match,manufacturer_match_score
0,laibuta foliar feeds,murphy foliar feed,0.68,laib,eabl,0.22
1,white candles - riva,white line - mint & coriander,0.49,halar industries ltd,zaam industries ltd.,0.83
2,ampiclo-dawa dry syrup,vitastar pet syrup,0.4,.,mea ltd.,0.2
3,carvedilol(vidol)6.25mg,clopidol,0.45,cosmos ltd,cosmos ltd.,0.91
4,amoxil 'o'125mg,mineral 1-5725,0.41,gsk,wellstock,0.31


In [6]:
data['product_name'].nunique()  

129071

In [7]:
# filtering for unique product names
unique_product_names = data['product_name'].unique()
len(unique_product_names)

129071

In [8]:
# vectorizing the product names
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(unique_product_names)

In [None]:
# # choosing the k-value using the elbow method
# clusters = [1000, 2000, 3000, 4000, 5000, 6000, 7000]
# silhouette_avg = []

# for cluster in clusters:
#     kmeans = KMeans(n_clusters = cluster, random_state = 42)
#     kmeans.fit(tfidf_matrix)
#     cluster_labels = kmeans.labels_
    
#     silhouette_avg.append(silhouette_score(tfidf_matrix, cluster_labels))
    
# plt.plot(clusters, silhouette_avg, 'bx-')
# plt.xlabel('Values of K') 
# plt.ylabel('Silhouette score') 
# plt.title('Silhouette analysis For Optimal k')
# plt.show()

In [None]:
# # clustering the vectorized data
# k_values = [400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2200, 2400, 2600]
# ssd = []

# for k in k_values:
#     kmeans = KMeans(n_clusters=k, random_state=0).fit(tfidf_matrix)
#     ssd.append(kmeans.inertia_)

# # plotting the elbow curve
# plt.plot(k_values, ssd, 'bx-')
# plt.xlabel('No of clusters (k)')
# plt.ylabel('inertia')
# plt.title('The Elbow Method showing the optimal k')
# plt.show()

In [8]:
# clustering the data
num_clusters = 10000
kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(tfidf_matrix)
labels = kmeans.labels_

# creating a dataframe of the clusters
cluster_to_name = {}
for label in set(labels):
    indices = np.where(labels == label)[0]
    names = unique_product_names[indices].tolist()
    cluster_to_name[label] = names
    
# viewing the clusters
cluster_4_df = pd.DataFrame.from_dict(cluster_to_name.items())
cluster_4_df.rename(columns={0: 'cluster_id', 1: 'product_names'}, inplace=True)
cluster_4_df.set_index('cluster_id', inplace=True)
pd.set_option('display.max_colwidth', None)
cluster_4_df.head(5)

Unnamed: 0_level_0,product_names
cluster_id,Unnamed: 1_level_1
0,"[nozzle four holes, nozzle (4 holes), four holed nozzle]"
1,"[ord.nails 2"", ord.nails 3"", ord nails 5"", ord nails 3"", ord nails 2"", ord nails 4"", 3"" ord.nails 1/4, 1""ord. nails 1/4, ord. nails 2"", ord. nails 3"", ord. nails 4"", 1½'' ord. nails 1/4, 4"" ord. nails 1/4, 2"" ord. nails 1/4, 5"" ord. nails 1/4, 6"" ord. nails 1/4, ord.nails 4"", ord. nails 2 ½"", ord nails 6"", ord nails 2 1/2"", ord nails, ord nails 21/2""]"
2,"[kajamba growers, kajamba growers mash, growers kajamba]"
3,"[water heater black small, water heater black big, big black heater, water heater black]"
4,"[pliers large, pliers large]"


In [104]:
unique_names_df = pd.DataFrame({'product_name': unique_product_names,
                                'label': labels})
unique_names_df.head()
len(unique_names_df)

129071

### Topic modeling (1)

In [120]:
# extracting most common words from each cluster in order
cluster_word_freq = {}

for doc, cluster_label in zip(unique_product_names, labels):
    words = re.split(r'\s+|-|\(|\)|/|\\|\||,', doc)
    for word in words:
        if cluster_label in cluster_word_freq:
            cluster_word_freq[cluster_label][word] = cluster_word_freq[cluster_label].get(word, 0) + 1
        else:
            cluster_word_freq[cluster_label] = {word: 1}
    
for cluster_label in cluster_word_freq:
    cluster_word_freq[cluster_label] = sorted(cluster_word_freq[cluster_label].items(), key=lambda x: x[1], reverse=True)
    
cluster_word_freq_df = pd.DataFrame.from_dict(cluster_word_freq.items())
cluster_word_freq_df.rename(columns={0: 'cluster_id', 1: 'word_freq'}, inplace=True)
cluster_word_freq_df.head()

Unnamed: 0,cluster_id,word_freq
0,5598,"[(laibuta, 7), (foliar, 7), (feed, 3), (feeds, 2), (, 2), (liquid, 1)]"
1,1253,"[(, 18), (candles, 9), (riva, 9), (white, 2), (yellow, 2), (red, 2), (black, 1), (maroon, 1), (green, 1)]"
2,8257,"[(ampiclo, 16), (dawa, 14), (, 5), (ampicilin, 3), (dry, 2), (syrup, 2), (caps, 2), (neonatal, 1), (500mg, 1), (500, 1), (suspension, 1), (250mg, 1), (5ml, 1), (capsules, 1), (0.6ml, 1), (90mg, 1), (0.6, 1)]"
3,2806,"[(vidol, 8), (carvedilol, 6), (, 5), (6.25mg, 4), (tabs, 2), (28's, 1), (6.25mgs, 1), (12.5mgs, 1)]"
4,8278,"[(amoxil, 16), (, 4), (syrup, 3), (500mg, 2), ('o'125mg, 1), (syrp, 1), (amoxycillin, 1), (60ml, 1), (cap, 1), (500, 1), (100ml, 1), (capsules, 1), (dt, 1), (caps, 1), (dispersible, 1), (suspension, 1), (original, 1), (60mls, 1)]"


In [131]:
cluster_word_freq_df['cluster_name'] = cluster_word_freq_df['word_freq'].apply(lambda x: ' '.join(word[0] for word in x[:3]))
cluster_word_freq_df

Unnamed: 0,cluster_id,word_freq,cluster_name
0,5598,"[(laibuta, 7), (foliar, 7), (feed, 3), (feeds, 2), (, 2), (liquid, 1)]",laibuta foliar feed
1,1253,"[(, 18), (candles, 9), (riva, 9), (white, 2), (yellow, 2), (red, 2), (black, 1), (maroon, 1), (green, 1)]",candles riva
2,8257,"[(ampiclo, 16), (dawa, 14), (, 5), (ampicilin, 3), (dry, 2), (syrup, 2), (caps, 2), (neonatal, 1), (500mg, 1), (500, 1), (suspension, 1), (250mg, 1), (5ml, 1), (capsules, 1), (0.6ml, 1), (90mg, 1), (0.6, 1)]",ampiclo dawa
3,2806,"[(vidol, 8), (carvedilol, 6), (, 5), (6.25mg, 4), (tabs, 2), (28's, 1), (6.25mgs, 1), (12.5mgs, 1)]",vidol carvedilol
4,8278,"[(amoxil, 16), (, 4), (syrup, 3), (500mg, 2), ('o'125mg, 1), (syrp, 1), (amoxycillin, 1), (60ml, 1), (cap, 1), (500, 1), (100ml, 1), (capsules, 1), (dt, 1), (caps, 1), (dispersible, 1), (suspension, 1), (original, 1), (60mls, 1)]",amoxil syrup
...,...,...,...
9995,1745,"[(, 2), (4.0, 1), (single, 1), (core, 1), (red, 1), (black, 1), (yellow, 1)]",4.0 single
9996,6674,"[(fuel, 2), (filter, 2), (perkins, 2), (short, 1)]",fuel filter perkins
9997,7901,"[(copem, 2), (simlaw, 1)]",copem simlaw
9998,3288,"[(bestphos, 1), (5kg, 1)]",bestphos 5kg


In [None]:
cluster_merge = cluster_word_freq_df[['cluster_id', 'cluster_name']]
cluster_merge = cluster_merge.rename(columns={'cluster_id': 'label'})

data = data.merge(unique_names_df, on='product_name', how='left')
data = data.merge(cluster_merge, on='label', how='left')

columns = data.columns.tolist()
columns.remove('cluster_name')
columns.insert(1, 'cluster_name')
data = data[columns]

data[:50]

In [140]:
data[data['product_match_score'] > 0.75][10:20]

Unnamed: 0,product_name,cluster_name,best_product_match,product_match_score,manufacturer_name,best_manufacturer_match,manufacturer_match_score,label
87455,dyclon,syrup tabs,cyclone,0.77,d.d.c.l,"addiseo,france",0.27,88
87456,arimis,jelly milking arimis,agrimos,0.77,tcl,oic ltd,0.18,2080
87457,profile,profile box long,profen,0.77,greenlife ltd,green live ltd.,0.83,2165
87458,profile,profile box long,profen,0.77,greenlife ltd,green live ltd.,0.83,2165
87459,ansal,ansal f1_tomato tomatoe,antisalm,0.77,royal seed,royal seed,0.95,831
87460,vetmin,vetmin stocklick milk,ivermin,0.77,botachem enterprises,botachem enterprises ltd.,0.87,3812
87461,nelgra,nelgra 50 50mg,integra,0.77,cosmos ltd,cosmos ltd.,0.91,1627
87462,nelgra,nelgra 50 50mg,integra,0.77,cosmos ltd,cosmos ltd.,0.91,1627
87463,premix,premix gp,premium,0.77,laibuta chemical ltd,laibuta chemicals ltd.,0.93,210
87464,egocin 10% oxytetracycline injection,egocin 10% injection,oxytetracycline 10% injection,0.77,medisel (k) ltd,medisel (k) ltd.,0.94,1768


In [135]:
# data.to_csv('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/clustered_stockist_transactions.csv', index=False)

### Topic modeling (2)

In [141]:
# creating vectorizers for each cluster
vectorizers = []

for k in range(0, num_clusters):
    vectorizers.append(CountVectorizer(lowercase=True))
    
vectorizers[0]

In [142]:
# vectorizing the data in each cluster
vectorized_data = []
fitted_vectorizers = []

for i, vectorizer in enumerate(vectorizers):
    indices = np.where(labels == i)[0]
    try:
        data = vectorizer.fit_transform(unique_product_names[indices])
        vectorized_data.append(data)
        fitted_vectorizers.append(vectorizer)
    except:
        print("Not enough instances in cluster {}".format(i))
        vectorized_data.append(None)

In [143]:
# getting topics per cluster
num_topics_per_cluster = 1

lda_models = []

for n in range(0, num_clusters):
    lda = LatentDirichletAllocation(n_components=num_topics_per_cluster, max_iter=10, 
                                    learning_method='online', verbose=False, random_state=0)
    lda_models.append(lda)
    
lda_models[0]

In [144]:
# fitting the lda models
cluster_lda_data = []

for l, lda in enumerate(lda_models):
    if vectorized_data[l] != None:
        cluster_lda_data.append(lda.fit_transform(vectorized_data[l]))
        
print(f'Done fitting LDA models on {num_clusters} clusters!')

Done fitting LDA models on 10000 clusters!


In [146]:
def selected_topics(model, vectorizer, top_n=4):
    current_words = []
    keywords = []

    for idx, topic in enumerate(model.components_):
        words = [(vectorizer.get_feature_names_out()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]
        for word in words:
            if word[0] not in current_words:
                keywords.append(word)
                current_words.append(word[0])

    keywords.sort(key=lambda x: x[1], reverse=True)
    return_values = [word[0] for word in keywords]

    return return_values


all_keywords = []

for current_vectorizer, lda in enumerate(lda_models):    
    if vectorized_data[current_vectorizer] != None:
        all_keywords.append(selected_topics(lda, fitted_vectorizers[current_vectorizer]))
        
print(f'Done extracting keywords from {num_clusters} clusters!')

Done extracting keywords from 10000 clusters!


In [147]:
keywords_df = pd.DataFrame(all_keywords)
keywords_df.fillna('', inplace=True)
keywords_df['concat_name'] = keywords_df[0] + ' ' + keywords_df[1] + ' ' + keywords_df[2] + ' ' + keywords_df[3]
keywords_df.head()

Unnamed: 0,0,1,2,3,concat_name
0,nozzle,holes,four,holed,nozzle holes four holed
1,nails,ord,1½,21,nails ord 1½ 21
2,growers,kajamba,mash,,growers kajamba mash
3,heater,black,water,big,heater black water big
4,large,pliers,,,large pliers


### Loading Clustered Data

In [26]:
clustered_data = pd.read_csv('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/clustered_stockist_transactions.csv')
clustered_data.head()

Unnamed: 0,product_name,cluster_name,best_product_match,product_match_score,manufacturer_name,best_manufacturer_match,manufacturer_match_score,label
0,laibuta foliar feeds,laibuta foliar feed,murphy foliar feed,0.68,laib,eabl,0.22,5598
1,white candles - riva,candles riva,white line - mint & coriander,0.49,halar industries ltd,zaam industries ltd.,0.83,1253
2,ampiclo-dawa dry syrup,ampiclo dawa,vitastar pet syrup,0.4,.,mea ltd.,0.2,8257
3,carvedilol(vidol)6.25mg,vidol carvedilol,clopidol,0.45,cosmos ltd,cosmos ltd.,0.91,2806
4,amoxil 'o'125mg,amoxil syrup,mineral 1-5725,0.41,gsk,wellstock,0.31,8278


In [5]:
unique_clustered_data = clustered_data[['product_name', 'cluster_name', 'best_product_match']].drop_duplicates(subset=['product_name'], keep='first')
unique_clustered_data

Unnamed: 0,product_name,cluster_name,best_product_match
0,laibuta foliar feeds,laibuta foliar feed,murphy foliar feed
1,white candles - riva,candles riva,white line - mint & coriander
2,ampiclo-dawa dry syrup,ampiclo dawa,vitastar pet syrup
3,carvedilol(vidol)6.25mg,vidol carvedilol,clopidol
4,amoxil 'o'125mg,amoxil syrup,mineral 1-5725
...,...,...,...
8610353,lyso- spray,spray body black,limoxin-25 spray
8611023,rivacia,syrup tabs,livacare
8611109,diproson oil,diproson cream creme,dinoprostone
8611784,panga(alligator),alligator panga,aligator


In [6]:
# cleanup function
def compare(row):
    comparison = {}
    i = row['product_name']
    prods_list = row[['cluster_name', 'best_product_match']].tolist()
    if isinstance(i, str):
       comparison.update({i: get_close_matches(i, prods_list, n=1, cutoff=0.1)})
    product_name = list(comparison.keys()) if comparison else None
    match = []
    score = []
    if comparison:
       for key, value in comparison.items():
           if value:
              match.append(value[0])
              score.append(round(SequenceMatcher(None, i, value[0]).ratio(), 2))
           else:
              match.append(None)
              score.append(None)
    else:
       match.append(None)
       score.append(None)
              
    return pd.Series([match, score], index = ['match', 'score'])

In [7]:
for i in tqdm(range(10), desc="Processing", ncols=80, bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}"):
    unique_clustered_data[['match', 'score']] = [compare(row) for _, row in unique_clustered_data.iterrows()]
    unique_clustered_data.head()

Processing: 100%|████████████████████████████████████████████████████████| 10/10


In [8]:
unique_clustered_data
dnt_unique_clustered_data = unique_clustered_data.copy()

In [9]:
unique_clustered_data['match'] = unique_clustered_data['match'].apply(lambda x: x[0])
unique_clustered_data['score'] = unique_clustered_data['score'].apply(lambda x: x[0])

unique_clustered_data

Unnamed: 0,product_name,cluster_name,best_product_match,match,score
0,laibuta foliar feeds,laibuta foliar feed,murphy foliar feed,laibuta foliar feed,0.97
1,white candles - riva,candles riva,white line - mint & coriander,candles riva,0.79
2,ampiclo-dawa dry syrup,ampiclo dawa,vitastar pet syrup,ampiclo dawa,0.69
3,carvedilol(vidol)6.25mg,vidol carvedilol,clopidol,vidol carvedilol,0.50
4,amoxil 'o'125mg,amoxil syrup,mineral 1-5725,amoxil syrup,0.50
...,...,...,...,...,...
8610353,lyso- spray,spray body black,limoxin-25 spray,limoxin-25 spray,0.67
8611023,rivacia,syrup tabs,livacare,livacare,0.67
8611109,diproson oil,diproson cream creme,dinoprostone,dinoprostone,0.67
8611784,panga(alligator),alligator panga,aligator,aligator,0.67


In [10]:
unique_clustered_data[unique_clustered_data['score'] >= 0.7].sort_values(by='score', ascending=False)

Unnamed: 0,product_name,cluster_name,best_product_match,match,score
144628,4way circular box,4way circular box,cargo box,4way circular box,1.0
274661,henke ject,henke ject,phenylject,henke ject,1.0
30980,tx 350 tuitor,tx 350 tuitor,tomato - terminator,tx 350 tuitor,1.0
946324,penikan p tube,penikan p tube,penikan,penikan p tube,1.0
895683,halisi cooking oil,halisi cooking oil,healing oil,halisi cooking oil,1.0
...,...,...,...,...,...
4051496,abanycin,syrup tabs,abamycin 10%,abamycin 10%,0.7
4051494,"2"" hinges","hinges t 3""","""hinges 3""""","""hinges 3""""",0.7
267477,finisher mash (fugo),finisher kienyeji fugo,pig finisher mash,pig finisher mash,0.7
4051257,esofag tabs,esofag kit esomeprazole,nest tabs,nest tabs,0.7


In [11]:
unique_clustered_data['go_to_match'] = np.where(unique_clustered_data['score'] >= 0.7, unique_clustered_data['match'], unique_clustered_data['cluster_name'])
unique_clustered_data

Unnamed: 0,product_name,cluster_name,best_product_match,match,score,go_to_match
0,laibuta foliar feeds,laibuta foliar feed,murphy foliar feed,laibuta foliar feed,0.97,laibuta foliar feed
1,white candles - riva,candles riva,white line - mint & coriander,candles riva,0.79,candles riva
2,ampiclo-dawa dry syrup,ampiclo dawa,vitastar pet syrup,ampiclo dawa,0.69,ampiclo dawa
3,carvedilol(vidol)6.25mg,vidol carvedilol,clopidol,vidol carvedilol,0.50,vidol carvedilol
4,amoxil 'o'125mg,amoxil syrup,mineral 1-5725,amoxil syrup,0.50,amoxil syrup
...,...,...,...,...,...,...
8610353,lyso- spray,spray body black,limoxin-25 spray,limoxin-25 spray,0.67,spray body black
8611023,rivacia,syrup tabs,livacare,livacare,0.67,syrup tabs
8611109,diproson oil,diproson cream creme,dinoprostone,dinoprostone,0.67,diproson cream creme
8611784,panga(alligator),alligator panga,aligator,aligator,0.67,alligator panga


In [12]:
unique_clustered_data

Unnamed: 0,product_name,cluster_name,best_product_match,match,score,go_to_match
0,laibuta foliar feeds,laibuta foliar feed,murphy foliar feed,laibuta foliar feed,0.97,laibuta foliar feed
1,white candles - riva,candles riva,white line - mint & coriander,candles riva,0.79,candles riva
2,ampiclo-dawa dry syrup,ampiclo dawa,vitastar pet syrup,ampiclo dawa,0.69,ampiclo dawa
3,carvedilol(vidol)6.25mg,vidol carvedilol,clopidol,vidol carvedilol,0.50,vidol carvedilol
4,amoxil 'o'125mg,amoxil syrup,mineral 1-5725,amoxil syrup,0.50,amoxil syrup
...,...,...,...,...,...,...
8610353,lyso- spray,spray body black,limoxin-25 spray,limoxin-25 spray,0.67,spray body black
8611023,rivacia,syrup tabs,livacare,livacare,0.67,syrup tabs
8611109,diproson oil,diproson cream creme,dinoprostone,dinoprostone,0.67,diproson cream creme
8611784,panga(alligator),alligator panga,aligator,aligator,0.67,alligator panga


### Cleaning matches against master list

In [13]:
# loading the master list
master_list_df = pd.read_csv('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/master_list.csv')
master_list_df

Unnamed: 0,combined_name,product_name,manufacturer_name
0,aviboost aqua block; vital animal health,aviboost aqua block,vital animal health
1,aviboost cl-x blue; vital animal health,aviboost cl-x blue,vital animal health
2,aviboost nutri block; vital animal health,aviboost nutri block,vital animal health
3,aviboost spectrum; vital animal health,aviboost spectrum,vital animal health
4,aviboost poultry tonic; vital animal health,aviboost poultry tonic,vital animal health
...,...,...,...
8774,jayhawk 500sc suspension concentrate ;,jayhawk 500sc suspension concentrate,
8775,cynara 72 wp wettable powder;,cynara 72 wp wettable powder,
8776,twiga-amine 720sl soluble concentrate;,twiga-amine 720sl soluble concentrate,
8777,ranman top 160 sc suspension concentrate;,ranman top 160 sc suspension concentrate,


In [14]:
matches_cache = {}
master_list = master_list_df['product_name'].to_list()

def get_closest_match(word, possibilities: list[str]):
    word = str(word).lower()
    if found := matches_cache.get(word):
        return found

    matches = get_close_matches(word, possibilities, n=1, cutoff=0.0)
    match = matches[0] if matches else ''
    score = round(SequenceMatcher(None, word, match).ratio(), 2)
    found = {'best_match': match, 'best_score': score}
    matches_cache[word] = found

    return found         # pd.Series([word, match, score], index = ['product_name', 'match', 'score'])

found_df = unique_clustered_data['go_to_match'].apply(lambda x: get_closest_match(x, master_list))

In [15]:
len(matches_cache)

13659

In [16]:
dnt_found_df = found_df.copy()

In [17]:
found_df = found_df.apply(pd.Series)
found_df.head()

Unnamed: 0,best_match,best_score
0,murphy foliar feed,0.7
1,maxcare reviva,0.59
2,amicop 50 wp,0.56
3,vital farasi,0.48
4,raxil super 375,0.57


In [18]:
unique_clustered_data.head()
new_dnt_ucd_df = unique_clustered_data.copy()

In [19]:
unique_clustered_data = pd.concat([unique_clustered_data, found_df], axis = 1)
unique_clustered_data = unique_clustered_data.reset_index(drop=True)
unique_clustered_data = unique_clustered_data.loc[:, ~unique_clustered_data.columns.duplicated()]
unique_clustered_data.head(20)

Unnamed: 0,product_name,cluster_name,best_product_match,match,score,go_to_match,best_match,best_score
0,laibuta foliar feeds,laibuta foliar feed,murphy foliar feed,laibuta foliar feed,0.97,laibuta foliar feed,murphy foliar feed,0.7
1,white candles - riva,candles riva,white line - mint & coriander,candles riva,0.79,candles riva,maxcare reviva,0.59
2,ampiclo-dawa dry syrup,ampiclo dawa,vitastar pet syrup,ampiclo dawa,0.69,ampiclo dawa,amicop 50 wp,0.56
3,carvedilol(vidol)6.25mg,vidol carvedilol,clopidol,vidol carvedilol,0.5,vidol carvedilol,vital farasi,0.48
4,amoxil 'o'125mg,amoxil syrup,mineral 1-5725,amoxil syrup,0.5,amoxil syrup,raxil super 375,0.57
5,dryblack(n7)cheap (btty)battery,n7 battery,hyacinth beans (valore),n7 battery,0.48,n7 battery,battery cover,0.58
6,baraka fertilizer,fertilizer dap baraka,fanisi tea fertilizer,fanisi tea fertilizer,0.68,fertilizer dap baraka,frontline spray,0.56
7,omeprazole/ompra caps,omeprazole caps 20mg,beetle patch cap,omeprazole caps 20mg,0.73,omeprazole caps 20mg,soprano sc 250,0.59
8,1kg urea mea chapa,urea mea 46%n,hp supreme (can),urea mea 46%n,0.58,urea mea 46%n,furadan 4f,0.52
9,505,505 ws wh,hai 450,505 ws wh,0.5,505 ws wh,doxy-500 ws,0.5


In [20]:
unique_clustered_data = unique_clustered_data.reset_index(drop=True)
unique_clustered_data = unique_clustered_data.loc[:, ~unique_clustered_data.columns.duplicated()]
unique_clustered_data.head()

Unnamed: 0,product_name,cluster_name,best_product_match,match,score,go_to_match,best_match,best_score
0,laibuta foliar feeds,laibuta foliar feed,murphy foliar feed,laibuta foliar feed,0.97,laibuta foliar feed,murphy foliar feed,0.7
1,white candles - riva,candles riva,white line - mint & coriander,candles riva,0.79,candles riva,maxcare reviva,0.59
2,ampiclo-dawa dry syrup,ampiclo dawa,vitastar pet syrup,ampiclo dawa,0.69,ampiclo dawa,amicop 50 wp,0.56
3,carvedilol(vidol)6.25mg,vidol carvedilol,clopidol,vidol carvedilol,0.5,vidol carvedilol,vital farasi,0.48
4,amoxil 'o'125mg,amoxil syrup,mineral 1-5725,amoxil syrup,0.5,amoxil syrup,raxil super 375,0.57


In [69]:
# unique_clustered_data[(unique_clustered_data['best_score'] < 0.8) & (unique_clustered_data['best_score'] >= 0.75)].sort_values(by = 'best_score', ascending = False)[-200:-100]

In [21]:
unique_clustered_data['correct_match'] = np.where(unique_clustered_data['best_score'] >= 0.75, unique_clustered_data['best_match'], unique_clustered_data['go_to_match'])
unique_clustered_data.head()

Unnamed: 0,product_name,cluster_name,best_product_match,match,score,go_to_match,best_match,best_score,correct_match
0,laibuta foliar feeds,laibuta foliar feed,murphy foliar feed,laibuta foliar feed,0.97,laibuta foliar feed,murphy foliar feed,0.7,laibuta foliar feed
1,white candles - riva,candles riva,white line - mint & coriander,candles riva,0.79,candles riva,maxcare reviva,0.59,candles riva
2,ampiclo-dawa dry syrup,ampiclo dawa,vitastar pet syrup,ampiclo dawa,0.69,ampiclo dawa,amicop 50 wp,0.56,ampiclo dawa
3,carvedilol(vidol)6.25mg,vidol carvedilol,clopidol,vidol carvedilol,0.5,vidol carvedilol,vital farasi,0.48,vidol carvedilol
4,amoxil 'o'125mg,amoxil syrup,mineral 1-5725,amoxil syrup,0.5,amoxil syrup,raxil super 375,0.57,amoxil syrup


In [22]:
len(unique_clustered_data[unique_clustered_data['best_score'] >= 0.75])

27075

In [27]:
unique_clustered_data_merge = unique_clustered_data[['product_name', 'correct_match']]

clustered_data = clustered_data.merge(unique_clustered_data_merge, on='product_name', how='left')

columns_list = clustered_data.columns.tolist()
columns_list.remove('correct_match')
columns_list.insert(2, 'correct_match')
clustered_data = clustered_data[columns_list]

clustered_data.head()

Unnamed: 0,product_name,cluster_name,correct_match,best_product_match,product_match_score,manufacturer_name,best_manufacturer_match,manufacturer_match_score,label
0,laibuta foliar feeds,laibuta foliar feed,laibuta foliar feed,murphy foliar feed,0.68,laib,eabl,0.22,5598
1,white candles - riva,candles riva,candles riva,white line - mint & coriander,0.49,halar industries ltd,zaam industries ltd.,0.83,1253
2,ampiclo-dawa dry syrup,ampiclo dawa,ampiclo dawa,vitastar pet syrup,0.4,.,mea ltd.,0.2,8257
3,carvedilol(vidol)6.25mg,vidol carvedilol,vidol carvedilol,clopidol,0.45,cosmos ltd,cosmos ltd.,0.91,2806
4,amoxil 'o'125mg,amoxil syrup,amoxil syrup,mineral 1-5725,0.41,gsk,wellstock,0.31,8278


In [24]:
# clustered_data.to_csv('final_clustered_data.csv', index = False)