### Importing packages

In [8]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist
from difflib import SequenceMatcher, get_close_matches
import re
from tqdm import tqdm, trange
import time

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
products_df = pd.read_csv('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/data_v2/subsequent_unmatched_products.csv')
products_df

Unnamed: 0,product_name,best_product_match,product_match_score
0,Capacitor,plantector,0.42
1,Starter Crumbs Fugo,broiler starter crumbs,0.68
2,Rewinding 1HP induction geared motor,biodistinction extra,0.36
3,pizza small chopping board,dairy meal economy okoa,0.49
4,Bamboo Chopping Bord,choline chloride,0.39
...,...,...,...
1403372,"aldrop yellow 6""",carophyll yellow 10%,0.67
1403373,Rocket,rocket 44 ec,0.67
1403374,Salad,seal,0.67
1403375,Heat inducer,hinder,0.67


In [4]:
products_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1403377 entries, 0 to 1403376
Data columns (total 3 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   product_name         1403377 non-null  object 
 1   best_product_match   1403377 non-null  object 
 2   product_match_score  1403377 non-null  float64
dtypes: float64(1), object(2)
memory usage: 32.1+ MB


### Data Preprocessing

In [6]:
# changing all strings to lowercase
products_df = products_df.applymap(lambda x: x.lower().strip() if isinstance(x, str) else x)
products_df.head()

Unnamed: 0,product_name,best_product_match,product_match_score
0,capacitor,plantector,0.42
1,starter crumbs fugo,broiler starter crumbs,0.68
2,rewinding 1hp induction geared motor,biodistinction extra,0.36
3,pizza small chopping board,dairy meal economy okoa,0.49
4,bamboo chopping bord,choline chloride,0.39


In [7]:
products_df['product_name'].nunique()

66005

In [9]:
# filtering for unique product names
unique_product_names = products_df['product_name'].unique()
len(unique_product_names)

66005

### Clustering

In [10]:
# vectorizing the product names
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(unique_product_names)

In [11]:
# clustering the data
num_clusters = 10000
kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(tfidf_matrix)
labels = kmeans.labels_

# creating a dataframe of the clusters
cluster_to_name = {}
for label in set(labels):
    indices = np.where(labels == label)[0]
    names = unique_product_names[indices].tolist()
    cluster_to_name[label] = names
    
# viewing the clusters
cluster_1_df = pd.DataFrame.from_dict(cluster_to_name.items())
cluster_1_df.rename(columns={0: 'cluster_id', 1: 'product_names'}, inplace=True)
cluster_1_df.set_index('cluster_id', inplace=True)
pd.set_option('display.max_colwidth', None)
cluster_1_df.head(5)

Unnamed: 0_level_0,product_names
cluster_id,Unnamed: 1_level_1
0,"[oraimo data cable, phone charging cable(oraimo), oraimo original cable, oraimo smart cable original, micro oraimo cable, oraimo cable]"
1,"[block, reva block red, reva block red block, reva red block]"
2,"[chick feeder small, feeder chick, chick feeder 24 slots, small chick feeder]"
3,"[dairy maxpro plus, farm dairy meal maxpro, dairy meal maxpro plus, maxpro dairy meal, maxpro plus dairy meal, maxpro dairy, pio maxpro plus dairy meal, dairy meal maxpro, maxpro dairy meal]"
4,"[highland stocklick- all purpose, all purpose stocklick, pharma all purpose stocklick, daima stocklick all purpose, hylick all purpose stocklick]"


In [16]:
cluster_1_df['cluster_size'] = cluster_1_df['product_names'].apply(lambda x: len(x))
cluster_1_df[cluster_1_df['cluster_size'] == 1]

Unnamed: 0_level_0,product_names,cluster_size
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1
9,[kienyeji bora],1
10,[crop champion (npk) 20:20:20],1
11,[max pollard],1
22,[maize mixture],1
25,[green organic 1ltr],1
...,...,...
9923,[gazette],1
9962,[otc10%],1
9982,[cp/tg(silver)org steeringbar(007)],1
9989,[233cables],1


In [17]:
unique_names_df = pd.DataFrame({'product_name': unique_product_names,
                                'label': labels})
unique_names_df.head()
len(unique_names_df)

66005

### Topic Modeling

In [18]:
# extracting most common words from each cluster in order
cluster_word_freq = {}

for doc, cluster_label in zip(unique_product_names, labels):
    words = re.split(r'\s+|-|\(|\)|/|\\|\||,', doc)
    for word in words:
        if cluster_label in cluster_word_freq:
            cluster_word_freq[cluster_label][word] = cluster_word_freq[cluster_label].get(word, 0) + 1
        else:
            cluster_word_freq[cluster_label] = {word: 1}
    
for cluster_label in cluster_word_freq:
    cluster_word_freq[cluster_label] = sorted(cluster_word_freq[cluster_label].items(), key=lambda x: x[1], reverse=True)
    
cluster_word_freq_df = pd.DataFrame.from_dict(cluster_word_freq.items())
cluster_word_freq_df.rename(columns={0: 'cluster_id', 1: 'word_freq'}, inplace=True)
cluster_word_freq_df.head()

Unnamed: 0,cluster_id,word_freq
0,7273,"[(capacitor, 2), (200, 1)]"
1,2180,"[(crumbs, 7), (fugo, 7), (starter, 6), (, 5), (50kg, 2), (10kg, 1), (25kg, 1), (1kg, 1)]"
2,8491,"[(motor, 12), (rewinding, 9), (induction, 8), (geared, 3), (1.5hp, 3), (2hp, 2), (1hp, 1), (3phase, 1), (5.5hp, 1), (7.5hp, 1), (3ph, 1), (1, 1), (hp, 1), (0.75kw, 1), (30hp, 1)]"
3,1409,"[(chopping, 9), (board, 9), (small, 3), (marble, 2), (big, 2), (pizza, 1), (@400, 1), (medium, 1), (knife, 1), (set, 1), (with, 1)]"
4,7282,"[(bamboo, 3), (chopping, 3), (board, 2), (bord, 1), (small, 1), (big, 1)]"


In [28]:
cluster_word_freq_df['cluster_name'] = cluster_word_freq_df['word_freq'].apply(lambda x: ' '.join(word[0] for word in x[:3] if word[0] != ' '))
cluster_word_freq_df

Unnamed: 0,cluster_id,word_freq,cluster_name
0,7273,"[(capacitor, 2), (200, 1)]",capacitor 200
1,2180,"[(crumbs, 7), (fugo, 7), (starter, 6), (, 5), (50kg, 2), (10kg, 1), (25kg, 1), (1kg, 1)]",crumbs fugo starter
2,8491,"[(motor, 12), (rewinding, 9), (induction, 8), (geared, 3), (1.5hp, 3), (2hp, 2), (1hp, 1), (3phase, 1), (5.5hp, 1), (7.5hp, 1), (3ph, 1), (1, 1), (hp, 1), (0.75kw, 1), (30hp, 1)]",motor rewinding induction
3,1409,"[(chopping, 9), (board, 9), (small, 3), (marble, 2), (big, 2), (pizza, 1), (@400, 1), (medium, 1), (knife, 1), (set, 1), (with, 1)]",chopping board small
4,7282,"[(bamboo, 3), (chopping, 3), (board, 2), (bord, 1), (small, 1), (big, 1)]",bamboo chopping board
...,...,...,...
9995,2786,"[(yellow, 1), (insulating, 1), (tape, 1), (small, 1)]",yellow insulating tape
9996,7486,"[(nitronor, 2), (34%, 1)]",nitronor 34%
9997,2889,"[(flukenor, 1), (500, 1)]",flukenor 500
9998,581,"[(globe, 1), (master, 1), (50g, 1), (e.a, 1)]",globe master 50g


In [29]:
cluster_word_freq_df[:50]

Unnamed: 0,cluster_id,word_freq,cluster_name
0,7273,"[(capacitor, 2), (200, 1)]",capacitor 200
1,2180,"[(crumbs, 7), (fugo, 7), (starter, 6), (, 5), (50kg, 2), (10kg, 1), (25kg, 1), (1kg, 1)]",crumbs fugo starter
2,8491,"[(motor, 12), (rewinding, 9), (induction, 8), (geared, 3), (1.5hp, 3), (2hp, 2), (1hp, 1), (3phase, 1), (5.5hp, 1), (7.5hp, 1), (3ph, 1), (1, 1), (hp, 1), (0.75kw, 1), (30hp, 1)]",motor rewinding induction
3,1409,"[(chopping, 9), (board, 9), (small, 3), (marble, 2), (big, 2), (pizza, 1), (@400, 1), (medium, 1), (knife, 1), (set, 1), (with, 1)]",chopping board small
4,7282,"[(bamboo, 3), (chopping, 3), (board, 2), (bord, 1), (small, 1), (big, 1)]",bamboo chopping board
5,3677,"[(gypsum, 9), (screw, 4), (1, 2), (2, 1), (1"", 1), (charnnel, 1), (1½"", 1), (board, 1), (sturd, 1), (gypsum{skirmcoat}, 1), (20kg, 1), (plaster, 1), (powder, 1)]",gypsum screw 1
6,2085,"[(malezi, 7), (feeds, 5), (, 4), (dairy, 2), (growers, 2), (dog, 1), (food, 1), (layer, 1), (meal, 1), (kienyeji, 1), (mash, 1)]",malezi feeds
7,4560,"[(juugo, 4), (maziwa, 2), (nyama, 1), (dairy, 1)]",juugo maziwa nyama
8,4027,"[(pioneer, 8), (dairy, 7), (maxpro, 6), (meal, 5), (plus, 4), (, 2), (maxplus, 1)]",pioneer dairy maxpro
9,1730,"[(coldcap, 13), (caps, 3), (syrup, 3), (orginal, 1), (original, 1), (100mls, 1), (day, 1), (night, 1), (capsules, 1), (generic, 1), (srp, 1), (syr, 1), (100ml, 1), (syrp, 1)]",coldcap caps syrup


In [30]:
cluster_merge = cluster_word_freq_df[['cluster_id', 'cluster_name']]
cluster_merge = cluster_merge.rename(columns={'cluster_id': 'label'})

products_df = products_df.merge(unique_names_df, on='product_name', how='left')
products_df = products_df.merge(cluster_merge, on='label', how='left')

columns = products_df.columns.tolist()
columns.remove('cluster_name')
columns.insert(1, 'cluster_name')
products_df = products_df[columns]

products_df[:50]
products_df.to_csv('clean_subsequent_products.csv', index=False)

### Clustered Data

In [None]:
unique_clustered_data = products_df[['product_name', 'cluster_name', 'best_product_match']].drop_duplicates(subset=['product_name'], keep='first').reset_index(drop=True)
unique_clustered_data

In [None]:
# cleanup function
def compare(row):
    comparison = {}
    i = row['product_name']
    prods_list = row[['cluster_name', 'best_product_match']].tolist()
    if isinstance(i, str):
       comparison.update({i: get_close_matches(i, prods_list, n=1, cutoff=0.1)})
    product_name = list(comparison.keys()) if comparison else None
    match = []
    score = []
    if comparison:
       for key, value in comparison.items():
           if value:
              match.append(value[0])
              score.append(round(SequenceMatcher(None, i, value[0]).ratio(), 2))
           else:
              match.append(None)
              score.append(None)
    else:
       match.append(None)
       score.append(None)
              
    return pd.Series([match, score], index = ['match', 'score'])

In [None]:
for i in tqdm(range(10), desc="Processing", ncols=80, bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}"):
    unique_clustered_data[['match', 'score']] = [compare(row) for _, row in unique_clustered_data.iterrows()]
    unique_clustered_data.head()

In [None]:
dnd_unique_clustered_data = unique_clustered_data.copy()

In [None]:
unique_clustered_data['match'] = unique_clustered_data['match'].apply(lambda x: x[0])
unique_clustered_data['score'] = unique_clustered_data['score'].apply(lambda x: x[0])

unique_clustered_data

In [None]:
unique_clustered_data['go_to_match'] = np.where(unique_clustered_data['score'] >= 0.7, unique_clustered_data['match'], unique_clustered_data['cluster_name'])
unique_clustered_data

### Cleaning against master list