In [1]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher, get_close_matches
from tqdm import tqdm
from collections import Counter
from fuzzywuzzy import fuzz
import re

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
df_man = pd.read_csv('../../data/data_v2/subsequent_manufacturers.csv')
df_man = df_man.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)
df_manufacturer = df_man[df_man['manufacturer_match_score'] < 0.8]
df_manufacturer.head()

Unnamed: 0,manufacturer_name,best_manufacturer_match,manufacturer_match_score
5,local,oic ltd,0.15
16,fugo,pz cussons,0.27
22,local,oic ltd,0.15
31,mukurwe-ini dairy,zagro-india,0.41
33,xxxx,apex,0.22


In [3]:
df_non_dup = df_manufacturer.drop_duplicates(subset='manufacturer_name', keep='first')
df_non_dup['word_count'] = df_manufacturer['manufacturer_name'].apply(lambda x: len(x.split()) if isinstance(x, str) else 1)
df_non_dup['manufacturer_name'] = df_non_dup['manufacturer_name'].astype('str')

manufacturer_list = []

for index, row in df_non_dup.iterrows():
    word_count = row['word_count']
    manufacturer_name = row['manufacturer_name']
    
    if word_count in [1,2]:
        manufacturer_slice = manufacturer_name.strip().split()[:1]
        manufacturer_list.append(' '.join(manufacturer_slice))
    elif word_count in [3,4,5]:
        manufacturer_slice = manufacturer_name.strip().split()[:2]
        manufacturer_list.append(' '.join(manufacturer_slice))
    else:
        manufacturer_slice = manufacturer_name.strip().split()[:3]
        manufacturer_list.append(' '.join(manufacturer_slice))
           
df_non_dup['manufacturer_slice'] = manufacturer_list        
df_non_dup = df_non_dup.reset_index(drop=True)
df_non_dup.head()

Unnamed: 0,manufacturer_name,best_manufacturer_match,manufacturer_match_score,word_count,manufacturer_slice
0,local,oic ltd,0.15,1,local
1,fugo,pz cussons,0.27,1,fugo
2,mukurwe-ini dairy,zagro-india,0.41,2,mukurwe-ini
3,xxxx,apex,0.22,1,xxxx
4,g&g,hebei yuangzheng,0.2,1,g&g


In [4]:
df_non_dup.shape

(18619, 5)

In [5]:
df_manufacturer_slice = df_non_dup['manufacturer_slice'].drop_duplicates(keep='first').to_frame()
df_manufacturer_slice = df_manufacturer_slice.reset_index(drop=True)
df_manufacturer_slice

Unnamed: 0,manufacturer_slice
0,local
1,fugo
2,mukurwe-ini
3,xxxx
4,g&g
...,...
12693,gloria
12694,kiresi seed
12695,triachem tanzania
12696,tags


In [6]:
matched = []
def compare(i):
    compare = {}
    if i in matched:
        compare.update({i: ''})
    else:
        compare.update({i: get_close_matches(i, df_manufacturer_slice['manufacturer_slice'].to_list(), 20, 0.85)})
    # matched.extend(compare.values())
    matched.extend([item for sublist in compare.values() for item in sublist])
    manufacturer_slice = list(compare.keys())
    match = []
    for key, items in compare.items():
      match.append(items)
    return pd.Series([manufacturer_slice, match],index=['manufacturer_slice', 'match'])

      
cleaned_manufacturers_df = pd.DataFrame()
cleaned_manufacturers_df[['manufacturer_slice', 'match']] = df_manufacturer_slice['manufacturer_slice'].apply(lambda x: compare(x))
cleaned_manufacturers_df = cleaned_manufacturers_df.applymap(lambda x: x[0] if x else '')
cleaned_manufacturers_df.head()

Unnamed: 0,manufacturer_slice,match
0,local,"[local, localy, locals, locali, local., local,..."
1,fugo,"[fugo, fungo, fuo, fug]"
2,mukurwe-ini,[mukurwe-ini]
3,xxxx,"[xxxx, xxx]"
4,g&g,[g&g]


In [7]:
len(set(matched))

12698

In [8]:
dnd_cleaned_manufacturers_df = cleaned_manufacturers_df.copy()
cleaned_manufacturers_df['manufacturer_slice'] = cleaned_manufacturers_df['manufacturer_slice'].astype('str')

In [9]:
filtered_df = cleaned_manufacturers_df[cleaned_manufacturers_df['match'].apply(lambda x: len(x) == 0)]
filtered_df

Unnamed: 0,manufacturer_slice,match
84,somo,
131,intercheme,
151,xxx,
152,lcal,
192,loal,
...,...,...
12682,nexigen,
12684,amamy green,
12686,olympics,
12687,pt mandom,


In [10]:
for i, row in cleaned_manufacturers_df.iterrows():
    string = row['manufacturer_slice']
    lst = row['match']
    
    # Check if the string exists in any list of previous rows
    if not lst:
        for prev_i in range(i):
            prev_lst = cleaned_manufacturers_df.at[prev_i, 'match']
            if string in prev_lst:
                cleaned_manufacturers_df.at[i, 'match'] = prev_lst
                break  # Stop searching after finding the first match

# Reset the index of the DataFrame
# cleaned_manufacturers_df_subset = cleaned_manufacturers_df_subset.reset_index(drop=True)

pd.set_option('display.max_colwidth', None)
cleaned_manufacturers_df.head()

Unnamed: 0,manufacturer_slice,match
0,local,"[local, localy, locals, locali, local., local,, locl, loca, loal, lcal]"
1,fugo,"[fugo, fungo, fuo, fug]"
2,mukurwe-ini,[mukurwe-ini]
3,xxxx,"[xxxx, xxx]"
4,g&g,[g&g]


In [11]:
dnd_df_non_dup = df_non_dup.copy()

In [12]:
df_non_dup = pd.merge(df_non_dup, cleaned_manufacturers_df, how='left', on='manufacturer_slice')
df_non_dup.head()

Unnamed: 0,manufacturer_name,best_manufacturer_match,manufacturer_match_score,word_count,manufacturer_slice,match
0,local,oic ltd,0.15,1,local,"[local, localy, locals, locali, local., local,, locl, loca, loal, lcal]"
1,fugo,pz cussons,0.27,1,fugo,"[fugo, fungo, fuo, fug]"
2,mukurwe-ini dairy,zagro-india,0.41,2,mukurwe-ini,[mukurwe-ini]
3,xxxx,apex,0.22,1,xxxx,"[xxxx, xxx]"
4,g&g,hebei yuangzheng,0.2,1,g&g,[g&g]


In [13]:
# Initialize an empty list to store the similar strings
similar_strings = []

# Iterate over the rows of the DataFrame
for i, row in df_non_dup.iterrows():
    string = row['manufacturer_name']
    value = row['match']
    
    # Check if any other row has a similar value in 'col2'
    similar_rows = df_non_dup[df_non_dup['match'].apply(lambda x: x == value)]
    
    # Extract the strings from 'col1' in similar rows
    similar_strings.append(similar_rows['manufacturer_name'].tolist())

similar_strings[:5]

[['local',
  'lcal',
  'loal',
  'locl',
  'local,',
  'localy',
  'local manufacture',
  'local supplier',
  'local nairobi',
  'local manufacturer',
  'local artisan',
  'local juakali',
  'loca',
  'locali',
  'local providers',
  'local seller',
  'localy made',
  'local.',
  'local industries',
  'local manufactur',
  'local any',
  'local carpenters',
  'locals',
  'local ltd.'],
 ['fugo',
  'fug',
  'fugo company',
  'fuo',
  'fugo feeds',
  'fugo ltd',
  'fungo',
  'fugo vits',
  'fugo unga',
  'fugo feed',
  'fugo vit'],
 ['mukurwe-ini dairy'],
 ['xxxx', 'xxx'],
 ['g&g']]

In [14]:
df_similar_strings = pd.DataFrame({'similar_strings': similar_strings})
df_similar_strings = df_similar_strings['similar_strings'].drop_duplicates(keep='first').to_frame().reset_index(drop=True)
df_similar_strings.head()

Unnamed: 0,similar_strings
0,"[local, lcal, loal, locl, local,, localy, local manufacture, local supplier, local nairobi, local manufacturer, local artisan, local juakali, loca, locali, local providers, local seller, localy made, local., local industries, local manufactur, local any, local carpenters, locals, local ltd.]"
1,"[fugo, fug, fugo company, fuo, fugo feeds, fugo ltd, fungo, fugo vits, fugo unga, fugo feed, fugo vit]"
2,[mukurwe-ini dairy]
3,"[xxxx, xxx]"
4,[g&g]


In [15]:
df_unique_match = df_non_dup['match'].drop_duplicates(keep='first').to_frame().reset_index(drop=True)
df_unique_match = pd.concat([df_unique_match, df_similar_strings],axis = 1)
df_unique_match.head()

Unnamed: 0,match,similar_strings
0,"[local, localy, locals, locali, local., local,, locl, loca, loal, lcal]","[local, lcal, loal, locl, local,, localy, local manufacture, local supplier, local nairobi, local manufacturer, local artisan, local juakali, loca, locali, local providers, local seller, localy made, local., local industries, local manufactur, local any, local carpenters, locals, local ltd.]"
1,"[fugo, fungo, fuo, fug]","[fugo, fug, fugo company, fuo, fugo feeds, fugo ltd, fungo, fugo vits, fugo unga, fugo feed, fugo vit]"
2,[mukurwe-ini],[mukurwe-ini dairy]
3,"[xxxx, xxx]","[xxxx, xxx]"
4,[g&g],[g&g]


In [16]:
dnd_df_unique_match = df_unique_match.copy()

In [17]:
df_unique_match = dnd_df_unique_match.copy()
df_unique_match['average_length'] = df_unique_match['similar_strings'].apply(lambda x: round(sum(len(word.split()) for word in x) / len(x), 0))
df_unique_match.head()

Unnamed: 0,match,similar_strings,average_length
0,"[local, localy, locals, locali, local., local,, locl, loca, loal, lcal]","[local, lcal, loal, locl, local,, localy, local manufacture, local supplier, local nairobi, local manufacturer, local artisan, local juakali, loca, locali, local providers, local seller, localy made, local., local industries, local manufactur, local any, local carpenters, locals, local ltd.]",2.0
1,"[fugo, fungo, fuo, fug]","[fugo, fug, fugo company, fuo, fugo feeds, fugo ltd, fungo, fugo vits, fugo unga, fugo feed, fugo vit]",2.0
2,[mukurwe-ini],[mukurwe-ini dairy],2.0
3,"[xxxx, xxx]","[xxxx, xxx]",1.0
4,[g&g],[g&g],1.0


In [18]:
# df_unique_match['cluster_name'] = df_unique_match['similar_strings'].apply(lambda x: x[0])
# df_unique_match

In [19]:
for id, row in df_unique_match.iterrows():
    names = row['similar_strings']
    
    filtered_names = [item for item in names if item]
    counter = Counter(filtered_names)
    most_common_name = counter.most_common(1)[0][0]
    
    df_unique_match.at[id, 'common_name'] = most_common_name
    
df_unique_match.head()

Unnamed: 0,match,similar_strings,average_length,common_name
0,"[local, localy, locals, locali, local., local,, locl, loca, loal, lcal]","[local, lcal, loal, locl, local,, localy, local manufacture, local supplier, local nairobi, local manufacturer, local artisan, local juakali, loca, locali, local providers, local seller, localy made, local., local industries, local manufactur, local any, local carpenters, locals, local ltd.]",2.0,local
1,"[fugo, fungo, fuo, fug]","[fugo, fug, fugo company, fuo, fugo feeds, fugo ltd, fungo, fugo vits, fugo unga, fugo feed, fugo vit]",2.0,fugo
2,[mukurwe-ini],[mukurwe-ini dairy],2.0,mukurwe-ini dairy
3,"[xxxx, xxx]","[xxxx, xxx]",1.0,xxxx
4,[g&g],[g&g],1.0,g&g


In [20]:
# df_unique_match['similar_strings'] = df_unique_match['similar_strings'].apply(lambda x: ' '.join(x))

# # extracting most common words from each cluster in order
# cluster_word_freq = {}


# for id, row in df_unique_match.iterrows():
#     cluster = row['similar_strings']
    
#     words = re.split(r'\s+|-|\(|\)|/|\\|\||,', cluster)
#     for word in words:
#         if id in cluster_word_freq:
#             cluster_word_freq[id][word] = cluster_word_freq[id].get(word, 0) + 1
#         else:
#             cluster_word_freq[id] = {word: 1}
    
# for id in cluster_word_freq:
#     cluster_word_freq[id] = sorted(cluster_word_freq[id].items(), key=lambda x: x[1], reverse=True)
#     # cluster_word_freq[id] = list(cluster_word_freq[id].items())
#     # cluster_word_freq[id] = cluster_word_freq[id].items()

# cluster_word_freq_df = pd.DataFrame.from_dict(cluster_word_freq.items())
# cluster_word_freq_df.rename(columns={0: 'id', 1: 'word_freq'}, inplace=True)
# cluster_word_freq_df.head()

In [21]:
# # extracting most common words from each cluster in order
# cluster_word_freq = {}


# for id, row in df_unique_match.iterrows():
#     cluster = row['similar_strings']
    
#     words = re.split(r'\s+|-|\(|\)|/|\\|\||,', cluster)
#     for word in words:
#         if id in cluster_word_freq:
#             cluster_word_freq[id][word] = cluster_word_freq[id].get(word, 0) + 1
#         else:
#             cluster_word_freq[id] = {word: 1}
    
# for id in cluster_word_freq:
#     # cluster_word_freq[id] = sorted(cluster_word_freq[id].items(), key=lambda x: x[1], reverse=True)
#     cluster_word_freq[id] = list(cluster_word_freq[id].items())
#     # cluster_word_freq[id] = cluster_word_freq[id].items()

# cluster_word_freq_df = pd.DataFrame.from_dict(cluster_word_freq.items())
# cluster_word_freq_df.rename(columns={0: 'id', 1: 'word_freq'}, inplace=True)
# cluster_word_freq_df.head()

In [22]:
# cluster_word_freq_df = pd.concat([cluster_word_freq_df, df_unique_match[['average_length', 'common_name']]], axis=1)
# cluster_word_freq_df.head()

In [23]:
# dnd_cluster_word_freq_df = cluster_word_freq_df.copy()

In [24]:
# for i, row in cluster_word_freq_df.iterrows():
#     lst = row['word_freq']
#     number = int(row['average_length'])

#     cluster_name = ' '.join(word[0] for word in lst[:number])
#     cluster_word_freq_df.at[i, 'cluster_name'] = cluster_name
    
# cluster_word_freq_df.head()

In [25]:
df_unique_match.head()

Unnamed: 0,match,similar_strings,average_length,common_name
0,"[local, localy, locals, locali, local., local,, locl, loca, loal, lcal]","[local, lcal, loal, locl, local,, localy, local manufacture, local supplier, local nairobi, local manufacturer, local artisan, local juakali, loca, locali, local providers, local seller, localy made, local., local industries, local manufactur, local any, local carpenters, locals, local ltd.]",2.0,local
1,"[fugo, fungo, fuo, fug]","[fugo, fug, fugo company, fuo, fugo feeds, fugo ltd, fungo, fugo vits, fugo unga, fugo feed, fugo vit]",2.0,fugo
2,[mukurwe-ini],[mukurwe-ini dairy],2.0,mukurwe-ini dairy
3,"[xxxx, xxx]","[xxxx, xxx]",1.0,xxxx
4,[g&g],[g&g],1.0,g&g


In [26]:
# loading the master list
master_list_df = pd.read_csv('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/data_v1/master_list.csv')
master_list_df.head()

Unnamed: 0,combined_name,product_name,manufacturer_name
0,aviboost aqua block; vital animal health,aviboost aqua block,vital animal health
1,aviboost cl-x blue; vital animal health,aviboost cl-x blue,vital animal health
2,aviboost nutri block; vital animal health,aviboost nutri block,vital animal health
3,aviboost spectrum; vital animal health,aviboost spectrum,vital animal health
4,aviboost poultry tonic; vital animal health,aviboost poultry tonic,vital animal health


In [27]:
df_unique_match['common_name'] = df_unique_match['common_name'].astype('str')
master_list_df['manufacturer_name'] = master_list_df['manufacturer_name'].astype('str')

In [28]:
matches_cache = {}
master_list = master_list_df['manufacturer_name'].to_list()

# def get_closest_match(word, possibilities: list[str]):
#     word = str(word).lower()
#     if found := matches_cache.get(word):
#         return found

#     matches = get_close_matches(word, possibilities, n=1, cutoff=0.0)
#     match = matches[0] if matches else ''
#     score = round(SequenceMatcher(None, word, match).ratio(), 2)
#     found = {'best_match': match, 'best_score': score}
#     matches_cache[word] = found

#     return found         # pd.Series([word, match, score], index = ['product_name', 'match', 'score'])

def get_closest_match(word, possibilities: list[str]):
    word = str(word).lower()
    if found := matches_cache.get(word):
        return found

    matches = [possibility for possibility in possibilities if fuzz.partial_ratio(word, possibility) > 0]
    match = max(matches, key=lambda x: fuzz.partial_ratio(word, x)) if matches else ''
    score = round(fuzz.partial_ratio(word, match), 2)
    found = {'best_match': match, 'best_score': score}
    matches_cache[word] = found

    return found

found_df = df_unique_match['common_name'].apply(lambda x: get_closest_match(x, master_list))

In [29]:
dnt_found_df = found_df.copy()

In [30]:
found_df = found_df.apply(pd.Series)
found_df.head()

Unnamed: 0,best_match,best_score
0,global nutrition solutions,80
1,elgon kenya ltd.,50
2,zagro-india,55
3,1. huvepharma-bulgaria 2. tianjin xinxing veterinary pharmaceuticals -china,50
4,export trading group,67


In [31]:
df_unique_match = pd.concat([df_unique_match, found_df], axis=1)
df_unique_match.head()

Unnamed: 0,match,similar_strings,average_length,common_name,best_match,best_score
0,"[local, localy, locals, locali, local., local,, locl, loca, loal, lcal]","[local, lcal, loal, locl, local,, localy, local manufacture, local supplier, local nairobi, local manufacturer, local artisan, local juakali, loca, locali, local providers, local seller, localy made, local., local industries, local manufactur, local any, local carpenters, locals, local ltd.]",2.0,local,global nutrition solutions,80
1,"[fugo, fungo, fuo, fug]","[fugo, fug, fugo company, fuo, fugo feeds, fugo ltd, fungo, fugo vits, fugo unga, fugo feed, fugo vit]",2.0,fugo,elgon kenya ltd.,50
2,[mukurwe-ini],[mukurwe-ini dairy],2.0,mukurwe-ini dairy,zagro-india,55
3,"[xxxx, xxx]","[xxxx, xxx]",1.0,xxxx,1. huvepharma-bulgaria 2. tianjin xinxing veterinary pharmaceuticals -china,50
4,[g&g],[g&g],1.0,g&g,export trading group,67


In [32]:
df_unique_match[(df_unique_match['best_score'] < 90) & (df_unique_match['best_score'] >= 85)][:50]

Unnamed: 0,match,similar_strings,average_length,common_name,best_match,best_score
29,"[intetchem, intechem, interchem]","[intetchem, intechem]",1.0,intetchem,interchemie werken ‘de adelaar’ b.v. holland,89
126,"[neo, neon, nemo]","[neo life, neo cell, nemo vets, neon]",2.0,neo life,neolife international ltd.,88
227,"[microp+, microp]","[microp+, microp yara]",2.0,microp+,cmi-cropcare (k) ltd.,86
278,"[jangara, sangara]","[jangara, sangara]",1.0,jangara,"veko care pvt. limited, mr. narayan undre, plot no. e-48&49, midc, ranjangaaon, district-pune, maharashtra, pin code: 412220. india",86
303,"[aishish, ashish, ashishi]","[aishish, ashish life, ashish company, ashishi, ashish, ashish ltd]",2.0,aishish,ashish life science pvt ltd.,86
311,"[sanofi, sanofil]","[sanofi pharmaceuticals, sanofi, sanofil, sanofi aventis, sanofi ltd]",2.0,sanofi pharmaceuticals,jiangsu ouke animal pharmaceuticals ltd china & sinochem ningbo ltd,86
313,"[laibota, laibuta, laibita]","[laibota, laibuta company, laibita ltd, laibuta ltd, laibuta, laibuta chem]",2.0,laibota,laibuta chemicals ltd.,86
330,"[india immunological, indian immunological, indian immunologicals]","[india immunological limited, indian immunologicals ltd, indian immunological ltd]",3.0,india immunological limited,"indian immunologicals ltd hyderabad, india",85
362,[hebei huaran],[hebei huaran pharmacy ltd],4.0,hebei huaran pharmacy ltd,"hebei hope harmony pharmaceutical co. ltd hebei huaran pharmacy co.ltd,china,hebei kexing pharmaceutical co. ltd-china",88
504,[envelope],[envelope],1.0,envelope,"1). chongqing fangtong animal pharmaceutical co.ltd, no.80, east part of changzhou road, rongchang, chongqing, china. (2). hebei bimeda pharmaceutical technology co. ltd, no 60 ganjiang road, shijiazhoung economic and technology development zone, china",88


In [33]:
cluster_word_freq_df['chosen_match'] = np.where(cluster_word_freq_df['best_score'] >= 0.83, cluster_word_freq_df['best_match'], cluster_word_freq_df['common_name'])
cluster_word_freq_df.head()

Unnamed: 0,id,word_freq,average_length,common_name,cluster_name,best_match,best_score,chosen_match
0,0,"[(local, 15), (localy, 2), (lcal, 1), (loal, 1), (locl, 1), (, 1), (manufacture, 1), (supplier, 1), (nairobi, 1), (manufacturer, 1), (artisan, 1), (juakali, 1), (loca, 1), (locali, 1), (providers, 1), (seller, 1), (made, 1), (local., 1), (industries, 1), (manufactur, 1), (any, 1), (carpenters, 1), (locals, 1), (ltd., 1)]",2.0,local,local localy,chrysal africa ltd.,0.45,local
1,1,"[(fugo, 8), (fug, 1), (company, 1), (fuo, 1), (feeds, 1), (ltd, 1), (fungo, 1), (vits, 1), (unga, 1), (feed, 1), (vit, 1)]",2.0,fugo,fugo fug,nutreco africa,0.36,fugo
2,2,"[(mukurwe, 1), (ini, 1), (dairy, 1)]",2.0,mukurwe-ini dairy,mukurwe ini,iprocure limited,0.44,mukurwe-ini dairy
3,3,"[(xxxx, 1), (xxx, 1)]",1.0,xxxx,xxxx,apex,0.25,xxxx
4,4,"[(g&g, 1)]",1.0,g&g,g&g,hebei yuangzheng,0.21,g&g


In [34]:
df_unique_match.head()

Unnamed: 0,match,similar_strings,average_length,common_name
0,"[local, localy, locals, locali, local., local,, locl, loca, loal, lcal]","local lcal loal locl local, localy local manufacture local supplier local nairobi local manufacturer local artisan local juakali loca locali local providers local seller localy made local. local industries local manufactur local any local carpenters locals local ltd.",2.0,local
1,"[fugo, fungo, fuo, fug]",fugo fug fugo company fuo fugo feeds fugo ltd fungo fugo vits fugo unga fugo feed fugo vit,2.0,fugo
2,[mukurwe-ini],mukurwe-ini dairy,2.0,mukurwe-ini dairy
3,"[xxxx, xxx]",xxxx xxx,1.0,xxxx
4,[g&g],g&g,1.0,g&g


In [35]:
df_unique_match = pd.concat([df_unique_match[['match', 'similar_strings']], cluster_word_freq_df], axis=1)
# df_unique_match.drop(['id', 'best_match', 'best_score', 'average_length'], axis=1, inplace=True)
# df_unique_match.drop(['word_freq', 'match', 'average_length', 'cluster_name'], axis=1, inplace=True)
df_unique_match.head()

Unnamed: 0,match,similar_strings,average_length,common_name,id,word_freq,average_length.1,common_name.1,cluster_name,best_match,best_score,chosen_match
0,"[local, localy, locals, locali, local., local,, locl, loca, loal, lcal]","local lcal loal locl local, localy local manufacture local supplier local nairobi local manufacturer local artisan local juakali loca locali local providers local seller localy made local. local industries local manufactur local any local carpenters locals local ltd.",2.0,local,0,"[(local, 15), (localy, 2), (lcal, 1), (loal, 1), (locl, 1), (, 1), (manufacture, 1), (supplier, 1), (nairobi, 1), (manufacturer, 1), (artisan, 1), (juakali, 1), (loca, 1), (locali, 1), (providers, 1), (seller, 1), (made, 1), (local., 1), (industries, 1), (manufactur, 1), (any, 1), (carpenters, 1), (locals, 1), (ltd., 1)]",2.0,local,local localy,chrysal africa ltd.,0.45,local
1,"[fugo, fungo, fuo, fug]",fugo fug fugo company fuo fugo feeds fugo ltd fungo fugo vits fugo unga fugo feed fugo vit,2.0,fugo,1,"[(fugo, 8), (fug, 1), (company, 1), (fuo, 1), (feeds, 1), (ltd, 1), (fungo, 1), (vits, 1), (unga, 1), (feed, 1), (vit, 1)]",2.0,fugo,fugo fug,nutreco africa,0.36,fugo
2,[mukurwe-ini],mukurwe-ini dairy,2.0,mukurwe-ini dairy,2,"[(mukurwe, 1), (ini, 1), (dairy, 1)]",2.0,mukurwe-ini dairy,mukurwe ini,iprocure limited,0.44,mukurwe-ini dairy
3,"[xxxx, xxx]",xxxx xxx,1.0,xxxx,3,"[(xxxx, 1), (xxx, 1)]",1.0,xxxx,xxxx,apex,0.25,xxxx
4,[g&g],g&g,1.0,g&g,4,"[(g&g, 1)]",1.0,g&g,g&g,hebei yuangzheng,0.21,g&g


In [36]:
len(df_non_dup)

18619

In [37]:
dnddd_df_unique_match = df_unique_match.copy()
dnddd_df_non_dup = df_non_dup.copy()

In [38]:
df_unique_match = dnddd_df_unique_match.copy()
df_non_dup = dnddd_df_non_dup.copy()

In [39]:
df_unique_match.columns

Index(['match', 'similar_strings', 'average_length', 'common_name', 'id',
       'word_freq', 'average_length', 'common_name', 'cluster_name',
       'best_match', 'best_score', 'chosen_match'],
      dtype='object')

In [40]:
df_unique_match['match'] = df_unique_match['match'].apply(lambda x: ' '.join(x))
df_non_dup['match'] = df_non_dup['match'].apply(lambda x: ' '.join(x))

In [41]:
df_non_dup = df_non_dup.merge(df_unique_match, how='left', on='match')
df_non_dup.head()

Unnamed: 0,manufacturer_name,best_manufacturer_match,manufacturer_match_score,word_count,manufacturer_slice,match,similar_strings,average_length,common_name,id,word_freq,average_length.1,common_name.1,cluster_name,best_match,best_score,chosen_match
0,local,oic ltd,0.15,1,local,"local localy locals locali local. local, locl loca loal lcal","local lcal loal locl local, localy local manufacture local supplier local nairobi local manufacturer local artisan local juakali loca locali local providers local seller localy made local. local industries local manufactur local any local carpenters locals local ltd.",2.0,local,0,"[(local, 15), (localy, 2), (lcal, 1), (loal, 1), (locl, 1), (, 1), (manufacture, 1), (supplier, 1), (nairobi, 1), (manufacturer, 1), (artisan, 1), (juakali, 1), (loca, 1), (locali, 1), (providers, 1), (seller, 1), (made, 1), (local., 1), (industries, 1), (manufactur, 1), (any, 1), (carpenters, 1), (locals, 1), (ltd., 1)]",2.0,local,local localy,chrysal africa ltd.,0.45,local
1,fugo,pz cussons,0.27,1,fugo,fugo fungo fuo fug,fugo fug fugo company fuo fugo feeds fugo ltd fungo fugo vits fugo unga fugo feed fugo vit,2.0,fugo,1,"[(fugo, 8), (fug, 1), (company, 1), (fuo, 1), (feeds, 1), (ltd, 1), (fungo, 1), (vits, 1), (unga, 1), (feed, 1), (vit, 1)]",2.0,fugo,fugo fug,nutreco africa,0.36,fugo
2,mukurwe-ini dairy,zagro-india,0.41,2,mukurwe-ini,mukurwe-ini,mukurwe-ini dairy,2.0,mukurwe-ini dairy,2,"[(mukurwe, 1), (ini, 1), (dairy, 1)]",2.0,mukurwe-ini dairy,mukurwe ini,iprocure limited,0.44,mukurwe-ini dairy
3,xxxx,apex,0.22,1,xxxx,xxxx xxx,xxxx xxx,1.0,xxxx,3,"[(xxxx, 1), (xxx, 1)]",1.0,xxxx,xxxx,apex,0.25,xxxx
4,g&g,hebei yuangzheng,0.2,1,g&g,g&g,g&g,1.0,g&g,4,"[(g&g, 1)]",1.0,g&g,g&g,hebei yuangzheng,0.21,g&g


In [42]:
# cleanup function
def compare(row):
    comparison = {}
    i = row['manufacturer_name']
    prods_list = row[['chosen_match', 'best_manufacturer_match']].tolist()
    if isinstance(i, str):
       comparison.update({i: get_close_matches(i, prods_list, n=1, cutoff=0.1)})
    product_name = list(comparison.keys()) if comparison else None
    match = []
    score = []
    if comparison:
       for key, value in comparison.items():
           if value:
              match.append(value[0])
              score.append(round(SequenceMatcher(None, i, value[0]).ratio(), 2))
           else:
              match.append(None)
              score.append(None)
    else:
       match.append(None)
       score.append(None)
              
    return pd.Series([match, score], index = ['match', 'score'])

In [43]:
def convert_to_string(value):
    return str(value)

columns_to_convert = ['manufacturer_name', 'chosen_match', 'best_manufacturer_match']
df_non_dup[columns_to_convert] = df_non_dup[columns_to_convert].applymap(convert_to_string)

In [45]:
for i in tqdm(range(10), desc="Processing", ncols=80, bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}"):
    df_non_dup[['final_match', 'score']] = df_non_dup.apply(lambda row: compare(row), axis=1)
    df_non_dup.head()

Processing: 100%|████████████████████████████████████████████████████████| 10/10


In [46]:
df_non_dup['final_match'] = df_non_dup['final_match'].apply(lambda x: ' '.join(x))
df_non_dup['score'] = df_non_dup['score'].apply(lambda x: x[0])

df_non_dup.head()

Unnamed: 0,manufacturer_name,best_manufacturer_match,manufacturer_match_score,word_count,manufacturer_slice,match,similar_strings,average_length,common_name,id,word_freq,average_length.1,common_name.1,cluster_name,best_match,best_score,chosen_match,final_match,score
0,local,oic ltd,0.15,1,local,"local localy locals locali local. local, locl loca loal lcal","local lcal loal locl local, localy local manufacture local supplier local nairobi local manufacturer local artisan local juakali loca locali local providers local seller localy made local. local industries local manufactur local any local carpenters locals local ltd.",2.0,local,0,"[(local, 15), (localy, 2), (lcal, 1), (loal, 1), (locl, 1), (, 1), (manufacture, 1), (supplier, 1), (nairobi, 1), (manufacturer, 1), (artisan, 1), (juakali, 1), (loca, 1), (locali, 1), (providers, 1), (seller, 1), (made, 1), (local., 1), (industries, 1), (manufactur, 1), (any, 1), (carpenters, 1), (locals, 1), (ltd., 1)]",2.0,local,local localy,chrysal africa ltd.,0.45,local,local,1.0
1,fugo,pz cussons,0.27,1,fugo,fugo fungo fuo fug,fugo fug fugo company fuo fugo feeds fugo ltd fungo fugo vits fugo unga fugo feed fugo vit,2.0,fugo,1,"[(fugo, 8), (fug, 1), (company, 1), (fuo, 1), (feeds, 1), (ltd, 1), (fungo, 1), (vits, 1), (unga, 1), (feed, 1), (vit, 1)]",2.0,fugo,fugo fug,nutreco africa,0.36,fugo,fugo,1.0
2,mukurwe-ini dairy,zagro-india,0.41,2,mukurwe-ini,mukurwe-ini,mukurwe-ini dairy,2.0,mukurwe-ini dairy,2,"[(mukurwe, 1), (ini, 1), (dairy, 1)]",2.0,mukurwe-ini dairy,mukurwe ini,iprocure limited,0.44,mukurwe-ini dairy,mukurwe-ini dairy,1.0
3,xxxx,apex,0.22,1,xxxx,xxxx xxx,xxxx xxx,1.0,xxxx,3,"[(xxxx, 1), (xxx, 1)]",1.0,xxxx,xxxx,apex,0.25,xxxx,xxxx,1.0
4,g&g,hebei yuangzheng,0.2,1,g&g,g&g,g&g,1.0,g&g,4,"[(g&g, 1)]",1.0,g&g,g&g,hebei yuangzheng,0.21,g&g,g&g,1.0


In [47]:
df_non_dup['go_to_match'] = np.where(df_non_dup['score'] >= 0.79, df_non_dup['final_match'], df_non_dup['chosen_match'])
df_non_dup.head()

Unnamed: 0,manufacturer_name,best_manufacturer_match,manufacturer_match_score,word_count,manufacturer_slice,match,similar_strings,average_length,common_name,id,word_freq,average_length.1,common_name.1,cluster_name,best_match,best_score,chosen_match,final_match,score,go_to_match
0,local,oic ltd,0.15,1,local,"local localy locals locali local. local, locl loca loal lcal","local lcal loal locl local, localy local manufacture local supplier local nairobi local manufacturer local artisan local juakali loca locali local providers local seller localy made local. local industries local manufactur local any local carpenters locals local ltd.",2.0,local,0,"[(local, 15), (localy, 2), (lcal, 1), (loal, 1), (locl, 1), (, 1), (manufacture, 1), (supplier, 1), (nairobi, 1), (manufacturer, 1), (artisan, 1), (juakali, 1), (loca, 1), (locali, 1), (providers, 1), (seller, 1), (made, 1), (local., 1), (industries, 1), (manufactur, 1), (any, 1), (carpenters, 1), (locals, 1), (ltd., 1)]",2.0,local,local localy,chrysal africa ltd.,0.45,local,local,1.0,local
1,fugo,pz cussons,0.27,1,fugo,fugo fungo fuo fug,fugo fug fugo company fuo fugo feeds fugo ltd fungo fugo vits fugo unga fugo feed fugo vit,2.0,fugo,1,"[(fugo, 8), (fug, 1), (company, 1), (fuo, 1), (feeds, 1), (ltd, 1), (fungo, 1), (vits, 1), (unga, 1), (feed, 1), (vit, 1)]",2.0,fugo,fugo fug,nutreco africa,0.36,fugo,fugo,1.0,fugo
2,mukurwe-ini dairy,zagro-india,0.41,2,mukurwe-ini,mukurwe-ini,mukurwe-ini dairy,2.0,mukurwe-ini dairy,2,"[(mukurwe, 1), (ini, 1), (dairy, 1)]",2.0,mukurwe-ini dairy,mukurwe ini,iprocure limited,0.44,mukurwe-ini dairy,mukurwe-ini dairy,1.0,mukurwe-ini dairy
3,xxxx,apex,0.22,1,xxxx,xxxx xxx,xxxx xxx,1.0,xxxx,3,"[(xxxx, 1), (xxx, 1)]",1.0,xxxx,xxxx,apex,0.25,xxxx,xxxx,1.0,xxxx
4,g&g,hebei yuangzheng,0.2,1,g&g,g&g,g&g,1.0,g&g,4,"[(g&g, 1)]",1.0,g&g,g&g,hebei yuangzheng,0.21,g&g,g&g,1.0,g&g


In [48]:
df_non_dup[df_non_dup['score'] < 0.8]

Unnamed: 0,manufacturer_name,best_manufacturer_match,manufacturer_match_score,word_count,manufacturer_slice,match,similar_strings,average_length,common_name,id,word_freq,average_length.1,common_name.1,cluster_name,best_match,best_score,chosen_match,final_match,score,go_to_match
142,baoding sunlight herb medicament co,shandong xiangrui pharmaceutical co. ltd,0.39,5,baoding sunlight,baoding sunlight boading sunlight boarding sunlight boading sunliht baoding sunlight herb,"baoding sunlight herb baoding sunlight herb medicament co boading sunlight herb boading sunliht herb medicament co.,ltd boading sunlight herb medicament co.ltd boarding sunlight herd baoding sunlight herb medicament co.ltd baoding sunlight herb medicament co ltd. baoding sunlight herb medicament co. ltd boading sunlight co ltd baoding sunlight herb medication co ltd boading sunlight herb medlcament co,ltd boarding sunlight herb baoding sunlight herb medicament ltd boading sunlight herb medicament ltd. baoding sunlight herb medcament co ltd boading sunlight herb mgt boading sunlight herb medicament baoding sunlight herb medicament",5.0,baoding sunlight herb,51,"[(sunlight, 18), (herb, 17), (medicament, 10), (baoding, 9), (boading, 8), (ltd, 7), (co, 6), (co., 2), (co.ltd, 2), (boarding, 2), (ltd., 2), (sunliht, 1), (herd, 1), (medication, 1), (medlcament, 1), (medcament, 1), (mgt, 1)]",5.0,baoding sunlight herb,sunlight herb medicament baoding boading,hebei yuangzheng & baoding jizhong,0.51,baoding sunlight herb,baoding sunlight herb,0.75,baoding sunlight herb
335,osho ferrox,fanisi fertilizer,0.41,2,osho,osho,osho osho ferrox osho industry osho ltd osho limited osho company osho chemical osho chemicals osho chemical.,2.0,osho,7,"[(osho, 9), (ferrox, 1), (industry, 1), (ltd, 1), (limited, 1), (company, 1), (chemical, 1), (chemicals, 1), (chemical., 1)]",2.0,osho,osho ferrox,fanisi fertilizer,0.43,osho,osho,0.53,osho
380,hollarnd,solagro kenya ltd.,0.30,1,hollarnd,holland hollarnd hollan holand polland hollond hollard,holland minerals hollarnd holland lndustry holand zaden holland ltd holland indus. holland supplement holland industry holland ltd hollan hollard hollond holland holand polland holland industries hollard industries,2.0,holland minerals,243,"[(holland, 9), (holand, 2), (ltd, 2), (hollard, 2), (industries, 2), (minerals, 1), (hollarnd, 1), (lndustry, 1), (zaden, 1), (indus., 1), (supplement, 1), (industry, 1), (hollan, 1), (hollond, 1), (polland, 1)]",2.0,holland minerals,holland holand,bidco land o'lakes,0.56,holland minerals,holland minerals,0.58,holland minerals
391,mavuno bora,miavit gmbh-germany,0.19,2,mavuno,mavuno,mavuno mavuno bora mavuno smart mavuno company mavuno feeds,2.0,mavuno,26,"[(mavuno, 5), (bora, 1), (smart, 1), (company, 1), (feeds, 1)]",2.0,mavuno,mavuno bora,miavit gmbh-germany,0.20,mavuno,mavuno,0.71,mavuno
533,flat 1.5×1.2mm,farmate ltd.,0.37,2,flat,flat,flat 3/4×1.2mm flat 1.5×1.2mm flat plastic flat iron,2.0,flat 3/4×1.2mm,400,"[(flat, 4), (3, 1), (4×1.2mm, 1), (1.5×1.2mm, 1), (plastic, 1), (iron, 1)]",2.0,flat 3/4×1.2mm,flat 3,farmate ltd.,0.44,flat 3/4×1.2mm,flat 3/4×1.2mm,0.79,flat 3/4×1.2mm
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18604,umoja ltd,mea ltd.,0.67,2,umoja,umoja,umoja umoja sons umoja ltd,2.0,umoja,4893,"[(umoja, 3), (sons, 1), (ltd, 1)]",2.0,umoja,umoja sons,pz cussons,0.50,umoja,umoja,0.71,umoja
18607,bellachem limited,glumar chemicals limited,0.67,2,bellachem,bell chem bellachem bell chemcal,bell chem africa ltd bellachem bellachem ltd. bellachem ltd bellachem limited,2.0,bell chem africa ltd,5481,"[(bellachem, 4), (ltd, 2), (bell, 1), (chem, 1), (africa, 1), (ltd., 1), (limited, 1)]",2.0,bell chem africa ltd,bellachem ltd,farmchem ltd.,0.69,bell chem africa ltd,glumar chemicals limited,0.68,bell chem africa ltd
18611,vital crop ltd,vetagro ltd.,0.67,3,vital crop,vital crop,vital crop science vital crop science inc vital crop ltd,3.0,vital crop science,4458,"[(vital, 3), (crop, 3), (science, 2), (inc, 1), (ltd, 1)]",3.0,vital crop science,vital crop science,stenna crop science ltd.,0.71,vital crop science,vetagro ltd.,0.69,vital crop science
18612,gloria ltd,oic ltd,0.67,2,gloria,lagloria gloria,lagloria gloria ltd,2.0,lagloria,1753,"[(lagloria, 1), (gloria, 1), (ltd, 1)]",2.0,lagloria,lagloria gloria,agripac africa ltd.,0.41,lagloria,oic ltd,0.71,lagloria


In [62]:
df_manufacturer

Unnamed: 0,manufacturer_name,best_manufacturer_match,manufacturer_match_score
5,local,oic ltd,0.15
16,fugo,pz cussons,0.27
22,local,oic ltd,0.15
31,mukurwe-ini dairy,zagro-india,0.41
33,xxxx,apex,0.22
...,...,...,...
25782,jojemi agri ventures ltd.,afri ventures,0.67
25783,triachem tanzania ltd,agrichem africa ltd.,0.67
25784,tags limted,chiapas limited,0.67
25785,regal pharmaceutical,hebei yuanzheng pharmaceutical,0.67


In [63]:
df_manufacturer = pd.merge(df_manufacturer, df_non_dup[['manufacturer_name', 'go_to_match']], how='left',on='manufacturer_name')
df_manufacturer.head()

Unnamed: 0,manufacturer_name,best_manufacturer_match,manufacturer_match_score,go_to_match
0,local,oic ltd,0.15,local localy
1,fugo,pz cussons,0.27,fugo fug
2,local,oic ltd,0.15,local localy
3,mukurwe-ini dairy,zagro-india,0.41,mukurwe ini
4,xxxx,apex,0.22,xxxx


In [64]:
df_manufacturer[:50]

Unnamed: 0,manufacturer_name,best_manufacturer_match,manufacturer_match_score,go_to_match
0,local,oic ltd,0.15,local localy
1,fugo,pz cussons,0.27,fugo fug
2,local,oic ltd,0.15,local localy
3,mukurwe-ini dairy,zagro-india,0.41,mukurwe ini
4,xxxx,apex,0.22,xxxx
5,g&g,hebei yuangzheng,0.2,g&g
6,rainbow,rainbow agro sciences ltd.,0.41,rainbow agro
7,ttt,aesthetics ltd.,0.32,ttt
8,osho,basco,0.2,osho ferrox
9,chloride exide,chemline africa ltd.,0.34,chloride exide


In [66]:
df_man = df_man.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)
df_man

Unnamed: 0,manufacturer_name,best_manufacturer_match,manufacturer_match_score
0,highchem essentials ltd.,highchem essentials ltd.,0.98
1,bimeda ltd.,bimeda ltd.,0.96
2,simlaw seeds company ltd.,simlaw seeds company ltd.,0.98
3,ultravetis east africa ltd.,ultravetis east africa ltd.,0.98
4,cooper k-brands ltd.,cooper k-brands ltd.,0.98
...,...,...,...
25782,jojemi agri ventures ltd.,afri ventures,0.67
25783,triachem tanzania ltd,agrichem africa ltd.,0.67
25784,tags limted,chiapas limited,0.67
25785,regal pharmaceutical,hebei yuanzheng pharmaceutical,0.67


In [67]:
dnd_df_man = df_man.copy()

In [68]:
df_man = pd.merge(df_man, df_manufacturer[['manufacturer_name', 'go_to_match']], how='left',on='manufacturer_name')
df_man

Unnamed: 0,manufacturer_name,best_manufacturer_match,manufacturer_match_score,go_to_match
0,highchem essentials ltd.,highchem essentials ltd.,0.98,
1,bimeda ltd.,bimeda ltd.,0.96,
2,simlaw seeds company ltd.,simlaw seeds company ltd.,0.98,
3,ultravetis east africa ltd.,ultravetis east africa ltd.,0.98,
4,cooper k-brands ltd.,cooper k-brands ltd.,0.98,
...,...,...,...,...
40092,tags limted,chiapas limited,0.67,tags limted
40093,regal pharmaceutical,hebei yuanzheng pharmaceutical,0.67,regal real
40094,regal pharmaceutical,hebei yuanzheng pharmaceutical,0.67,regal real
40095,regal pharmaceutical,hebei yuanzheng pharmaceutical,0.67,regal real


In [69]:
df_man['go_to_match'] = np.where(df_man['go_to_match'].isna(), df_man['best_manufacturer_match'], df_man['go_to_match'])
df_man

Unnamed: 0,manufacturer_name,best_manufacturer_match,manufacturer_match_score,go_to_match
0,highchem essentials ltd.,highchem essentials ltd.,0.98,highchem essentials ltd.
1,bimeda ltd.,bimeda ltd.,0.96,bimeda ltd.
2,simlaw seeds company ltd.,simlaw seeds company ltd.,0.98,simlaw seeds company ltd.
3,ultravetis east africa ltd.,ultravetis east africa ltd.,0.98,ultravetis east africa ltd.
4,cooper k-brands ltd.,cooper k-brands ltd.,0.98,cooper k-brands ltd.
...,...,...,...,...
40092,tags limted,chiapas limited,0.67,tags limted
40093,regal pharmaceutical,hebei yuanzheng pharmaceutical,0.67,regal real
40094,regal pharmaceutical,hebei yuanzheng pharmaceutical,0.67,regal real
40095,regal pharmaceutical,hebei yuanzheng pharmaceutical,0.67,regal real


In [71]:
df_man = df_man[['manufacturer_name', 'go_to_match']]
df_man = df_man.drop_duplicates(subset=['manufacturer_name'], keep='first')
len(df_man)

20051

In [72]:
df_man.to_csv('final_clustered_manufacturers.csv', index = False)

In [73]:
df_man.isna().sum()

manufacturer_name    1
go_to_match          1
dtype: int64