In [1]:
import pandas as pd
import numpy as np
import re
from difflib import SequenceMatcher, get_close_matches
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings("ignore")

In [3]:
# loading the product list
iprocure_prods_df = pd.read_excel('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/data_v2/product_list.xlsx')
cols = ['Product Name', 'Type']
iprocure_prods_df = iprocure_prods_df[cols]

# dropping product_name duplicates
iprocure_prods_df = iprocure_prods_df.drop_duplicates(keep='first').reset_index(drop=True)

pd.set_option('display.max_columns',None)
iprocure_prods_df.head()

Unnamed: 0,Product Name,Type
0,Aviboost Aqua Block,1kg
1,Aviboost CL-X Blue,1lt
2,Aviboost CL-X Blue,100ml
3,Aviboost Nutri Block,1kg
4,Aviboost Spectrum,1lt


In [4]:
iprocure_prods_df.isna().sum()

Product Name    0
Type            0
dtype: int64

In [5]:
product_type_df = pd.read_csv('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/data_v2/dirty_product_types.csv')
product_type_df.head()

Unnamed: 0,correct_product_match,product_type
0,pantoxy 20℅ 10%,250mls
1,pharmguard,5g
2,tabs cream,copies
3,uwezo dairy meal,copies
4,cargo box,3 way


In [6]:
# to lowercase
iprocure_prods_df[['Product Name', 'Type']] = iprocure_prods_df[['Product Name', 'Type']].applymap(lambda x: str(x).lower().strip())
product_type_df[['correct_product_match', 'product_type']] = product_type_df[['correct_product_match', 'product_type']].applymap(lambda x: str(x).lower().strip())

In [9]:
iprocure_prods_df.rename(columns={'Product Name': 'product_name'}, inplace=True)
product_type_df.rename(columns={'correct_product_match': 'product_name'}, inplace=True)
# product_type_df = product_type_df.merge(iprocure_prods_df, on='product_name', how='left')

In [10]:
product_type_df.shape

(3115105, 2)

In [11]:
product_type_df = product_type_df.drop_duplicates(subset=['product_name', 'product_type'], keep='first').reset_index(drop=True)
product_type_df.head()

Unnamed: 0,product_name,product_type
0,pantoxy 20℅ 10%,250mls
1,pharmguard,5g
2,tabs cream,copies
3,uwezo dairy meal,copies
4,cargo box,3 way


In [12]:
product_type_df.shape

(99983, 2)

In [13]:
product_type_df.isna().sum()

product_name    0
product_type    0
dtype: int64

In [14]:
# Function to extract the number and unit parts
def extract_parts(value):
    # pattern = r'(\d+(?:\.\d+)?|½|¼|¾|[1-9][0-9]*(?:\/[1-9][0-9]*)?)(\D+)'
    pattern = r'(\d+(?:\.\d+)?|½|¼|¾|\d+\/\d+)(\s*[a-zA-Z]+)'
    matches = re.match(pattern, value)

    if matches:
        digits_part = matches.group(1)
        letters_part = matches.group(2)
        return digits_part, letters_part.strip()
    else:
        return value, value

# Apply the function to the column and create new columns
product_type_df[['number', 'unit']] = product_type_df['product_type'].apply(lambda x: pd.Series(extract_parts(str(x))))
iprocure_prods_df[['number', 'unit']] = iprocure_prods_df['Type'].apply(lambda x: pd.Series(extract_parts(str(x))))

# Print the updated DataFrame
display(product_type_df.head())
display(iprocure_prods_df.head())

Unnamed: 0,product_name,product_type,number,unit
0,pantoxy 20℅ 10%,250mls,250,mls
1,pharmguard,5g,5,g
2,tabs cream,copies,copies,copies
3,uwezo dairy meal,copies,copies,copies
4,cargo box,3 way,3,way


Unnamed: 0,product_name,Type,number,unit
0,aviboost aqua block,1kg,1,kg
1,aviboost cl-x blue,1lt,1,lt
2,aviboost cl-x blue,100ml,100,ml
3,aviboost nutri block,1kg,1,kg
4,aviboost spectrum,1lt,1,lt


In [12]:
# cleaning category column
units = iprocure_prods_df['unit'].unique().tolist()

# wrong_units_df = product_type_df[~product_type_df['product_type'].isin(types)]
units_df = product_type_df['unit'].drop_duplicates(keep='first').reset_index(drop=True).to_frame()

# cleanup function
def compare(i):
    comparison = {}
    if isinstance(i, str):
       comparison.update({i: get_close_matches(i, units, n=1, cutoff=0.1)})
    unit = list(comparison.keys()) if comparison else None
    match = []
    score = []
    if comparison:
       for key, value in comparison.items():
           if value:
              match.append(value[0])
              score.append(round(SequenceMatcher(None, i, value[0]).ratio(), 2))
           else:
              match.append(None)
              score.append(None)
    else:
       match.append(None)
       score.append(None)

    return pd.Series([unit, match, score], index = ['type', 'match', 'score'])

cleaned_types_df = pd.DataFrame()
cleaned_types_df[['unit', 'match', 'score']] = units_df['unit'].apply(lambda x: compare(x))
cleaned_types_df = cleaned_types_df.applymap(lambda x: x[0] if x else '')
cleaned_types_df.head()

Unnamed: 0,unit,match,score
0,mls,ml,0.8
1,g,g,1.0
2,copies,pess,0.6
3,way,pair,0.29
4,ml,ml,1.0


In [31]:
# cleaned_types_df[(cleaned_types_df['score'] >= 0.67) & (cleaned_types_df['score'] < 0.7)][:50]

In [32]:
units_df.shape

(5902, 1)

In [33]:
to_cluster_df = cleaned_types_df[cleaned_types_df['score'] < 0.65]
to_cluster_df.head()

Unnamed: 0,unit,match,score
2,copies,pess,0.6
3,way,pair,0.29
7,assorted,seeds,0.46
9,tabs,tablet,0.6
10,assorted (3gm),seeds,0.32


In [34]:
matched = []
def compare(i):
    compare = {}
    if i in matched:
        compare.update({i: ''})
    else:
        compare.update({i: get_close_matches(i, to_cluster_df['unit'].to_list(), 20, 0.7)})
    # matched.extend(compare.values())
    matched.extend([item for sublist in compare.values() for item in sublist])
    manufacturer_slice = list(compare.keys())
    match = []
    for key, items in compare.items():
      match.append(items)
    return pd.Series([manufacturer_slice, match],index=['manufacturer_slice', 'match'])


cleaned__df = pd.DataFrame()
cleaned__df[['unit', 'match']] = to_cluster_df['unit'].apply(lambda x: compare(x))
cleaned__df = cleaned__df.applymap(lambda x: x[0] if x else '')
cleaned__df.head()

Unnamed: 0,unit,match
2,copies,[copies]
3,way,"[way, ways, waya, 4-way]"
7,assorted,"[assorted, assoerted, asorted, asdorted, arsor..."
9,tabs,"[tabs, tabs., tas, tsbs, tqbs, tbts, tans, tab..."
10,assorted (3gm),


In [35]:
pd.set_option('display.max_colwidth', None)
cleaned__df.reset_index(drop=True, inplace=True)
cleaned__df

Unnamed: 0,unit,match
0,copies,[copies]
1,way,"[way, ways, waya, 4-way]"
2,assorted,"[assorted, assoerted, asorted, asdorted, arsorted, asoterd, asstd, assrt, assorted 6pcs, seeds assorted, assorted (3gm), asotwd]"
3,tabs,"[tabs, tabs., tas, tsbs, tqbs, tbts, tans, tabd, taba, tabs bp, tabs 2', tab 24s]"
4,assorted (3gm),
...,...,...
4995,green 20litres,
4996,helper main fsr,"[helper main fsr, helper main frr, helper 2nd frr]"
4997,2654403,[2654403]
4998,3/4* 60fts,


In [36]:
cleaned__df['unit'] = cleaned__df['unit'].astype('str')
for i, row in cleaned__df.iterrows():
    string = row['unit']
    lst = row['match']

    # Check if the string exists in any list of previous rows
    if not lst:
        for prev_i in range(i):
            prev_lst = cleaned__df.at[prev_i, 'match']
            if string in prev_lst:
                cleaned__df.at[i, 'match'] = prev_lst
                break  # Stop searching after finding the first match

# Reset the index of the DataFrame
# cleaned_manufacturers_df_subset = cleaned_manufacturers_df_subset.reset_index(drop=True)

pd.set_option('display.max_colwidth', None)
cleaned__df

Unnamed: 0,unit,match
0,copies,[copies]
1,way,"[way, ways, waya, 4-way]"
2,assorted,"[assorted, assoerted, asorted, asdorted, arsorted, asoterd, asstd, assrt, assorted 6pcs, seeds assorted, assorted (3gm), asotwd]"
3,tabs,"[tabs, tabs., tas, tsbs, tqbs, tbts, tans, tabd, taba, tabs bp, tabs 2', tab 24s]"
4,assorted (3gm),"[assorted, assoerted, asorted, asdorted, arsorted, asoterd, asstd, assrt, assorted 6pcs, seeds assorted, assorted (3gm), asotwd]"
...,...,...
4995,green 20litres,"[green 10litres, green 20litres, ripple green 1litre]"
4996,helper main fsr,"[helper main fsr, helper main frr, helper 2nd frr]"
4997,2654403,[2654403]
4998,3/4* 60fts,"[3/4*60, 3/4""*60, 3/4*20, 3/4""*60ft, 3/4* 60fts, 3/4-60ft, 3/4 60ft]"


In [37]:
cleaned__df['match_concat'] = cleaned__df['match'].apply(lambda x:' '.join(x))
# cleaned__df['match_split'] = cleaned__df['match_concat'].str.split()
cleaned__df['match_split'] = cleaned__df['match_concat'].apply(lambda x: re.split(r'\s+|-|\(|\)|/|\\|\||,', x))

cleaned__df

Unnamed: 0,unit,match,match_concat,match_split
0,copies,[copies],copies,[copies]
1,way,"[way, ways, waya, 4-way]",way ways waya 4-way,"[way, ways, waya, 4, way]"
2,assorted,"[assorted, assoerted, asorted, asdorted, arsorted, asoterd, asstd, assrt, assorted 6pcs, seeds assorted, assorted (3gm), asotwd]",assorted assoerted asorted asdorted arsorted asoterd asstd assrt assorted 6pcs seeds assorted assorted (3gm) asotwd,"[assorted, assoerted, asorted, asdorted, arsorted, asoterd, asstd, assrt, assorted, 6pcs, seeds, assorted, assorted, , 3gm, , asotwd]"
3,tabs,"[tabs, tabs., tas, tsbs, tqbs, tbts, tans, tabd, taba, tabs bp, tabs 2', tab 24s]",tabs tabs. tas tsbs tqbs tbts tans tabd taba tabs bp tabs 2' tab 24s,"[tabs, tabs., tas, tsbs, tqbs, tbts, tans, tabd, taba, tabs, bp, tabs, 2', tab, 24s]"
4,assorted (3gm),"[assorted, assoerted, asorted, asdorted, arsorted, asoterd, asstd, assrt, assorted 6pcs, seeds assorted, assorted (3gm), asotwd]",assorted assoerted asorted asdorted arsorted asoterd asstd assrt assorted 6pcs seeds assorted assorted (3gm) asotwd,"[assorted, assoerted, asorted, asdorted, arsorted, asoterd, asstd, assrt, assorted, 6pcs, seeds, assorted, assorted, , 3gm, , asotwd]"
...,...,...,...,...
4995,green 20litres,"[green 10litres, green 20litres, ripple green 1litre]",green 10litres green 20litres ripple green 1litre,"[green, 10litres, green, 20litres, ripple, green, 1litre]"
4996,helper main fsr,"[helper main fsr, helper main frr, helper 2nd frr]",helper main fsr helper main frr helper 2nd frr,"[helper, main, fsr, helper, main, frr, helper, 2nd, frr]"
4997,2654403,[2654403],2654403,[2654403]
4998,3/4* 60fts,"[3/4*60, 3/4""*60, 3/4*20, 3/4""*60ft, 3/4* 60fts, 3/4-60ft, 3/4 60ft]","3/4*60 3/4""*60 3/4*20 3/4""*60ft 3/4* 60fts 3/4-60ft 3/4 60ft","[3, 4*60, 3, 4""*60, 3, 4*20, 3, 4""*60ft, 3, 4*, 60fts, 3, 4, 60ft, 3, 4, 60ft]"


In [38]:
# extracting most common words from each cluster in order
cluster_word_freq = {}


for id, row in cleaned__df.iterrows():
    cluster = row['match_concat']

    words = re.split(r'\s+|-|\(|\)|/|\\|\||,', cluster)
    words = [word for word in words if word.strip()]

    for word in words:
        if id in cluster_word_freq:
            cluster_word_freq[id][word] = cluster_word_freq[id].get(word, 0) + 1
        else:
            cluster_word_freq[id] = {word: 1}

for id in cluster_word_freq:
    cluster_word_freq[id] = sorted(cluster_word_freq[id].items(), key=lambda x: x[1], reverse=True)
    # cluster_word_freq[id] = list(cluster_word_freq[id].items())
    # cluster_word_freq[id] = cluster_word_freq[id].items()

cluster_word_freq_df = pd.DataFrame.from_dict(cluster_word_freq.items())
cluster_word_freq_df.rename(columns={0: 'id', 1: 'word_freq'}, inplace=True)
cluster_word_freq_df.head()

Unnamed: 0,id,word_freq
0,0,"[(copies, 1)]"
1,1,"[(way, 2), (ways, 1), (waya, 1), (4, 1)]"
2,2,"[(assorted, 4), (assoerted, 1), (asorted, 1), (asdorted, 1), (arsorted, 1), (asoterd, 1), (asstd, 1), (assrt, 1), (6pcs, 1), (seeds, 1), (3gm, 1), (asotwd, 1)]"
3,3,"[(tabs, 3), (tabs., 1), (tas, 1), (tsbs, 1), (tqbs, 1), (tbts, 1), (tans, 1), (tabd, 1), (taba, 1), (bp, 1), (2', 1), (tab, 1), (24s, 1)]"
4,4,"[(assorted, 4), (assoerted, 1), (asorted, 1), (asdorted, 1), (arsorted, 1), (asoterd, 1), (asstd, 1), (assrt, 1), (6pcs, 1), (seeds, 1), (3gm, 1), (asotwd, 1)]"


In [39]:
cluster_word_freq_df['cluster_name'] = cluster_word_freq_df['word_freq'].apply(lambda x: ''.join(word[0] for word in x[:1]))
cluster_word_freq_df[60:70]

Unnamed: 0,id,word_freq,cluster_name
60,61,"[(tabs, 8), (5mg, 2), (375mg, 1), (75mg, 1), (tabs500mg, 1), (20mg, 1), (50mg, 1), (15mg, 1), (0.135mg, 1), (tab, 1), (360mg, 1)]",tabs
61,62,"[(6's, 1), (6', 1)]",6's
62,63,"[(cap, 1), (caps, 1), (capd, 1), (cp, 1), (ca, 1), (cheap, 1)]",cap
63,64,"[(foliar, 5), (1ltr, 1), (5ltrs, 1), (3ltrs, 1), (2ltrs, 1)]",foliar
64,65,"[(continental, 1), (container, 1)]",continental
65,66,"[(tabs, 3), (tabs., 1), (tas, 1), (tsbs, 1), (tqbs, 1), (tbts, 1), (tans, 1), (tabd, 1), (taba, 1), (bp, 1), (2', 1), (tab, 1), (24s, 1)]",tabs
66,67,"[(fertilizer, 5), (i, 1), (kg, 1), (1kg, 1), (frertilizer, 1), (fertilizers, 1), (fertiliser, 1), (fartilizer, 1), (500ml, 1), (100ml, 1)]",fertilizer
67,68,"[(dairy, 18), (meal, 17), (20kgs, 3), (50kgs, 3), (10kgs, 3), (tuvune, 3), (70kg, 2), (50kg, 2), (10kg, 2), (fugo, 2), (empire, 2), (dairymeal, 1), (25kgs, 1), (max, 1), (1kg, 1), (20kg, 1)]",dairy
68,69,"[(cc, 1)]",cc
69,70,"[(special, 2), (pcs, 1), (spiral, 1), (pencil, 1)]",special


In [40]:
cluster_names = cluster_word_freq_df['cluster_name'].to_list()

def find_cluster_name(string):
    for i in cluster_names:
        if i in string:
            return i

cleaned__df['cluster_name'] = cleaned__df['match_split'].apply(find_cluster_name)
cleaned__df[60:70]

Unnamed: 0,unit,match,match_concat,match_split,cluster_name
60,"4*6""","[4*6, 4*6"", 4*26, 46]","4*6 4*6"" 4*26 46","[4*6, 4*6"", 4*26, 46]",4*6
61,tabs-375mg,"[tabs-375mg, tabs 75mg, tabs 5mg, tabs500mg, tabs-20mg, tabs 50mg, tabs 15mg, tabs 0.135mg, tab 5mg, tabs 360mg]",tabs-375mg tabs 75mg tabs 5mg tabs500mg tabs-20mg tabs 50mg tabs 15mg tabs 0.135mg tab 5mg tabs 360mg,"[tabs, 375mg, tabs, 75mg, tabs, 5mg, tabs500mg, tabs, 20mg, tabs, 50mg, tabs, 15mg, tabs, 0.135mg, tab, 5mg, tabs, 360mg]",tabs
62,6's,"[6's, 6']",6's 6',"[6's, 6']",6's
63,caps,"[cap, caps, capd, cp, ca, cheap]",cap caps capd cp ca cheap,"[cap, caps, capd, cp, ca, cheap]",cap
64,foliar 1ltr,"[foliar 1ltr, foliar 5ltrs, foliar 3ltrs, foliar 2ltrs, foliar]",foliar 1ltr foliar 5ltrs foliar 3ltrs foliar 2ltrs foliar,"[foliar, 1ltr, foliar, 5ltrs, foliar, 3ltrs, foliar, 2ltrs, foliar]",foliar
65,continental,"[continental, container]",continental container,"[continental, container]",continental
66,tabd,"[tabs, tabs., tas, tsbs, tqbs, tbts, tans, tabd, taba, tabs bp, tabs 2', tab 24s]",tabs tabs. tas tsbs tqbs tbts tans tabd taba tabs bp tabs 2' tab 24s,"[tabs, tabs., tas, tsbs, tqbs, tbts, tans, tabd, taba, tabs, bp, tabs, 2', tab, 24s]",tabs
67,fertilizer i kg,"[fertilizer i kg, fertilizer 1kg, fertilizer, frertilizer, fertilizers, fertiliser, fartilizer, fertilizer 500ml, fertilizer 100ml]",fertilizer i kg fertilizer 1kg fertilizer frertilizer fertilizers fertiliser fartilizer fertilizer 500ml fertilizer 100ml,"[fertilizer, i, kg, fertilizer, 1kg, fertilizer, frertilizer, fertilizers, fertiliser, fartilizer, fertilizer, 500ml, fertilizer, 100ml]",fertilizer
68,dairy meal (70kg),"[dairy meal (70kg), dairy meal (50kg), dairy meal (10kg), dairy meal (20kgs), dairy meal 10kg, dairy meal 50kgs, dairy meal 10kgs, dairymeal 50kg, dairy meal 25kgs, max dairy meal 20kgs, dairy meal, dairy 70kg, fugo dairy meal 10kgs, dairy meal fugo 1kg, empire dairy meal 20kg, tuvune dairy meal 50kgs, tuvune dairy meal 20kgs, tuvune dairy meal 10kgs, empire dairy meal 50kgs]",dairy meal (70kg) dairy meal (50kg) dairy meal (10kg) dairy meal (20kgs) dairy meal 10kg dairy meal 50kgs dairy meal 10kgs dairymeal 50kg dairy meal 25kgs max dairy meal 20kgs dairy meal dairy 70kg fugo dairy meal 10kgs dairy meal fugo 1kg empire dairy meal 20kg tuvune dairy meal 50kgs tuvune dairy meal 20kgs tuvune dairy meal 10kgs empire dairy meal 50kgs,"[dairy, meal, , 70kg, , dairy, meal, , 50kg, , dairy, meal, , 10kg, , dairy, meal, , 20kgs, , dairy, meal, 10kg, dairy, meal, 50kgs, dairy, meal, 10kgs, dairymeal, 50kg, dairy, meal, 25kgs, max, dairy, meal, 20kgs, dairy, meal, dairy, 70kg, fugo, dairy, meal, 10kgs, dairy, meal, fugo, 1kg, empire, dairy, meal, 20kg, tuvune, dairy, meal, 50kgs, tuvune, dairy, meal, 20kgs, tuvune, dairy, meal, 10kgs, empire, dairy, meal, 50kgs]",fugo
69,cc,[cc],cc,[cc],cc


In [41]:
cleaned__df[cleaned__df['cluster_name'].isna()]

Unnamed: 0,unit,match,match_concat,match_split,cluster_name
40,-,[-],-,"[, ]",
4433,--,"[--, ---]",-- ---,"[, , , , , , ]",
4576,---,"[--, ---]",-- ---,"[, , , , , , ]",


In [49]:
clean_ones = cleaned_types_df[cleaned_types_df['score'] >= 0.65]
clean_ones

Unnamed: 0,unit,match,score
0,mls,ml,0.80
1,g,g,1.00
4,ml,ml,1.00
5,gms,gm,0.80
6,50,50,1.00
...,...,...,...
5869,bolusf,bolus,0.91
5887,tablet 25mg,tablet,0.71
5888,datacable,tablet,0.67
5891,xps,pcs,0.67


In [45]:
# cleaned_types_df[(cleaned_types_df['score'] >= 0.6) & (cleaned_types_df['score'] < 0.65)][:50]

In [50]:
new_product_type_df = product_type_df.merge(clean_ones[['unit', 'match']], how='left', on='unit')
new_product_type_df

Unnamed: 0,product_name,product_type,number,unit,match
0,pantoxy 20℅ 10%,250mls,250,mls,ml
1,pharmguard,5g,5,g,g
2,tabs cream,copies,copies,copies,
3,uwezo dairy meal,copies,copies,copies,
4,cargo box,3 way,3,way,
...,...,...,...,...,...
99990,wuxal basis,500mls,500,mls,ml
99991,tzex 500,tablet,tablet,tablet,tablet
99992,becomplex,tablet,tablet,tablet,tablet
99993,bran,tablet,tablet,tablet,tablet


In [51]:
new_product_type_df = new_product_type_df.merge(cleaned__df[['unit', 'cluster_name']], on='unit', how='left')
new_product_type_df

Unnamed: 0,product_name,product_type,number,unit,match,cluster_name
0,pantoxy 20℅ 10%,250mls,250,mls,ml,
1,pharmguard,5g,5,g,g,
2,tabs cream,copies,copies,copies,,copies
3,uwezo dairy meal,copies,copies,copies,,copies
4,cargo box,3 way,3,way,,way
...,...,...,...,...,...,...
99990,wuxal basis,500mls,500,mls,ml,
99991,tzex 500,tablet,tablet,tablet,tablet,
99992,becomplex,tablet,tablet,tablet,tablet,
99993,bran,tablet,tablet,tablet,tablet,


In [53]:
new_product_type_df[:50]

Unnamed: 0,product_name,product_type,number,unit,match,cluster_name
0,pantoxy 20℅ 10%,250mls,250,mls,ml,
1,pharmguard,5g,5,g,g,
2,tabs cream,copies,copies,copies,,copies
3,uwezo dairy meal,copies,copies,copies,,copies
4,cargo box,3 way,3,way,,way
5,force one plus,135ml,135,ml,ml,
6,weed master 500 sc,250gms,250,gms,gm,
7,fugo layers compleat meal,50,50,50,50,
8,starter j,50,50,50,50,
9,chick & duck mash,50,50,50,50,


In [54]:
new_product_type_df['match'] = np.where(new_product_type_df['match'].isna(), new_product_type_df['cluster_name'], new_product_type_df['match'])
new_product_type_df[:50]

Unnamed: 0,product_name,product_type,number,unit,match,cluster_name
0,pantoxy 20℅ 10%,250mls,250,mls,ml,
1,pharmguard,5g,5,g,g,
2,tabs cream,copies,copies,copies,copies,copies
3,uwezo dairy meal,copies,copies,copies,copies,copies
4,cargo box,3 way,3,way,way,way
5,force one plus,135ml,135,ml,ml,
6,weed master 500 sc,250gms,250,gms,gm,
7,fugo layers compleat meal,50,50,50,50,
8,starter j,50,50,50,50,
9,chick & duck mash,50,50,50,50,


In [55]:
new_product_type_df.isna().sum()

product_name        0
product_type        0
number              0
unit                0
match             615
cluster_name    84499
dtype: int64

In [58]:
new_product_type_df['match'] = np.where(new_product_type_df['match'].isna(), new_product_type_df['unit'], new_product_type_df['match'])
new_product_type_df[:50]

Unnamed: 0,product_name,product_type,number,unit,match,cluster_name
0,pantoxy 20℅ 10%,250mls,250,mls,ml,
1,pharmguard,5g,5,g,g,
2,tabs cream,copies,copies,copies,copies,copies
3,uwezo dairy meal,copies,copies,copies,copies,copies
4,cargo box,3 way,3,way,way,way
5,force one plus,135ml,135,ml,ml,
6,weed master 500 sc,250gms,250,gms,gm,
7,fugo layers compleat meal,50,50,50,50,
8,starter j,50,50,50,50,
9,chick & duck mash,50,50,50,50,


In [59]:
new_product_type_df.isna().sum()

product_name        0
product_type        0
number              0
unit                0
match               0
cluster_name    84499
dtype: int64

In [60]:
def add_correct_product_type_column(df):
    # df['correct_product_type'] = df.apply(lambda row: row['number'] + row['match'] if row['number'] != row['match'] and row['number'].isalpha() == False  else row['match'], axis=1)
    # df['correct_product_type'] = df.apply(lambda row: row['number'] + row['match'] if row['number'].isdigit() != row['match'].isdigit() else row['match'], axis=1)
    def is_numeric(value):
        pattern = r'^\d+(\.\d+)?|½|¼|¾|\d+\/\d+$'
        return bool(re.match(pattern, value))

    df['correct_product_type'] = df.apply(lambda row: row['number'] + row['match'] if row['number'] != row['match'] and is_numeric(row['number']) and row['match'].isalpha() else row['match'], axis=1)

    # df['correct_product_type'] = df.apply(lambda row: row['number'] + row['match'] if row['number'] != row['match'] else row['match'], axis=1)

new_product_type_df_copy = new_product_type_df.copy()

new_product_type_df_copy['number'] = new_product_type_df_copy['number'].astype('str')
new_product_type_df_copy['match'] = new_product_type_df_copy['match'].astype('str')

# new_product_type_df['correct_product_type'] = new_product_type_df['number'] + new_product_type_df['match']
# new_product_type_df[:50]


add_correct_product_type_column(new_product_type_df_copy)

In [61]:
new_product_type_df_copy[:50]

Unnamed: 0,product_name,product_type,number,unit,match,cluster_name,correct_product_type
0,pantoxy 20℅ 10%,250mls,250,mls,ml,,250ml
1,pharmguard,5g,5,g,g,,5g
2,tabs cream,copies,copies,copies,copies,copies,copies
3,uwezo dairy meal,copies,copies,copies,copies,copies,copies
4,cargo box,3 way,3,way,way,way,3way
5,force one plus,135ml,135,ml,ml,,135ml
6,weed master 500 sc,250gms,250,gms,gm,,250gm
7,fugo layers compleat meal,50,50,50,50,,50
8,starter j,50,50,50,50,,50
9,chick & duck mash,50,50,50,50,,50


In [63]:
new_product_type_df_copy = new_product_type_df_copy[['product_name', 'product_type', 'correct_product_type']]
new_product_type_df_copy

Unnamed: 0,product_name,product_type,correct_product_type
0,pantoxy 20℅ 10%,250mls,250ml
1,pharmguard,5g,5g
2,tabs cream,copies,copies
3,uwezo dairy meal,copies,copies
4,cargo box,3 way,3way
...,...,...,...
99990,wuxal basis,500mls,500ml
99991,tzex 500,tablet,tablet
99992,becomplex,tablet,tablet
99993,bran,tablet,tablet


In [64]:
new_product_type_df_copy = new_product_type_df_copy.drop_duplicates(subset=['product_type', 'correct_product_type'], keep='first')
new_product_type_df_copy.shape

(10428, 3)

In [65]:
new_product_type_df_copy.to_csv('clean_product_types_new.csv', index = False)

In [79]:
data = pd.read_csv('../../clean_prod_datasets/cleaned_products.csv')
iprocure_product_df = pd.read_excel('../../data/data_v2/product_list.xlsx')

In [80]:
product_list_df = iprocure_product_df[['Product Name', 'Type']].\
                                applymap(lambda x: str(x).lower().strip()).\
                                drop_duplicates(subset=['Product Name', 'Type'], keep='first').\
                                reset_index(drop=True)
                                
df = data[['correct_product_match',	'product_type']].\
                                applymap(lambda x: str(x).lower().strip()).\
                                drop_duplicates(subset=['correct_product_match', 'product_type'], keep='first').\
                                reset_index(drop=True)

In [81]:
df[df['correct_product_match'] == 'gasket clutch block']

Unnamed: 0,correct_product_match,product_type
2822,gasket clutch block,loose
29052,gasket clutch block,cg 150(complete)
29059,gasket clutch block,cg150 incomplete
29627,gasket clutch block,bm 150
29651,gasket clutch block,bj 100
30962,gasket clutch block,unit
37444,gasket clutch block,piece
40874,gasket clutch block,1unit
41729,gasket clutch block,1pc
42066,gasket clutch block,pcs


In [82]:
def extract_parts(value):
    pattern = r'(\d+(?:\.\d+)?|½|¼|¾|\d+\/\d+)(\s*[a-zA-Z]+)'
    matches = re.match(pattern, value)

    if matches:
        digits_part = matches.group(1)
        letters_part = matches.group(2)
        return digits_part, letters_part.strip()
    else:
        return value, value

In [83]:
# applying the function to the column and create new columns
product_list_df[['number', 'unit']] = product_list_df['Type'].apply(lambda x: pd.Series(extract_parts(str(x))))
df[['number', 'unit']] = df['product_type'].apply(lambda x: pd.Series(extract_parts(str(x))))

units = product_list_df['unit'].unique().tolist()
units_df = df['unit'].drop_duplicates(keep='first').reset_index(drop=True).to_frame()

In [84]:
df[df['correct_product_match'] == 'gasket clutch block']

Unnamed: 0,correct_product_match,product_type,number,unit
2822,gasket clutch block,loose,loose,loose
29052,gasket clutch block,cg 150(complete),cg 150(complete),cg 150(complete)
29059,gasket clutch block,cg150 incomplete,cg150 incomplete,cg150 incomplete
29627,gasket clutch block,bm 150,bm 150,bm 150
29651,gasket clutch block,bj 100,bj 100,bj 100
30962,gasket clutch block,unit,unit,unit
37444,gasket clutch block,piece,piece,piece
40874,gasket clutch block,1unit,1,unit
41729,gasket clutch block,1pc,1,pc
42066,gasket clutch block,pcs,pcs,pcs


In [85]:
# cleanup function to clean units
def compare(i):
    comparison = {}
    if isinstance(i, str):
        comparison.update({i: get_close_matches(i, units, n=1, cutoff=0.1)})
    unit = list(comparison.keys()) if comparison else None
    match = []
    score = []
    if comparison:
        for key, value in comparison.items():
            if value:
                match.append(value[0])
                score.append(round(SequenceMatcher(None, i, value[0]).ratio(), 2))
            else:
                match.append(None)
                score.append(None)
    else:
        match.append(None)
        score.append(None)

    return pd.Series([unit, match, score], index = ['unit', 'match', 'score'])

In [86]:
cleaned_types_df = pd.DataFrame()
cleaned_types_df[['unit', 'match', 'score']] = units_df['unit'].apply(lambda x: compare(x))
cleaned_types_df = cleaned_types_df.applymap(lambda x: x[0] if x else '')

In [87]:
cleaned_types_df[cleaned_types_df['unit'] == 'ks/bm150']

Unnamed: 0,unit,match,score
4596,ks/bm150,50,0.4


In [88]:
# clustering unmatched units
to_cluster_df = cleaned_types_df[cleaned_types_df['score'] < 0.7].reset_index(drop=True)

matched = []
def compare(i):
    compare = {}
    if i in matched:
        compare.update({i: ''})
    else:
        compare.update({i: get_close_matches(i, to_cluster_df['unit'].to_list(), 20, 0.7)})
    matched.extend([item for sublist in compare.values() for item in sublist])
    unit = list(compare.keys())
    match = []
    for key, items in compare.items():
        match.append(items)
    return pd.Series([unit, match], index=['unit', 'match'])

In [89]:
cluster_cleaned_df = pd.DataFrame()
cluster_cleaned_df[['unit', 'match']] = to_cluster_df['unit'].apply(lambda x: compare(x))
cluster_cleaned_df = cluster_cleaned_df.applymap(lambda x: x[0] if x else '')

cluster_cleaned_df['unit'] = cluster_cleaned_df['unit'].astype('str')

for i, row in cluster_cleaned_df.iterrows():
    string = row['unit']
    lst = row['match']

    # checking if the string exists in any list of previous rows
    if not lst:
        for prev_i in range(i):
            prev_lst = cluster_cleaned_df.at[prev_i, 'match']
            if string in prev_lst:
                cluster_cleaned_df.at[i, 'match'] = prev_lst
                break  

In [90]:
cluster_cleaned_df[cluster_cleaned_df['unit'] == 'ks/bm150']

Unnamed: 0,unit,match
4285,ks/bm150,"[bm 150, bm150, bm_150, bm 100, pm 1500, max 1..."


In [91]:
cluster_cleaned_df['match_concat'] = cluster_cleaned_df['match'].apply(lambda x:' '.join(x))
cluster_cleaned_df['match_split'] = cluster_cleaned_df['match_concat'].apply(lambda x: re.split(r'\s+|-|\(|\)|/|\\|\||,', x))

In [92]:
cluster_cleaned_df[cluster_cleaned_df['unit'] == 'ks/bm150']

Unnamed: 0,unit,match,match_concat,match_split
4285,ks/bm150,"[bm 150, bm150, bm_150, bm 100, pm 1500, max 1...",bm 150 bm150 bm_150 bm 100 pm 1500 max 150 bm ...,"[bm, 150, bm150, bm_150, bm, 100, pm, 1500, ma..."


In [68]:
# # extracting most common words from each cluster in order
# cluster_word_freq = {}

# for id, row in cluster_cleaned_df.iterrows():
#     cluster = row['match_concat']

#     words = re.split(r'\s+|-|\(|\)|/|\\|\||,', cluster)
#     words = [word for word in words if word.strip()]

#     for word in words:
#         if id in cluster_word_freq:
#             cluster_word_freq[id][word] = cluster_word_freq[id].get(word, 0) + 1
#         else:
#             cluster_word_freq[id] = {word: 1}

# for id in cluster_word_freq:
#     cluster_word_freq[id] = sorted(cluster_word_freq[id].items(), key=lambda x: x[1], reverse=True)
    
# cluster_word_freq_df = pd.DataFrame.from_dict(cluster_word_freq.items())
# cluster_word_freq_df.rename(columns={0: 'id', 1: 'word_freq'}, inplace=True)

# cluster_word_freq_df['cluster_name'] = cluster_word_freq_df['word_freq'].apply(lambda x: ''.join(word[0] for word in x[:1]))

In [93]:
cluster_cleaned_df['cluster_name'] = cluster_cleaned_df['match'].apply(lambda x: x[0])
cluster_cleaned_df

Unnamed: 0,unit,match,match_concat,match_split,cluster_name
0,,"[nan, nn, na]",nan nn na,"[nan, nn, na]",
1,bag,"[bag, bags, bg, ba, ag, kgbag, 1_bag]",bag bags bg ba ag kgbag 1_bag,"[bag, bags, bg, ba, ag, kgbag, 1_bag]",bag
2,x,[x],x,[x],x
3,3/4*1/2,"[3/4*1/2, 3/4* 1/2, 3/4*1/2ft, 3/4""*1/2"", 3/4×...","3/4*1/2 3/4* 1/2 3/4*1/2ft 3/4""*1/2"" 3/4×1/2 3...","[3, 4*1, 2, 3, 4*, 1, 2, 3, 4*1, 2ft, 3, 4""*1,...",3/4*1/2
4,"1""","[1"", l1"", 1½"", 1¼"", 19"", 18"", 17"", 16"", 15"", 1...","1"" l1"" 1½"" 1¼"" 19"" 18"" 17"" 16"" 15"" 14"" 12"" 11""...","[1"", l1"", 1½"", 1¼"", 19"", 18"", 17"", 16"", 15"", 1...","1"""
...,...,...,...,...,...
22095,"vitzs, belta,linco","[vitzs, belta,linco]","vitzs, belta,linco","[vitzs, , belta, linco]","vitzs, belta,linco"
22096,"apol3,pickup","[apol3,pickup]","apol3,pickup","[apol3, pickup]","apol3,pickup"
22097,restoration,"[restoration, castration]",restoration castration,"[restoration, castration]",restoration
22098,holts cold start,[holts cold start],holts cold start,"[holts, cold, start]",holts cold start


In [30]:
# cluster_names = cluster_word_freq_df['cluster_name'].to_list()

# def find_cluster_name(string):
#     for i in cluster_names:
#         if i in string:
#             return i

# cluster_cleaned_df['cluster_name'] = cluster_cleaned_df['match_split'].apply(find_cluster_name)

In [94]:
cluster_cleaned_df[cluster_cleaned_df['cluster_name'].isna()]

Unnamed: 0,unit,match,match_concat,match_split,cluster_name


In [95]:
clean_types = cleaned_types_df[cleaned_types_df['score'] >= 0.7]
df = df.merge(clean_types[['unit', 'match']], how='left', on='unit')
df = df.merge(cluster_cleaned_df[['unit', 'cluster_name']], how='left', on='unit')
df['match'] = np.where(df['match'].isna(), df['cluster_name'], df['match'])
df['match'] = np.where(df['match'].isna(), df['unit'], df['match'])
# df['match'] = np.where(df['match'].isna(), df['product_type'], df['match'])

In [96]:
df[df['correct_product_match'] == 'gasket clutch block']

Unnamed: 0,correct_product_match,product_type,number,unit,match,cluster_name
2822,gasket clutch block,loose,loose,loose,loose,loose
29052,gasket clutch block,cg 150(complete),cg 150(complete),cg 150(complete),cg 150 (cooler),cg 150 (cooler)
29059,gasket clutch block,cg150 incomplete,cg150 incomplete,cg150 incomplete,cg150 incomplete,cg150 incomplete
29627,gasket clutch block,bm 150,bm 150,bm 150,bm 150,bm 150
29651,gasket clutch block,bj 100,bj 100,bj 100,bj 100,bj 100
30962,gasket clutch block,unit,unit,unit,unit,
37444,gasket clutch block,piece,piece,piece,pce,
40874,gasket clutch block,1unit,1,unit,unit,
41729,gasket clutch block,1pc,1,pc,pcs,
42066,gasket clutch block,pcs,pcs,pcs,pcs,


In [97]:
def add_correct_product_type_column(df):
    def is_numeric(value):
        pattern = r'^\d+(\.\d+)?|½|¼|¾|\d+\/\d+$'
        return bool(re.match(pattern, value))

    df['correct_product_type'] = df.apply(lambda row: row['number'] + row['match'] if row['number'] != row['match'] and is_numeric(row['number']) and row['match'].isalpha() else row['match'], axis=1)

df[['number', 'match']] = df[['number', 'match']].astype(str)
add_correct_product_type_column(df)

In [98]:
df[df['correct_product_match'] == 'gasket clutch block']

Unnamed: 0,correct_product_match,product_type,number,unit,match,cluster_name,correct_product_type
2822,gasket clutch block,loose,loose,loose,loose,loose,loose
29052,gasket clutch block,cg 150(complete),cg 150(complete),cg 150(complete),cg 150 (cooler),cg 150 (cooler),cg 150 (cooler)
29059,gasket clutch block,cg150 incomplete,cg150 incomplete,cg150 incomplete,cg150 incomplete,cg150 incomplete,cg150 incomplete
29627,gasket clutch block,bm 150,bm 150,bm 150,bm 150,bm 150,bm 150
29651,gasket clutch block,bj 100,bj 100,bj 100,bj 100,bj 100,bj 100
30962,gasket clutch block,unit,unit,unit,unit,,unit
37444,gasket clutch block,piece,piece,piece,pce,,pce
40874,gasket clutch block,1unit,1,unit,unit,,1unit
41729,gasket clutch block,1pc,1,pc,pcs,,1pcs
42066,gasket clutch block,pcs,pcs,pcs,pcs,,pcs


In [99]:
df = df[['correct_product_match', 'product_type', 'correct_product_type']].\
            drop_duplicates(subset=['correct_product_match', 'product_type', 'correct_product_type'], keep='first')
            
data[['correct_product_match', 'product_type']] = data[['correct_product_match', 'product_type']].applymap(lambda x: str(x).lower().strip())
data = data.merge(df[['correct_product_match', 'product_type', 'correct_product_type']], how='left', on=['correct_product_match', 'product_type'])

data[['correct_product_match', 'product_type', 'correct_product_type_y']][-50:]

Unnamed: 0,correct_product_match,product_type,correct_product_type_y
959551,hub bearing rubber,ks/bm150,bm 150
959552,kick start,ks/bm150,bm 150
959553,axle front half,unit,unit
959554,clutch cable tvs,unit,unit
959555,bulb holder small,unit,unit
959556,battery solar,12n-2.5,12n
959557,sulfamycin k sk,100gm,100gm
959558,rope polytwisted,2.5g,2.5g
959559,syringe ml,5ml,5ml
959560,syringe ml,10ml,10ml


In [100]:
data[data['product_type'] == 'ks/bm150'][['correct_product_match', 'product_type', 'correct_product_type_y']]

Unnamed: 0,correct_product_match,product_type,correct_product_type_y
81225,sprocket complete chain,ks/bm150,bm 150
81228,engine valve,ks/bm150,bm 150
112746,cylinder head cg,ks/bm150,bm 150
161343,cap wig plug,ks/bm150,bm 150
164167,main switch stand,ks/bm150,bm 150
257099,brake pedal,ks/bm150,bm 150
257286,gasket head cylinder,ks/bm150,bm 150
258048,gasket clutch block,ks/bm150,bm 150
258052,motorcycle bearing chain,ks/bm150,bm 150
258057,gasket clutch block,ks/bm150,bm 150
