# Matching Product Names and Descriptions using FuzzyWuzzy

In [85]:
import pandas as pd
import numpy as np
import time
import logging
from fuzzywuzzy import fuzz,process
 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

Import datasets

In [86]:
shopeeList = pd.read_csv('datasets/shp_data.csv')

mappingList = pd.read_csv('datasets/shop_mapping.csv').iloc[:,0:2].astype(int)

lazadaList = pd.read_csv('datasets/lzd_data.csv')
lazadaList = lazadaList.merge(mappingList,how='left',on='LZD Shop ID')

shopIdList = lazadaList['SHP Shop ID'].dropna().drop_duplicates()
lzdDict = {shopId: lazadaList[lazadaList['SHP Shop ID'] == shopId] for shopId in shopIdList}
shpDict = {shopId: shopeeList[shopeeList['SHP Shop ID'] == shopId] for shopId in shopIdList}

correctMatch = pd.read_csv('datasets/correct_match.csv')
correctMatch = correctMatch[['LZD Item ID','SHP Item Name']]
correctMatch.columns = ['LZD Item ID','Correct Match - SHP Item Name']

New Matching Logic: Applies both Token Sort and Token Set from FuzzyWuzzy package

In [87]:
def newFuzzMatcher(lzdInfo,shpInfoDf):
  mainArr = []
  lzdShopId=lzdInfo[0];lzdItemId=lzdInfo[1];lzdItemName=lzdInfo[2];lzdItemDesc=lzdInfo[3];shpShopId=lzdInfo[4]
  shpInfo = shpInfoDf.values.tolist()

  for i in range(len(shpInfo)):
    shpItemId=shpInfo[i][1];shpItemName=shpInfo[i][2];shpItemDesc=shpInfo[i][3]
    if lzdItemDesc == 'None':
      tknSortNameScore = fuzz.token_sort_ratio(lzdItemName, shpItemName)
      tknSetNameScore = fuzz.token_set_ratio(lzdItemName, shpItemName)
      tknSortDescScore = 0
      tknSetDescScore = 0
    else:
      tknSortNameScore = fuzz.token_sort_ratio(lzdItemName, shpItemName)
      tknSetNameScore = fuzz.token_set_ratio(lzdItemName, shpItemName)
      tknSortDescScore = fuzz.token_sort_ratio(lzdItemDesc, shpItemDesc)
      tknSetDescScore = fuzz.token_set_ratio(lzdItemDesc, shpItemDesc)
    overallScore = tknSortNameScore + tknSetNameScore + tknSortDescScore + tknSetDescScore
    mainArr.append([lzdShopId,lzdItemId,lzdItemName,lzdItemDesc,shpShopId,shpItemId,shpItemName,shpItemDesc,
                    tknSortNameScore,tknSetNameScore,tknSortDescScore,tknSetDescScore,overallScore])
  lastColIndex = len(mainArr[0])-1
  toDf = pd.DataFrame(mainArr)
  rankedDf = toDf.sort_values(by=lastColIndex,ascending=False)
  toReturn = rankedDf.head(2).values.tolist()[0]

  return toReturn

startTime = time.time()
# Matching
appendList1 = []
logger.info('Start matching...')
for shopId in shopIdList:
  logger.info('Matching Shop ID: {}'.format(shopId))
  lzdDf = lzdDict[shopId]
  shpDf = shpDict[shopId]

  for lzdIndex in range(len(lzdDf)):
    lzdInfo = lzdDf.iloc[lzdIndex].values.tolist()
    appendList1.append(newFuzzMatcher(lzdInfo, shpDf))

toList = list(filter(None.__ne__, appendList1))
logger.info('Done matching...')
newLogicDf = pd.DataFrame(toList,columns=['LZD Shop ID','LZD Item ID','LZD Item Name','LZD Item Description','SHP Shop ID','SHP Item ID','SHP Item Name','SHP Item Description',
                                          'Token Sort Name Score','Token Set Name Score','Token Sort Desc Score','Token Set Desc Score','Overall Score'])

endTime = time.time()
totalTime = endTime - startTime
logger.info(f'Time Elapsed: {time.strftime("%H:%M:%S", time.gmtime(totalTime))}')

newLogicResult = newLogicDf.merge(correctMatch,how='left',on='LZD Item ID')
newLogicResult['Test Result'] = np.where(newLogicResult['SHP Item Name'] == newLogicResult['Correct Match - SHP Item Name'], 'Pass', 'Fail')

INFO:__main__:Start matching...
INFO:__main__:Matching Shop ID: 26704352
INFO:__main__:Matching Shop ID: 48225853
INFO:__main__:Matching Shop ID: 17492625
INFO:__main__:Done matching...
INFO:__main__:Time Elapsed: 00:01:02


Old Matching Logic: Only applies Token Sort from FuzzyWuzzy package

In [88]:
def oldFuzzMatcher(lzdInfo,shpInfoDf):
  mainArr = []
  lzdShopId=lzdInfo[0];lzdItemId=lzdInfo[1];lzdItemName=lzdInfo[2];lzdItemDesc=lzdInfo[3];shpShopId=lzdInfo[4]
  shpInfo = shpInfoDf.values.tolist()
  for i in range(len(shpInfo)):
    shpItemId=shpInfo[i][1];shpItemName=shpInfo[i][2];shpItemDesc=shpInfo[i][3]
    tknSortNameScore = fuzz.token_sort_ratio(lzdItemName, shpItemName)
    tknSetNameScore = 0
    tknSortDescScore = 0
    tknSetDescScore = 0

    overallScore = tknSortNameScore + tknSetNameScore + tknSortDescScore + tknSetDescScore
    mainArr.append([lzdShopId,lzdItemId,lzdItemName,lzdItemDesc,shpShopId,shpItemId,shpItemName,shpItemDesc,
                    tknSortNameScore,tknSetNameScore,tknSortDescScore,tknSetDescScore,overallScore])

  lastColIndex = len(mainArr[0])-1
  toDf = pd.DataFrame(mainArr)
  rankedDf = toDf.sort_values(by=lastColIndex,ascending=False)
  toReturn = rankedDf.head(2).values.tolist()[0]

  return toReturn

startTime = time.time()
# Matching
appendList2 = []
logger.info('Start matching...')
for shopId in shopIdList:
  logger.info('Matching Shop ID: {}'.format(shopId))
  lzdDf = lzdDict[shopId]
  shpDf = shpDict[shopId]

  for lzdIndex in range(len(lzdDf)):
    lzdInfo = lzdDf.iloc[lzdIndex].values.tolist()
    appendList2.append(oldFuzzMatcher(lzdInfo, shpDf))

toList = list(filter(None.__ne__, appendList2))
logger.info('Done matching...')
oldLogicDf = pd.DataFrame(toList,columns=['LZD Shop ID','LZD Item ID','LZD Item Name','LZD Item Description','SHP Shop ID','SHP Item ID','SHP Item Name','SHP Item Description',
                                          'Token Sort Name Score','Token Set Name Score','Token Sort Desc Score','Token Set Desc Score','Overall Score'])
endTime = time.time()
totalTime = endTime - startTime
logger.info(f'Time Elapsed: {time.strftime("%H:%M:%S", time.gmtime(totalTime))}')

oldLogicResult = oldLogicDf.merge(correctMatch,how='left',on='LZD Item ID')
oldLogicResult['Test Result'] = np.where(oldLogicResult['SHP Item Name'] == oldLogicResult['Correct Match - SHP Item Name'], 'Pass', 'Fail')

INFO:__main__:Start matching...
INFO:__main__:Matching Shop ID: 26704352
INFO:__main__:Matching Shop ID: 48225853
INFO:__main__:Matching Shop ID: 17492625
INFO:__main__:Done matching...
INFO:__main__:Time Elapsed: 00:00:01


Comparison between New Logic and Old Logic

In [90]:
newLogic_succesRate = round(sum(newLogicResult['Test Result'] == 'Pass')/52*100)
oldLogic_succesRate = round(sum(oldLogicResult['Test Result'] == 'Pass')/52*100)
perc_diff = newLogic_succesRate - oldLogic_succesRate

print('New Logic Matching Succes Rate: {}%'.format(newLogic_succesRate))
print('Old Logic Matching Succes Rate: {}%'.format(oldLogic_succesRate))
print('Conclusion: New Logic is more accurate by {}% against Old Logic'.format(perc_diff))

New Logic Matching Succes Rate: 87%
Old Logic Matching Succes Rate: 71%
Conclusion: New Logic is more accurate by 16% against Old Logic
