In [401]:
import pandas as pd
import numpy as np
import re
import Levenshtein as lev
from fuzzywuzzy import fuzz
import time
import sys

from static import *

# To calculate: TF-IDF & Cosine Similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

import warnings
warnings.filterwarnings("ignore")

In [402]:
# useful functions import
from static import *

In [403]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gonzalooportus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# CODE

In [404]:
# Initial time
t_initial = time.time()

## 1. Pre-processing

In [405]:
# parameters definition
country = 'uk'
parent_chain = 'booker' # lower case and "clean"
parent_chain_column = 'parent_chain_name'
item_column = 'item_name'
language_ = 'en'
threshold_products = 85
threshold_package = 75
parent_chain_use = True

In [406]:
# reading raw data
data = pd.read_csv('data/uk_booker_products.csv')

In [407]:
if parent_chain_use:
    # cleaning parent chain name as it has duplicated entries
    df = clean_text(data, parent_chain_column, '{}_{}'.format(parent_chain_column, 'norm'))
    # chain selection and columns to work on
    df_nlp = df[df['parent_chain_name_norm'] == parent_chain]
    df_nlp = df_nlp.loc[:, ['parent_chain_name_norm', item_column]].reset_index(drop=True)
else:
    df_nlp = data.loc[:, [item_column]].drop_duplicates().reset_index(drop=True)

In [408]:
# item name standardization
df_nlp.rename(columns={'sku_name': 'item_name'}, inplace=True)

In [409]:
print(f"Initial products: {len(list(set(df_nlp['item_name'].unique())))}")

Initial products: 50587


## 2. NLP Aplication

In [410]:
if language_ == 'en':
    stop_words = stopwords.words('english')
elif language_ == 'es':
    stop_words = stopwords.words('spanish')

In [411]:
regex_clean = r'(pm \d+\w+)|(pm \d+\.\d+)|(pm\d+\.\d+)|(\d+ pmp)|(pm\d+)|( \.+)|(pmp\d+.\d+)|(\d+pmp)|(pmp \d+)|(\d+.\d+ pm)'


In [412]:
df_nlp = nlp_cleaning(df_nlp, stop_words, regex_clean)

In [413]:
print(f'Percentage of unique products after NLP: {round(len(df_nlp.product_name.unique())/len(df_nlp.item_name.unique()), 3)}')

Percentage of unique products after NLP: 0.827


In [414]:
df_nlp[:2]

Unnamed: 0,parent_chain_name_norm,item_name,item_name_norm,item_name_norm_stop,item_name_token,item_name_token_lemma,product_name
0,booker,\tAunt Bessie's Hearty & Homely Dumpling Mix 140g,aunt bessies hearty homely dumpling mix 140g,aunt bessies hearty homely dumpling mix 140g,"[aunt, bessies, hearty, homely, dumpling, mix,...","[aunt, bessies, hearty, homely, dumpling, mix,...",aunt bessies hearty homely dumpling mix 140g
1,booker,\tBC Choc Fudge Brownie,bc choc fudge brownie,bc choc fudge brownie,"[bc, choc, fudge, brownie]","[bc, choc, fudge, brownie]",bc choc fudge brownie


### Creating mapping between source item_name & product_name (post NLP)

In [415]:
df_back_propagation = df_nlp.loc[:, ['item_name', 'product_name']]

In [416]:
df_back_propagation.to_csv(f'back_propagation/groups_{country}_back_propagation.csv', index=False)

## 3. TF-IDF Application

### Creating a tf-idf matrix

In [417]:
# preparing set for TF-IDF
df_tf = df_nlp.loc[:, ['product_name']]
df_tf = df_tf.drop_duplicates().reset_index(drop=True)
df_tf['id'] = range(1, len(df_tf) + 1)

### Applying method

In [418]:
# create object
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=2, token_pattern='(\S+)')

In [419]:
# get tf-idf values
tf_idf_matrix = tfidf_vectorizer.fit_transform(df_tf['product_name'])

In [420]:
tf_idf_matrix.shape

(41819, 26321)

## 4. Computing cosine similarity

In [421]:
matches = cosine_similarity(tf_idf_matrix, tf_idf_matrix.transpose(), 25, 0)

In [422]:
matches.shape

(41819, 41819)

### Create a match table to show the similarity scores

In [423]:
matches_df = pd.DataFrame()
matches_df = get_matches_df(matches, df_tf['product_name'], top=False)

In [424]:
matches_df = matches_df.drop_duplicates().reset_index(drop=True)

In [425]:
matches.shape

(41819, 41819)

In [426]:
matches_df.head()

Unnamed: 0,product_name,match,similarity_score
0,aunt bessies hearty homely dumpling mix 140g,aunt bessies hearty homely dumpling mix 140g,1.0
1,aunt bessies hearty homely dumpling mix 140g,aunt bessies dumpling mix 140g,0.909142
2,aunt bessies hearty homely dumpling mix 140g,aunt bessies dumpling mix,0.747339
3,aunt bessies hearty homely dumpling mix 140g,aunt bessies rosties,0.575958
4,aunt bessies hearty homely dumpling mix 140g,aunt bessies,0.575958


### Products without a match

In [427]:
prod_list = list(df_tf.product_name.unique())
match_list = list(pd.unique(matches_df[['product_name', 'match']].values.ravel('K')))

In [428]:
not_match = []
for prod_ in prod_list:
    if prod_ not in match_list:
        not_match.append(prod_)

In [429]:
print(f'Number of products without match: {len(not_match)}')
print(f'Percentage of products without match: {round(len(not_match)/len(prod_list), 6)}')

Number of products without match: 84
Percentage of products without match: 0.002009


### Who are they?

In [430]:
not_match[:4]

['70clbacardi', 'amareto', 'andrexwashlets', 'asparagus']

## 5. Grouping products

### 5.1 Fuzzy ratios calculation

In [431]:
matches_df['fuzz_ratio'] = matches_df.apply(lambda x: fuzz.token_sort_ratio(x['product_name'], x['match']), axis=1)

In [432]:
matches_df.head()

Unnamed: 0,product_name,match,similarity_score,fuzz_ratio
0,aunt bessies hearty homely dumpling mix 140g,aunt bessies hearty homely dumpling mix 140g,1.0,100
1,aunt bessies hearty homely dumpling mix 140g,aunt bessies dumpling mix 140g,0.909142,81
2,aunt bessies hearty homely dumpling mix 140g,aunt bessies dumpling mix,0.747339,72
3,aunt bessies hearty homely dumpling mix 140g,aunt bessies rosties,0.575958,50
4,aunt bessies hearty homely dumpling mix 140g,aunt bessies,0.575958,43


### 5.2 Keeping products with high similarity

In [433]:
print(f'Product Threshold: {threshold_products}')

Product Threshold: 85


In [434]:
df_similars = matches_df[matches_df['fuzz_ratio'] >= threshold_products].\
                        drop_duplicates(subset=['product_name', 'match']).reset_index(drop=True)

### 5.3 Logic to aggregate

In [435]:
df_similars = df_similars.sort_values(by=['product_name', 'match']).reset_index(drop=True)

### a) Extending similarities

In [436]:
df_similars_copy = df_similars.drop(columns=['similarity_score', 'fuzz_ratio'], axis=1).copy()

In [437]:
df_similars_copy.rename(columns={'match': 'extended_match', 'product_name': 'match'}, inplace=True)

In [438]:
# extending
df_similars_mrg = df_similars.merge(df_similars_copy, how='inner', on='match')

In [439]:
df_similars_mrg.drop('similarity_score', axis=1, inplace=True)

In [440]:
# melt dataframe
df_melt = df_similars_mrg.melt(id_vars=['product_name', 'fuzz_ratio'], var_name='which_match', value_name='candidate')

In [441]:
df_melt = df_melt.drop('which_match', axis=1)[['product_name', 'candidate', 'fuzz_ratio']]

In [442]:
df_similars_ext = df_melt.drop_duplicates(['product_name', 'candidate']).sort_values(by=['product_name', 'candidate'])\
            .reset_index(drop=True)


In [443]:
df_similars_ext.head()

Unnamed: 0,product_name,candidate,fuzz_ratio
0,1 kg baker adult beef dog food,1 kg baker adult beef dog food,100
1,1 kg happy shopper plain flour,1 kg happy shopper plain flour,100
2,1 kg happy shopper plain flour,happy shopper plain flour 1kg,92
3,1 kg happy shopper plain flour,happy shopper plain flour 500g,87
4,1 kg happy shopper plain flour,happy shopper selfraising flour 1kg,92


### b) Package similarity

In [444]:
reg_package = r'(\d+x\d+\w+)|(\d+ x \d+\w+)|(\d+\.+\d+\w+)|(\d+\.+\d+ \w+)|(\d+ ml)|(\d+ g)|(\d+\w+)|(\d+ \w+)'

In [445]:
# extracting package
df_similars_ext['package'] = package_extract(df_similars_ext, 'product_name', reg_package)
df_similars_ext['package_candidate'] = package_extract(df_similars_ext, 'candidate', reg_package)

In [446]:
# package similarity
df_similars_ext['package_ratio'] = df_similars_ext.apply(lambda x: fuzz.token_sort_ratio(x['package'],\
                                                                                x['package_candidate']), axis=1)

In [447]:
df_similars_ext.head()

Unnamed: 0,product_name,candidate,fuzz_ratio,package,package_candidate,package_ratio
0,1 kg baker adult beef dog food,1 kg baker adult beef dog food,100,1 kg,1 kg,100
1,1 kg happy shopper plain flour,1 kg happy shopper plain flour,100,1 kg,1 kg,100
2,1 kg happy shopper plain flour,happy shopper plain flour 1kg,92,1 kg,1kg,86
3,1 kg happy shopper plain flour,happy shopper plain flour 500g,87,1 kg,500g,25
4,1 kg happy shopper plain flour,happy shopper selfraising flour 1kg,92,1 kg,1kg,86


### c) Tansforming product names into integers (easier to compare)

In [448]:
product_index_dict = dict(zip(df_tf['product_name'], df_tf.index))
index_product_dict = dict(zip(df_tf.index, df_tf['product_name']))

In [449]:
for col in ['product_name', 'candidate']:
    df_similars_ext[col] = df_similars_ext[col].map(product_index_dict)

In [450]:
df_similars_ext.head()

Unnamed: 0,product_name,candidate,fuzz_ratio,package,package_candidate,package_ratio
0,867,867,100,1 kg,1 kg,100
1,81,81,100,1 kg,1 kg,100
2,81,18972,92,1 kg,1kg,86
3,81,18973,87,1 kg,500g,25
4,81,19009,92,1 kg,1kg,86


### d) Package filter + Column selection

In [451]:
print(f'Package Threshold: {threshold_package}')

Package Threshold: 75


In [452]:
df_clean = df_similars_ext[df_similars_ext['package_ratio'] > threshold_package].reset_index(drop=True)

In [453]:
df_clean = df_clean.loc[:, ['product_name', 'candidate']]

In [454]:
df_clean.head()

Unnamed: 0,product_name,candidate
0,867,867
1,81,81
2,81,18972
3,81,19009
4,80,80


### e ) Functions

In [455]:
def product_name_replacement(df, dic_):
    df['product_name'] = df['product_name'].map(dic_)
    df['candidate'] = df['candidate'].map(dic_)
    return df

### f) Procedure: for each product

In [456]:
clean_leaders = df_clean['product_name'].unique()

In [457]:
len(clean_leaders), len(df_similars['match'].unique())

(41735, 41735)

In [458]:
df_clean.head()

Unnamed: 0,product_name,candidate
0,867,867
1,81,81
2,81,18972
3,81,19009
4,80,80


In [459]:
# time before
t_bef_group = time.time()

In [460]:
# dataframe definition
groups_df = pd.DataFrame(columns=['group_id', 'leader', 'member'])
track_df = pd.DataFrame(columns=['group_id', 'member'])

for leader in clean_leaders:
    select_df = df_clean[df_clean['product_name'] == leader] 
    applicants_list = list(pd.unique(select_df[['product_name', 'candidate']].values.ravel('K')))
    groups_df, track_df = verify_and_concat_groups(groups_df, track_df, leader, applicants_list)

In [461]:
# time run
t_run = time.time()-t_bef_group
print(f'Time to run procedure: {round(t_run/60, 3)} minutes!')

Time to run procedure: 11.351 minutes!


In [462]:
print(f'Number of groups: {len(groups_df["group_id"].unique())}')

Number of groups: 24037


In [463]:
# replacing product names
groups_df['leader'] = groups_df['leader'].map(index_product_dict)
groups_df['member'] = groups_df['member'].map(index_product_dict)

In [464]:
groups_df.head()

Unnamed: 0,group_id,leader,member
0,0,1 kg baker adult beef dog food,1 kg baker adult beef dog food
1,2,1 kg happy shopper self raising flour,1 kg happy shopper self raising flour
2,12,1 ltr carter low cal tonic,1 ltr carter low cal tonic
3,14,1 ltr chekov,1 ltr chekov
4,18,1 ltr e energy drink,1 ltr e energy drink


In [465]:
# Complete run time
t_complete = time.time()-t_initial
print(f'Time to run it all: {round(t_complete/60, 3)} minutes!')

Time to run it all: 14.614 minutes!


## 6. Validation

### 6.1 Products added

In [466]:
original_products = df_tf['product_name'].unique()
len(original_products)

41819

In [467]:
added_products = pd.unique(groups_df[['leader', 'member']].values.ravel('K'))
len(added_products)

41735

In [468]:
not_added = []
for prod_ in original_products:
    if prod_ not in added_products:
        not_added.append(prod_)

In [469]:
print(f'Number of products without group: {len(not_added)}')

Number of products without group: 84


### Who are they?

In [470]:
not_added[-10:]

['sodastream cylinder',
 'tiger640ml',
 'tights',
 'toothpick',
 'ultratape sellotape',
 'yazoobanana400ml',
 'yazoochocolate400ml',
 'yazoostrawberry400ml',
 'glenmoange',
 'starbust']

### 6.2 Duplicated leaders / members ?? (in 2 or more groups)

In [471]:
# uniques: group_id - leader
leaders_df = groups_df[['group_id', 'leader']].drop_duplicates().reset_index(drop=True)

In [472]:
# duplicated leaders
leaders_df[leaders_df['leader'].duplicated() == True]

Unnamed: 0,group_id,leader


In [473]:
# uniques: group_id - member
members_df = groups_df[['group_id', 'member']].drop_duplicates().reset_index(drop=True)

In [474]:
# duplicated members
members_df[members_df['member'].duplicated() == True]

Unnamed: 0,group_id,member


### 6.3 Adding not matched products

Products not added to the groups dataframe are because previously they demonstrated low similarity on the clusters generated with TF-IDF + Cosine Similarity layer. This why they are added as "individual groups".

In [475]:
max_id = groups_df['group_id'].max()

In [476]:
not_added_df = pd.DataFrame(data={
                    'group_id': range(max_id, max_id + len(not_added)),
                    'leader': not_added,
                    'member': not_added})

In [477]:
# concat to groups_df
groups_df = pd.concat([groups_df, not_added_df], axis=0).reset_index(drop=True)

In [478]:
# concat to track df
track_df = pd.concat([track_df, not_added_df.loc[:, ['group_id', 'member']]], axis=0).reset_index(drop=True)

### 6.4 Saving results

In [479]:
groups_df = groups_df.sort_values(by=['leader', 'member']).reset_index(drop=True)

In [480]:
groups_df.to_csv(f'outputs/groups_{country}_{threshold_products}_{threshold_package}.csv', index=False)

### 6.5 Samples

In [481]:
len(groups_df['leader'].unique()), len(groups_df['member'].unique())

(24121, 41819)

In [482]:
groups_df[(groups_df['leader'].str.contains('coca'))|(groups_df['member'].str.contains('coca'))][:60]

Unnamed: 0,group_id,leader,member
651,1211,70 cl cococariba,70 cl cococariba
6239,6433,can cocacola 330ml,can cocacola 330ml
8100,8296,coca cola,coca cola
8101,8296,coca cola,coca cola
8102,8298,coca cola 1.25 litre,coca cola 1.25 litre
8103,8300,coca cola 1.75lt 2.29,coca cola 1.75lt 2.29
8104,8302,coca cola 10 x 330ml,coca cola 10 x 330ml
8105,8304,coca cola 2l,coca cola 2l
8106,8306,coca cola 500 ml,coca cola 500 ml
8107,8310,coca cola 6pck,coca cola 6pck


### 6.6 Are all leaders in members?

In [484]:
groups_df.head()

Unnamed: 0,group_id,leader,member
0,0,1 kg baker adult beef dog food,1 kg baker adult beef dog food
1,2,1 kg happy shopper self raising flour,1 kg happy shopper self raising flour
2,12,1 ltr carter low cal tonic,1 ltr carter low cal tonic
3,14,1 ltr chekov,1 ltr chekov
4,18,1 ltr e energy drink,1 ltr e energy drink


In [488]:
leaders_list = list(set(groups_df.leader))
members_list = list(set(groups_df.member))

len(leaders_list), len(members_list)

(24121, 41819)

In [491]:
len(list(set(~groups_df[groups_df['member'].isin(leaders_list)]['member'])))

TypeError: bad operand type for unary ~: 'str'

In [489]:
not_member = []
for leader_ in leaders_list:
    if leader_ not in members_list:
        not_member.append(leader_)

In [490]:
len(not_member)

0