In [317]:
import pandas as pd
import numpy as np
import re
import Levenshtein as lev
from fuzzywuzzy import fuzz
import time
import sys

from static import *

# To calculate: TF-IDF & Cosine Similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

import warnings
warnings.filterwarnings("ignore")

In [318]:
# useful functions import
from static import *

In [319]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gonzalooportus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# CODE

In [320]:
# Initial time
t_initial = time.time()

## 1. Pre-processing

In [321]:
# parameters definition
country = 'cr'
parent_chain = 'booker' # lower case and "clean"
parent_chain_column = 'parent_chain_name'
item_column = 'sku_name'
language_ = 'es'
threshold_products = 85
threshold_package = 75
parent_chain_use = False

In [322]:
# reading raw data
data = pd.read_csv('data/CR_products.csv')

In [323]:
if parent_chain_use:
    # cleaning parent chain name as it has duplicated entries
    df = clean_text(data, parent_chain_column, '{}_{}'.format(parent_chain_column, 'norm'))
    # chain selection and columns to work on
    df_nlp = df[df['parent_chain_name_norm'] == parent_chain]
    df_nlp = df_nlp.loc[:, ['parent_chain_name_norm', item_column]].reset_index(drop=True)
else:
    df_nlp = data.loc[:, [item_column]].drop_duplicates().reset_index(drop=True)

In [324]:
# item name standardization
df_nlp.rename(columns={'sku_name': 'item_name'}, inplace=True)

In [325]:
print(f"Initial products: {len(list(set(df_nlp['item_name'].unique())))}")

Initial products: 2782


## 2. NLP Aplication

In [326]:
if language_ == 'en':
    stop_words = stopwords.words('english')
elif language_ == 'es':
    stop_words = stopwords.words('spanish')

In [327]:
regex_clean = r'(pm \d+\w+)|(pm \d+\.\d+)|(pm\d+\.\d+)|(\d+ pmp)|(pm\d+)|( \.+)|(pmp\d+.\d+)|(\d+pmp)|(pmp \d+)|(\d+.\d+ pm)'


In [328]:
df_nlp = nlp_cleaning(df_nlp, stop_words, regex_clean)

In [329]:
print(f'Percentage of unique products after NLP: {round(len(df_nlp.product_name.unique())/len(df_nlp.item_name.unique()), 3)}')

Percentage of unique products after NLP: 0.458


In [330]:
df_nlp[:2]

Unnamed: 0,item_name,item_name_norm,item_name_norm_stop,item_name_token,item_name_token_lemma,product_name
0,Chocolate Guayabita Gallito 35 g 0293,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 35 g 0293,"[chocolate, guayabita, gallito, 35, g, 0293]","[chocolate, guayabita, gallito, 35, g, 0293]",chocolate guayabita gallito 35 g 0293
1,2 Dos Pinos Trits Pie Limón 100 g 25% Desc,2 dos pinos trits pie limn 100 g 25% desc,2 dos pinos trits pie limn 100 g 25% desc,"[2, dos, pinos, trits, pie, limn, 100, g, 25, ...","[2, do, pinos, trits, pie, limn, 100, g, 25, %...",2 do pinos trits pie limn 100 g 25 % desc


### Creating mapping between source item_name & product_name (post NLP)

In [331]:
df_back_propagation = df_nlp.loc[:, ['item_name', 'product_name']]

In [332]:
df_back_propagation.to_csv(f'back_propagation/groups_{country}_back_propagation.csv', index=False)

## 3. TF-IDF Application

### Creating a tf-idf matrix

In [333]:
# preparing set for TF-IDF
df_tf = df_nlp.loc[:, ['product_name']]
df_tf = df_tf.drop_duplicates().reset_index(drop=True)
df_tf['id'] = range(1, len(df_tf) + 1)

### Applying method

In [334]:
# create object
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=2, token_pattern='(\S+)')

In [335]:
# get tf-idf values
tf_idf_matrix = tfidf_vectorizer.fit_transform(df_tf['product_name'])

In [336]:
tf_idf_matrix.shape

(1273, 2026)

## 4. Computing cosine similarity

In [337]:
matches = cosine_similarity(tf_idf_matrix, tf_idf_matrix.transpose(), 25, 0)

In [338]:
matches.shape

(1273, 1273)

### Create a match table to show the similarity scores

In [339]:
matches_df = pd.DataFrame()
matches_df = get_matches_df(matches, df_tf['product_name'], top=False)

In [340]:
matches_df = matches_df.drop_duplicates().reset_index(drop=True)

In [341]:
matches.shape

(1273, 1273)

In [342]:
matches_df.head()

Unnamed: 0,product_name,match,similarity_score
0,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 35 g 0293,1.0
1,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 35 g 0293 1,0.981587
2,chocolate guayabita gallito 35 g 0293,3 gallito chocolate guayabita 35 g 33 % desc,0.394446
3,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 117 g 0263,0.372202
4,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 117 g 0263 5,0.359773


### Products without a match

In [343]:
prod_list = list(df_tf.product_name.unique())
match_list = list(pd.unique(matches_df[['product_name', 'match']].values.ravel('K')))

In [344]:
not_match = []
for prod_ in prod_list:
    if prod_ not in match_list:
        not_match.append(prod_)

In [345]:
print(f'Number of products without match: {len(not_match)}')
print(f'Percentage of products without match: {round(len(not_match)/len(prod_list), 6)}')

Number of products without match: 0
Percentage of products without match: 0.0


### Who are they?

In [346]:
not_match[:4]

[]

## 5. Grouping products

### 5.1 Fuzzy ratios calculation

In [347]:
matches_df['fuzz_ratio'] = matches_df.apply(lambda x: fuzz.token_sort_ratio(x['product_name'], x['match']), axis=1)

In [348]:
matches_df.head()

Unnamed: 0,product_name,match,similarity_score,fuzz_ratio
0,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 35 g 0293,1.0,100
1,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 35 g 0293 1,0.981587,97
2,chocolate guayabita gallito 35 g 0293,3 gallito chocolate guayabita 35 g 33 % desc,0.394446,86
3,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 117 g 0263,0.372202,91
4,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 117 g 0263 5,0.359773,91


### 5.2 Keeping products with high similarity

In [349]:
print(f'Product Threshold: {threshold_products}')

Product Threshold: 85


In [350]:
df_similars = matches_df[matches_df['fuzz_ratio'] >= threshold_products].\
                        drop_duplicates(subset=['product_name', 'match']).reset_index(drop=True)

### 5.3 Logic to aggregate

In [351]:
df_similars = df_similars.sort_values(by=['product_name', 'match']).reset_index(drop=True)

### a) Extending similarities

In [353]:
df_similars_copy = df_similars.drop(columns=['similarity_score', 'fuzz_ratio'], axis=1).copy()

In [354]:
df_similars_copy.rename(columns={'match': 'extended_match', 'product_name': 'match'}, inplace=True)

In [355]:
# extending
df_similars_mrg = df_similars.merge(df_similars_copy, how='inner', on='match')

In [357]:
df_similars_mrg.drop('similarity_score', axis=1, inplace=True)

In [358]:
# melt dataframe
df_melt = df_similars_mrg.melt(id_vars=['product_name', 'fuzz_ratio'], var_name='which_match', value_name='candidate')

In [359]:
df_melt = df_melt.drop('which_match', axis=1)[['product_name', 'candidate', 'fuzz_ratio']]

In [360]:
df_similars_ext = df_melt.drop_duplicates(['product_name', 'candidate']).sort_values(by=['product_name', 'candidate'])\
            .reset_index(drop=True)


In [361]:
df_similars_ext.head()

Unnamed: 0,product_name,candidate,fuzz_ratio
0,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,100
1,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,90
2,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,90
3,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,100
4,1 helado combinado do pinos 14 gl 499 grs1 pas...,1 helado combinado do pinos 14 gl 499 grs1 pas...,100


### b) Package similarity

In [362]:
reg_package = r'(\d+x\d+\w+)|(\d+ x \d+\w+)|(\d+\.+\d+\w+)|(\d+\.+\d+ \w+)|(\d+ ml)|(\d+ g)|(\d+\w+)|(\d+ \w+)'

In [363]:
# extracting package
df_similars_ext['package'] = package_extract(df_similars_ext, 'product_name', reg_package)
df_similars_ext['package_candidate'] = package_extract(df_similars_ext, 'candidate', reg_package)

In [364]:
# package similarity
df_similars_ext['package_ratio'] = df_similars_ext.apply(lambda x: fuzz.token_sort_ratio(x['package'],\
                                                                                x['package_candidate']), axis=1)

In [365]:
df_similars_ext.head()

Unnamed: 0,product_name,candidate,fuzz_ratio,package,package_candidate,package_ratio
0,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,100,1 do,1 do,100
1,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,90,1 do,1 do,100
2,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,90,1 do,1 do,100
3,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,100,1 do,1 do,100
4,1 helado combinado do pinos 14 gl 499 grs1 pas...,1 helado combinado do pinos 14 gl 499 grs1 pas...,100,1 helado,1 helado,100


### c) Tansforming product names into integers (easier to compare)

In [366]:
product_index_dict = dict(zip(df_tf['product_name'], df_tf.index))
index_product_dict = dict(zip(df_tf.index, df_tf['product_name']))

In [367]:
for col in ['product_name', 'candidate']:
    df_similars_ext[col] = df_similars_ext[col].map(product_index_dict)

In [368]:
df_similars_ext.head()

Unnamed: 0,product_name,candidate,fuzz_ratio,package,package_candidate,package_ratio
0,489,489,100,1 do,1 do,100
1,489,825,90,1 do,1 do,100
2,825,489,90,1 do,1 do,100
3,825,825,100,1 do,1 do,100
4,853,853,100,1 helado,1 helado,100


### d) Package filter + Column selection

In [369]:
print(f'Package Threshold: {threshold_package}')

Package Threshold: 75


In [370]:
df_clean = df_similars_ext[df_similars_ext['package_ratio'] > threshold_package].reset_index(drop=True)

In [371]:
df_clean = df_clean.loc[:, ['product_name', 'candidate']]

In [372]:
df_clean.head()

Unnamed: 0,product_name,candidate
0,489,489
1,489,825
2,825,489
3,825,825
4,853,853


### e ) Functions

In [373]:
def product_name_replacement(df, dic_):
    df['product_name'] = df['product_name'].map(dic_)
    df['candidate'] = df['candidate'].map(dic_)
    return df

### f) Procedure: for each product

In [374]:
clean_leaders = df_clean['product_name'].unique()

In [375]:
len(clean_leaders), len(df_similars['match'].unique())

(1273, 1273)

In [376]:
df_clean.head()

Unnamed: 0,product_name,candidate
0,489,489
1,489,825
2,825,489
3,825,825
4,853,853


In [377]:
# time before
t_bef_group = time.time()

In [378]:
# dataframe definition
groups_df = pd.DataFrame(columns=['group_id', 'leader', 'member'])
track_df = pd.DataFrame(columns=['group_id', 'member'])

for leader in clean_leaders:
    select_df = df_clean[df_clean['product_name'] == leader] 
    applicants_list = list(pd.unique(select_df[['product_name', 'candidate']].values.ravel('K')))
    groups_df, track_df = verify_and_concat_groups(groups_df, track_df, leader, applicants_list)

In [379]:
# time run
t_run = time.time()-t_bef_group
print(f'Time to run procedure: {round(t_run/60, 3)} minutes!')

Time to run procedure: 0.095 minutes!


In [380]:
print(f'Number of groups: {len(groups_df["group_id"].unique())}')

Number of groups: 701


In [381]:
# replacing product names
groups_df['leader'] = groups_df['leader'].map(index_product_dict)
groups_df['member'] = groups_df['member'].map(index_product_dict)

In [382]:
groups_df.head()

Unnamed: 0,group_id,leader,member
0,0,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...
1,0,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...
2,1,1 helado do pinos chocolate 14 gl 499 grs1 pas...,1 helado combinado do pinos 14 gl 499 grs1 pas...
3,1,1 helado do pinos chocolate 14 gl 499 grs1 pas...,1 helado do pinos chocochips 14 gl 499 grs1 pa...
4,1,1 helado do pinos chocolate 14 gl 499 grs1 pas...,1 helado combinado do pinos 14 gl 499 grs1 pas...


In [383]:
# Complete run time
t_complete = time.time()-t_initial
print(f'Time to run it all: {round(t_complete/60, 3)} minutes!')

Time to run it all: 0.273 minutes!


## 6. Validation

### 6.1 Products added

In [384]:
original_products = df_tf['product_name'].unique()
len(original_products)

1273

In [385]:
added_products = pd.unique(groups_df[['leader', 'member']].values.ravel('K'))
len(added_products)

1273

In [386]:
not_added = []
for prod_ in original_products:
    if prod_ not in added_products:
        not_added.append(prod_)

In [387]:
print(f'Number of products without group: {len(not_added)}')

Number of products without group: 0


### Who are they?

In [388]:
not_added[-10:]

[]

### 6.2 Duplicated leaders / members ?? (in 2 or more groups)

In [389]:
# uniques: group_id - leader
leaders_df = groups_df[['group_id', 'leader']].drop_duplicates().reset_index(drop=True)

In [390]:
# duplicated leaders
leaders_df[leaders_df['leader'].duplicated() == True]

Unnamed: 0,group_id,leader


In [391]:
# uniques: group_id - member
members_df = groups_df[['group_id', 'member']].drop_duplicates().reset_index(drop=True)

In [392]:
# duplicated members
members_df[members_df['member'].duplicated() == True]

Unnamed: 0,group_id,member


### 6.3 Adding not matched products

Products not added to the groups dataframe are because previously they demonstrated low similarity on the clusters generated with TF-IDF + Cosine Similarity layer. This why they are added as "individual groups".

In [393]:
max_id = groups_df['group_id'].max()

In [394]:
not_added_df = pd.DataFrame(data={
                    'group_id': range(max_id, max_id + len(not_added)),
                    'leader': not_added,
                    'member': not_added})

In [395]:
# concat to groups_df
groups_df = pd.concat([groups_df, not_added_df], axis=0).reset_index(drop=True)

In [396]:
# concat to track df
track_df = pd.concat([track_df, not_added_df.loc[:, ['group_id', 'member']]], axis=0).reset_index(drop=True)

### 6.4 Saving results

In [397]:
groups_df = groups_df.sort_values(by=['leader', 'member']).reset_index(drop=True)

In [398]:
groups_df.to_csv(f'outputs/groups_{country}_{threshold_products}_{threshold_package}.csv', index=False)

### 6.5 Samples

In [399]:
len(groups_df['leader'].unique()), len(groups_df['member'].unique())

(701, 1273)

In [400]:
groups_df[(groups_df['leader'].str.contains('coca'))|(groups_df['member'].str.contains('coca'))][:60]

Unnamed: 0,group_id,leader,member
