In [142]:
import pandas as pd
import numpy as np
import re
import Levenshtein as lev
from fuzzywuzzy import fuzz
import time
import sys

import warnings
warnings.filterwarnings("ignore")

In [143]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('punkt')
from nltk.tokenize import word_tokenize

nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gonzalooportus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gonzalooportus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gonzalooportus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/gonzalooportus/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# CODE

In [144]:
# Initial time
t_initial = time.time()

## 1. Pre-processing

In [145]:
# parameters definition
country = 'cr'
parent_chain = 'booker' # lower case and "clean"
parent_chain_column = 'parent_chain_name'
item_column = 'sku_name'
language_ = 'es'
threshold_products = 85
threshold_package = 75
parent_chain_use = False
# to fix top in get_matches_df function
stop_ = False

In [146]:
# reading raw data
data = pd.read_csv('data/CR_products.csv')

In [147]:
def clean_text(df, col_name, new_col_name):
    # column values to lower case
    df[new_col_name] = df[col_name].str.lower().str.strip()
    # removes special characters
    df[new_col_name] = df[new_col_name].apply(lambda x: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z.% \t])", "", x))
    return df

In [148]:
if parent_chain_use:
    # cleaning parent chain name as it has duplicated entries
    df = clean_text(data, parent_chain_column, '{}_{}'.format(parent_chain_column, 'norm'))
    # chain selection and columns to work on
    df_nlp = df[df['parent_chain_name_norm'] == parent_chain]
    df_nlp = df_nlp.loc[:, ['parent_chain_name_norm', item_column]].reset_index(drop=True)
else:
    df_nlp = data.loc[:, [item_column]].drop_duplicates().reset_index(drop=True)

In [149]:
# item name standardization
df_nlp.rename(columns={'sku_name': 'item_name'}, inplace=True)

In [150]:
print(f"Initial products: {len(list(set(df_nlp['item_name'].unique())))}")

Initial products: 2782


## 2. NLP Aplication

In [151]:
if language_ == 'en':
    stop_words = stopwords.words('english')
elif language_ == 'es':
    stop_words = stopwords.words('spanish')

In [152]:
def replace_stop_words(df, col, stop_list):
    df['{}_stop'.format(col)] = df[col].apply(lambda x: ' '.join([word for word in x.split() if x not in stop_list]))
    return df

In [153]:
def word_lemmatizer(text):
    text_lemma = [WordNetLemmatizer().lemmatize(word) for word in text]
    return text_lemma

In [154]:
regex_clean = r'(pm \d+\w+)|(pm \d+\.\d+)|(pm\d+\.\d+)|(\d+ pmp)|(pm\d+)|( \.+)|(pmp\d+.\d+)|(\d+pmp)|(pmp \d+)|(\d+.\d+ pm)'


In [155]:
def nlp_cleaning(df, stop_words, regex_clean):
    # normalization
    df = clean_text(df, 'item_name', 'item_name_norm')
    # remove stop words
    df = replace_stop_words(df, 'item_name_norm', stop_words)
    # tokenize text
    df['item_name_token'] = df['item_name_norm_stop'].apply(lambda x: word_tokenize(x))
    # lemmatization
    df['item_name_token_lemma'] = df['item_name_token'].apply(lambda x: word_lemmatizer(x))
    # joining lemmas
    df['product_name'] = df['item_name_token_lemma'].apply(lambda list_: ' '.join([word for word in list_]))
    # cleaning product names with regex
    df['product_name'] = df['product_name'].apply(lambda x: re.sub(regex_clean, "", x))
    return df

In [156]:
df_nlp = nlp_cleaning(df_nlp, stop_words, regex_clean)

In [157]:
# unique items
len(df_nlp.item_name.unique()), len(df_nlp.product_name.unique())

(2782, 1273)

In [158]:
df_nlp[:2]

Unnamed: 0,item_name,item_name_norm,item_name_norm_stop,item_name_token,item_name_token_lemma,product_name
0,Chocolate Guayabita Gallito 35 g 0293,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 35 g 0293,"[chocolate, guayabita, gallito, 35, g, 0293]","[chocolate, guayabita, gallito, 35, g, 0293]",chocolate guayabita gallito 35 g 0293
1,2 Dos Pinos Trits Pie Limón 100 g 25% Desc,2 dos pinos trits pie limn 100 g 25% desc,2 dos pinos trits pie limn 100 g 25% desc,"[2, dos, pinos, trits, pie, limn, 100, g, 25, ...","[2, do, pinos, trits, pie, limn, 100, g, 25, %...",2 do pinos trits pie limn 100 g 25 % desc


## 3. TF-IDF Application

In [159]:
# importing module
from sklearn.feature_extraction.text import TfidfVectorizer

### Creating a tf-idf matrix

In [160]:
df_tf = df_nlp.loc[:, ['product_name']]

In [161]:
df_tf = df_tf.drop_duplicates().reset_index(drop=True)

In [162]:
df_tf['id'] = range(1, len(df_tf) + 1)

In [163]:
# create object
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=2, token_pattern='(\S+)')

In [164]:
# get tf-idf values
tf_idf_matrix = tfidf_vectorizer.fit_transform(df_tf['product_name'])

In [165]:
tf_idf_matrix.shape

(1273, 2026)

## 4. Computing cosine similarity

We'll implement an approach from here: https://bergvca.github.io/2017/10/14/super-fast-string-matching.html

The reason is that calculates the similarities much faster.

In [166]:
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

In [167]:
def cosine_similarity(A, B, ntop, lower_bound=0):
    # force A and B as a compressed sparse row (CSR) matrix.
    # CSR --> efficient operations, fast matrix vector products
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)
    ct.sparse_dot_topn(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype),
            A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype),
            B.data,
            ntop,
            lower_bound,
            indptr, indices, data)
    
    return csr_matrix((data,indices,indptr),shape=(M,N))

In [168]:
matches = cosine_similarity(tf_idf_matrix, tf_idf_matrix.transpose(), 20, 0)

In [169]:
matches.shape

(1273, 1273)

### Create a match table to show the similarity scores

In [170]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'product_name': left_side,
                          'match': right_side,
                           'similarity_score': similairity})

In [171]:
# HOW TO AUTOMATE TOP SELECTION?
matches_df = pd.DataFrame()
matches_df = get_matches_df(matches, df_tf['product_name'], top=False) #uk: 830934

In [172]:
if stop_:
    print('VERIFY THAT "TOP" PARAMETER ON get_matches_df EXECUTED ABOVE IS THE MAXIMUM.')
    print('HOW? - write a huge number and watch for the length in the error')
    sys.exit()

In [173]:
matches_df = matches_df.drop_duplicates().reset_index(drop=True)

In [174]:
matches.shape

(1273, 1273)

In [175]:
matches_df.shape

(25362, 3)

In [176]:
matches_df.head()

Unnamed: 0,product_name,match,similarity_score
0,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 35 g 0293,1.0
1,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 35 g 0293 1,0.981587
2,chocolate guayabita gallito 35 g 0293,3 gallito chocolate guayabita 35 g 33 % desc,0.394446
3,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 117 g 0263,0.372202
4,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 117 g 0263 5,0.359773


### Products without a match

In [177]:
prod_list = list(df_tf.product_name.unique())

In [178]:
match_list = list(matches_df[['product_name', 'match']].values.ravel('K'))

In [179]:
not_match = []
for prod_ in prod_list:
    if prod_ not in match_list:
        not_match.append(prod_)

In [180]:
print(f'Number of products without match: {len(not_match)}')
print(f'Percentage of products without match: {round(len(not_match)/len(prod_list), 6)}')

Number of products without match: 0
Percentage of products without match: 0.0


### Who are they?

In [181]:
not_match[:4]

[]

## 5. Grouping products

In [182]:
matches_df.shape

(25362, 3)

### 5.1 Fuzzy ratios calculation

In [183]:
matches_df['fuzz_ratio'] = matches_df.apply(lambda x: fuzz.token_sort_ratio(x['product_name'], x['match']), axis=1)

In [184]:
matches_df.head()

Unnamed: 0,product_name,match,similarity_score,fuzz_ratio
0,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 35 g 0293,1.0,100
1,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 35 g 0293 1,0.981587,97
2,chocolate guayabita gallito 35 g 0293,3 gallito chocolate guayabita 35 g 33 % desc,0.394446,86
3,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 117 g 0263,0.372202,91
4,chocolate guayabita gallito 35 g 0293,chocolate guayabita gallito 117 g 0263 5,0.359773,91


### 5.3 Splitting matches with high-low fuzzy ratios

In [185]:
print(f'Product Threshold: {threshold_products}')

Product Threshold: 85


In [186]:
df_similars = matches_df[matches_df['fuzz_ratio'] >= threshold_products].\
                        drop_duplicates(subset=['product_name', 'match']).reset_index(drop=True)

In [187]:
df_not_similars = matches_df[matches_df['fuzz_ratio'] < threshold_products].\
                        drop_duplicates(subset=['product_name', 'match']).reset_index(drop=True)

PREGUNTA:
    
    * Como ir formando los grupos extrayendo la totalidad de ellos?
    * Agrego los similares de los similares?

### 5.4 Logic to aggregate

In [188]:
df_similars = df_similars.sort_values(by=['product_name', 'match']).reset_index(drop=True)

### a) Extending similarities

In [189]:
df_similars.shape

(4033, 4)

In [190]:
df_similars_copy = df_similars.drop(columns=['similarity_score', 'fuzz_ratio'], axis=1).copy()

In [191]:
df_similars_copy.rename(columns={'match': 'extended_match', 'product_name': 'match'}, inplace=True)

In [192]:
# extending
df_similars_mrg = df_similars.merge(df_similars_copy, how='inner', on='match')

In [193]:
df_similars_mrg.shape

(19209, 5)

In [194]:
df_similars_mrg.drop('similarity_score', axis=1, inplace=True)

In [195]:
# melt dataframe
df_melt = df_similars_mrg.melt(id_vars=['product_name', 'fuzz_ratio'], var_name='which_match', value_name='candidate')

In [196]:
df_melt = df_melt.drop('which_match', axis=1)[['product_name', 'candidate', 'fuzz_ratio']]

In [197]:
df_similars_ext = df_melt.drop_duplicates(['product_name', 'candidate']).sort_values(by=['product_name', 'candidate'])\
            .reset_index(drop=True)


In [198]:
df_similars_ext.head()

Unnamed: 0,product_name,candidate,fuzz_ratio
0,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,100
1,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,90
2,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,90
3,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,100
4,1 helado combinado do pinos 14 gl 499 grs1 pas...,1 helado combinado do pinos 14 gl 499 grs1 pas...,100


### b) Package similarity

In [199]:
def package_extract(df, column, regex_):
    """
    Extracts the package from a product name. Uses a regular expression for these.
    
    Inputs:
    - df: dataframe
    - column: product name column where to look for packages
    - regex_: regular expression formula to match patterns
    
    Output: a column with the package of the specified product name column
    """
    packs = df[column].str.extract(regex_)
    packs['package'] = packs[packs.columns[0:]].apply(lambda x: ','.join(x.dropna()), axis=1)
    packs = packs.loc[:, ['package']]
    return packs.loc[:, ['package']]

In [200]:
reg_package = r'(\d+x\d+\w+)|(\d+ x \d+\w+)|(\d+\.+\d+\w+)|(\d+\.+\d+ \w+)|(\d+ ml)|(\d+ g)|(\d+\w+)|(\d+ \w+)'

In [201]:
# extracting package
df_similars_ext['package'] = package_extract(df_similars_ext, 'product_name', reg_package)
df_similars_ext['package_candidate'] = package_extract(df_similars_ext, 'candidate', reg_package)

In [202]:
# package similarity
df_similars_ext['package_ratio'] = df_similars_ext.apply(lambda x: fuzz.token_sort_ratio(x['package'],\
                                                                                x['package_candidate']), axis=1)

In [203]:
df_similars_ext.head()

Unnamed: 0,product_name,candidate,fuzz_ratio,package,package_candidate,package_ratio
0,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,100,1 do,1 do,100
1,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,90,1 do,1 do,100
2,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,90,1 do,1 do,100
3,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...,100,1 do,1 do,100
4,1 helado combinado do pinos 14 gl 499 grs1 pas...,1 helado combinado do pinos 14 gl 499 grs1 pas...,100,1 helado,1 helado,100


### c) Tansforming product names into integers (easier to compare)

In [204]:
product_index_dict = dict(zip(df_tf['product_name'], df_tf.index))
index_product_dict = dict(zip(df_tf.index, df_tf['product_name']))

In [205]:
for col in ['product_name', 'candidate']:
    df_similars_ext[col] = df_similars_ext[col].map(product_index_dict)

In [206]:
df_similars_ext.head()

Unnamed: 0,product_name,candidate,fuzz_ratio,package,package_candidate,package_ratio
0,489,489,100,1 do,1 do,100
1,489,825,90,1 do,1 do,100
2,825,489,90,1 do,1 do,100
3,825,825,100,1 do,1 do,100
4,853,853,100,1 helado,1 helado,100


### d) Package filter + Column selection

In [207]:
print(f'Package Threshold: {threshold_package}')

Package Threshold: 75


In [208]:
df_clean = df_similars_ext[df_similars_ext['package_ratio'] > threshold_package].reset_index(drop=True)

In [209]:
df_clean = df_clean.loc[:, ['product_name', 'candidate']]

In [210]:
df_clean.head()

Unnamed: 0,product_name,candidate
0,489,489
1,489,825
2,825,489
3,825,825
4,853,853


### e ) Functions

In [211]:
def create_group_track_df(groups_df, track_df, product, applicants_list):
    if groups_df.shape[0] == 0:
        group_id = 0
    else:
        group_id = groups_df['group_id'].max() + 1
    if track_df.shape[0] == 0:
        track_id = 0
    else:
        track_id = track_df['group_id'].max() + 1
        
    df_temp_group = pd.DataFrame({
        'group_id': group_id,
        'leader': product,
        'member': applicants_list
        })
    df_temp_track = pd.DataFrame({
        'group_id': track_id,
        'member': applicants_list
        })
    
    return df_temp_group, df_temp_track

In [212]:
def verify_and_concat_groups(groups_df, track_df, index_, applicants_list):
    # verify if any of the applicants is already assigned to a group, if not:    
    if track_df[track_df['member'].isin(applicants_list)].shape[0] == 0:
        # create df for the group
        tmp_group_df, tmp_track_df = create_group_track_df(groups_df, track_df, index_, applicants_list)
        # concat group to the global groups df
        groups_df = pd.concat([groups_df, tmp_group_df], axis=0).reset_index(drop=True)
        # concat track group to track global groups df
        track_df = pd.concat([track_df, tmp_track_df], axis=0).reset_index(drop=True)
    else:
        # get the group ids where all of the candidates are assigned
        groups_id_list = list(track_df[track_df['member'].isin(applicants_list)]['group_id'].unique())
        # locate where the group is
        select_df = groups_df[groups_df['group_id'].isin(groups_id_list)]
        # list of actual members of the group
        already_members = list(pd.unique(select_df[['leader', 'member']].values.ravel('K')))
        # union of already members + apliccants list --> idea: get a unique selection of a wider spectrum
        concatenated_list = list(set(already_members + applicants_list))
        # remove group from global groups and track dataframes
        groups_df = groups_df[~groups_df['group_id'].isin(groups_id_list)].copy()
        track_df = track_df[~track_df['group_id'].isin(groups_id_list)]
        # re-create both: groups & track - global dfs
        tmp_group_df, tmp_track_df = create_group_track_df(groups_df, track_df, index_, concatenated_list)
        # add the new set to both: groups & track - global dfs
        groups_df = pd.concat([groups_df, tmp_group_df], axis=0).reset_index(drop=True)
        track_df = pd.concat([track_df, tmp_track_df], axis=0).reset_index(drop=True)
    return groups_df, track_df

In [213]:
def product_name_replacement(df, dic_):
    df['product_name'] = df['product_name'].map(dic_)
    df['candidate'] = df['candidate'].map(dic_)
    return df

### f) Procedure: for each product

In [214]:
clean_leaders = df_clean['product_name'].unique()

In [215]:
len(clean_leaders), len(df_similars['match'].unique())

(1273, 1273)

In [216]:
df_clean.head()

Unnamed: 0,product_name,candidate
0,489,489
1,489,825
2,825,489
3,825,825
4,853,853


In [217]:
# time before
t_bef_group = time.time()

In [218]:
# dataframe definition
groups_df = pd.DataFrame(columns=['group_id', 'leader', 'member'])
track_df = pd.DataFrame(columns=['group_id', 'member'])

for leader in clean_leaders:
    select_df = df_clean[df_clean['product_name'] == leader] 
    applicants_list = list(pd.unique(select_df[['product_name', 'candidate']].values.ravel('K')))
    groups_df, track_df = verify_and_concat_groups(groups_df, track_df, leader, applicants_list)

In [219]:
# time run
t_run = time.time()-t_bef_group
print(f'Time to run procedure: {round(t_run/60, 3)} minutes!')

Time to run procedure: 0.125 minutes!


In [220]:
print(f'Number of groups: {len(groups_df["group_id"].unique())}')

Number of groups: 704


In [221]:
# replacing product names
groups_df['leader'] = groups_df['leader'].map(index_product_dict)
groups_df['member'] = groups_df['member'].map(index_product_dict)

In [222]:
groups_df.head()

Unnamed: 0,group_id,leader,member
0,0,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...
1,0,1 do pinos helado combinado 1 tosty mix snack ...,1 do pinos helado combinado 1 tosty mix snack ...
2,1,1 helado do pinos chocolate 14 gl 499 grs1 pas...,1 helado combinado do pinos 14 gl 499 grs1 pas...
3,1,1 helado do pinos chocolate 14 gl 499 grs1 pas...,1 helado do pinos chocochips 14 gl 499 grs1 pa...
4,1,1 helado do pinos chocolate 14 gl 499 grs1 pas...,1 helado combinado do pinos 14 gl 499 grs1 pas...


In [223]:
# Complete run time
t_complete = time.time()-t_initial
print(f'Time to run it all: {round(t_complete/60, 3)} minutes!')

Time to run it all: 0.371 minutes!


## 6. Validation

### 6.1 Products added

In [224]:
original_products = df_tf['product_name'].unique()
len(original_products)

1273

In [225]:
added_products = pd.unique(groups_df[['leader', 'member']].values.ravel('K'))
len(added_products)

1273

In [226]:
not_added = []
for prod_ in original_products:
    if prod_ not in added_products:
        not_added.append(prod_)

In [227]:
print(f'Number of products without group: {len(not_added)}')

Number of products without group: 0


### Who are they?

In [228]:
not_added[-10:]

[]

### 6.2 Duplicated leaders / members ?? (in 2 or more groups)

In [229]:
# uniques: group_id - leader
leaders_df = groups_df[['group_id', 'leader']].drop_duplicates().reset_index(drop=True)

In [230]:
# duplicated leaders
leaders_df[leaders_df['leader'].duplicated() == True]

Unnamed: 0,group_id,leader


In [231]:
# uniques: group_id - member
members_df = groups_df[['group_id', 'member']].drop_duplicates().reset_index(drop=True)

In [232]:
# duplicated members
members_df[members_df['member'].duplicated() == True]

Unnamed: 0,group_id,member


### 6.3 Adding not matched products

Products not added to the groups dataframe are because previously they demonstrated low similarity on the clusters generated with TF-IDF + Cosine Similarity layer. This why they are added as "individual groups".

In [233]:
max_id = groups_df['group_id'].max()

In [234]:
not_added_df = pd.DataFrame(data={
                    'group_id': range(max_id, max_id + len(not_added)),
                    'leader': not_added,
                    'member': not_added})

In [235]:
# concat to groups_df
groups_df = pd.concat([groups_df, not_added_df], axis=0).reset_index(drop=True)

In [236]:
# concat to track df
track_df = pd.concat([track_df, not_added_df.loc[:, ['group_id', 'member']]], axis=0).reset_index(drop=True)

### 6.4 Saving results

In [237]:
groups_df = groups_df.sort_values(by=['leader', 'member']).reset_index(drop=True)

In [238]:
groups_df.to_csv(f'outputs/groups_{country}_{threshold_products}_{threshold_package}.csv', index=False)

### 6.5 Samples

In [239]:
len(groups_df['leader'].unique()), len(groups_df['member'].unique())

(704, 1273)

In [240]:
groups_df[(groups_df['leader'].str.contains('coca'))|(groups_df['member'].str.contains('coca'))][:60]

Unnamed: 0,group_id,leader,member
