In [184]:
import pandas as pd
import numpy as np
import re
import Levenshtein as lev
from fuzzywuzzy import fuzz
import time
import sys

# To calculate: TF-IDF & Cosine Similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

import warnings
warnings.filterwarnings("ignore")

In [185]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('punkt')
from nltk.tokenize import word_tokenize

nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gonzalooportus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gonzalooportus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gonzalooportus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/gonzalooportus/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# CODE

In [186]:
# Initial time
t_initial = time.time()

## 1. Pre-processing

In [187]:
# parameters definition
country = 'cr'
parent_chain = 'booker' # lower case and "clean"
parent_chain_column = 'parent_chain_name'
item_column = 'sku_name'
language_ = 'es'
threshold_products = 85
threshold_package = 75
parent_chain_use = False
# to fix top in get_matches_df function
stop_ = False

In [188]:
# reading raw data
data = pd.read_csv('data/CR_products.csv')

In [189]:
def clean_text(df, col_name, new_col_name):
    # column values to lower case
    df[new_col_name] = df[col_name].str.lower().str.strip()
    # removes special characters
    df[new_col_name] = df[new_col_name].apply(lambda x: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z.% \t])", "", x))
    return df

In [190]:
if parent_chain_use:
    # cleaning parent chain name as it has duplicated entries
    df = clean_text(data, parent_chain_column, '{}_{}'.format(parent_chain_column, 'norm'))
    # chain selection and columns to work on
    df_nlp = df[df['parent_chain_name_norm'] == parent_chain]
    df_nlp = df_nlp.loc[:, ['parent_chain_name_norm', item_column]].reset_index(drop=True)
else:
    df_nlp = data.loc[:, [item_column]].drop_duplicates().reset_index(drop=True)

In [191]:
# item name standardization
df_nlp.rename(columns={'sku_name': 'item_name'}, inplace=True)

In [192]:
print(f"Initial products: {len(list(set(df_nlp['item_name'].unique())))}")

Initial products: 2782


## 2. NLP Aplication

In [193]:
if language_ == 'en':
    stop_words = stopwords.words('english')
elif language_ == 'es':
    stop_words = stopwords.words('spanish')

In [194]:
def replace_stop_words(df, col, stop_list):
    df['{}_stop'.format(col)] = df[col].apply(lambda x: ' '.join([word for word in x.split() if x not in stop_list]))
    return df

In [195]:
def word_lemmatizer(text):
    text_lemma = [WordNetLemmatizer().lemmatize(word) for word in text]
    return text_lemma

In [196]:
regex_clean = r'(pm \d+\w+)|(pm \d+\.\d+)|(pm\d+\.\d+)|(\d+ pmp)|(pm\d+)|( \.+)|(pmp\d+.\d+)|(\d+pmp)|(pmp \d+)|(\d+.\d+ pm)'


In [197]:
def nlp_cleaning(df, stop_words, regex_clean):
    # normalization
    df = clean_text(df, 'item_name', 'item_name_norm')
    # remove stop words
    df = replace_stop_words(df, 'item_name_norm', stop_words)
    # tokenize text
    df['item_name_token'] = df['item_name_norm_stop'].apply(lambda x: word_tokenize(x))
    # lemmatization
    df['item_name_token_lemma'] = df['item_name_token'].apply(lambda x: word_lemmatizer(x))
    # joining lemmas
    df['product_name'] = df['item_name_token_lemma'].apply(lambda list_: ' '.join([word for word in list_]))
    # cleaning product names with regex
    df['product_name'] = df['product_name'].apply(lambda x: re.sub(regex_clean, "", x))
    return df

In [198]:
df_nlp = nlp_cleaning(df_nlp, stop_words, regex_clean)

In [None]:
# unique items
len(df_nlp.item_name.unique()), len(df_nlp.product_name.unique())

In [None]:
df_nlp[:2]

### Creating mapping between source item_name & product_name (post NLP)

In [None]:
df_back_propagation = df_nlp.loc[:, ['item_name', 'product_name']]

In [None]:
df_back_propagation.to_csv(f'back_propagation/groups_{country}_back_propagation.csv', index=False)

## 3. TF-IDF Application

### Creating a tf-idf matrix

In [None]:
# preparing set for TF-IDF
df_tf = df_nlp.loc[:, ['product_name']]
df_tf = df_tf.drop_duplicates().reset_index(drop=True)
df_tf['id'] = range(1, len(df_tf) + 1)

In [None]:
# create object
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=2, token_pattern='(\S+)')

In [None]:
# get tf-idf values
tf_idf_matrix = tfidf_vectorizer.fit_transform(df_tf['product_name'])

In [None]:
tf_idf_matrix.shape

## 4. Computing cosine similarity

In [None]:
def cosine_similarity(A, B, ntop, lower_bound=0):
    # force A and B as a compressed sparse row (CSR) matrix.
    # CSR --> efficient operations, fast matrix vector products
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)
    ct.sparse_dot_topn(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype),
            A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype),
            B.data,
            ntop,
            lower_bound,
            indptr, indices, data)
    
    return csr_matrix((data,indices,indptr),shape=(M,N))

In [None]:
matches = cosine_similarity(tf_idf_matrix, tf_idf_matrix.transpose(), 25, 0)

In [None]:
matches.shape

### Create a match table to show the similarity scores

In [None]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'product_name': left_side,
                          'match': right_side,
                           'similarity_score': similairity})

In [None]:
matches_df = pd.DataFrame()
matches_df = get_matches_df(matches, df_tf['product_name'], top=False)

In [None]:
matches_df = matches_df.drop_duplicates().reset_index(drop=True)

In [None]:
matches.shape

In [None]:
matches_df.head()

### Products without a match

In [None]:
prod_list = list(df_tf.product_name.unique())
match_list = list(pd.unique(matches_df[['product_name', 'match']].values.ravel('K')))

In [None]:
not_match = []
for prod_ in prod_list:
    if prod_ not in match_list:
        not_match.append(prod_)

In [None]:
print(f'Number of products without match: {len(not_match)}')
print(f'Percentage of products without match: {round(len(not_match)/len(prod_list), 6)}')

### Who are they?

In [None]:
not_match[:4]

## 5. Grouping products

### 5.1 Fuzzy ratios calculation

In [None]:
matches_df['fuzz_ratio'] = matches_df.apply(lambda x: fuzz.token_sort_ratio(x['product_name'], x['match']), axis=1)

In [None]:
matches_df.head()

### 5.2 Splitting matches with high-low fuzzy ratios

In [None]:
print(f'Product Threshold: {threshold_products}')

In [None]:
df_similars = matches_df[matches_df['fuzz_ratio'] >= threshold_products].\
                        drop_duplicates(subset=['product_name', 'match']).reset_index(drop=True)

### 5.3 Logic to aggregate

In [None]:
df_similars = df_similars.sort_values(by=['product_name', 'match']).reset_index(drop=True)

### a) Extending similarities

In [None]:
df_similars.shape

In [None]:
df_similars_copy = df_similars.drop(columns=['similarity_score', 'fuzz_ratio'], axis=1).copy()

In [None]:
df_similars_copy.rename(columns={'match': 'extended_match', 'product_name': 'match'}, inplace=True)

In [None]:
# extending
df_similars_mrg = df_similars.merge(df_similars_copy, how='inner', on='match')

In [None]:
df_similars_mrg.shape

In [None]:
df_similars_mrg.drop('similarity_score', axis=1, inplace=True)

In [None]:
# melt dataframe
df_melt = df_similars_mrg.melt(id_vars=['product_name', 'fuzz_ratio'], var_name='which_match', value_name='candidate')

In [None]:
df_melt = df_melt.drop('which_match', axis=1)[['product_name', 'candidate', 'fuzz_ratio']]

In [None]:
df_similars_ext = df_melt.drop_duplicates(['product_name', 'candidate']).sort_values(by=['product_name', 'candidate'])\
            .reset_index(drop=True)


In [None]:
df_similars_ext.head()

### b) Package similarity

In [None]:
def package_extract(df, column, regex_):
    """
    Extracts the package from a product name. Uses a regular expression for these.
    
    Inputs:
    - df: dataframe
    - column: product name column where to look for packages
    - regex_: regular expression formula to match patterns
    
    Output: a column with the package of the specified product name column
    """
    packs = df[column].str.extract(regex_)
    packs['package'] = packs[packs.columns[0:]].apply(lambda x: ','.join(x.dropna()), axis=1)
    packs = packs.loc[:, ['package']]
    return packs.loc[:, ['package']]

In [None]:
reg_package = r'(\d+x\d+\w+)|(\d+ x \d+\w+)|(\d+\.+\d+\w+)|(\d+\.+\d+ \w+)|(\d+ ml)|(\d+ g)|(\d+\w+)|(\d+ \w+)'

In [None]:
# extracting package
df_similars_ext['package'] = package_extract(df_similars_ext, 'product_name', reg_package)
df_similars_ext['package_candidate'] = package_extract(df_similars_ext, 'candidate', reg_package)

In [None]:
# package similarity
df_similars_ext['package_ratio'] = df_similars_ext.apply(lambda x: fuzz.token_sort_ratio(x['package'],\
                                                                                x['package_candidate']), axis=1)

In [None]:
df_similars_ext.head()

### c) Tansforming product names into integers (easier to compare)

In [None]:
product_index_dict = dict(zip(df_tf['product_name'], df_tf.index))
index_product_dict = dict(zip(df_tf.index, df_tf['product_name']))

In [None]:
for col in ['product_name', 'candidate']:
    df_similars_ext[col] = df_similars_ext[col].map(product_index_dict)

In [None]:
df_similars_ext.head()

### d) Package filter + Column selection

In [None]:
print(f'Package Threshold: {threshold_package}')

In [None]:
df_clean = df_similars_ext[df_similars_ext['package_ratio'] > threshold_package].reset_index(drop=True)

In [None]:
df_clean = df_clean.loc[:, ['product_name', 'candidate']]

In [None]:
df_clean.head()

### e ) Functions

In [None]:
def create_group_track_df(groups_df, track_df, product, applicants_list):
    if groups_df.shape[0] == 0:
        group_id = 0
    else:
        group_id = groups_df['group_id'].max() + 1
    if track_df.shape[0] == 0:
        track_id = 0
    else:
        track_id = track_df['group_id'].max() + 1
        
    df_temp_group = pd.DataFrame({
        'group_id': group_id,
        'leader': product,
        'member': applicants_list
        })
    df_temp_track = pd.DataFrame({
        'group_id': track_id,
        'member': applicants_list
        })
    
    return df_temp_group, df_temp_track

In [None]:
def verify_and_concat_groups(groups_df, track_df, index_, applicants_list):
    # verify if any of the applicants is already assigned to a group, if not:    
    if track_df[track_df['member'].isin(applicants_list)].shape[0] == 0:
        # create df for the group
        tmp_group_df, tmp_track_df = create_group_track_df(groups_df, track_df, index_, applicants_list)
        # concat group to the global groups df
        groups_df = pd.concat([groups_df, tmp_group_df], axis=0).reset_index(drop=True)
        # concat track group to track global groups df
        track_df = pd.concat([track_df, tmp_track_df], axis=0).reset_index(drop=True)
    else:
        # get the group ids where all of the candidates are assigned
        groups_id_list = list(track_df[track_df['member'].isin(applicants_list)]['group_id'].unique())
        # locate where the group is
        select_df = groups_df[groups_df['group_id'].isin(groups_id_list)]
        # list of actual members of the group
        already_members = list(pd.unique(select_df[['leader', 'member']].values.ravel('K')))
        # union of already members + apliccants list --> idea: get a unique selection of a wider spectrum
        concatenated_list = list(set(already_members + applicants_list))
        # remove group from global groups and track dataframes
        groups_df = groups_df[~groups_df['group_id'].isin(groups_id_list)].copy()
        track_df = track_df[~track_df['group_id'].isin(groups_id_list)]
        # re-create both: groups & track - global dfs
        tmp_group_df, tmp_track_df = create_group_track_df(groups_df, track_df, index_, concatenated_list)
        # add the new set to both: groups & track - global dfs
        groups_df = pd.concat([groups_df, tmp_group_df], axis=0).reset_index(drop=True)
        track_df = pd.concat([track_df, tmp_track_df], axis=0).reset_index(drop=True)
    return groups_df, track_df

In [None]:
def product_name_replacement(df, dic_):
    df['product_name'] = df['product_name'].map(dic_)
    df['candidate'] = df['candidate'].map(dic_)
    return df

### f) Procedure: for each product

In [None]:
clean_leaders = df_clean['product_name'].unique()

In [None]:
len(clean_leaders), len(df_similars['match'].unique())

In [None]:
df_clean.head()

In [None]:
# time before
t_bef_group = time.time()

In [None]:
# dataframe definition
groups_df = pd.DataFrame(columns=['group_id', 'leader', 'member'])
track_df = pd.DataFrame(columns=['group_id', 'member'])

for leader in clean_leaders:
    select_df = df_clean[df_clean['product_name'] == leader] 
    applicants_list = list(pd.unique(select_df[['product_name', 'candidate']].values.ravel('K')))
    groups_df, track_df = verify_and_concat_groups(groups_df, track_df, leader, applicants_list)

In [None]:
# time run
t_run = time.time()-t_bef_group
print(f'Time to run procedure: {round(t_run/60, 3)} minutes!')

In [None]:
print(f'Number of groups: {len(groups_df["group_id"].unique())}')

In [None]:
# replacing product names
groups_df['leader'] = groups_df['leader'].map(index_product_dict)
groups_df['member'] = groups_df['member'].map(index_product_dict)

In [None]:
groups_df.head()

In [None]:
# Complete run time
t_complete = time.time()-t_initial
print(f'Time to run it all: {round(t_complete/60, 3)} minutes!')

## 6. Validation

### 6.1 Products added

In [None]:
original_products = df_tf['product_name'].unique()
len(original_products)

In [None]:
added_products = pd.unique(groups_df[['leader', 'member']].values.ravel('K'))
len(added_products)

In [None]:
not_added = []
for prod_ in original_products:
    if prod_ not in added_products:
        not_added.append(prod_)

In [None]:
print(f'Number of products without group: {len(not_added)}')

### Who are they?

In [None]:
not_added[-10:]

### 6.2 Duplicated leaders / members ?? (in 2 or more groups)

In [None]:
# uniques: group_id - leader
leaders_df = groups_df[['group_id', 'leader']].drop_duplicates().reset_index(drop=True)

In [None]:
# duplicated leaders
leaders_df[leaders_df['leader'].duplicated() == True]

In [None]:
# uniques: group_id - member
members_df = groups_df[['group_id', 'member']].drop_duplicates().reset_index(drop=True)

In [None]:
# duplicated members
members_df[members_df['member'].duplicated() == True]

### 6.3 Adding not matched products

Products not added to the groups dataframe are because previously they demonstrated low similarity on the clusters generated with TF-IDF + Cosine Similarity layer. This why they are added as "individual groups".

In [None]:
max_id = groups_df['group_id'].max()

In [None]:
not_added_df = pd.DataFrame(data={
                    'group_id': range(max_id, max_id + len(not_added)),
                    'leader': not_added,
                    'member': not_added})

In [None]:
# concat to groups_df
groups_df = pd.concat([groups_df, not_added_df], axis=0).reset_index(drop=True)

In [None]:
# concat to track df
track_df = pd.concat([track_df, not_added_df.loc[:, ['group_id', 'member']]], axis=0).reset_index(drop=True)

### 6.4 Saving results

In [None]:
groups_df = groups_df.sort_values(by=['leader', 'member']).reset_index(drop=True)

In [None]:
groups_df.to_csv(f'outputs/groups_{country}_{threshold_products}_{threshold_package}.csv', index=False)

### 6.5 Samples

In [None]:
len(groups_df['leader'].unique()), len(groups_df['member'].unique())

In [None]:
groups_df[(groups_df['leader'].str.contains('coca'))|(groups_df['member'].str.contains('coca'))][:60]