In [1]:
import pandas as pd
import numpy as np
import re
import Levenshtein as lev
from fuzzywuzzy import fuzz
import time

In [2]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('punkt')
from nltk.tokenize import word_tokenize

nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gonzalooportus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gonzalooportus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gonzalooportus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/gonzalooportus/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# CODE

## 1. Pre-processing

In [3]:
# parameters definition
parent_chain = 'booker' # lower case and "clean"
parent_chain_column = 'parent_chain_name'
item_column = 'item_name'
language_ = 'en'
threshold_ = 82
parent_chain_use = True

In [4]:
# reading raw data
data = pd.read_csv('uk_booker_products.csv')

In [5]:
def clean_text(df, col_name, new_col_name):
    # column values to lower case
    df[new_col_name] = df[col_name].str.lower().str.strip()
    # removes special characters
    df[new_col_name] = df[new_col_name].apply(lambda x: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z.% \t])", "", x))
    return df

In [6]:
if parent_chain_use:
    # cleaning parent chain name as it has duplicated entries
    df = clean_text(data, parent_chain_column, '{}_{}'.format(parent_chain_column, 'norm'))
    # chain selection and columns to work on
    df_nlp = df[df['parent_chain_name_norm'] == parent_chain]
    df_nlp = df_nlp.loc[:, ['parent_chain_name_norm', item_column]].reset_index(drop=True)
else:
    df_nlp = data.loc[:, [item_column]].drop_duplicates().reset_index(drop=True)

In [7]:
# item name standardization
df_nlp.rename(columns={'sku_name': 'item_name'}, inplace=True)

In [8]:
print(f"Initial products: {len(list(set(df_nlp['item_name'].unique())))}")

Initial products: 50587


## 2. NLP Aplication

In [9]:
if language_ == 'en':
    stop_words = stopwords.words('english')
elif language_ == 'es':
    stop_words = stopwords.words('spanish')

In [10]:
def replace_stop_words(df, col, stop_list):
    df['{}_stop'.format(col)] = df[col].apply(lambda x: ' '.join([word for word in x.split() if x not in stop_list]))
    return df

In [11]:
def word_lemmatizer(text):
    text_lemma = [WordNetLemmatizer().lemmatize(word) for word in text]
    return text_lemma

In [12]:
regex_clean = r'(pm \d+\w+)|(pm \d+\.\d+)|(pm\d+\.\d+)|(\d+ pmp)|(pm\d+)|( \.+)|(pmp\d+.\d+)|(\d+pmp)|(pmp \d+)|(\d+.\d+ pm)'


In [13]:
def nlp_cleaning(df, stop_words, regex_clean):
    # normalization
    df = clean_text(df, 'item_name', 'item_name_norm')
    # remove stop words
    df = replace_stop_words(df, 'item_name_norm', stop_words)
    # tokenize text
    df['item_name_token'] = df['item_name_norm_stop'].apply(lambda x: word_tokenize(x))
    # lemmatization
    df['item_name_token_lemma'] = df['item_name_token'].apply(lambda x: word_lemmatizer(x))
    # joining lemmas
    df['product_name'] = df['item_name_token_lemma'].apply(lambda list_: ' '.join([word for word in list_]))
    # cleaning product names with regex
    df['product_name'] = df['product_name'].apply(lambda x: re.sub(regex_clean, "", x))
    return df

In [14]:
df_nlp = nlp_cleaning(df_nlp, stop_words, regex_clean)

In [15]:
# unique items
len(df_nlp.item_name.unique()), len(df_nlp.product_name.unique())

(50587, 41819)

In [16]:
df_nlp[:2]

Unnamed: 0,parent_chain_name_norm,item_name,item_name_norm,item_name_norm_stop,item_name_token,item_name_token_lemma,product_name
0,booker,\tAunt Bessie's Hearty & Homely Dumpling Mix 140g,aunt bessies hearty homely dumpling mix 140g,aunt bessies hearty homely dumpling mix 140g,"[aunt, bessies, hearty, homely, dumpling, mix,...","[aunt, bessies, hearty, homely, dumpling, mix,...",aunt bessies hearty homely dumpling mix 140g
1,booker,\tBC Choc Fudge Brownie,bc choc fudge brownie,bc choc fudge brownie,"[bc, choc, fudge, brownie]","[bc, choc, fudge, brownie]",bc choc fudge brownie


## 3. Extending df

In [17]:
df_filter = df_nlp.loc[:, ['product_name']]

In [18]:
df_filter.head()

Unnamed: 0,product_name
0,aunt bessies hearty homely dumpling mix 140g
1,bc choc fudge brownie
2,batchelors big super noodle chicken flavour 100g
3,batchelors condensed soup cream of chicken 295g
4,batchelors cream of tomato condensed soup 295g


In [19]:
# extendemos cada registro segun las palabras que contenga
df_split = df_filter.assign(name_split=df_filter['product_name'].str.split(' ')).\
                explode('name_split').loc[lambda x: x.name_split!='']

In [20]:
df_split.head()

Unnamed: 0,product_name,name_split
0,aunt bessies hearty homely dumpling mix 140g,aunt
0,aunt bessies hearty homely dumpling mix 140g,bessies
0,aunt bessies hearty homely dumpling mix 140g,hearty
0,aunt bessies hearty homely dumpling mix 140g,homely
0,aunt bessies hearty homely dumpling mix 140g,dumpling


In [21]:
# copy to merge
df_split_copy = df_split.copy()

In [22]:
df_split_copy.rename(columns={'product_name': 'match'}, inplace=True)

In [23]:
# merging with itself: products with at least 1 word in common will align
df_split_mrg = df_split.merge(df_split_copy, how='inner', on='name_split')

In [24]:
df_split_mrg.shape

(121422171, 3)

In [25]:
df_compare = df_split_mrg.drop_duplicates(subset=['product_name', 'match']).reset_index(drop=True)

In [26]:
df_compare.drop('name_split', axis=1, inplace=True)

In [27]:
df_compare.shape

(59603087, 2)

In [28]:
df_compare.head(10)

Unnamed: 0,product_name,match
0,aunt bessies hearty homely dumpling mix 140g,aunt bessies hearty homely dumpling mix 140g
1,aunt bessies hearty homely dumpling mix 140g,aunt bessies yorkshire 10 pack
2,aunt bessies hearty homely dumpling mix 140g,aunt bessies yorkshire pudding 12pk
3,aunt bessies hearty homely dumpling mix 140g,220 g aunt bessies yorkshire pudding
4,aunt bessies hearty homely dumpling mix 140g,aunt bessies 10 yorkshire pudding
5,aunt bessies hearty homely dumpling mix 140g,aunt bessies mashed potato 650g
6,aunt bessies hearty homely dumpling mix 140g,aunt bessies yorkshire pudding 190g
7,aunt bessies hearty homely dumpling mix 140g,aunt bessie yorkshire pudding
8,aunt bessies hearty homely dumpling mix 140g,aunt bessie yorkshire pudding 270g
9,aunt bessies hearty homely dumpling mix 140g,aunt bessies 10 glorious golden yorkshire pudd...


## 4. Functions for package similarity

In [29]:
def package_extract(df, column, regex_):
    """
    Extracts the package from a product name. Uses a regular expression for these.
    
    Inputs:
    - df: dataframe
    - column: product name column where to look for packages
    - regex_: regular expression formula to match patterns
    
    Output: a column with the package of the specified product name column
    """
    packs = df[column].str.extract(regex_)
    packs['package'] = packs[packs.columns[0:]].apply(lambda x: ','.join(x.dropna()), axis=1)
    packs = packs.loc[:, ['package']]
    return packs.loc[:, ['package']]

In [30]:
reg_package = r'(\d+x\d+\w+)|(\d+ x \d+\w+)|(\d+\.+\d+\w+)|(\d+\.+\d+ \w+)|(\d+ ml)|(\d+ g)|(\d+\w+)|(\d+ \w+)'

In [31]:
def clean_group(group_df, regex_, threshold_=85):
    """
    From a group of products which are similar, compares if they share the same/similar package. For this, uses 
    "package_extract" function to extract the package of the "leader" product, and its similars; and then uses
    fuzzywuzzy to compare the similarity of the packages. Finally, keeps the packages with similarity over a threshold
    of 75 or the one specified by the user.
    
    Inputs:
    - group_df: dataframe with a group of similar products
    - regex_: regex formula to extract the package
    - threshold_: threshold of similarity to compare package
    
    Output: a clean group of similar candidates
    """
    group_df['package'] = package_extract(group_df, 'product_name', regex_)
    group_df['package_candidate'] = package_extract(group_df, 'candidate', regex_)
    group_df['package_ratio'] = group_df.apply(lambda x: fuzz.token_sort_ratio(x['package'], x['package_candidate']), axis=1)
    group_df = group_df[group_df['package_ratio'] >= threshold_].copy()
    group_df = group_df.loc[:, ['product_name', 'candidate', 'fuzz_ratio']]
    group_df.reset_index(drop=True, inplace=True)
    return group_df


## 5. Groups definition

In [32]:
df_compare.head()

Unnamed: 0,product_name,match
0,aunt bessies hearty homely dumpling mix 140g,aunt bessies hearty homely dumpling mix 140g
1,aunt bessies hearty homely dumpling mix 140g,aunt bessies yorkshire 10 pack
2,aunt bessies hearty homely dumpling mix 140g,aunt bessies yorkshire pudding 12pk
3,aunt bessies hearty homely dumpling mix 140g,220 g aunt bessies yorkshire pudding
4,aunt bessies hearty homely dumpling mix 140g,aunt bessies 10 yorkshire pudding


In [33]:
product_name_list = df_compare['product_name'].unique()

### 5.1 Fuzzy ratios

In [48]:
t_before = time.time()

In [34]:
df_compare['fuzzy_ratios'] = df_compare.apply(lambda x: fuzz.token_sort_ratio(x.product_name, x.match), axis=1)

In [49]:
t_run = time.time()-t_before
t_run

1.2229108810424805

In [50]:
df_compare.head()

Unnamed: 0,product_name,match,fuzzy_ratios
0,aunt bessies hearty homely dumpling mix 140g,aunt bessies hearty homely dumpling mix 140g,100
1,aunt bessies hearty homely dumpling mix 140g,aunt bessies,100
2,aunt bessies hearty homely dumpling mix 140g,aunt bessies dumpling mix 140g,100
3,aunt bessies hearty homely dumpling mix 140g,aunt bessies dumpling mix,100
4,aunt bessies yorkshire 10 pack,aunt bessies yorkshire 10 pack,100


### 5.2 Removing "bad" matches

In [51]:
df_80 = df_compare[df_compare['fuzzy_ratios'] > 80].reset_index(drop=True)

### 5.3 Identifying groups

In [37]:
def package_extract(df, column, regex_):
    """
    Extracts the package from a product name. Uses a regular expression for these.
    
    Inputs:
    - df: dataframe
    - column: product name column where to look for packages
    - regex_: regular expression formula to match patterns
    
    Output: a column with the package of the specified product name column
    """
    packs = df[column].str.extract(regex_)
    packs['package'] = packs[packs.columns[0:]].apply(lambda x: ','.join(x.dropna()), axis=1)
    packs = packs.loc[:, ['package']]
    return packs.loc[:, ['package']]

In [38]:
reg_package = r'(\d+x\d+\w+)|(\d+ x \d+\w+)|(\d+\.+\d+\w+)|(\d+\.+\d+ \w+)|(\d+ ml)|(\d+ g)|(\d+\w+)|(\d+ \w+)'

In [39]:
def clean_group(group_df, regex_, threshold_=85):
    """
    From a group of products which are similar, compares if they share the same/similar package. For this, uses 
    "package_extract" function to extract the package of the "leader" product, and its similars; and then uses
    fuzzywuzzy to compare the similarity of the packages. Finally, keeps the packages with similarity over a threshold
    of 75 or the one specified by the user.
    
    Inputs:
    - group_df: dataframe with a group of similar products
    - regex_: regex formula to extract the package
    - threshold_: threshold of similarity to compare package
    
    Output: a clean group of similar candidates
    """
    group_df['package'] = package_extract(group_df, 'product_name', regex_)
    group_df['package_candidate'] = package_extract(group_df, 'candidate', regex_)
    group_df['package_ratio'] = group_df.apply(lambda x: fuzz.token_sort_ratio(x['package'], x['package_candidate']), axis=1)
    group_df = group_df[group_df['package_ratio'] >= threshold_].copy()
    group_df = group_df.loc[:, ['product_name', 'candidate', 'fuzz_ratio']]
    group_df.reset_index(drop=True, inplace=True)
    return group_df

In [40]:
def create_group_track_df(groups_df, track_df, product, applicants_list):
    if groups_df.shape[0] == 0:
        group_id = 0
    else:
        group_id = groups_df['group_id'].max() + 1
    if track_df.shape[0] == 0:
        track_id = 0
    else:
        track_id = track_df['group_id'].max() + 1
        
    df_temp_group = pd.DataFrame({
        'group_id': group_id,
        'leader': product,
        'member': applicants_list
        })
    df_temp_track = pd.DataFrame({
        'group_id': track_id,
        'member': applicants_list
        })
    
    return df_temp_group, df_temp_track

In [41]:
def verify_and_concat_groups(groups_df, track_df, index_, applicants_list):
    # verify if any of the applicants is already assigned to a group, if not:    
    if track_df[track_df['member'].isin(applicants_list)].shape[0] == 0:
        # create df for the group
        tmp_group_df, tmp_track_df = create_group_track_df(groups_df, track_df, index_, applicants_list)
        # concat group to the global groups df
        groups_df = pd.concat([groups_df, tmp_group_df], axis=0).reset_index(drop=True)
        # concat track group to track global groups df
        track_df = pd.concat([track_df, tmp_track_df], axis=0).reset_index(drop=True)
    else:
        # get the group ids where all of the candidates are assigned
        groups_id_list = list(track_df[track_df['member'].isin(applicants_list)]['group_id'].unique())
        # locate where the group is
        select_df = groups_df[groups_df['group_id'].isin(groups_id_list)]
        # list of actual members of the group
        already_members = list(pd.unique(select_df[['leader', 'member']].values.ravel('K')))
        # union of already members + apliccants list --> idea: get a unique selection of a wider spectrum
        concatenated_list = list(set(already_members + applicants_list))
        # remove group from global groups and track dataframes
        groups_df = groups_df[~groups_df['group_id'].isin(groups_id_list)].copy()
        track_df = track_df[~track_df['group_id'].isin(groups_id_list)]
        # re-create both: groups & track - global dfs
        tmp_group_df, tmp_track_df = create_group_track_df(groups_df, track_df, index_, concatenated_list)
        # add the new set to both: groups & track - global dfs
        groups_df = pd.concat([groups_df, tmp_group_df], axis=0).reset_index(drop=True)
        track_df = pd.concat([track_df, tmp_track_df], axis=0).reset_index(drop=True)
    return groups_df, track_df

In [42]:
product_ = product_name_list[0]
product_

'aunt bessies hearty homely dumpling mix 140g'

In [56]:
# dataframe definition
groups_df = pd.DataFrame(columns=['group_id', 'leader', 'member'])
track_df = pd.DataFrame(columns=['group_id', 'member'])

for product_ in product_name_list:
    df_group = df_80[df_80['product_name'] == product_].copy()
    df_group.rename(columns={'match': 'candidate', 'fuzzy_ratios': 'fuzz_ratio'}, inplace=True)
    clean_group_df = clean_group(df_group, reg_package, threshold_=85)
    applicants_list = list(pd.unique(clean_group_df[['product_name', 'candidate']].values.ravel('K')))
    groups_df, track_df = verify_and_concat_groups(groups_df, track_df, product_, applicants_list)

In [59]:
groups_df = groups_df.sort_values(by=['leader', 'member']).reset_index(drop=True)

In [62]:
groups_df[(groups_df['leader'].str.contains('coca'))|(groups_df['member'].str.contains('coca'))][20:80]

Unnamed: 0,group_id,leader,member
3065,28493,budweisr 6x300ml,coca cola can 330ml
3066,28493,budweisr 6x300ml,coca cola can cherry 330ml
3067,28493,budweisr 6x300ml,coca cola can zero 330ml
3068,28493,budweisr 6x300ml,coca cola cherry 330ml
3069,28493,budweisr 6x300ml,coca cola cherry 330ml can
3070,28493,budweisr 6x300ml,coca cola cherry 330ml x2
3071,28493,budweisr 6x300ml,coca cola classic 330ml
3072,28493,budweisr 6x300ml,coca cola classic cherry 330ml
3073,28493,budweisr 6x300ml,coca cola classic cherry 330ml
3074,28493,budweisr 6x300ml,coca cola diet can 10x330ml
