In [1]:
import pandas as pd
import numpy as np
import re
import Levenshtein as lev
from fuzzywuzzy import fuzz
import time

In [None]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('punkt')
from nltk.tokenize import word_tokenize

nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer

# CODE

## 1. Pre-processing

In [2]:
# parameters definition
parent_chain = 'booker' # lower case and "clean"
parent_chain_column = 'parent_chain_name'
item_column = 'item_name'
language_ = 'en'
threshold_ = 82
parent_chain_use = True

In [3]:
# reading raw data
data = pd.read_csv('uk_booker_products.csv')

In [4]:
def clean_text(df, col_name, new_col_name):
    # column values to lower case
    df[new_col_name] = df[col_name].str.lower().str.strip()
    # removes special characters
    df[new_col_name] = df[new_col_name].apply(lambda x: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z.% \t])", "", x))
    return df

In [5]:
if parent_chain_use:
    # cleaning parent chain name as it has duplicated entries
    df = clean_text(data, parent_chain_column, '{}_{}'.format(parent_chain_column, 'norm'))
    # chain selection and columns to work on
    df_nlp = df[df['parent_chain_name_norm'] == parent_chain]
    df_nlp = df_nlp.loc[:, ['parent_chain_name_norm', item_column]].reset_index(drop=True)
else:
    df_nlp = data.loc[:, [item_column]].drop_duplicates().reset_index(drop=True)

In [6]:
# item name standardization
df_nlp.rename(columns={'sku_name': 'item_name'}, inplace=True)

In [7]:
print(f"Initial products: {len(list(set(df_nlp['item_name'].unique())))}")

Initial products: 50587


## 2. NLP Aplication

In [None]:
if language_ == 'en':
    stop_words = stopwords.words('english')
elif language_ == 'es':
    stop_words = stopwords.words('spanish')

In [None]:
def replace_stop_words(df, col, stop_list):
    df['{}_stop'.format(col)] = df[col].apply(lambda x: ' '.join([word for word in x.split() if x not in stop_list]))
    return df

In [None]:
def word_lemmatizer(text):
    text_lemma = [WordNetLemmatizer().lemmatize(word) for word in text]
    return text_lemma

In [None]:
regex_clean = r'(pm \d+\w+)|(pm \d+\.\d+)|(pm\d+\.\d+)|(\d+ pmp)|(pm\d+)|( \.+)|(pmp\d+.\d+)|(\d+pmp)|(pmp \d+)|(\d+.\d+ pm)'


In [None]:
def nlp_cleaning(df, stop_words, regex_clean):
    # normalization
    df = clean_text(df, 'item_name', 'item_name_norm')
    # remove stop words
    df = replace_stop_words(df, 'item_name_norm', stop_words)
    # tokenize text
    df['item_name_token'] = df['item_name_norm_stop'].apply(lambda x: word_tokenize(x))
    # lemmatization
    df['item_name_token_lemma'] = df['item_name_token'].apply(lambda x: word_lemmatizer(x))
    # joining lemmas
    df['product_name'] = df['item_name_token_lemma'].apply(lambda list_: ' '.join([word for word in list_]))
    # cleaning product names with regex
    df['product_name'] = df['product_name'].apply(lambda x: re.sub(regex_clean, "", x))
    return df

In [None]:
df_nlp = nlp_cleaning(df_nlp, stop_words, regex_clean)

In [None]:
# unique items
len(df_nlp.item_name.unique()), len(df_nlp.product_name.unique())

In [None]:
df_nlp[:2]

## 3. Levenshtein Ratio Calculation

### 3.1 Data preparation

In [22]:
# uselful columns selection (item selected just for 'product_name' not be treated as array)
df_lev = df_nlp.loc[:, ['product_name']]
df_lev = df_lev.drop_duplicates('product_name').reset_index(drop=True)

In [23]:
df_lev.head()

Unnamed: 0,product_name
0,aunt bessies hearty homely dumpling mix 140g
1,bc choc fudge brownie
2,batchelors big super noodle chicken flavour 100g
3,batchelors condensed soup cream of chicken 295g
4,batchelors cream of tomato condensed soup 295g


In [24]:
len_ = df_lev.shape[0]

In [25]:
# matrix creation --> null values
matrix_ = np.zeros((len_, len_))

In [29]:
matrix_.shape

(41819, 41819)

### 3.2 Applying Levenshtein Method

In [27]:
t1 = time.time()
t1

1659027592.2706292

In [28]:
for i, product_ in enumerate(df_lev['product_name']):
    for j, match_ in enumerate(df_lev['product_name']):
        matrix_[i][j] = fuzz.token_sort_ratio(match_, product_)

KeyboardInterrupt: 

In [None]:
t = time.time()-t1
t

In [None]:
print(f'Direct Levenshtein for 50K: {t/60} minutes')

In [None]:
# matrix copy
matrix_copy = matrix_

In [None]:
matrix_

### 4. Matrix to dataframe

IDEA:

* podria hacer un diccionario entre: product-index
* por cada producto, tomo su fila
* filtro por el threshold
* traigo los similares de los matches actuales usando su index
* integro al dataframe bajo condiciones ya establecidas

In [None]:
product_name_list = list(df_lev['product_name'].values)

In [None]:
# dataframe with lev ratios
df_ratios = pd.DataFrame(matrix_)

In [None]:
df_ratios.head()

In [None]:
def create_group_track_df(groups_df, track_df, product, applicants_list):
    if groups_df.shape[0] == 0:
        group_id = 0
    else:
        group_id = groups_df['group_id'].max() + 1
    if track_df.shape[0] == 0:
        track_id = 0
    else:
        track_id = track_df['group_id'].max() + 1
        
    df_temp_group = pd.DataFrame({
        'group_id': group_id,
        'leader': product,
        'member': applicants_list
        })
    df_temp_track = pd.DataFrame({
        'group_id': track_id,
        'member': applicants_list
        })
    
    return df_temp_group, df_temp_track

### Package similarity

In [None]:
def package_extract(df, column, regex_):
    """
    Extracts the package from a product name. Uses a regular expression for these.
    
    Inputs:
    - df: dataframe
    - column: product name column where to look for packages
    - regex_: regular expression formula to match patterns
    
    Output: a column with the package of the specified product name column
    """
    packs = df[column].str.extract(regex_)
    packs['package'] = packs[packs.columns[0:]].apply(lambda x: ','.join(x.dropna()), axis=1)
    packs = packs.loc[:, ['package']]
    return packs.loc[:, ['package']]

In [None]:
reg_package = r'(\d+x\d+\w+)|(\d+ x \d+\w+)|(\d+\.+\d+\w+)|(\d+\.+\d+ \w+)|(\d+ ml)|(\d+ g)|(\d+\w+)|(\d+ \w+)'

In [None]:
def clean_group(group_df, regex_, threshold_=85):
    """
    From a group of products which are similar, compares if they share the same/similar package. For this, uses 
    "package_extract" function to extract the package of the "leader" product, and its similars; and then uses
    fuzzywuzzy to compare the similarity of the packages. Finally, keeps the packages with similarity over a threshold
    of 75 or the one specified by the user.
    
    Inputs:
    - group_df: dataframe with a group of similar products
    - regex_: regex formula to extract the package
    - threshold_: threshold of similarity to compare package
    
    Output: a clean group of similar candidates
    """
    group_df['package'] = package_extract(group_df, 'product_name', regex_)
    group_df['package_match'] = package_extract(group_df, 'match', regex_)
    group_df['package_ratio'] = group_df.apply(lambda x: fuzz.token_sort_ratio(x['package'], x['package_match']), axis=1)
    group_df = group_df[group_df['package_ratio'] >= threshold_].copy()
    group_df = group_df.loc[:, ['product_name', 'match', 'lev_ratio']]
    group_df.reset_index(drop=True, inplace=True)
    return group_df


### 5. Identifying groups

In [None]:
def verify_and_concat_groups(groups_df, track_df, index_, applicants_list):
    # verify if any of the applicants is already assigned to a group, if not:    
    if track_df[track_df['member'].isin(applicants_list)].shape[0] == 0:
        # create df for the group
        tmp_group_df, tmp_track_df = create_group_track_df(groups_df, track_df, index_, applicants_list)
        # concat group to the global groups df
        groups_df = pd.concat([groups_df, tmp_group_df], axis=0).reset_index(drop=True)
        # concat track group to track global groups df
        track_df = pd.concat([track_df, tmp_track_df], axis=0).reset_index(drop=True)
    else:
        # get the group ids where all of the candidates are assigned
        groups_id_list = list(track_df[track_df['member'].isin(applicants_list)]['group_id'].unique())
        # locate where the group is
        select_df = groups_df[groups_df['group_id'].isin(groups_id_list)]
        # list of actual members of the group
        already_members = list(pd.unique(select_df[['leader', 'member']].values.ravel('K')))
        # union of already members + apliccants list --> idea: get a unique selection of a wider spectrum
        concatenated_list = list(set(already_members + applicants_list))
        # remove group from global groups and track dataframes
        groups_df = groups_df[~groups_df['group_id'].isin(groups_id_list)].copy()
        track_df = track_df[~track_df['group_id'].isin(groups_id_list)]
        # re-create both: groups & track - global dfs
        tmp_group_df, tmp_track_df = create_group_track_df(groups_df, track_df, index_, concatenated_list)
        # add the new set to both: groups & track - global dfs
        groups_df = pd.concat([groups_df, tmp_group_df], axis=0).reset_index(drop=True)
        track_df = pd.concat([track_df, tmp_track_df], axis=0).reset_index(drop=True)
    return groups_df, track_df

In [None]:
# dictionary to match product names with index
product_index_dict = dict(zip(product_name_list, df_ratios.columns))

In [None]:
# dictionary to match indexes with product_names
index_product_dict = dict(zip(df_ratios.columns, product_name_list))

In [None]:
# time before
t_bef_group = time.time()

In [None]:
# dataframe definition
groups_df = pd.DataFrame(columns=['group_id', 'leader', 'member'])
track_df = pd.DataFrame(columns=['group_id', 'member'])

# iterating for all products and defining final groups
for product_, index_ in product_index_dict.items():
    try:
        # gets all levenshtein ratios of the product and the set
        df_values = df_ratios.iloc[index_, :]
        # we get the indexes of the products that have similarity measure over the threshold
        similar_indexes_list = list(np.where(df_values > 0.8)[0])
        # get the selection of products that are similar (extended) and also their levenshtein ratios
        df_extended = df_ratios.iloc[similar_indexes_list, :]
        # adds product name column to melt
        df_extended.insert(0, 'product_name', df_extended.index)
        # melt dataframe
        df_melt = df_extended.melt(id_vars='product_name', var_name='match', value_name='lev_ratio')
        # filter product lev ratios by threshold         
        df_group = df_melt[df_melt['lev_ratio'] > 0.8].reset_index(drop=True)
        # replacing product name to clarify the direction of similarities (product tha's being processed)
        df_group['product_name'] = index_
        
        # cleaning group by applying package comparison (regex)
        df_group['product_name'] = df_group['product_name'].map(index_product_dict)
        df_group['match'] = df_group['match'].map(index_product_dict)
        df_clean_group = clean_group(df_group, reg_package, threshold_=85)
        # place back int values for better performance
        df_clean_group['product_name'] = df_clean_group['product_name'].map(product_index_dict)
        df_clean_group['match'] = df_clean_group['match'].map(product_index_dict)
        
        # removing duplicates (matches)
        df_clean_group = df_clean_group.drop_duplicates(subset=['product_name', 'match']).reset_index(drop=True)
        # applicants list (all products - including "leader")
        applicants_list = list(pd.unique(df_clean_group[['product_name', 'match']].values.ravel('K')))
        groups_df, track_df = verify_and_concat_groups(groups_df, track_df, index_, applicants_list)
    except Exception as e: 
        print(e)
        print(f'Failed product: {product_}; Index: {index_}')
        break

In [None]:
# time to run
t_run = time.time()-t_bef_group

In [None]:
print(f'Time to run: {round(t_run/60, 2)} minutes!')

### 6. Validation

#### 6.1 Verify if any leader leads more than 1 group

In [None]:
leaders_df = groups_df.loc[:, ['group_id', 'leader']].drop_duplicates().reset_index(drop=True)

In [None]:
leaders_df[leaders_df['leader'].duplicated() == True]

In [None]:
if leaders_df[leaders_df['leader'].duplicated() == True].shape[0] == 0:
    print('Leaders lead correctly!')
else:
    print('Verify leaders please! They are repeated!')

#### 6.2 Verify if any product belongs to more than 1 group

In [None]:
members_df = groups_df.loc[:, ['group_id', 'member']].drop_duplicates().reset_index(drop=True)

In [None]:
members_df[members_df['member'].duplicated() == True]

In [None]:
if members_df[members_df['member'].duplicated() == True].shape[0] == 0:
    print('Members are assigned correctly!')
else:
    print('Verify members please! Some are assigned to more than 1 group!')

#### 6.3 All products have been assigned

Hipotesis:
    
* Se estan filtrando en el threshold, no tienen similares

Confirmado:
* No se estan agregando grupos!!! 

##### on groups DF

In [None]:
all_products_list = list(index_product_dict.keys())
all_products_list[:2]

In [None]:
groups_assigned_products = pd.unique(groups_df[['leader', 'member']].values.ravel('K'))

In [None]:
not_in_groups = []
for i in all_products_list:
    if i not in groups_assigned_products:
        not_in_groups.append(i)

In [None]:
print(f'Number of products not assigned in groups_df: {len(not_in_groups)}')

##### On track DF

In [None]:
track_assigned_products = pd.unique(track_df[['group_id', 'member']].values.ravel('K'))

In [None]:
not_in_track = []
for i in all_products_list:
    if i not in track_assigned_products:
        not_in_track.append(i)

In [None]:
print(f'Number of products not assigned in track_df: {len(not_in_track)}')

##### Why not assigned?

In [None]:
for indice_ in not_in_groups[:5]:
    print(f'{indice_}: {index_product_dict[indice_]}')

In [None]:
for indice_ in not_in_track[:5]:
    print(f'{indice_}: {index_product_dict[indice_]}')

### 7. Visualizing groups

In [None]:
print(f'Number of unique groups: {len(groups_df["group_id"].unique())}')

#### 7.1 Replacing int product names with the actual name

In [None]:
groups_df['leader'] = groups_df['leader'].map(index_product_dict)
groups_df['member'] = groups_df['member'].map(index_product_dict)

In [None]:
groups_df = groups_df.sort_values(by=['leader', 'member']).reset_index(drop=True)

In [None]:
groups_df

##### Coca DF

In [None]:
coca_df = groups_df[(groups_df['leader'].str.contains('coca'))|(groups_df['member'].str.contains('coca'))]

In [None]:
coca_df[60:120]

In [None]:
groups_df

### Single Example

In [None]:
# gets all levenshtein ratios of the product and the set
df_values = df_ratios.iloc[100, :]

In [None]:
# we get the indexes of the products that have similarity measure over the threshold
similar_indexes_list = list(np.where(df_values > 0.8)[0])
similar_indexes_list

In [None]:
# get the selection of products that are similar (extended) and also their levenshtein ratios
df_extended = df_ratios.iloc[similar_indexes_list, :]

In [None]:
df_extended

In [None]:
# adds product name column to melt
df_extended.insert(0, 'product_name', df_extended.index)

In [None]:
# melt dataframe
df_melt = df_extended.melt(id_vars='product_name', var_name='match', value_name='lev_ratio')

In [None]:
# filter product lev ratios by threshold
df_group = df_melt[df_melt['lev_ratio'] > 0.8].reset_index(drop=True)

In [None]:
# replacing product name to clarify the direction of similarities (product tha's being processed)
df_group['product_name'] = index_

In [None]:
# removing duplicates (matches)
df_clean_group = df_group.drop_duplicates(subset=['product_name', 'match']).reset_index(drop=True)

In [None]:
df_clean_group

In [None]:
zz