In [10]:
import pandas as pd
from fuzzywuzzy import fuzz

* Canonical - The Item name used as the query. The top 25 items will be provided based on this item name.
* item_name - These are the items that are the closest matches to the canonical item.
* Rank - A value between 1-25, with 1 representing the closest match of the item_name to the canonical item. 

In [3]:
df = pd.read_csv('UK_booker_canonical_dataset.csv')

In [4]:
df = df[['canonical', 'item_name', 'rank']].sort_values(by=['canonical', 'rank']).reset_index(drop=True)

### Structure

In [5]:
canonical_products = list(df['canonical'].unique())

In [6]:
item_products = list(df['item_name'].unique())

In [7]:
len(canonical_products), len(item_products)

(45527, 48927)

In [9]:
df[20:28]

Unnamed: 0,canonical,item_name,rank
20,1 Kg Bakers Adult Beef Dog Food,Bakers Adult Dry Dog Food Chicken and Veg 1kg,21
21,1 Kg Bakers Adult Beef Dog Food,Pet food BAKERS ADULT Beef with Vegetables Dry...,22
22,1 Kg Bakers Adult Beef Dog Food,BAKERS ADULT Beef with Vegetables Dry Dog Food...,23
23,1 Kg Bakers Adult Beef Dog Food,BUTCHERS TRIPE DOG FOOD 1200G,24
24,1 Kg Bakers Adult Beef Dog Food,Bakers Dog Food Chicken and Veg 1kg,25
25,1 Kg Silver Spoon Caster Sugar,Silver Spoon Caster Sugar 1kg,1
26,1 Kg Silver Spoon Caster Sugar,Silver Spoon British Caster Sugar 1kg,2
27,1 Kg Silver Spoon Caster Sugar,Silver Spoon White Sugar 1kg,3


# 1. Grouping products

## 1.1 Fuzzy ratios calculation

In [11]:
df['fuzz_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(x['canonical'], x['item_name']), axis=1)

In [12]:
df.head()

Unnamed: 0,canonical,item_name,rank,fuzz_ratio
0,1 Kg Bakers Adult Beef Dog Food,Bakers Beef and Vegetable Dry Dog Food 1kg,1,74
1,1 Kg Bakers Adult Beef Dog Food,Bakers Beef and Vegetable Dry Dog Food 1kg,2,74
2,1 Kg Bakers Adult Beef Dog Food,Bakers ADULT Small Dog Beef with Vegetables Dr...,3,65
3,1 Kg Bakers Adult Beef Dog Food,BAKERS Meaty Meals Adult Beef Dry Dog Food 1kg,4,75
4,1 Kg Bakers Adult Beef Dog Food,Bakers Meaty Meals Adult Beef Dry Dog Food 1kg,5,75


## 1.2 Filtering similarities

In [13]:
threshold_products = 85

In [17]:
df_similars = df[df['fuzz_ratio'] >= threshold_products].\
            drop_duplicates(subset=['canonical', 'item_name']).reset_index(drop=True)

In [19]:
len(pd.unique(df_similars[['canonical', 'item_name']].values.ravel('K')))

37655

In [20]:
len(pd.unique(df[['canonical', 'item_name']].values.ravel('K')))

49105