# Feature engineering for ingredients list

_________________________________________________________________________________

**Reference file:**
- combined_data.json

__________________________________________________________________________________

## 1.0 Loading file

In [1]:
#Import necessary libraries
import json 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
with open('../data/processed_data/combined_data.json', 'r') as file:
    data= json.load(file)
df=pd.DataFrame.from_dict(data)

In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
brand,Glow Recipe,Tatcha,goop,CLINIQUE,Tata Harper
product_name,Glow Recipe Watermelon Glow PHA +BHA Pore-Tigh...,Tatcha Pure One Step Camellia Oil Cleanser,goop GOOPGLOW Microderm Instant Glow Exfoliator,CLINIQUE Take The Day Off Makeup Remover For L...,Tata Harper Regenerating Exfoliating Cleanser
product_type,toners,face wash and cleansers,exfoliators and peels,face wash and cleansers,face wash and cleansers
num_likes,125100,107600,12900,76700,31000
rating,4.5,4.5,4.5,4.5,4.5
num_reviews,1900,1700,1200,3100,567
sensitive_type,0,1,0,0,0
combination_type,1,1,1,0,1
oily_type,1,1,1,0,0
normal_type,1,1,1,0,0


_________________________

## 2.1 Approach 1: Fuzzymatch

In [4]:
df_new=df.copy()

### Clean data

In [5]:
#Import necessary libraries
#!pip install textsearch
#!pip install contractions
from nltk.tokenize import sent_tokenize
import re
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AffinityPropagation

In [6]:
def clean_ingr(text):
    '''
    text pre-processing for ingredients list
    '''
    text = str(text)
    text= unicodedata.normalize("NFKD", text)
    text= re.sub(r'[/\()-]',' ', text)
    text= re.sub(r'[^a-zA-Z\s,]', '', text)
    text= re.sub(r'\s*([,])\s*', ', ',text)
    text= text.lower().replace('\n','').replace('  ', ' ')
    
    new_text= re.split(', ', str(text))
    
    new_text= [i.strip() for i in new_text if i!='']
    
    #return list
    return new_text

In [7]:
df_new['ingr_list']=df_new['ingr_list'].apply(lambda x: clean_ingr(x))

In [8]:
df_new['ingr_list']

0       [opuntia ficus indica cactus extract, citrullu...
1       [cetyl ethylhexanoate, oryza sativa rice bran ...
2       [aqua, alumina, glycerin, cetearyl alcohol, gl...
3       [water, isohexadecane, dimethicone, cyclopenta...
4       [hordeum vulgare leaf juice, cetearyl alcohol,...
                              ...                        
1333    [rosa canina rosehip fruit oil, helianthus ann...
1334    [water aqua eau, butylene glycol, dimethicone,...
1335    [water, butylene glycol, glycerin, caprylic ca...
1336    [badaptive superfoodstm microcapsule complex, ...
1337    [aqua water, butylene glycol, glycerin, propan...
Name: ingr_list, Length: 1338, dtype: object

### Simplify ingredients list using Fuzzywuzzy

In [9]:
#! pip install fuzzywuzzy
#! pip install python-Levenshtein

In [10]:
import fuzzywuzzy
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
from itertools import chain

In [11]:
#Create complete list of unique ingredients
ingr = list(df_new['ingr_list'])
ingr_list = list(set(chain(*ingr)))

In [12]:
print(f'{len(ingr_list)} unique ingredients before fuzzy match')

5963 unique ingredients before fuzzy match


________________

**1. Calculating similarities**

Used fuzzy token sort ratio but other metrics can also be used (i.e. Levin)

In [13]:
ingr_df = pd.DataFrame(ingr_list)
ingr_df.columns = ['ingredients']


In [14]:
fuzz.token_sort_ratio('aha glycolic acid', 'aha acid')

64

In [15]:
ct= pd.crosstab(ingr_df['ingredients'], ingr_df['ingredients'])
ct= ct.apply(lambda col: [fuzz.token_sort_ratio(col.name, x) for x in col.index])

In [16]:
#Fuzz ratio taking so long! 
ct.head()

ingredients,a,a blend of essential amino acids and calcium,abies pectinata leaf oil,abies sibirica oil,acacia dealbata flower stem extract,acacia decurrens flower wax,acacia decurrens jojoba sunflower seed wax polyglyceryl esters,acacia farnesiana flower wax,acacia honey,acacia senegal gum,...,zingiber officinale ginger root extract,zingiber officinale ginger root oil,zingiber officinale ginger water,zingiber officinale root extract,zingiber officinale root oil,zingiber officinale root oil ginger root oil,zingiber zerumbet extract,ziziphus jujuba fruit extract,zizyphus jujuba fruit extract,zizyphus jujuba seed extract
ingredients,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a,100,4,8,11,6,7,3,7,15,11,...,5,6,6,6,7,4,8,7,7,7
a blend of essential amino acids and calcium,4,100,38,32,41,39,36,39,29,45,...,34,30,32,32,31,32,29,27,27,31
abies pectinata leaf oil,8,38,100,57,37,35,33,35,33,38,...,35,44,36,36,42,38,29,26,26,27
abies sibirica oil,11,32,57,100,26,36,28,35,27,33,...,39,38,40,36,43,35,37,30,26,26
acacia dealbata flower stem extract,6,41,37,26,100,58,47,57,38,42,...,38,31,33,42,35,30,40,38,38,35


**2. Replace 100s since this is a comparison to itself**

In [17]:
ct[ct==100]=0

**3. Create a dictionary to be used later for replacing values**

In [18]:
def create_mapping(df, threshold=83):
    '''
    
    '''
    replaced=[]
    mapping={}
    for i in df:
        #returns col 
        col =df[i]
        cur_ingr = col.name
        close_strings= list(col[col>threshold].index)

        #check if ingr has already been replaced, if so, then check dictionary for ultimate word 
        #print(cur_ingr, )
        #print(close_strings)
        if cur_ingr in replaced:
            #print('Ingredient already replaced')
            for value in close_strings:
                mapping[value] = mapping[cur_ingr]
                replaced.append(value)
        else:
            #print('Replacing ', close_strings, ' with ', cur_ingr)
            for value in close_strings:
                mapping[value]=cur_ingr
                replaced.append(value)
        #print('\n')       
    #print('Full list of values replaced: \n', replaced)
    #print(f'{len(mapping)} ingredients to be mapped')
    return mapping

In [19]:
mapping = create_mapping(ct, 83)

In [29]:
mapping

{'acer rubrum extract': 'acer rubrum bark extract',
 'acer rubrum bark extract': 'acer rubrum bark extract',
 'acer saccharum sugar maple extract': 'acer saccarum sugar maple extract',
 'acer saccarum sugar maple extract': 'acer saccarum sugar maple extract',
 'acetyl glutamine': 'acetyl glucosamine',
 'acetyl glucosamine': 'acetyl glucosamine',
 'acetyl hexapeptide': 'heptapeptide',
 'acetyl octapeptide': 'acetyl heptapeptide',
 'acetyl tetrapeptide': 'acetyl heptapeptide',
 'acetyl heptapeptide': 'acetyl heptapeptide',
 'acetyl hexapeptide  amide': 'acetyl heptapeptide',
 'hexapeptide hcl': 'heptapeptide',
 'achillea millefolium flower extract': 'achillea millefolium extract',
 'achillea millefolium flower extroct': 'achillea millefolium extract',
 'achillea millefolium yarrow extract': 'achillea millefolium extract',
 'achillea millefolium extract': 'achillea millefolium extract',
 'spilanthes acmella flower extract': 'spilanthes acmella flower bud extract',
 'hydroxyethyl acrylate 

In [31]:
mapped_df= pd.DataFrame.from_dict(mapping, orient='index')

In [33]:
mapped_df.reset_index(inplace=True)
mapped_df.columns=['old', 'new']

In [34]:
mapped_df

Unnamed: 0,old,new
0,acer rubrum extract,acer rubrum bark extract
1,acer rubrum bark extract,acer rubrum bark extract
2,acer saccharum sugar maple extract,acer saccarum sugar maple extract
3,acer saccarum sugar maple extract,acer saccarum sugar maple extract
4,acetyl glutamine,acetyl glucosamine
...,...,...
2452,zingiber officinale ginger water,zingiber officinale ginger extract
2453,zingiber officinale root oil,zingiber officinale ginger extract
2454,zingiber officinale root oil ginger root oil,zingiber officinale ginger extract
2455,zizyphus jujuba fruit extract,ziziphus jujuba fruit extract


**4. Replaced values in the ingredients list**

In [35]:
def replace_ingr(alist, dict_map):
    '''
    Takes in a list and replaces each value in the list according to the dictionary
    Returns comma separated doc/string
    '''
    for i in range(len(alist)):
        if alist[i] in dict_map.keys():
            alist[i] = dict_map[alist[i]]
    
    new_text = ", ".join(alist)
    return new_text

In [36]:
df_new['ingr_list']= df_new['ingr_list'].apply(lambda x: replace_ingr(x, mapping))
df_new['ingr_list']

0       opuntia ficus indica cactus extract, citrullus...
1       cetearyl ethylhexanoate, oryza sativa rice bra...
2       aqua, alumina, diglycerin, isostearyl alcohol,...
3       water, isohexadecane, dimethcone, cyclopen tas...
4       hordeum vulgare leaf juice, isostearyl alcohol...
                              ...                        
1333    rosa canina fruit oil, helianthus annuus seed ...
1334    water aqua eau, butylene glycol, dimethcone, d...
1335    water, butylene glycol, diglycerin, capric  ca...
1336    badaptive superfoodstm microcapsule complex, w...
1337    water aqua, butylene glycol, diglycerin, propa...
Name: ingr_list, Length: 1338, dtype: object

## 3.0 Compute similarities

In [37]:
#Import necessary libraries
#!pip install textsearch
#!pip install contractions
from nltk.tokenize import sent_tokenize
import re
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
#Defined custom tokenizer
def separate_ingr(text):
    '''
    Custom tokenizer: Returns a list of ingredients by partitioning text based on commas 
    '''
    new_text= re.split(', ', str(text))
    new_text= [i.strip() for i in new_text if i!=''] 
    #recreate the document from filtered tokens
    return new_text

In [39]:
ingr_list = list(df_new.ingr_list)

**1. Count Vectorizer**

In [40]:
cv= CountVectorizer(ingr_list, tokenizer= separate_ingr)
sparse_matrix= cv.fit_transform(ingr_list)
print(cv.get_feature_names())





In [41]:
listed_ingr=cv.get_feature_names
listed_ingr

<bound method CountVectorizer.get_feature_names of CountVectorizer(input=['opuntia ficus indica cactus extract, citrullus lanatus '
                       'watermelon fruit, diglycerin, alguronic acid, '
                       'gluconolactone, sodium acetate, betaine salicylate, '
                       'salix alba bark extract, melaleuca alternafolia tea '
                       'tree leaf oil, hibiscus sabdariffa flower extract, '
                       'lactobacillus papaya fruit ferment extract, cucumis '
                       'sativus cucumber extract, saccharum officinarum...
                       'potassium sorbate, ethylparaben, disodium edta, '
                       'niacinamide, tin oxide',
                       'water aqua, propylene glycol, poloxamer, pheno '
                       'xyethano, glyceryl cocoate, disodium edta, sodium '
                       'acetate, sodium coceth sulfate, butylene glycol, '
                       'sodium hyaluronate, fragrance  parf um,

In [42]:
ingr_term_matrix = sparse_matrix.todense()
ingr_df = pd.DataFrame(ingr_term_matrix, columns= cv.get_feature_names())

**2. TF-IDF Features**

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(ingr_list, tokenizer= separate_ingr)
tfidf_matrix= tf.fit_transform(ingr_list)
tfidf_matrix.shape 

(1338, 4293)

**3. Compute pairwise similarity using cosine**

In [44]:
cos_sim_df = pd.DataFrame(cosine_similarity(tfidf_matrix))
cos_sim_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1328,1329,1330,1331,1332,1333,1334,1335,1336,1337
0,1.0,0.003963,0.017651,0.0,0.046054,0.109445,0.02253,0.022737,0.002881,0.0,...,0.012674,0.042273,0.018831,0.013537,0.051416,0.0,0.050651,0.032521,0.034975,0.045955
1,0.003963,1.0,0.019313,0.020621,0.026983,0.007682,0.011986,0.025589,0.022963,0.036298,...,0.004053,0.023076,0.028413,0.134987,0.061349,0.0,0.037084,0.016872,0.006375,0.023632
2,0.017651,0.019313,1.0,0.0,0.057269,0.028768,0.07972,0.0658,0.021412,0.023965,...,0.06236,0.063033,0.027289,0.050553,0.016856,0.053015,0.044223,0.022107,0.021179,0.056699
3,0.0,0.020621,0.0,1.0,0.006529,0.033514,0.034054,0.0,0.03647,0.013665,...,0.013293,0.033402,0.036634,0.0,0.042442,0.0,0.042244,0.02351,0.012564,0.01156
4,0.046054,0.026983,0.057269,0.006529,1.0,0.004155,0.025838,0.008955,0.041167,0.014103,...,0.017357,0.01119,0.056591,0.02975,0.053685,0.027847,0.013108,0.004067,0.019288,0.02422


**4. Sample recommender**

In [45]:
df_new.product_name.values

array(['Glow Recipe Watermelon Glow PHA +BHA Pore-Tight Toner',
       'Tatcha Pure One Step Camellia Oil Cleanser',
       'goop GOOPGLOW Microderm Instant Glow Exfoliator', ...,
       'Dr. Dennis Gross Skincare Hyaluronic Marine Dew It Right Eye Gel',
       'Dr. Dennis Gross Skincare Stress SOS Eye Cream™ with Niacinamide',
       'Guerlain Abeille Royale Anti-Aging Eye Cream'], dtype=object)

In [46]:
def prod_recommender(product, products_list=df_new.product_name.values, doc_sims=cos_sim_df):
    # find product id
    prod_idx = np.where(products_list == product)[0][0]
    # get movie similarities
    product_similarities = doc_sims.iloc[prod_idx].values
    # get top 5 similar movie IDs
    similar_prod_idxs = np.argsort(-product_similarities)[1:6]
    # get top 5 movies
    similar_prod = products_list[similar_prod_idxs]
    # return the top 5 movies
    return similar_prod

In [50]:
for i in df_new.product_name.values[23:34]:
    print('Recommendations for:', i)
    print('Top recommended: \n', prod_recommender(product=i, products_list = df_new.product_name.values, doc_sims = cos_sim_df))
    print('\n')

Recommendations for: Peter Thomas Roth Water Drench® Hyaluronic Cloud Hydrating Toner Mist
Top recommended: 
 ['Peter Thomas Roth 8% Glycolic Solutions Toner'
 'Mario Badescu Witch Hazel & Lavender Toner'
 'Drunk Elephant B-Hydra™ Intensive Hydration Serum'
 'Alpha-H Vitamin B Serum with Niacinamide'
 'CLINIQUE Clarifying Lotion 1.0 Twice A Day Exfoliator']


Recommendations for: belif Cleansing Gel Oil Enriched
Top recommended: 
 ['belif Creamy Cleansing Foam Moist '
 'belif Milky Hydra Balancing Moisturizer '
 'belif Hungarian Water Essence' 'belif Witch Hazel Herbal Extract Toner'
 'belif The True Cream Aqua Bomb']


Recommendations for: Estée Lauder Gentle Eye Makeup Remover
Top recommended: 
 ['CLINIQUE Rinse-Off Eye Makeup Solvent'
 "Paula's Choice Skin Perfecting BHA 9 Treatment"
 'Dior Capture Totale High-Performance Treatment Serum-Lotion'
 'Lancôme MOUSSE RADIANCE Clarifying Self-Foaming Cleanser'
 'CLINIQUE Moisture Surge™ Hydrating Lotion']


Recommendations for: Shiseido W

________________________________________

## 2.2 Approach 2: Cosine similarity, no fuzzymatching

Ingredients similarity without FuzzyMatching

In [51]:
#Import necessary libraries
#!pip install textsearch
#!pip install contractions
from nltk.tokenize import sent_tokenize
import re
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [52]:
#Defined custom tokenizer
def separate_ingr(text):
    '''
    Custom tokenizer: Returns a list of ingredients by partitioning text based on commas 
    '''
    #cleaning
    text = str(text)
    text= unicodedata.normalize("NFKD", text)
    text= re.sub(r'[/\()-]',' ', text)
    text= re.sub(r'[^a-zA-Z\s,]', '', text)
    text= re.sub(r'\s*([,])\s*', ', ',text)
    text= text.lower().replace('\n','').replace('  ', ' ')
    
    #tokenize document
    new_text= re.split(', ', str(text))
    
    new_text= [i.strip() for i in new_text if i!='']
    
    #recreate the document from filtered tokens
    return new_text

In [54]:
ingr_list = list(df.ingr_list)

**Count Vectorizer**

In [55]:
cv= CountVectorizer(ingr_list, tokenizer= separate_ingr)
sparse_matrix= cv.fit_transform(ingr_list)
listed_ingr=cv.get_feature_names



In [56]:
ingr_term_matrix = sparse_matrix.todense()
ingr_df = pd.DataFrame(ingr_term_matrix, columns= cv.get_feature_names())

**TF-IDF Features**

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(ingr_list, tokenizer= separate_ingr)
tfidf_matrix= tf.fit_transform(ingr_list)
tfidf_matrix.shape 

(1338, 5963)

**Pairwise similarity using cosine**

In [58]:
cos_sim_df = pd.DataFrame(cosine_similarity(tfidf_matrix))
cos_sim_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1328,1329,1330,1331,1332,1333,1334,1335,1336,1337
0,1.0,0.003108,0.00152,0.0,0.022494,0.031655,0.002085,0.0,0.002355,0.0,...,0.002092,0.02957,0.016141,0.002139,0.03832,0.0,0.049407,0.020632,0.026383,0.024891
1,0.003108,1.0,0.002459,0.017306,0.024211,0.006246,0.009653,0.0,0.018399,0.012868,...,0.003384,0.019685,0.007588,0.055934,0.007731,0.0,0.01174,0.014189,0.005323,0.022002
2,0.00152,0.002459,1.0,0.0,0.056178,0.012825,0.041965,0.022591,0.012163,0.005951,...,0.063024,0.043721,0.026721,0.053446,0.01673,0.039541,0.027546,0.02028,0.022599,0.039865
3,0.0,0.017306,0.0,1.0,0.005783,0.030101,0.030265,0.0,0.031978,0.0139,...,0.0,0.032779,0.034295,0.0,0.041087,0.0,0.049397,0.022596,0.003276,0.0
4,0.022494,0.024211,0.056178,0.005783,1.0,0.003572,0.021994,0.007624,0.034531,0.0043,...,0.017996,0.011444,0.015756,0.015505,0.050126,0.010802,0.0,0.003609,0.0,0.015736


**Sample recommender**

In [59]:
df.product_name.values

array(['Glow Recipe Watermelon Glow PHA +BHA Pore-Tight Toner',
       'Tatcha Pure One Step Camellia Oil Cleanser',
       'goop GOOPGLOW Microderm Instant Glow Exfoliator', ...,
       'Dr. Dennis Gross Skincare Hyaluronic Marine Dew It Right Eye Gel',
       'Dr. Dennis Gross Skincare Stress SOS Eye Cream™ with Niacinamide',
       'Guerlain Abeille Royale Anti-Aging Eye Cream'], dtype=object)

In [60]:
def prod_recommender(product, products_list=df.product_name.values, doc_sims=cos_sim_df):
    # find product id
    prod_idx = np.where(products_list == product)[0][0]
    # get movie similarities
    product_similarities = doc_sims.iloc[prod_idx].values
    # get top 5 similar movie IDs
    similar_prod_idxs = np.argsort(-product_similarities)[1:6]
    # get top 5 movies
    similar_prod = products_list[similar_prod_idxs]
    # return the top 5 movies
    return similar_prod

In [61]:
for i in df.product_name.values[23:34]:
    print('Recommendations for:', i)
    print('Top recommended:', prod_recommender(product=i, products_list = df.product_name.values, doc_sims = cos_sim_df))
    print('\n')

Recommendations for: Peter Thomas Roth Water Drench® Hyaluronic Cloud Hydrating Toner Mist
Top recommended: ['Peter Thomas Roth 8% Glycolic Solutions Toner'
 'Alpha-H Vitamin B Serum with Niacinamide'
 'Mario Badescu Witch Hazel & Lavender Toner'
 'Drunk Elephant B-Hydra™ Intensive Hydration Serum'
 'CLINIQUE Clarifying Lotion 1.0 Twice A Day Exfoliator']


Recommendations for: belif Cleansing Gel Oil Enriched
Top recommended: ['belif Creamy Cleansing Foam Moist '
 'belif Problem Solution Cleansing Foam' 'belif The True Cream Aqua Bomb'
 'belif Cleansing Herb Water' 'belif Milky Hydra Balancing Moisturizer ']


Recommendations for: Estée Lauder Gentle Eye Makeup Remover
Top recommended: ['CLINIQUE Rinse-Off Eye Makeup Solvent'
 "Paula's Choice Skin Perfecting BHA 9 Treatment"
 'Dior Capture Totale High-Performance Treatment Serum-Lotion'
 'CLINIQUE Moisture Surge™ Hydrating Lotion'
 'NARS Gentle Oil-Free Eye Makeup Remover']


Recommendations for: Shiseido WASO: Soft & Cushy Polishing 

________________________________________

___________________

In [None]:
words = np.asarray(words) #So that indexing with a list will work
lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in words])

affprop = AffinityPropagation(affinity="precomputed", damping=0.5)
affprop.fit(lev_similarity)
for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))

In [None]:
! pip install distance

In [None]:
import numpy as np
from sklearn.cluster import AffinityPropagation
import distance
    
words = ingr_list
lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in words])

affprop = AffinityPropagation(affinity="precomputed", damping=0.5)
affprop.fit(lev_similarity)
for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))

In [None]:
AffinityPropagation?

In [None]:
affprop = AffinityPropagation(affinity="precomputed", damping=.5, random_state=123)
affprop.fit(ct)
addr= np.asarray(ingr_list)
for cluster_id in np.unique(affprop.labels_):
    exemplar = addr[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(addr[np.nonzero(affprop.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster))

**IGNORE BELOW**

**Fuzzywuzzy**

In [None]:
#! pip install fuzzywuzzy
#! pip install python-Levenshtein

In [None]:
import fuzzywuzzy
from fuzzywuzzy import process

In [None]:
ingr= cv.get_feature_names()

In [None]:
def fuzzy_match_replace(data, choices, score=fuzzywuzzy.fuzz.token_sort_ratio, min_ratio=83):
    '''
    
    '''
    #Get a list of unique strings
    for i in data:
        #Get closest matches
        matches = process.extract(string_to_match, chioces, limit=10, scorer = score)
        #Apply criteria
        close_matches = [matches[0] for matches in matches if matches[1]>=min_ratio]
        
        #Replace rows with close matches with the input match
    new_i = ''
    return new_i
    print('Done.')
    

In [None]:
mapping_ingr={}
for ingr in ingr_list[500:510]:
    tokenized = re.split(', ', ingr)
    for text in tokenized: 
        #Get closest matches
        matches = process.extract(text, ingr_list, limit=10, scorer = fuzzywuzzy.fuzz.token_sort_ratio)
        #Apply criteria
        close_matches = [matches[0] for matches in matches if matches[1]>=83]
        print(text, '------', close_matches)

In [None]:
#Defined custom tokenizer
def separate_ingr(text):
    '''
    Custom tokenizer: Returns a list of ingredients by partitioning text based on commas 
    '''
    #cleaning
    text = str(text)
    text= unicodedata.normalize("NFKD", text)
    text= re.sub(r'[/\()-]',' ', text)
    text= re.sub(r'[^a-zA-Z\s,]', '', text)
    text= re.sub(r'\s*([,])\s*', ', ',text)
    text= text.lower().replace('\n','').replace('  ', ' ')
    
    #tokenize document - returns list of separate ingredients
    new_text= re.split(', ', str(text))
    new_text= [i.strip() for i in new_text if i!='']
    
    #fuzzywuzzy here
    for i in new_text
    
    

    
    #recreate the document from filtered tokens
    return new_text

In [None]:
sample=separate_ingr(df.ingr_list[755])
sample

In [None]:
#Need to figure out how to have just one that represents all the similar ones; removing possible duplications
simplified_list=[]
for i in sample:
    matches = process.extract(i, ingr_list, limit=10, scorer = fuzzywuzzy.fuzz.token_sort_ratio)
    #Apply criteria
    close_matches = [matches[0] for matches in matches if matches[1]>=83]
    
    print(i, '------', close_matches)

In [None]:
fuzzy_match_replace(df.ingr_list[0], ingr_list)

Methodologies
1) Fuzzy matching - We will use 'Levenshtein Similarity' for finding similarity score between two words.

2) Clustering - We will use 'Affinity Propagation Clustering' for grouping similar words against a standard form.


Note: Several other methods are available to perform fuzzy matching. Also, the standard form corresponding to a particular group of similar words (a cluster) is basically going to be the longest common substring across all the words in that cluster.

Steps:
    
1) Generate similarity matrix 
2) replace self comparisons with 0 
3) Apply a cut off to retain relevant similarities

In [None]:
def fuzzy_match_replace(data, choices= ingr_list, score=fuzzywuzzy.fuzz.token_sort_ratio, min_ratio=83):
    '''
    
    '''
    #Get a list of unique strings
    for i in data:
        #Get closest matches
        matches = process.extract(string_to_match, chioces, limit=10, scorer = score)
        #Apply criteria
        close_matches = [matches[0] for matches in matches if matches[1]>=min_ratio]
        
        #Replace rows with close matches with the input match
    new_i = ''
    return new_i
    print('Done.')    

In [None]:
mapping_ingr={}
for ingr in ingr_list[500:510]:
    tokenized = re.split(', ', ingr)
    for text in tokenized: 
        #Get closest matches
        matches = process.extract(text, ingr_list, limit=10, scorer = fuzzywuzzy.fuzz.token_sort_ratio)
        #Apply criteria
        close_matches = [matches[0] for matches in matches if matches[1]>=83]
        print(text, '------', close_matches)

https://www.kaggle.com/sushantpekar/string-similarity-fuzzy-matching-clustering

http://jonathansoma.com/lede/algorithms-2017/classes/fuzziness-matplotlib/fuzzing-matching-in-pandas-with-fuzzywuzzy/

https://stats.stackexchange.com/questions/123060/clustering-a-long-list-of-strings-words-into-similarity-groups

https://stackoverflow.com/questions/53261214/python-fuzzy-string-matching-as-correlation-style-table-matrix

_________________