In [238]:
import pandas as pd
import re

import re

#Split only on commas + space
def smart_split(text):
    return [i.strip() for i in re.split(r',\s+', text)]

allergens = pd.read_csv('allergens/allergen_synonyms.csv', sep=';', engine='python')
allergens.sort_values(by=['Name'], ascending=True, inplace=True)

print(list(allergens.Name))

['Alpha-Tocopherol', 'Alpha-Tocopherol-Acetate', 'Amidoamine', 'Bacitracin', 'Balsam of Peru', 'Balsam of Peru', 'Benzalkonium Chloride', 'Benzocaine', 'Black Rubber Mix', 'Bromo-2-Nitropropaine-1,3 Diol', 'Budesonide', 'Carba Mix', 'Chloroxylenol (PCMX)', 'Cinnamic Aldehyde', 'Cobalt Dichloride', 'Cocamindopropyl Betaine', 'Coconut Diethanolamide', 'Colophony', 'DMDM Hydantoin', 'Diazolidinyl Urea', 'Dibucaine hydrochloride', 'Dimethylol Dihydroxyethylene Urea', 'Epoxy Resin', 'Ethyl Acrylate', 'Ethylene Urea', 'Ethylenediamine Dihydrochloride (EDD)', 'Euxyl K400', 'Formaldehyde', 'Formaldehyde', 'Fragrance Mix', 'Glutaraldehyde', 'Glyceryl Thioglycolate', 'Gold Sodium Thiosulfate', 'Hydrocortisone-17-Butyrate', 'Hydroxy-4-Methoxybenzophenone', 'Imidazolidnyl Urea', 'Iodopropyl Butylcarbamate', 'Jasmine Absolute', 'Lanolin Alcohol', 'Lidocaine', 'Mercapto Mix', 'Mercaptobenzothiazole', 'Methyl Methacrylate', 'Methylchloroisothiazolinone', 'Methyldibromoglutaronitrile / Phenoxyethanol'

In [239]:
#Read in the products
products = pd.read_csv('sephora_scraper/products_small_batch.csv')
products.head(5)

Unnamed: 0,Product Name,Brand,Ingredients
0,Summer Fridays,Lip Butter Balm for Hydration & Shine,Phytosteryl/Isostearyl/Cetyl/Stearyl/Behenyl D...
1,Dr. Jart+,Premium BB Tinted Moisturizer with Niacinamide...,"WATER/AQUA/EAU, CYCLOPENTASILOXANE, CYCLOHEXAS..."
2,Zari Eyes Long-Lasting Crease-Proof Cream Eyes...,Kulfi,"Isododecane, Mica, Silica, Trimethylsiloxysili..."
3,Drunk Elephant,T.L.C. Sukari Babyfacial™ AHA + BHA Mask,"Water, Glycolic Acid, Hydroxyethyl Acrylate/So..."
4,Kiehl's Since 1851,Ultra Facial Refillable Moisturizing Cream wit...,"Aqua / Water , Glycerin , Dimethicone , Squala..."


In [240]:
#Lowercase the ingredients list and take a peek 
products['Ingredients'] = products.Ingredients.str.lower()
products.head(10)

Unnamed: 0,Product Name,Brand,Ingredients
0,Summer Fridays,Lip Butter Balm for Hydration & Shine,phytosteryl/isostearyl/cetyl/stearyl/behenyl d...
1,Dr. Jart+,Premium BB Tinted Moisturizer with Niacinamide...,"water/aqua/eau, cyclopentasiloxane, cyclohexas..."
2,Zari Eyes Long-Lasting Crease-Proof Cream Eyes...,Kulfi,"isododecane, mica, silica, trimethylsiloxysili..."
3,Drunk Elephant,T.L.C. Sukari Babyfacial™ AHA + BHA Mask,"water, glycolic acid, hydroxyethyl acrylate/so..."
4,Kiehl's Since 1851,Ultra Facial Refillable Moisturizing Cream wit...,"aqua / water , glycerin , dimethicone , squala..."
5,Tower 28 Beauty,MakeWaves Lengthening + Volumizing Mascara,"water/aqua/eau, copernicia cerifera (carnauba)..."
6,Sol de Janeiro,Cheirosa 62 Bum Bum Hair & Body Perfume Mist,"alcohol denat., aqua (water, eau), parfum (fra..."
7,NARS,Soft Matte Complete Full Coverage Longwear Con...,"orthesin, dimethicone, water, glycerin, butyle..."
8,Patrick Ta,Major Skin Hydra-Luxe Luminous Skin Perfecting...,"cananga oil, water (aqua, eau), glycerin, ison..."


In [241]:
#Actually convert it to a list and flatten 
allergens['Synonyms'] = allergens['Synonyms'].apply(smart_split)
allergen_synonyms = [item for sublist in allergens['Synonyms'] for item in sublist]
allergen_synonyms = [i.lower().strip() for i in allergen_synonyms if len(i) > 2]

In [242]:
#Remove some garbage
stop_phrases = ["possible occupational exposure", "cement workers", "potters", "artists", "nurses", "laboratory technologists"]
allergen_synonyms = [i for i in allergen_synonyms if not any(stop in i for stop in stop_phrases)]

In [243]:
#Better to do it as list comprehension in case we encounter NAs 
#products['ingred_list'] = products['Ingredients'].str.split(',')
#products['ingred_list'] = [x.split(',') for x in products['Ingredients']]
#products['ingred_list'] = products['Ingredients'].apply(lambda x: [i.strip() for i in x.split(',')])

#Due to complexity in ingredient names, we have to pivot and define a function
def match_ingredients(text):
    matches = [allergen for allergen in allergen_synonyms if allergen in text]
    return matches

In [244]:
products['matched_allergens'] = products['Ingredients'].apply(match_ingredients)

In [246]:
products['matched_allergens']

0                             [tocopherol, tocopherol]
1    [tocopherol, tocopherol, phenoxyethanol, propy...
2             [tocopherol, tocopherol, phenoxyethanol]
3                     [ethyl acrylate, phenoxyethanol]
4             [tocopherol, tocopherol, phenoxyethanol]
5                             [tocopherol, tocopherol]
6                                                   []
7    [tocopherol, tocopherol, orthesin, phenoxyetha...
8    [tocopherol, tocopherol, phenoxyethanol, canan...
Name: matched_allergens, dtype: object