# Ingredient matching from product photo using OCR 
---

## Libraries

In [1]:
import numpy as np
import pandas as pd
from unidecode import unidecode
import re
import string

from nltk.metrics import jaccard_distance

import os
import cv2
import easyocr
import pytesseract

from PIL import Image
import subprocess

import warnings
warnings.filterwarnings('ignore')

## Reading the datasets

In [2]:
ingredients = pd.read_excel('../Data/ingredients.xlsx')

In [None]:
# Adding new rows for most common misspellings
def create_new_rows(generic_name, synonym_values):
    # Find the ingredientID based on the generic_name
    ingredient_id = ingredients.loc[ingredients['generic_name'] == generic_name, 'ingredientID'].iloc[0]

    # Create new rows with the found ingredientID
    new_rows = {'ingredientID': [ingredient_id] * len(synonym_values),
                'generic_name': [generic_name] * len(synonym_values),
                'synonym': synonym_values}
    return new_rows

new_rows1 = create_new_rows('Alcohol Denat.', ['alchol dent', 'alcoholdenat', 'alcoholdent', 'alchol', 'alkohol'])
new_rows2 = create_new_rows('Alpha-Isomethyl Ionone', ['alpha, isomethyl ionon', 'alpha, isomethyl ionone', 'alpha isomethyl ionone', 'alpha,isomethyl ionon', 'alpha-isomethylionone'])
new_rows3 = create_new_rows('Dimethicone', ['dimthicone'])
new_rows4 = create_new_rows('Linalool', ['linalol'])

new_rows_list = [new_rows1, new_rows2, new_rows3, new_rows4]
for new_rows in new_rows_list:
    ingredients = pd.concat([ingredients, pd.DataFrame(new_rows)], ignore_index=True)

In [None]:
synonyms = ingredients['synonym'].tolist()

In [3]:
extracted_dir = '../Data/pictures'
os.makedirs(extracted_dir, exist_ok=True)

## EasyOCR reader

In [4]:
# Files conversion
files = [f for f in os.listdir(extracted_dir) if f.endswith('.heic') or f.endswith('.heif')]

for filename in files:
    heic_path = os.path.join(extracted_dir, filename)
    jpeg_path = os.path.join(extracted_dir, os.path.splitext(filename)[0] + '.jpg')

    subprocess.run(['convert', heic_path, jpeg_path])

    os.remove(heic_path)

    print(f"Converted {filename} to {os.path.basename(jpeg_path)}")

In [5]:
reader = easyocr.Reader(['en'])

results = []

for filename in os.listdir(extracted_dir):
    if filename.endswith('.jpg') or filename.endswith('.png') or filename.endswith('.jpeg') or filename.endswith('.webp'):
        image_path = os.path.join(extracted_dir, filename)

        try:
            image = cv2.imread(image_path)
            if image is not None:
                result = reader.readtext(image, detail=0)
                results.append({'filename': filename, 'text': result})
            else:
                print(f"Failed to load image: {image_path}")
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

df_pictures = pd.DataFrame(results)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


## Cleaning functions

In [6]:
# function for unicode transliteration and lower case for ingredient_list
def clean_text(s):               
    if isinstance(s, str):
        s = unidecode(s)    
        s = s.lower()
        return s

In [7]:
# Custom function for text cleaning
def clean_text_before_ingredients(lst):
    cleaned_list = []
    found_ingredients = False

    for item in lst:
        if isinstance(item, str):
            if not found_ingredients and "ingredients" in item.lower():
                found_ingredients = True
                start_index = item.lower().find("ingredients")
                end_index = item.find(":", start_index)
                if start_index != -1 and end_index != -1:
                    cleaned_item = item[end_index + 1:].lstrip()
                    cleaned_list.append(unidecode(cleaned_item))
            elif found_ingredients:
                # Check for '.' and stop processing the list
                dot_index = item.find('.')
                if dot_index != -1:
                    cleaned_item = item[:dot_index].lstrip()
                    cleaned_list.append(unidecode(cleaned_item))
                    break
                else:
                    cleaned_list.append(unidecode(item))
        else:
            cleaned_list.append(None)  # Remove the entire list

    return [item for item in cleaned_list if item is not None]

In [8]:
# Remove unclosed brackets
def remove_unclosed_brackets(input_str):
    if input_str is None or not isinstance(input_str, str):
        return input_str

    stack = []
    result = list(input_str)

    for i, char in enumerate(input_str):
        if char in ['(', '[']:
            stack.append(i)
        elif char in [')', ']']:
            if stack:
                stack.pop()
            else:
                result[i] = ' '

    # Replace unclosed open brackets with an empty space
    for index in stack:
        result[index] = ' '

    return ''.join(result)

## Preform cleaning after feading the files with EasyOCR reader

In [9]:
def perform_first_ocr_cleaning(products):
    
    products['full_ingredient_list'] = products['text_from_ocr'].apply(lambda x: ' '.join(map(str, x)))
    products['ingredient_list'] = products['full_ingredient_list']
    products['ingredient_list'] = products['ingredient_list'].apply(clean_text)
    products['ingredient_list'] = products['ingredient_list'].str.split(r',|;|\|')
    products.reset_index(drop=True, inplace=True)
    products['ingredient_list'] = products['ingredient_list'].apply(clean_text_before_ingredients)
    products = products.explode('ingredient_list')
    products['ingredient_list'] = products['ingredient_list'].apply(remove_unclosed_brackets)
    products['ingredient_list'] = products['ingredient_list'].str.replace('[', '(').str.replace(']', ')')
    products['ingredient_list'] = products['ingredient_list'].str.replace('\\', '/')
    products['ingredient_list'] = products['ingredient_list'].str.lstrip(',')
    products['ingredient_list'] = products['ingredient_list'].str.replace(' ,', ', ')
    products['ingredient_list'] = products['ingredient_list'].str.lstrip('/')
    products['ingredient_list'] = products['ingredient_list'].str.rstrip(string.punctuation.replace(')', ''))
    
    special_characters = ['*', '$', '?', '!', '@', '}', '{', '--', '>', '<', '~', '&', '=', '"']
    for char in special_characters:
        products['ingredient_list'] = products['ingredient_list'].astype(str).str.replace(char, ' ')

    products['ingredient_list'] = products['ingredient_list'].str.replace(r'cl (\d{5})', r'ci \1')              # zamena na cl XXXXX so ci XXXXX    
    products['ingredient_list'] = products['ingredient_list'].replace(r',+', ',', regex=True)
    products['ingredient_list'] = products['ingredient_list'].str.strip().str.replace(r'\s+', ' ')
    products['ingredient_list'] = products['ingredient_list'].str.replace(r'\s*/\s*', '/')
    products['ingredient_list'] = products['ingredient_list'].str.replace(r'\s*-\s*', '-')
    products['ingredient_list'] = products['ingredient_list'].str.replace(r'\s+\.', '.')
    products['ingredient_list'] = products['ingredient_list'].str.replace(r'\(\s*', '(').str.replace(r'\s*\)', ')')
    products['ingredient_list'] = products['ingredient_list'].str.replace('f,i,l,', 'fil')
    products['ingredient_list'] = products['ingredient_list'].str.strip()
    products = products[~products['ingredient_list'].astype(str).str.match(r'^[\d\W]+$')]
    products = products[~products['ingredient_list'].str.strip(string.punctuation).eq('')]
    products = products[~products['ingredient_list'].apply(lambda x: pd.to_numeric(x, errors='coerce')).notna() | (products['ingredient_list'] == '')]
    products = products[~products['ingredient_list'].str.len() < 3]   
    products['ingredient_list'] = products['ingredient_list'].str.strip()   
    products['ingredient_list'] = products['ingredient_list'].str.replace(r'\s+', ' ', regex=True)                      # Brisenje na prazni stringovi i NaN
    products.dropna(subset=['ingredient_list'], inplace=True)
    products.reset_index(drop=True, inplace=True)
    
    return products

products1 = df_pictures.copy()
products1.rename(columns={'text': 'text_from_ocr'}, inplace=True)
products1.rename(columns={'filename': 'product_name'}, inplace=True)

products_cleaned_first_ocr = perform_first_ocr_cleaning(products1)                                                      # Call the function to perform the cleaning

## Tesseract OCR reader

In [10]:
filtered_products = products_cleaned_first_ocr[
    (products_cleaned_first_ocr['ingredient_list'].str.len() > 100)      # Gi zemame redovite so dolgi stringovi koi prviot algoritam ne gi separira               
]

# Check if there are any products that meet the condition
if not filtered_products.empty:
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    results = []

    for filename in os.listdir(extracted_dir):
        if filename.endswith('.jpg') or filename.endswith('.png') or filename.endswith('.jpeg') or filename.endswith('.webp'):
            image_path = os.path.join(extracted_dir, filename)

            try:
                image = cv2.imread(image_path)
                if image is not None:
                    # Use pytesseract.image_to_string to extract text
                    result = pytesseract.image_to_string(image)
                    results.append({'filename': filename, 'text': result})
                else:
                    print(f"Failed to load image: {image_path}")
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

    # Create a DataFrame from the OCR results
    df_pictures_tesseract = pd.DataFrame(results)
    
    # Merge the OCR results with the filtered_products DataFrame
    df_pictures_tesseract = pd.merge(filtered_products, df_pictures_tesseract, how='inner', left_on='product_name', right_on='filename')
else:
    # If there are no products that meet the condition, create an empty DataFrame
    df_pictures_tesseract = pd.DataFrame()

## Preform cleaning after feading the files with Tesseract OCR reader

In [11]:
def perform_second_ocr_cleaning(products):

        products['full_ingredient_list'] = products['text_from_ocr']
        products['full_ingredient_list'] = products['full_ingredient_list'].apply(lambda x: x.split('\n'))          # Split each paragraph into a list
        products['full_ingredient_list'] = products['full_ingredient_list'].apply(lambda x: [paragraph.strip() for paragraph in x if paragraph.strip()])    # Remove empty paragraphs if any
        products['full_ingredient_list'] = products['full_ingredient_list'].apply(lambda x: ' '.join(map(str, x)))
        products['ingredient_list'] = products['full_ingredient_list']                                              # Making unicode transliteration, lower case and tokenization

        split_characters = [',', '|', '*', '«', '»', '-', ':', '+']        
        for char in split_characters:
            products['ingredient_list'] = products['ingredient_list'].apply(lambda x: ', '.join(x.split(char)))     # Iterate over each split character and perform the split [',', '|', '*', '«', '»', '-', ':', '+'] 
        products.reset_index(drop=True, inplace=True)

        products['ingredient_list'] = products['ingredient_list'].apply(clean_text)
        products['ingredient_list'] = products['ingredient_list'].apply(lambda x: x.split(','))
        products['ingredient_list'] = products['ingredient_list'].tolist()        
        products['ingredient_list'] = products['ingredient_list'].apply(clean_text_before_ingredients)              # Apply the cleaning function
        products = products.explode('ingredient_list')        
        products['ingredient_list'] = products['ingredient_list'].apply(remove_unclosed_brackets)
        products['ingredient_list'] = products['ingredient_list'].astype(str)
        products['ingredient_list'] = products['ingredient_list'].str.replace('[', '(').str.replace(']', ')')       # zamena na aglesti zagradi so obicni
        products['ingredient_list'] = products['ingredient_list'].str.replace('\\', '/')                            # zamena na \ so /
        products['ingredient_list'] = products['ingredient_list'].str.lstrip(',')                                   # brisenje na ',' ako stringot pocnuva so ','
        products['ingredient_list'] = products['ingredient_list'].str.replace(' ,', ', ')                           # zamena na ' ,' so ', '
        products['ingredient_list'] = products['ingredient_list'].str.lstrip('/')                                   # brisenje / na pocetok na string
        products['ingredient_list'] = products['ingredient_list'].str.rstrip(string.punctuation.replace(')', ''))   # brisenje punktuacija na kraj na sting, no bez zagradite

        special_characters = ['*', '$', '?', '!', '@', '}', '{', '--', '>', '<', '~', '&', '=', '"']
        for char in special_characters:
            products['ingredient_list'] = products['ingredient_list'].astype(str).str.replace(char, ' ')            # brisenje na specijalnite znaci bilo kade vo stringot
        
        products['ingredient_list'] = products['ingredient_list'].str.replace(r'cl (\d{5})', r'ci \1')              # zamena na cl XXXXX so ci XXXXX
        products['ingredient_list'] = products['ingredient_list'].replace(r',+', ',', regex=True)                   # brisenje na povekje od 1 posledovatelni zapirki
        products['ingredient_list'] = products['ingredient_list'].str.strip().str.replace(r'\s+', ' ')              # brisenje na povekje od 1 posledovatelni prazni mesta
        products['ingredient_list'] = products['ingredient_list'].str.replace(r'\s*/\s*', '/')                      # brisenje na prazni mesta pred i posle '/'
        products['ingredient_list'] = products['ingredient_list'].str.replace(r'\s*-\s*', '-')                      # brisenje na prazni mesta pred i posle '-'
        products['ingredient_list'] = products['ingredient_list'].str.replace(r'\s+\.', '.')                        # brisenje na prazno mesto pred '.'
        products['ingredient_list'] = products['ingredient_list'].str.replace(r'\(\s*', '(').str.replace(r'\s*\)', ')') # brisenje na prazni mesta posle otvorena zagrada i pred zatvorena zagrada
        products['ingredient_list'] = products['ingredient_list'].str.replace('f,i,l,', 'fil')
        products['ingredient_list'] = products['ingredient_list'].str.strip()                                       # trim
        products = products[~products['ingredient_list'].astype(str).str.match(r'^[\d\W]+$')]                       # brisenje na red koj sodrzi samo broj i punktuacija
        products = products[~products['ingredient_list'].str.strip(string.punctuation).eq('')]                      # brisenje na red koj sodrzi samo punktuacija
        products = products[~products['ingredient_list'].apply(lambda x: pd.to_numeric(x, errors='coerce')).notna() | (products['ingredient_list'] == '')]  # samo broj
        mask_to_drop = (products['ingredient_list'].str.len() < 3) | (products['ingredient_list'].str.len() > 80)   # Brisenje na red koj sodrzi pomalku od 3 ili povekje od 70 karakteri 
        products = products[~mask_to_drop]
        products['ingredient_list'] = products['ingredient_list'].str.strip()   
        products['ingredient_list'] = products['ingredient_list'].str.replace(r'\s+', ' ', regex=True)              # Brisenje na prazni stringovi i NaN
        products.dropna(subset=['ingredient_list'], inplace=True)
        products.reset_index(drop=True, inplace=True)

        return products


products2 = df_pictures_tesseract.copy()
columns_to_drop = ['product_name', 'text_from_ocr', 'full_ingredient_list', 'ingredient_list']                      # Columns to drop from first ocr read
products2.drop(columns=columns_to_drop, inplace=True, errors='ignore')
products2.rename(columns={'text': 'text_from_ocr'}, inplace=True)                                                   # Renaming the columns 
products2.rename(columns={'filename': 'product_name'}, inplace=True)

products_cleaned_second_ocr = perform_second_ocr_cleaning(products2)                                                # Call the function to perform the cleaning   

In [12]:
mask_to_drop = products_cleaned_first_ocr['ingredient_list'].str.len() > 100                                         # Brisenje na redovite koi sodrzat povekje od 80 karakteri
rows_to_drop = products_cleaned_first_ocr[mask_to_drop]
display(rows_to_drop)
products_to_drop = products_cleaned_first_ocr.loc[mask_to_drop, 'product_name'].unique()                            # Identify the product names for which the condition is met
products_cleaned_first_ocr = products_cleaned_first_ocr[~products_cleaned_first_ocr['product_name'].isin(products_to_drop)] # Brisenje na recordite od products_cleaned_first_ocr, bidejki se vcituvaat od drugata biblioteka
products_cleaned_first_ocr.reset_index(drop=True, inplace=True)

Unnamed: 0,product_name,text_from_ocr,full_ingredient_list,ingredient_list
139,1000092570.jpg,"[INGREDIENTS, AqeWater) Squalane; Caprvlic/Cap...",INGREDIENTS AqeWater) Squalane; Caprvlic/Capri...,benzyl alco-giucosaminen citric aoic srbitotor...
172,20231204_141323.jpg,"[PCOoc|, lontano dalla luce diretta del sole: ...",PCOoc| lontano dalla luce diretta del sole: In...,alcohol methorvoieenzorimethare ethvlhexy sali...
173,20231204_141323.jpg,"[PCOoc|, lontano dalla luce diretta del sole: ...",PCOoc| lontano dalla luce diretta del sole: In...,coperica gertenaheen phenvlbenzimidazole sullo...
218,20231204_142537.jpg,"[2, ne:, 17, 3 =, na y, 1, 1, 1, 0, %t, 6, 0, ...",2 ne: 17 3 = na y 1 1 1 0 %t 6 0 G; rater 2 Z7...,potassiomurorbaleride goramidrort hydroxde tor...
220,20231204_142537.jpg,"[2, ne:, 17, 3 =, na y, 1, 1, 1, 0, %t, 6, 0, ...",2 ne: 17 3 = na y 1 1 1 0 %t 6 0 G; rater 2 Z7...,corinedefarme com used the end of : la4 pa0420...
...,...,...,...,...
1719,IMG_6968.jpg,"[3258484750, L1S, Mirrobiom schutz FORMEL, FOR...",3258484750 L1S Mirrobiom schutz FORMEL FORMULe...,eur 42090 anti-schuppen shampoo gchtbare schup...
1720,IMG_6968.jpg,"[3258484750, L1S, Mirrobiom schutz FORMEL, FOR...",3258484750 L1S Mirrobiom schutz FORMEL FORMULe...,%s wvoyznts et desadditk atculs peg bases sur ...
1721,IMG_6968.jpg,"[3258484750, L1S, Mirrobiom schutz FORMEL, FOR...",3258484750 L1S Mirrobiom schutz FORMEL FORMULe...,fgey %oxgneusement avec de keau: awverienze: e...
1722,IMG_6968.jpg,"[3258484750, L1S, Mirrobiom schutz FORMEL, FOR...",3258484750 L1S Mirrobiom schutz FORMEL FORMULe...,scizcquare abbondantemente con acqua hodtet lg...


In [None]:
mask_to_drop2 = products_cleaned_second_ocr['ingredient_list'].str.len() > 70                                         # Brisenje na redovite koi sodrzat povekje od 70 karakteri vo imeto na sostojkite
rows_to_drop2 = products_cleaned_second_ocr[mask_to_drop]
display(rows_to_drop2)
products_to_drop2 = products_cleaned_second_ocr.loc[mask_to_drop2, 'product_name'].unique()                            # Identify the product names for which the condition is met
products_cleaned_second_ocr = products_cleaned_second_ocr[~products_cleaned_second_ocr['product_name'].isin(products_to_drop2)] # Brisenje na recordite od products_cleaned_first_ocr, bidejki se vcituvaat od drugata biblioteka
products_cleaned_second_ocr.reset_index(drop=True, inplace=True)

## Concatanating dataframes from both readers

In [13]:
products = pd.concat([products_cleaned_first_ocr, products_cleaned_second_ocr], axis=0)                             # Concatenate the two DataFrames along the columns
products = products.sort_values(by='product_name')

products['text_from_ocr'] = products['text_from_ocr'].apply(tuple)                                                  # Brisenje duplikati
products.drop_duplicates(inplace=True)
products.reset_index(drop=True, inplace=True)

products['productID'] = products.groupby('product_name').ngroup()                                                   # Dodavanje index colona za sekoj proizvod
products.insert(0, 'productID', products.pop('productID'))                                                          # Move 'productID' as first column

In [26]:
products_cleaned_first_ocr.to_excel('first_ocr.xlsx')
products_cleaned_second_ocr.to_excel('second_ocr.xlsx')
products.to_excel('ocr.xlsx')

In [15]:
duplicates_to_remove = products[products.duplicated()]                          # Filtriranje na dupli redovi
display(duplicates_to_remove)

products.drop_duplicates(inplace=True)                                          # Brisenje na dupli redovi

Unnamed: 0,productID,product_name,text_from_ocr,full_ingredient_list,ingredient_list


In [16]:
rows_with_string_nan = products[products['ingredient_list'].eq('nan')]          # Filtriranje na redovi so string nan
display(rows_with_string_nan)

products = products[~products['ingredient_list'].eq('nan')]                     # Brisenje na redovi so string nan

Unnamed: 0,productID,product_name,text_from_ocr,full_ingredient_list,ingredient_list


In [17]:
rows_with_NaN = products[~products['ingredient_list'].notna()]                  # Filtriranje na redovi so NaN
display(rows_with_NaN)
products = products[products['ingredient_list'].notna()]                        # Brisenje na redovi so NaN

Unnamed: 0,productID,product_name,text_from_ocr,full_ingredient_list,ingredient_list


In [18]:
products.to_excel('ingredient_list_for_matching.xlsx')

In [19]:
# Converting to list
ingredient_names = products['ingredient_list'].tolist()

## Jaccard distance Matching

In [20]:
def find_first_match(ingredient, candidates, threshold=0.50):
    ingredient_set = set(re.split(r'\W+', ingredient))

    # Set to keep track of matched ingredients
    matched_ingredients = set()

    for candidate in candidates:
        # Skip if candidate ingredient has already been matched
        if pd.notna(candidate):  # Check if candidate is not NaN
            if candidate in matched_ingredients:
                continue

            if isinstance(candidate, str):  # Check if candidate is a string
                candidate_set = set(re.split(r'\W+', candidate))
                union_size = len(ingredient_set.union(candidate_set))

                if union_size != 0:
                    distance = jaccard_distance(ingredient_set, candidate_set)
                    similarity = 1 - distance

                    if similarity > threshold:
                        # Add the matched ingredient to the set
                        matched_ingredients.add(candidate)
                        
                        # Return the first match and exit the function
                        return candidate
    return None

In [21]:

matches_df = pd.DataFrame(columns=['Matching Ingredient', 'Synonym'])

dfs = []

for ingredient in ingredient_names:
    if isinstance(ingredient, str):
        direct_match = [synonym for synonym in synonyms if synonym == ingredient] or [None]

        if direct_match[0] == None:
            match = find_first_match(ingredient, synonyms)
        else:
            match = direct_match[0]

        df = pd.DataFrame({'Matching Ingredient': [ingredient], 'Synonym': [match]})

        dfs.append(df)

matches_df = pd.concat(dfs, ignore_index=True)


In [22]:
synonym_to_generic = dict(zip(ingredients['synonym'], ingredients['generic_name']))

# Add a new column to matches_df to store the corresponding generic name
matches_df['Generic Name'] = None

# Loop through each ingredient
for index, row in matches_df.iterrows():
    ingredient = row['Matching Ingredient']
    match = row['Synonym']

    # Check if the match is not None
    if match is not None:
        # Use the dictionary to find the corresponding generic name
        generic_name = synonym_to_generic.get(match)

        # Update the 'Generic_Name' column in matches_df
        matches_df.at[index, 'Generic Name'] = generic_name

## Results

In [23]:
none_count = 0
for value in matches_df['Synonym']:
    if str(value).strip().lower() == 'none':
        none_count += 1

matched = round((len(products) - none_count)/len(products), 4)*100
not_matched = round((100-matched), 4)
exact_matches = products['ingredient_list'][products['ingredient_list'].isin(ingredients['synonym'].values)].tolist()
exact_matches = round(len(exact_matches)/len(products['ingredient_list']), 2)*100

print(f'MATCHED ingredients:               {round(matched, 2)} % \t/ {len(ingredient_names) - none_count}')
print(f'NOT MATCHED ingredients:           {not_matched} % \t/ {none_count}')
print()
print(f'EXACT MATCH of ingredients:        {exact_matches} %')
print(f'JACCARD DISTANCE MATCH of ing.:    {round(matched-exact_matches, 2)} %')

matches_df

MATCHED ingredients:               73.74 % 	/ 1536
NOT MATCHED ingredients:           26.26 % 	/ 547

EXACT MATCH of ingredients:        63.0 %
JACCARD DISTANCE MATCH of ing.:    10.74 %


Unnamed: 0,Matching Ingredient,Synonym,Generic Name
0,aqua water,aqua water,Water
1,bidens pilosa extract,bidens pilosa extract,Bidens Pilosa Extract
2,hditrimethylol hexyllactone crosspolymer butyr...,,
3,withania somnifera root extract,withania somnifera root extract,Withania Somnifera Root Extract
4,helianthus annuus (sunflower) seed oil,helianthus annuus seed,Helianthus Annuus Seed
...,...,...,...
2078,disodium edta,disodium edta,Disodium EDTA
2079,copper,,
2080,hydroxyethylcellulose lauricacid,,
2081,laurylalcohol diphosphonic acid,,


In [24]:
matches_df[['Generic Name', 'Synonym', 'Matching Ingredient']].to_excel('matches_0.50.xlsx', index=False)

In [25]:
# Analysis of unmatched ingredients
filtered_df = matches_df[matches_df['Generic Name'].isna()]
value_counts = filtered_df['Matching Ingredient'].value_counts()

# Print the first 20 most frequent values and their frequencies
for value, frequency in value_counts.head(30).iteritems():
    print(f"Value: {value}, Frequency: {frequency}")

Value: peg, Frequency: 8
Value: propanediol, Frequency: 8
Value: alpha, Frequency: 7
Value: betaine, Frequency: 6
Value: alpha-isomethyl lonone, Frequency: 5
Value: c12, Frequency: 4
Value: pentaerythrityl tetra, Frequency: 3
Value: coco, Frequency: 3
Value: ppg, Frequency: 3
Value: glycerides, Frequency: 3
Value: ceteareth, Frequency: 3
Value: cl 19140, Frequency: 3
Value: cl 77492, Frequency: 2
Value: aquo, Frequency: 2
Value: 1-methylhydan-toin-2-imide, Frequency: 2
Value: cetearylalcohol, Frequency: 2
Value: tocophery acetate, Frequency: 2
Value: glucoside, Frequency: 2
Value: caprylic/capric trigly-ceride, Frequency: 2
Value: polyglyceryl, Frequency: 2
Value: cerin, Frequency: 2
Value: cl 14700, Frequency: 2
Value: porfum, Frequency: 2
Value: lsomethy lonone, Frequency: 2
Value: gly, Frequency: 2
Value: ethylhexyiglycerin, Frequency: 2
Value: cl 77891, Frequency: 2
Value: fragra nce, Frequency: 1
Value: alcoh ol denat glycerin, Frequency: 1
Value: phenoxyetha-nol, Frequency: 1
