In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt


In [2]:
data = pd.read_csv('data/cleaned_data.csv')
data.head()

Unnamed: 0,product_name,brands,categories_en,labels_en,ingredients_text,allergens_en,additives_en,nutrition_grade_fr,energy_100g,fat_100g,...,cocoa_100g,carbon-footprint_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,category_level_1,category_level_2,category_level_3,category_level_4,category_level_5,category_level_6
0,Banana Chips Sweetened (Whole),not mentioned,Not Mentioned,Labels are missing,"Bananas, vegetable oil (coconut oil, corn oil ...",unknown,No additives,d,2243.0,28.57,...,0.0,no information,14.0,14.0,Not Mentioned,Not Specified,Not Specified,Not Specified,Not Specified,Not Specified
1,Peanuts,torn & glasser,Not Mentioned,Labels are missing,"Peanuts, wheat flour, sugar, rice flour, tapio...","soy, wheat, peanuts",No additives,b,1941.0,17.86,...,0.0,no information,0.0,0.0,Not Mentioned,Not Specified,Not Specified,Not Specified,Not Specified,Not Specified
2,Organic Salted Nut Mix,grizzlies,Not Mentioned,Labels are missing,"Organic hazelnuts, organic cashews, organic wa...",unknown,No additives,d,2540.0,57.14,...,0.0,no information,12.0,12.0,Not Mentioned,Not Specified,Not Specified,Not Specified,Not Specified,Not Specified
3,Organic Polenta,bob's red mill,Not Mentioned,Labels are missing,Organic polenta,unknown,No additives,not given,1552.0,1.43,...,0.0,no information,not given,not given,Not Mentioned,Not Specified,Not Specified,Not Specified,Not Specified,Not Specified
4,Breadshop Honey Gone Nuts Granola,unfi,Not Mentioned,Labels are missing,"Rolled oats, grape concentrate, expeller press...",sesame,No additives,not given,1933.0,18.27,...,0.0,no information,not given,not given,Not Mentioned,Not Specified,Not Specified,Not Specified,Not Specified,Not Specified


In [3]:
us_data = pd.read_csv('data/us_data.csv')
us_data.shape

(171521, 98)

In [4]:
from modules.ingredients import clean_ingredients

data['ingredients_clean'] = data['ingredients_text'].apply(clean_ingredients)
us_data['ingredients_clean'] = us_data['ingredients_text'].apply(clean_ingredients)

In [5]:
data = data.drop(columns=['ingredients_text', 'categories_en'])
us_data = us_data.drop(columns=['ingredients_text', 'categories_en'])

In [6]:
data.columns.values

array(['product_name', 'brands', 'labels_en', 'allergens_en',
       'additives_en', 'nutrition_grade_fr', 'energy_100g', 'fat_100g',
       'saturated-fat_100g', '-caprylic-acid_100g', '-capric-acid_100g',
       '-lauric-acid_100g', '-myristic-acid_100g', '-palmitic-acid_100g',
       '-stearic-acid_100g', '-arachidic-acid_100g',
       '-montanic-acid_100g', 'monounsaturated-fat_100g',
       'polyunsaturated-fat_100g', 'omega-3-fat_100g',
       '-alpha-linolenic-acid_100g', '-eicosapentaenoic-acid_100g',
       '-docosahexaenoic-acid_100g', 'omega-6-fat_100g',
       '-linoleic-acid_100g', '-arachidonic-acid_100g',
       '-gamma-linolenic-acid_100g', 'omega-9-fat_100g',
       '-oleic-acid_100g', 'trans-fat_100g', 'cholesterol_100g',
       'carbohydrates_100g', 'sugars_100g', '-sucrose_100g',
       '-glucose_100g', '-fructose_100g', '-lactose_100g',
       '-maltose_100g', '-maltodextrins_100g', 'starch_100g',
       'polyols_100g', 'fiber_100g', 'proteins_100g', 'casein_100g',

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process

In [10]:
# Sample data
ingredients = data['ingredients_clean'].values

# Step 1: Vectorize ingredients using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(ingredients)

In [11]:
# Step 2: Define a function to find similar products
def find_similar_products(input_product_name, data, tfidf_matrix, vectorizer, top_n=5):
    # Fuzzy match the input product name
    matches = process.extract(input_product_name, data['product_name'], limit=1)
    if not matches:
        return None  # No match found
    
    matched_product_name, _ = matches[0]
    matched_index = data[data['product_name'] == matched_product_name].index[0]
    
    # Get the ingredients of the matched product
    matched_ingredients = data.loc[matched_index, 'ingredients_clean']
    
    # Compute cosine similarity between the matched product and all others
    matched_vector = vectorizer.transform([matched_ingredients])
    cosine_similarities = cosine_similarity(matched_vector, tfidf_matrix).flatten()
    
    # Rank products based on similarity
    data['similarity'] = cosine_similarities
    data = data.sort_values(by='similarity', ascending=False)
    
    return data.head(top_n)


In [12]:
# Step 3: Define a function to filter allergens and flag additives
def filter_and_flag(data, allergens_en, additives_en):
    # Filter out products containing allergens
    data['contains_allergens'] = data['ingredients_clean'].apply(lambda x: any(allergen in x for allergen in allergens))
    filtered_df = data[~data['contains_allergens']]
    
    # Flag additives
    filtered_df['additives'] = filtered_df['ingredients_clean'].apply(lambda x: [additive for additive in additives if additive in x])
    
    return filtered_df

In [13]:
# Step 4: User input and recommendations
input_product_name = "Banana Chips"  # Example user input
user_allergens = ['peanuts', 'gluten']  # Example user allergens
additives = ['sugar', 'soy sauce', 'natural flavor']  # Example list of additives

In [14]:
# Find similar products
similar_products = find_similar_products(input_product_name, data, tfidf_matrix, vectorizer, top_n=5)

if similar_products is not None:
    # Filter allergens and flag additives
    filtered_products = filter_and_flag(similar_products, user_allergens, additives)
    
    # Display recommendations
    print(f"Top Recommendations for '{input_product_name}':")
    for idx, row in filtered_products.iterrows():
        print(f"Product: {row['product_name']}")
        print(f"Ingredients: {row['ingredients']}")
        print(f"Similarity: {row['similarity']:.4f}")
        print(f"Additives: {row['additives']}")
        print("-" * 40)
else:
    print("No matching product found.")

ValueError: too many values to unpack (expected 2)