In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt


In [2]:
data = pd.read_csv('data/cleaned_data.csv')
data.head()

Unnamed: 0,product_name,brands,categories_en,labels_en,ingredients_text,allergens_en,additives_en,nutrition_grade_fr,energy_100g,fat_100g,...,cocoa_100g,carbon-footprint_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,category_level_1,category_level_2,category_level_3,category_level_4,category_level_5,category_level_6
0,Banana Chips Sweetened (Whole),not mentioned,Not Mentioned,Labels are missing,"Bananas, vegetable oil (coconut oil, corn oil ...",unknown,No additives,d,2243.0,28.57,...,0.0,no information,14.0,14.0,Not Mentioned,Not Specified,Not Specified,Not Specified,Not Specified,Not Specified
1,Peanuts,torn & glasser,Not Mentioned,Labels are missing,"Peanuts, wheat flour, sugar, rice flour, tapio...","soy, wheat, peanuts",No additives,b,1941.0,17.86,...,0.0,no information,0.0,0.0,Not Mentioned,Not Specified,Not Specified,Not Specified,Not Specified,Not Specified
2,Organic Salted Nut Mix,grizzlies,Not Mentioned,Labels are missing,"Organic hazelnuts, organic cashews, organic wa...",unknown,No additives,d,2540.0,57.14,...,0.0,no information,12.0,12.0,Not Mentioned,Not Specified,Not Specified,Not Specified,Not Specified,Not Specified
3,Organic Polenta,bob's red mill,Not Mentioned,Labels are missing,Organic polenta,unknown,No additives,not given,1552.0,1.43,...,0.0,no information,not given,not given,Not Mentioned,Not Specified,Not Specified,Not Specified,Not Specified,Not Specified
4,Breadshop Honey Gone Nuts Granola,unfi,Not Mentioned,Labels are missing,"Rolled oats, grape concentrate, expeller press...",sesame,No additives,not given,1933.0,18.27,...,0.0,no information,not given,not given,Not Mentioned,Not Specified,Not Specified,Not Specified,Not Specified,Not Specified


In [3]:
us_data = pd.read_csv('data/us_data.csv')
us_data.shape

(171521, 98)

In [4]:
from modules.ingredients import clean_ingredients

data['ingredients'] = data['ingredients_text'].apply(clean_ingredients)
us_data['ingredients'] = us_data['ingredients_text'].apply(clean_ingredients)

In [5]:
data = data.drop(columns=['ingredients_text', 'categories_en'])
us_data = us_data.drop(columns=['ingredients_text', 'categories_en'])

### Pre-processing 

In [6]:
# Clean and standardize text columns
def clean_text(text):
    if isinstance(text, str):
        return text.lower().strip()
    return text

data['product_name'] = data['product_name'].apply(clean_text)
data['ingredients'] = data['ingredients'].apply(clean_text)
data['allergens_en'] = data['allergens_en'].apply(clean_text)
data['category_level_1'] = data['category_level_1'].apply(clean_text)
data['category_level_2'] = data['category_level_2'].apply(clean_text)

In [7]:
# Replace placeholders with NaN or empty lists
data['allergens_en'] = data['allergens_en'].replace('unknown', np.nan)
data['ingredients'] = data['ingredients'].replace('ingredients are missing', np.nan)

In [8]:
# Split columns into lists
data['ingredients'] = data['ingredients'].str.split(', ')
data['allergens_en'] = data['allergens_en'].str.split(', ')

In [9]:
# Create binary flags for allergens
allergens = data.allergens_en.explode().unique()

for allergen in allergens:
    data[f'contains_{allergen}'] = data['allergens_en'].apply(lambda x: allergen in x if isinstance(x, list) else False)

### Implement product matching

In [10]:
from fuzzywuzzy import process

# Sample user input
user_input = "banana crisps"

# Function to find closest matches
def find_closest_match(user_input, choices, limit=5):
    matches = process.extract(user_input, choices, limit=limit)
    return matches

# Get closest matches
product_names = data['product_name'].tolist()
matches = find_closest_match(user_input, product_names)

print("Top matches for user input:", matches)

Top matches for user input: [('banana crisps', 100), ('crisps', 90), ('crisps', 90), ('crisps', 90), ('banana', 90)]


Approach 1 

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Convert ingredients list back to strings for TF-IDF
data['ingredients_str'] = data['ingredients'].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')

# Create TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['ingredients_str'])

In [12]:
# Function to get similar products for a subset
def get_similar_products(product_name, tfidf_matrix, df, top_n=5):
    # Get the index of the product
    idx = df[df['product_name'] == product_name].index[0]
    
    # Compute cosine similarity between the query product and all others
    cosine_sim = cosine_similarity(tfidf_matrix[idx:idx+1], tfidf_matrix).flatten()
    
    # Get top N similar products
    sim_scores = list(enumerate(cosine_sim))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Exclude the product itself
    product_indices = [i[0] for i in sim_scores]
    
    return df.iloc[product_indices]

In [13]:
def filter_by_allergens(recommendations, allergens_to_avoid):
    # Check if the product contains any of the allergens to avoid
    for allergen in allergens_to_avoid:
        if f'contains_{allergen}' in recommendations.columns:
            recommendations = recommendations[~recommendations[f'contains_{allergen}']]
    return recommendations

In [14]:
def recommend_products(user_input, allergens_to_avoid, df, tfidf_matrix, top_n=5):
    # Step 1: Find closest match
    matches = find_closest_match(user_input, df['product_name'].tolist(), limit=1)
    closest_match = matches[0][0]  # Get the top match
    
    # Step 2: Get category of the closest match
    closest_match_categories = df[df['product_name'] == closest_match][['category_level_1', 'category_level_2']].values[0]
    
    # Step 3: Filter products in the same category
    same_category_products = df[
        (df['category_level_1'] == closest_match_categories[0]) & 
        (df['category_level_2'] == closest_match_categories[1])
    ]
    
    # Step 4: Get similar products within the same category
    if not same_category_products.empty:
        # Get indices of products in the same category
        same_category_indices = same_category_products.index
        
        # Compute similarity scores for products in the same category
        query_index = df[df['product_name'] == closest_match].index[0]
        cosine_sim = cosine_similarity(tfidf_matrix[query_index:query_index+1], tfidf_matrix[same_category_indices]).flatten()
        
        # Get top N similar products
        sim_scores = list(enumerate(cosine_sim))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:top_n+1]  # Exclude the product itself
        product_indices = [same_category_indices[i[0]] for i in sim_scores]
        
        recommendations = df.iloc[product_indices]
    else:
        # If no products in the same category, fall back to global similarity
        recommendations = get_similar_products(closest_match, tfidf_matrix, df, top_n)
    
    # Step 5: Filter by allergens
    recommendations = filter_by_allergens(recommendations, allergens_to_avoid)
    
    return recommendations

# Example: User inputs "banana crisps" and wants to avoid "soy" and "peanuts"
user_input = "banana crisps"
allergens_to_avoid = ['soy', 'peanuts']
recommendations = recommend_products(user_input, allergens_to_avoid, data, tfidf_matrix)

print("Final recommendations:")
print(recommendations[['product_name', 'ingredients', 'category_level_1', 'category_level_2']])

Final recommendations:
                                             product_name  \
115754  nud, organic paleo vegan raw snacks, cacao banana   
115753                            spirulina banana crisps   
139722  dr. mcdougall's, organic oatmeal hot cereal, c...   
139721                     organic might omega hot cereal   

                                              ingredients category_level_1  \
115754  [organic fresh banana, organic sesame seed, or...    not mentioned   
115753  [organic fresh banana, organic sesame seed, or...    not mentioned   
139722  [organic thick cut whole grain oats, organic s...    not mentioned   
139721  [organic thick cut whole grain oats, organic s...    not mentioned   

       category_level_2  
115754    not specified  
115753    not specified  
139722    not specified  
139721    not specified  
