In [36]:
import pandas as pd
from ast import literal_eval
from collections import defaultdict

# Step 1: Read the CSV data into a DataFrame
df = pd.read_csv('newdataset/RAW_merged_top_smallest.csv')

# Step 2: Convert strings representing lists to actual lists
df['ingredients'] = df['ingredients'].apply(literal_eval)
df['techniques_list'] = df['techniques_list'].apply(literal_eval)

# Step 3: Initialize dictionaries to store counts
ingredient_counts = defaultdict(int)
technique_counts = defaultdict(int)
ingredient_technique_counts = defaultdict(lambda: defaultdict(int))

# Step 4: Iterate through each row and update counts
for idx, row in df.iterrows():
    ingredients = row['ingredients']
    techniques = row['techniques_list']
    
    for ingredient in ingredients:
        ingredient_counts[ingredient] += 1
        for technique in techniques:
            technique_counts[technique] += 1
            ingredient_technique_counts[ingredient][technique] += 1

# Step 5: Calculate correlations (e.g., using Pearson correlation coefficient)
# Here's a simplified example of how you might calculate a correlation score:
correlation_scores = defaultdict(lambda: defaultdict(float))

for ingredient in ingredient_counts:
    for technique in technique_counts:
        if ingredient_technique_counts[ingredient][technique] > 0:
            observed = ingredient_technique_counts[ingredient][technique]
            expected = (ingredient_counts[ingredient] * technique_counts[technique]) / len(df)
            correlation_scores[ingredient][technique] = (observed - expected) / (expected + 1)

# Step 6: Output or further analyze correlation scores as needed
for ingredient in correlation_scores:
    for technique in correlation_scores[ingredient]:
        print(f"Correlation between '{ingredient}' and '{technique}': {correlation_scores[ingredient][technique]}")

Correlation between 'roma tomatoes' and 'boil': -0.8567027810628314
Correlation between 'roma tomatoes' and 'grill': -0.5063350925108322
Correlation between 'roma tomatoes' and 'marinate': -0.18284102110880723
Correlation between 'roma tomatoes' and 'simmer': -0.7997856829551322
Correlation between 'roma tomatoes' and 'toss': -0.6851095043487373
Correlation between 'roma tomatoes' and 'bake': -0.9242021041798567
Correlation between 'roma tomatoes' and 'blend': -0.9178653182548089
Correlation between 'roma tomatoes' and 'combine': -0.9106911995910519
Correlation between 'roma tomatoes' and 'pour': -0.953490711684975
Correlation between 'roma tomatoes' and 'dice': -0.5181978515311849
Correlation between 'roma tomatoes' and 'broil': -0.1421944692239072
Correlation between 'roma tomatoes' and 'melt': -0.7729112912802686
Correlation between 'roma tomatoes' and 'toast': -0.3410569402821431
Correlation between 'roma tomatoes' and 'smooth': -0.8225114191211215
Correlation between 'roma tomatoe

In [77]:
# Step 6: Normalize correlation scores
min_score = float('inf')
max_score = float('-inf')

# Find the min and max scores
for ingredient in correlation_scores:
    for technique in correlation_scores[ingredient]:
        score = correlation_scores[ingredient][technique]
        if score < min_score:
            min_score = score
        if score > max_score:
            max_score = score

# Normalize the scores
for ingredient in correlation_scores:
    for technique in correlation_scores[ingredient]:
        score = correlation_scores[ingredient][technique]
        normalized_score = (score - min_score) / (max_score - min_score)
        correlation_scores[ingredient][technique] = normalized_score

# Step 7: Predict techniques based on input ingredients
input_ingredients = ['spaghetti', 'eggs', 'pancetta', 'Parmesan', 'cheese', 'black', 'pepper', 'salt']

technique_scores = defaultdict(float)

for ingredient in input_ingredients:
    if ingredient in correlation_scores:
        for technique, score in correlation_scores[ingredient].items():
            technique_scores[technique] += score
            
            
print(technique_scores.items())

# Step 8: Get the top predicted techniques
top_techniques = sorted(technique_scores.items(), key=lambda x: x[1], reverse=True)[:3]

print("Top 3 predicted techniques:")
for technique, score in top_techniques:
    print(f"- {technique} (Score: {score})")

dict_items([('boil', 0.16225292071402192), ('marinate', 0.06655258988378195), ('simmer', 0.14360809058263632), ('toss', 0.22771882176704114), ('bake', 0.08840205431055065), ('combine', 0.07196042294196753), ('pour', 0.1271748182354368), ('refrigerate', 0.06770763286318202), ('dice', 0.08018565899327149), ('drain', 0.2517982869543794), ('melt', 0.08952352479905289), ('smooth', 0.07496402236559539), ('thicken', 0.20767635983529012), ('saute', 0.15942778898363322), ('skillet', 0.17283071716709103), ('whisk', 0.09536996537836946), ('grate', 0.34963571108338487), ('crush', 0.301754442750984), ('shred', 0.1918303458530303), ('fry', 0.09388136855178747), ('scramble', 0.41526157040098965), ('microwave', 0.13408761697617583), ('freeze', 0.12525006497792043), ('slow cook', 0.1184239153882685), ('blanch', 0.26861072893800153), ('grill', 0.04596196919783274), ('blend', 0.05524420061506939), ('broil', 0.059829596200088216), ('toast', 0.08606016168294292), ('roast', 0.051718246489807176), ('mash', 0

In [74]:
import pandas as pd
import ast


# Read the CSV into a pandas DataFrame
df = pd.read_csv('newdataset/RAW_merged_top_smallest.csv')

# Convert string representations of lists to actual lists
df['ingredients'] = df['ingredients'].apply(ast.literal_eval)
df['techniques_list'] = df['techniques_list'].apply(ast.literal_eval)

# Explode the lists into separate rows for each ingredient-technique pair
df_exploded = df.explode('ingredients').explode('techniques_list')


# Calculate frequency of co-occurrence of each ingredient and technique
ingredient_technique_counts = df_exploded.groupby(['ingredients', 'techniques_list']).size().reset_index(name='count')

# Pivot the table for easier correlation calculation
pivot_table = ingredient_technique_counts.pivot(index='ingredients', columns='techniques_list', values='count').fillna(0)

    
print(pivot_table.head())



def predict_techniques(input_ingredients, pivot_table, top_n=3):
    techniques_scores = pivot_table.loc[input_ingredients].sum().sort_values(ascending=False)
    top_techniques = techniques_scores.head(top_n).index.tolist()
    return top_techniques

# Example usage:
input_ingredients = ['tomatoes', 'cheese', 'basil', 'olive', 'oil', 'vinegar', 'salt', 'pepper']
predicted_techniques = predict_techniques(input_ingredients, pivot_table)
print(f"Predicted techniques for {input_ingredients}: {predicted_techniques}")

techniques_list          bake  barbecue  blanch  blend  boil  braise  brine  \
ingredients                                                                   
1% low-fat milk           4.0       0.0     0.0    2.0   2.0     0.0    0.0   
10-inch flour tortillas   2.0       0.0     0.0    0.0   1.0     0.0    0.0   
10-minute success rice    1.0       0.0     0.0    0.0   0.0     0.0    0.0   
12-inch flour tortillas   0.0       0.0     0.0    0.0   0.0     0.0    0.0   
12-inch pizza crust       1.0       0.0     0.0    0.0   1.0     0.0    0.0   

techniques_list          broil  caramelize  combine  ...  soak  steam  stew  \
ingredients                                          ...                      
1% low-fat milk            0.0         0.0      4.0  ...   0.0    0.0   0.0   
10-inch flour tortillas    0.0         0.0      3.0  ...   0.0    0.0   0.0   
10-minute success rice     0.0         0.0      0.0  ...   0.0    0.0   0.0   
12-inch flour tortillas    0.0         0.0      1.0

In [72]:
import pandas as pd
from ast import literal_eval
from collections import defaultdict
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
import itertools

# Download NLTK resources if not already downloaded
nltk.download('wordnet')

# Step 1: Read the CSV data into a DataFrame
df = pd.read_csv('newdataset/RAW_merged_top_smallest.csv')

# Step 2: Convert strings representing lists to actual lists
df['ingredients'] = df['ingredients'].apply(literal_eval)
df['techniques_list'] = df['techniques_list'].apply(literal_eval)

# Step 3: Initialize dictionaries to store counts
ingredient_counts = defaultdict(int)
technique_counts = defaultdict(int)
ingredient_technique_counts = defaultdict(lambda: defaultdict(int))

# Step 4: Iterate through each row and update counts
for idx, row in df.iterrows():
    ingredients = row['ingredients']
    techniques = row['techniques_list']
    
    for ingredient in ingredients:
        ingredient_counts[ingredient] += 1
        for technique in techniques:
            technique_counts[technique] += 1
            ingredient_technique_counts[ingredient][technique] += 1

# Step 5: Calculate correlations (Pearson correlation coefficient)
correlation_scores = defaultdict(lambda: defaultdict(float))

for ingredient in ingredient_counts:
    for technique in technique_counts:
        if ingredient_technique_counts[ingredient][technique] > 0:
            observed = ingredient_technique_counts[ingredient][technique]
            expected = (ingredient_counts[ingredient] * technique_counts[technique]) / len(df)
            correlation_scores[ingredient][technique] = (observed - expected) / (expected + 1)

# Step 6: Normalize correlation scores
min_score = float('inf')
max_score = float('-inf')

# Find the min and max scores
for ingredient in correlation_scores:
    for technique in correlation_scores[ingredient]:
        score = correlation_scores[ingredient][technique]
        if score < min_score:
            min_score = score
        if score > max_score:
            max_score = score

# Normalize the scores
for ingredient in correlation_scores:
    for technique in correlation_scores[ingredient]:
        score = correlation_scores[ingredient][technique]
        normalized_score = (score - min_score) / (max_score - min_score)
        correlation_scores[ingredient][technique] = normalized_score

# Step 7: Predict techniques based on input ingredients using NLTK for similarity

# Example input ingredients
input_ingredients = ['tomatoes', 'mozzarella', 'cheese', 'fresh', 'basil', 'olive', 'oil', 'balsamic', 'vinegar', 'salt', 'pepper']

# Function to calculate WordNet similarity between two words
def wordnet_similarity(word1, word2):
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)
    if synsets1 and synsets2:
        return max(s1.path_similarity(s2) or 0 for s1, s2 in itertools.product(synsets1, synsets2))
    else:
        return 0
    
# Threshold for WordNet similarity score
similarity_threshold = 0.5

# Find techniques associated with input ingredients
technique_scores = defaultdict(float)

for ingredient in input_ingredients:
    for dataset_ingredient in ingredient_counts:
        similarity_score = wordnet_similarity(ingredient, dataset_ingredient)
        if similarity_score > similarity_threshold:
            for technique, score in correlation_scores[dataset_ingredient].items():
                technique_scores[technique] += score * similarity_score

# Step 8: Get the top predicted techniques
top_techniques = sorted(technique_scores.items(), key=lambda x: x[1], reverse=True)[:3]

print("Top 3 predicted techniques:")
for technique, score in top_techniques:
    print(f"- {technique} (Score: {score})")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Top 3 predicted techniques:
- strain (Score: 0.7794909072059331)
- scald (Score: 0.4473543079776914)
- marinate (Score: 0.4017217073895403)
