In [384]:
# Import libraries and load data

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


In [385]:

uzbek_food = pd.read_csv("matched_uzb_with_FlavorDB.csv")
eng_food = pd.read_csv("matched_eng_with_FlavorDB.csv")
kor_food = pd.read_csv("matched_kor_with_FlavorDB.csv")


In [386]:
def drop_no_match_rows(df):
    # Dropping rows where 'Matched ingredient' is 'no match' or 'No Match', case insensitive
    return df[~df['Matched ingredient'].str.lower().isin(['no match'])]


# Applying the function to each DataFrame
eng_food = drop_no_match_rows(eng_food)
kor_food = drop_no_match_rows(kor_food)
uzbek_food = drop_no_match_rows(uzbek_food)

In [None]:
uzbek_food

In [None]:
eng_food

In [389]:

# Reading the unique flavor profiles from the .txt file
with open("/content/unique_flavors (2).txt", 'r') as f:
    unique_flavor_profiles = [line.strip() for line in f.readlines()]

print(f"Unique Flavor Profiles: {unique_flavor_profiles}")


vectorizer = TfidfVectorizer(vocabulary=unique_flavor_profiles)
tfidf_matrix = vectorizer.fit_transform(uzbek_food['Flavor profiles'])

tfidf_array = tfidf_matrix.toarray()

# Convert the array to a comma-separated string and add to the 'Tf-idf' column
uzbek_food['Tf-idf'] = ['[' + ', '.join(map(str, row)) + ']' for row in tfidf_array]

# Save to CSV
uzbek_food.to_csv('uzbek_food_with_tfidf.csv', index=False)

uzbek_food


Unique Flavor Profiles: ['absolute', 'acacia', 'acetic', 'acetoin', 'acetone', 'acetophenone', 'acid', 'acidic', 'acrid', 'acrylate', 'acrylic', 'alcohol', 'alcoholic', 'aldehydic', 'alkaline', 'alkane', 'alliaceous', 'allspice', 'almond', 'almond shell', 'amber', 'ambergris', 'amine', 'ammonia', 'ammoniacal', 'angelica', 'animal', 'anise', 'aniseed', 'anisic', 'apple', 'apple peel', 'apple skin', 'apricot', 'aromatic', 'arrack', 'asprin', 'bacon', 'baked', 'balsam', 'balsamic', 'banana', 'banana peel', 'barley', 'basil', 'bay oil', 'bean', 'beany', 'beef', 'beefy', 'beer', 'beet', 'bell', 'benzaldehyde', 'benzyl acetate', 'benzyl propionate', 'bergamot', 'berry', 'biscuit', 'bitter', 'bitter almond', 'black currant', 'black tea', 'blackberry', 'blackcurrant', 'bland', 'bloody', 'blossom', 'blueberry', 'boiled shrimp', 'boiled vegetable', 'bois de rose', 'borneol', 'bouillon', 'box tree', 'brandy', 'bread', 'bread crust', 'bready', 'broccoli', 'broom', 'brown', 'buchu', 'burnt', 'burnt

Unnamed: 0,Russian name,Matched ingredient,Flavor profiles,Tf-idf
0,капуста китайская,cabbage,"sweet, new mown hay, green, tonka, bitter, pru...","[0.0, 0.0, 0.0, 0.0, 0.023854699225454374, 0.0..."
1,мякоть задней ноги барана (окорок),mutton,"lard, oily, fatty, fruity, cheese, caramellic,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0402087764276..."
2,рыба,fish,"fragrant, bread, woody, sweet, baked, almond, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0332334206331..."
3,зелень кинзы,coriander,"sweet, new mown hay, green, tonka, bitter, her...","[0.0, 0.0, 0.0, 0.0, 0.01811265713607929, 0.0,..."
4,яблоко,apple,"bland, powdery, vanilla, bean, milky, sweet, c...","[0.0, 0.0, 0.0, 0.0, 0.010798475193004066, 0.0..."
...,...,...,...,...
413,мёд,honey,"fragrant, bread, woody, sweet, baked, almond, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0264244462180..."
414,рыжики солёные,mushroom,"caramellic, spicy, peach, vegetable, herbal, a...","[0.0, 0.0, 0.0, 0.0, 0.016837085583335014, 0.0..."
415,брюква,rutabaga,"sweet, new mown hay, green, tonka, bitter, bit...","[0.0, 0.0, 0.03364365694573482, 0.0, 0.0350442..."
416,пшеница,wheat,"bland, sweet, new mown hay, green, tonka, bitt...","[0.0, 0.0, 0.0, 0.0, 0.03480666770255092, 0.0,..."


In [None]:
vectorizer = TfidfVectorizer(vocabulary=unique_flavor_profiles)
tfidf_matrix = vectorizer.fit_transform(eng_food['Flavor profiles'])

tfidf_array = tfidf_matrix.toarray()

# Convert the array to a comma-separated string and add to the 'Tf-idf' column
eng_food['Tf-idf'] = ['[' + ', '.join(map(str, row)) + ']' for row in tfidf_array]

# Save to CSV
eng_food.to_csv('eng_food_with_tfidf.csv', index=False)

eng_food


In [None]:
vectorizer = TfidfVectorizer(vocabulary=unique_flavor_profiles)
tfidf_matrix = vectorizer.fit_transform(kor_food['Flavor profiles'])

tfidf_array = tfidf_matrix.toarray()

# Convert the array to a comma-separated string and add to the 'Tf-idf' column
kor_food['Tf-idf'] = ['[' + ', '.join(map(str, row)) + ']' for row in tfidf_array]

# Save to CSV
kor_food.to_csv('korean_food_with_tfidf.csv', index=False)

kor_food


In [392]:
import numpy as np
import ast

def string_to_list(embedding_str):
    """Converts the string representation of the embeddings into a list."""
    try:
        return ast.literal_eval(embedding_str)
    except (ValueError, SyntaxError):
        return []

# Convert the 'Tf-idf' column into actual lists for easy computation
eng_food['Tf-idf'] = eng_food['Tf-idf'].apply(string_to_list)
uzbek_food['Tf-idf'] = uzbek_food['Tf-idf'].apply(string_to_list)
kor_food['Tf-idf'] = kor_food['Tf-idf'].apply(string_to_list)




In [393]:
uzbek_dishes = pd.read_csv("/content/final_uzb_recipe.csv")
english_dishes = pd.read_csv("/content/final_eng_recipe.csv")
korean_dishes = pd.read_csv("/content/final_kor_recipe.csv")

In [None]:

# Count and print the number of duplicates in korean_traditional_dishes
num_duplicates = korean_dishes.duplicated().sum()
print(f"Number of duplicates in korean_traditional_dishes: {num_duplicates}")

# Drop duplicates from korean_traditional_dishes
korean_dishes = korean_dishes.drop_duplicates()

# Count and print the number of duplicates in korean_traditional_dishes
num_duplicates = english_dishes.duplicated().sum()
print(f"Number of duplicates in korean_traditional_dishes: {num_duplicates}")

# Drop duplicates from korean_traditional_dishes
english_dishes = english_dishes.drop_duplicates()

num_duplicates = uzbek_dishes.duplicated().sum()
print(f"Number of duplicates in uzbek_traditional_dishes: {num_duplicates}")

# Drop duplicates from korean_traditional_dishes
uzbek_dishes = uzbek_dishes.drop_duplicates()


In [395]:
# Create a dictionary for faster lookup with case-insensitive keys
tfidf_russian_dict = {row["Russian name"].lower().strip(): row['Tf-idf'] for _, row in uzbek_food.iterrows()}
tfidf_english_dict = {row["English name"].lower().strip(): row['Tf-idf'] for _, row in eng_food.iterrows()}
tfidf_korean_dict = {row["Korean name"].lower().strip(): row['Tf-idf'] for _, row in kor_food.iterrows()}

def compute_mean_embedding(dish_row, tfidf_dict, column_name):
    ingredients = dish_row[column_name]

    # Check if the ingredients is not a string or list; return None if so
    if not isinstance(ingredients, (str, list)):
        return None

    # Convert ingredient list to a Python list if it's a string
    if isinstance(ingredients, str) and ingredients.startswith("["):
        ingredients = ast.literal_eval(ingredients)
    elif isinstance(ingredients, str):
        ingredients = [ing.strip().lower() for ing in ingredients.split(',')]

    embeddings = [tfidf_dict[ing] for ing in ingredients if ing in tfidf_dict]

    # Calculate the mean embedding across all embeddings for this dish
    if embeddings:
        mean_embedding = np.mean(embeddings, axis=0).tolist()
        return mean_embedding
    return None


def process_dataset(dish_df, tfidf_dict, column_name):
    dish_df['Mean Embedding'] = dish_df.apply(compute_mean_embedding, axis=1, args=(tfidf_dict, column_name))
    return dish_df

# Process each dataset
uzbek_dishes = process_dataset(uzbek_dishes, tfidf_russian_dict, "Ingredients")
english_dishes = process_dataset(english_dishes, tfidf_english_dict, "Ingredients without Quantity")
korean_dishes = process_dataset(korean_dishes, tfidf_korean_dict, "Ingredients_without_quantity")


In [None]:
import pandas as pd

# 1. Rename columns for consistency
korean_dishes = korean_dishes.rename(columns={
    'Name': 'Korean Name',
    'Link to the dish': 'Link',
    'Ingredients': 'Ingredients with Quantity',
    'Ingredients_without_quantity': 'Ingredients without Quantity',
    'Making Steps': 'Making Steps',
    'Mean Embedding': 'Mean Embedding'
})

english_dishes = english_dishes.rename(columns={
    'Link': 'Link',
    'Ingredients with Quantity': 'Ingredients with Quantity',
    'Ingredients without Quantity': 'Ingredients without Quantity',
    'Making Steps': 'Making Steps',
    'Mean Embedding': 'Mean Embedding'
})

# 2. Add columns that don't exist in one of the datasets and fill with NaN
korean_dishes['English Name'] = None

# 3. Concatenate the two datasets
korean_traditional_dishes = pd.concat([korean_dishes, english_dishes], ignore_index=True, sort=False)

# Ensure the new dataset is properly merged
korean_traditional_dishes.head()


In [397]:
len(korean_traditional_dishes)

768

In [398]:
import pandas as pd

# Assuming you have a DataFrame named korean_traditional_dishes
# and you want to delete rows with specific values in the "Korean name" column
values_to_delete = ["보쌈김치", "나박김치", "갓김치", "통마늘장아찌","꽃빵","갈치조림","고등어조림", '다시마볶음','오이동치미','곱창전골','동태전']

# Create a boolean mask to identify rows with the specified values
mask = korean_traditional_dishes["Korean Name"].isin(values_to_delete)

# Use the mask to filter the DataFrame and keep only the rows you want
filtered_df = korean_traditional_dishes[~mask]

In [399]:
korean_traditional_dishes = filtered_df

In [400]:
#Save mean embedding if necessary
#korean_traditional_dishes.to_csv("Mean Embeddings_korean.csv")
#russian_dishes.to_csv("Mean Embeddings_uzbek.csv")

In [401]:
uzbek_dishes = uzbek_dishes.rename(columns={'Ingredients': 'Ingredients without Quantity'})
import ast

# Convert the string representation of ingredients into an actual list
uzbek_dishes['Ingredients without Quantity'] = uzbek_dishes['Ingredients without Quantity'].apply(ast.literal_eval)


In [402]:
uzbek_dishes_copy = uzbek_dishes.copy()
korean_dishes_copy = korean_traditional_dishes.copy()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ast



# Define a function to check if a dish meets the criteria
def meets_criteria(dish_row, n_ingredients=7, n_non_zero=10):
    ingredients = dish_row['Ingredients without Quantity']
    mean_embedding = dish_row['Mean Embedding']

    if not isinstance(ingredients, (list, str)):
        return False
    if isinstance(ingredients, str) and ingredients.startswith("["):
        ingredients = ast.literal_eval(ingredients)
    elif isinstance(ingredients, str):
        ingredients = [ing.strip().lower() for ing in ingredients.split(',')]

    count_ingredients = len(ingredients)
    count_non_zero = np.sum(np.array(mean_embedding) != 0)

    return count_ingredients > n_ingredients and count_non_zero >= n_non_zero

# Filter the dishes that meet the criteria
korean_filtered_dishes = korean_traditional_dishes[korean_traditional_dishes.apply(meets_criteria, axis=1)].copy()  # Using copy to avoid SettingWithCopyWarning
russian_filtered_dishes = uzbek_dishes[uzbek_dishes.apply(meets_criteria, axis=1)].copy()  # Using copy to avoid SettingWithCopyWarning

korean_filtered_dishes['Mean Embedding'] = korean_filtered_dishes['Mean Embedding'].apply(lambda x: np.array(x) if isinstance(x, list) else np.array([]))
russian_filtered_dishes['Mean Embedding'] = russian_filtered_dishes['Mean Embedding'].apply(lambda x: np.array(x) if isinstance(x, list) else np.array([]))

# Extract embeddings as arrays and stack them
korean_embeddings = np.vstack(korean_filtered_dishes['Mean Embedding'])
russian_embeddings = np.vstack(russian_filtered_dishes['Mean Embedding'])

# Calculate cosine similarities
cosine_similarities = cosine_similarity(russian_embeddings, korean_embeddings)
sorted_indices = np.argsort(cosine_similarities, axis=None)

# Extract top 10 most similar and dissimilar pairs
similar_pairs = [np.unravel_index(idx, cosine_similarities.shape) for idx in sorted_indices[-10:]][::-1]
dissimilar_pairs = [np.unravel_index(idx, cosine_similarities.shape) for idx in sorted_indices[:10]]

# Define sets to keep track of printed dish names separately for similar and dissimilar dishes
printed_similar_dishes = set()
printed_dissimilar_dishes = set()


def print_similar_and_dissimilar_dishes(similar_indices, dissimilar_indices, russian_data, korean_data, num_results=10):
    printed_russian_similar = set()
    printed_korean_similar = set()

    printed_russian_dissimilar = set()
    printed_korean_dissimilar = set()

    print("Top 10 Most Similar Dishes:")
    for idx in similar_indices:
        r_idx, k_idx = np.unravel_index(idx, cosine_similarities.shape)
        if len(printed_russian_similar) >= num_results:
            break

        russian_dish_name = russian_data.iloc[r_idx]['Name']
        korean_dish_name = korean_data.iloc[k_idx]['Korean Name']

        if russian_dish_name not in printed_russian_similar and korean_dish_name not in printed_korean_similar:
            similarity = cosine_similarities[r_idx, k_idx]
            print(f"Russian Dish: {russian_dish_name} - Korean Dish: {korean_dish_name} (Similarity: {similarity:.4f})")
            printed_russian_similar.add(russian_dish_name)
            printed_korean_similar.add(korean_dish_name)

    print("\nTop 10 Most Dissimilar Dishes:")
    for idx in dissimilar_indices:
        r_idx, k_idx = np.unravel_index(idx, cosine_similarities.shape)
        if len(printed_russian_dissimilar) >= num_results:
            break

        russian_dish_name = russian_data.iloc[r_idx]['Name']
        korean_dish_name = korean_data.iloc[k_idx]['Korean Name']

        if russian_dish_name not in printed_russian_dissimilar and korean_dish_name not in printed_korean_dissimilar:
            similarity = cosine_similarities[r_idx, k_idx]
            print(f"Russian Dish: {russian_dish_name} - Korean Dish: {korean_dish_name} (Similarity: {similarity:.4f})")
            printed_russian_dissimilar.add(russian_dish_name)
            printed_korean_dissimilar.add(korean_dish_name)

# Extract indices for most similar and dissimilar pairs
similar_indices = np.argsort(cosine_similarities, axis=None)[::-1]  # We need to reverse because argsort sorts in ascending order
dissimilar_indices = np.argsort(cosine_similarities, axis=None)

# Print the similar and dissimilar dishes
print_similar_and_dissimilar_dishes(similar_indices, dissimilar_indices, russian_filtered_dishes, korean_filtered_dishes)

