In [2]:
# %% Cell 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pickle


In [3]:
# %% Cell 2: Load Datasets
international_cuisine_df = pd.read_csv('RAW_recipes.csv')
african_cuisine_df = pd.read_csv('African_recipes.csv')
nutrition_df = pd.read_csv('nutritions.csv')


# Data Cleaning

In [4]:
# Nutrition Dataset
nutrition_df.drop(columns=['Unnamed: 0'], errors='ignore', inplace=True)
num_cols = ['calories', 'total_fat', 'saturated_fat', 'cholesterol', 'sodium', 'choline', 'folate']
text_cols = ['name', 'serving_size']

#Filling in all the numerical columns with 0 in missing values
nutrition_df[num_cols] = nutrition_df[num_cols].fillna(0)

#Filling in all the text columns with 'unknown' in missing values
nutrition_df[text_cols] = nutrition_df[text_cols].fillna('Unknown')

# Removing the metric 'g' in the serving size column
if 'serving_size' in nutrition_df.columns:
    nutrition_df['serving_size'] = nutrition_df['serving_size'].str.replace('g', '', regex=True).astype(float)
# Removing the metric 'g' in the 'total_fat' and 'saturated_fat' columns
for col in ['total_fat', 'saturated_fat']:
    if col in nutrition_df.columns:
        nutrition_df[col] = nutrition_df[col].replace('g', '', regex=True).astype(float)

# Dropping the missing values in the nutrition dataset
nutrition_df.dropna(inplace=True)

#Dropping the duplicates in the nutrition dataset
nutrition_df.drop_duplicates(inplace=True)


In [5]:
#  African Recipes
# Dropping the missing values
african_cuisine_df.dropna(inplace=True)

# Dropping all the duplicates
african_cuisine_df.drop_duplicates(inplace=True)

# Cleaning the columns 'ingredients' and 'steps' 
for col in ['ingredients', 'steps']:
    if col in african_cuisine_df.columns:
        african_cuisine_df[col] = african_cuisine_df[col].astype(str).str.replace('\n', ',')


In [6]:
#Data Cleaning - International Recipes
# Droppiing all the missing values
international_cuisine_df.dropna(inplace=True)

#Dropping all the duplicate values
international_cuisine_df.drop_duplicates(inplace=True)


# Feature Engineering

In [7]:
#  International Recipes
# Fetching the type of cuisin each recipe is from the tag column
international_cuisine_df['cuisine'] = international_cuisine_df['tags'].apply(
    lambda tag: tag.split()[0].replace('-style', '') if isinstance(tag, str) and '-style' in tag else None
)

# Fetching the type of dietary prefference each recie is according to the tag columns
international_cuisine_df['dietary_preference'] = international_cuisine_df['tags'].apply(
    lambda tag: tag.split()[0].replace('-friendly', '') if isinstance(tag, str) and '-friendly' in tag else None
)

# Creating Nutrition column that will fect informatiion from the 'Nutrition' column in the International dataset
nutr_cols = ['calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates']
for col in nutr_cols:
    international_cuisine_df[col] = None
for idx, row in international_cuisine_df.iterrows():
    if isinstance(row['nutrition'], str):
        values = row['nutrition'].strip('[]').split(',')
        for i, col in enumerate(nutr_cols):
            if i < len(values):
                international_cuisine_df.at[idx, col] = float(values[i])
international_cuisine_df.drop(columns=['nutrition'], inplace=True)

# Scaling the new nutrition columns
scaler = MinMaxScaler()
international_cuisine_df[nutr_cols] = scaler.fit_transform(international_cuisine_df[nutr_cols])


In [8]:
# Feature Engineering - African Recipes
# Getting the number of ingridient needed on each recipe
african_cuisine_df['num_ingredients'] = african_cuisine_df['ingredients'].apply(len)

# Scaling the calory column
if 'calories' in african_cuisine_df.columns:
    african_cuisine_df['calories'] = scaler.fit_transform(african_cuisine_df[['calories']])


# Vectorization

In [9]:
# TF-IDF Vectorization for Recommendations
# Create combined text fields for both datasets
african_cuisine_df['text'] = african_cuisine_df['ingredients'] + ' ' + african_cuisine_df['steps']
international_cuisine_df['text'] = international_cuisine_df['ingredients'] + ' ' + international_cuisine_df['steps']

# Initializing the reccomendation system
vectorizer = TfidfVectorizer()

# Fitting the vectorizer on the important column in the important feature i.e 'ingridients' and 'steps'
african_features = vectorizer.fit_transform(african_cuisine_df['text'])
international_features = vectorizer.transform(international_cuisine_df['text'])


In [10]:
# Set User Inputs (Simulated Inline)
user_preferences = "I want a low-calorie meal with chicken"
calories_limit = 100          # For text-based filtering (max calories)
max_cooking_time = 30         # Maximum cooking time in minutes
top_n = 5                     # Number of recommendations
desired_calories_african = 500        # Collaborative filtering target for African recipes
desired_calories_international = 600   # Collaborative filtering target for International recipes


In [11]:
# Convert 'minutes' column to numeric for both datasets (if needed)
african_cuisine_df['minutes'] = pd.to_numeric(african_cuisine_df['minutes'], errors='coerce')
international_cuisine_df['minutes'] = pd.to_numeric(international_cuisine_df['minutes'], errors='coerce')

# Text-Based Recommendations
user_pref_features = vectorizer.transform([user_preferences])

# African recipes text-based recommendation
african_sim = cosine_similarity(user_pref_features, african_features)
african_top_idxs = african_sim.argsort()[0][-top_n:][::-1]
african_recs = african_cuisine_df.loc[
    (african_cuisine_df['calories'] <= calories_limit) &
    (african_cuisine_df['minutes'] <= max_cooking_time) &
    (african_cuisine_df.index.isin(african_top_idxs)),
    ['name', 'ingredients', 'steps']
].reset_index(drop=True)
print("Text‑Based African Recipe Recommendations:")
print(african_recs)

# International recipes text-based recommendation
intl_sim = cosine_similarity(user_pref_features, international_features)
intl_top_idxs = intl_sim.argsort()[0][-top_n:][::-1]
international_recs = international_cuisine_df.loc[
    (international_cuisine_df['minutes'] <= max_cooking_time) &
    (international_cuisine_df.index.isin(intl_top_idxs)),
    ['name', 'ingredients', 'steps']
].reset_index(drop=True)
print("\nText‑Based International Recipe Recommendations:")
print(international_recs)


Text‑Based African Recipe Recommendations:
                           name                     ingredients  \
0  Ugali the kenyan staple food  3 cups water,3 cups maize meal   

                                               steps  
0  Step 1,In a sufuria bring water to boil. Add t...  

Text‑Based International Recipe Recommendations:
                            name  \
0  creamy tomato soup with herbs   
1  skip the machine peach gelato   
2            tom kha kai point 2   

                                         ingredients  \
0  ['olive oil', 'onion', 'garlic', 'diced tomato...   
1                     ['peaches', 'sugar', 'yogurt']   
2  ['broth', 'red pepper', 'mushroom', 'green oni...   

                                               steps  
0  ['heat oil in 4 1 / 2 qt dutch oven or soup po...  
1  ['cut peaches into very small pieces', 'arrang...  
2  ['bring broth to a boil , add chicken', 'while...  


# Collaborative Filtering Reccomendation

In [12]:
# African Recipes
african_collab_df = african_cuisine_df[['name', 'minutes', 'ingredients', 'steps', 'calories']].copy()
african_collab_df['calories_norm'] = scaler.fit_transform(african_collab_df[['calories']])
filtered_african = african_collab_df[
    (african_collab_df['calories'] <= desired_calories_african) &
    (african_collab_df['minutes'] <= max_cooking_time)
]
mean_cal_african = filtered_african['calories'].mean()
mean_cal_norm_african = scaler.transform([[mean_cal_african]])[0][0]
african_collab_sim = cosine_similarity([[mean_cal_norm_african]], african_collab_df[['calories_norm']])[0]
african_collab_top = african_collab_sim.argsort()[-top_n:][::-1]
african_collab_recs = african_collab_df.loc[african_collab_top, ['name', 'minutes', 'ingredients', 'steps', 'calories']].reset_index(drop=True)
print("\nCollaborative Filtering African Recipe Recommendations:")
print(african_collab_recs)

# International Recipes
international_collab_df = international_cuisine_df[['name', 'minutes', 'ingredients', 'steps', 'calories']].copy()
filtered_international = international_collab_df[
    (international_collab_df['calories'] <= desired_calories_international) &
    (international_collab_df['minutes'] <= max_cooking_time)
]
filtered_international['calories_diff'] = abs(filtered_international['calories'] - desired_calories_international)
international_collab_recs = filtered_international.nsmallest(top_n, 'calories_diff')[['name', 'minutes', 'ingredients', 'steps', 'calories']]
print("\nCollaborative Filtering International Recipe Recommendations:")
print(international_collab_recs)



Collaborative Filtering African Recipe Recommendations:
                                             name  minutes  \
0                              Pigeon Peas Mukimo    120.0   
1                                     AFRICAN TEA     20.0   
2  Wukunu (Sweet Potatoes & Dehulled Black Beans)    120.0   
3                  African brewed lemon grass tea     10.0   
4                            African Chicken Stew    180.0   

                                         ingredients  \
0  1.5 Cups Pigeon Peas /Mbaazi,1 Red onion,500 g...   
1                  1 tea spoonful per cup,milk,water   
2  Ingredients,2¼ cups (384 g) black beans, dolic...   
3  1 Cup milk,1 cup water,1/2 tsp Kenyan tea,Some...   
4  1362 g chicken cut pieces,5-6 tomatoes,62.5 g ...   

                                               steps  calories  
0  Step 1,Soak the pigeon peas for 8+ hours then ...  0.858242  
1  Step 1,Put clean water in a pot (sufuria) and ...  0.429670  
2  Preparation 15 minutes | Cooking 2 




Collaborative Filtering International Recipe Recommendations:
                                        name  minutes  \
209453                   tennessee moonshine       20   
165504                powdered hot cocoa mix       10   
183541            seasoned goldfish crackers       30   
187719  sinfully delicious hot chocolate mix        5   
77971        easy sesame tempura green beans       15   

                                              ingredients  \
209453  ['cornmeal', 'natural bran', 'sugar', 'yeast',...   
165504  ['dry milk', 'nestles quik', 'powdered sugar',...   
183541  ['goldfish crackers', 'ranch dressing mix', 'd...   
187719  ['quik chocolate milk mix', 'coffee-mate', 'po...   
77971   ['all-purpose flour', 'sesame seeds', 'beer', ...   

                                                    steps  calories  
209453  ['to boiled cornmeal add yeast and lots of sug...  1.000000  
165504  ['mix all ingredients together in an airtight ...  0.105003  
183541  ['place c

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_international['calories_diff'] = abs(filtered_international['calories'] - desired_calories_international)


# Content Based Reccomendation system

In [13]:
# Split the data into training and test sets
train_data, test_data, train_target, test_target = train_test_split(
    african_cuisine_df['text'], african_cuisine_df['calories'], test_size=0.2, random_state=42
)

# Create and fit a new TF‑IDF vectorizer on the training data
vectorizer_content = TfidfVectorizer()
train_features = vectorizer_content.fit_transform(train_data)

# Train a Linear Regression model
model = LinearRegression()
model.fit(train_features, train_target)

# Evaluate on the test set
test_features = vectorizer_content.transform(test_data)
predictions = model.predict(test_features)
mse = mean_squared_error(test_target, predictions)
print(f"\nContent-Based Model Mean Squared Error (MSE): {mse}")



Content-Based Model Mean Squared Error (MSE): 0.12138262894735313


In [14]:
#Save the Model and Vectorizer
with open('content_user_model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer_content, f)


In [15]:
international_cuisine_df.to_csv('cleaned_International_recipies.csv', index=False)

In [16]:
african_cuisine_df.to_csv('cleaned_African_recipies.csv', index=False)