In [7]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
file_path = 'foodrecmergedallergen.xlsx'
data = pd.read_excel(file_path)

# Normalize Numerical Features
scaler = StandardScaler()
X_numerical = scaler.fit_transform(data[['Calories', 'Fats', 'Proteins', 'Iron', 'Calcium', 'Sodium', 'Potassium', 'Carbohydrates', 'Fibre', 'VitaminD', 'Sugars']])

# Train KNN Model
knn = NearestNeighbors(n_neighbors=9, metric='euclidean')
knn.fit(X_numerical)


def recommend_recipes(input_features, user_allergy):
    # Scale the numerical features
    input_features_scaled = scaler.transform([input_features])
    
    # Get KNN recommendations
    distances, indices = knn.kneighbors(input_features_scaled)
    recommendations = data.iloc[indices[0]]
    filtered_recommendations = recommendations[~recommendations['Food_allergy'].str.contains(user_allergy, case=False, na=False)]
    return filtered_recommendations[['Food_items']]
input_features = [540, 16, 10,400,10,7,90,80,10,2,90]
recommendations = recommend_recipes(input_features,'Gluten')
recommendations
#this is good



Unnamed: 0,Food_items
187,Macroni n Cheese
328,Uttapam
75,Cereals-Corn Flakes
169,Idli
41,Berries
73,Cashew Nuts
251,Protein Powder
82,Chia seeds


In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = 'foodrecmergedallergen.xlsx'
recipe_df = pd.read_excel(file_path)

# Example User Allergy List
user_allergies = ['Gluten']  # Replace with actual user allergies

# Fill NaN values in 'Food_allergy' with 'Unknown'
recipe_df['Food_allergy'].fillna('Unknown', inplace=True)


# Feature extraction for ingredients and nutritional values
vectorizer = TfidfVectorizer()
X_ingredients = vectorizer.fit_transform(recipe_df['Food_items'])

# Normalize Numerical Features
scaler = StandardScaler()
X_numerical = scaler.fit_transform(recipe_df[['Calories', 'Fats', 'Proteins', 'Iron', 'Calcium', 'Sodium', 'Potassium', 'Carbohydrates', 'Fibre', 'VitaminD', 'Sugars']])

# Combine Features
X_combined = np.hstack([X_numerical, X_ingredients.toarray()])
y = recipe_df['Food_allergy']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
# smote = SMOTE(sampling_strategy='auto', random_state=42)
# X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Train KNN Classifier Model
knn_classifier = KNeighborsClassifier(n_neighbors=10, metric='euclidean') #57%accuracy
#knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='manhattan', weights='distance') 63% accuracy

knn_classifier.fit(X_train, y_train)

# Predict on the test set
knn_predictions = knn_classifier.predict(X_test)

# Calculate accuracy
knn_accuracy = accuracy_score(y_test, knn_predictions)
print(f"KNN Model Accuracy: {knn_accuracy:.2f}")


KNN Model Accuracy: 0.57


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  recipe_df['Food_allergy'].fillna('Unknown', inplace=True)


In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

# Load the dataset
file_path = 'foodrecmergedallergen.xlsx'
recipe_df = pd.read_excel(file_path)

# Example User Allergy List
user_allergies = ['Gluten']  # Replace with actual user allergies

# Fill NaN values in 'Food_allergy' with 'Unknown'
recipe_df['Food_allergy'] = recipe_df['Food_allergy'].fillna('Unknown')

# Check the class distribution
print(recipe_df['Food_allergy'].value_counts())

# Remove classes with very few samples (e.g., less than 5)
min_class_count = 5  # Set the threshold for the minimum number of samples per class
class_counts = recipe_df['Food_allergy'].value_counts()
minority_classes = class_counts[class_counts < min_class_count].index
recipe_df = recipe_df[~recipe_df['Food_allergy'].isin(minority_classes)]

# Feature extraction for ingredients and nutritional values
vectorizer = TfidfVectorizer()
X_ingredients = vectorizer.fit_transform(recipe_df['Food_items'])

# Normalize Numerical Features
scaler = StandardScaler()
X_numerical = scaler.fit_transform(recipe_df[['Calories', 'Fats', 'Proteins', 'Iron', 'Calcium', 'Sodium', 'Potassium', 'Carbohydrates', 'Fibre', 'VitaminD', 'Sugars']])

# Combine Features
X_combined = np.hstack([X_numerical, X_ingredients.toarray()])
y = recipe_df['Food_allergy']

# Apply SMOTE for class balancing (now should work as we've removed small classes)
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X_combined, y)

# Apply PCA for dimensionality reduction (optional but can improve model performance)
pca = PCA(n_components=0.95)  # Keep 95% of the variance
X_resampled = pca.fit_transform(X_resampled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train KNN Classifier Model with optimized hyperparameters
knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='manhattan', weights='distance')
knn_classifier.fit(X_train, y_train)

# Predict on the test set
knn_predictions = knn_classifier.predict(X_test)

# Calculate accuracy
knn_accuracy = accuracy_score(y_test, knn_predictions)
print(f"KNN Model Accuracy: {knn_accuracy:.2f}")

# Optionally, perform cross-validation to get a better estimate of accuracy
cv_scores = cross_val_score(knn_classifier, X_resampled, y_resampled, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation accuracy: {cv_scores.mean():.2f}")

def recommend_recipes(input_features, user_allergy, scaler, vectorizer, knn_classifier, pca, recipe_df):
    # Step 1: Extract and reshape the input nutritional data
    nutritional_input = np.array(input_features).reshape(1, -1)  # Nutritional features (1 sample)
    
    # Step 2: Scale the nutritional features using the scaler (apply same scaling as during training)
    nutritional_input_scaled = scaler.transform(nutritional_input)
    
    # Step 3: Combine with TF-IDF vector (assuming no ingredients are provided in input)
    ingredients_input = vectorizer.transform(['']).toarray()  # Placeholder for ingredients (empty string or dummy)
    
    # Combine nutritional input and ingredient input
    combined_input = np.hstack([nutritional_input_scaled, ingredients_input])
    
    # Step 4: Apply PCA transformation to match the PCA space used during training
    combined_input_pca = pca.transform(combined_input)
    
    # Step 5: Ensure the PCA-transformed input is within valid range for KNN
    # Here, combined_input_pca is a 1x33 array, corresponding to the number of components in PCA
    # Now we can safely get KNN indices without worrying about out-of-bounds errors
    
    distances, indices = knn_classifier.kneighbors(combined_input_pca)
    
    # Debug: Check the indices returned by KNN
    print(f"KNN indices: {indices}")
    
    # Step 6: Ensure indices are within bounds of the recipe DataFrame
    valid_indices = [i for i in indices[0] if i < len(recipe_df)]  # Only keep valid indices within bounds
    
    if not valid_indices:
        print(f"Error: No valid indices found. Max index is {indices[0].max()}, but recipe_df has {len(recipe_df)} rows.")
        return None
    
    # Filter the recommendations using valid indices
    recommendations = recipe_df.iloc[valid_indices]

    # Step 7: Filter out recipes based on the user's allergy
    filtered_recommendations = recommendations[~recommendations['Food_allergy'].str.contains(user_allergy, case=False, na=False)]

    # Step 8: Return the filtered food items along with allergy information
    return filtered_recommendations[['Food_items', 'Food_allergy']]

# Example Input (Nutritional Features)
# input_features = [500, 160, 1, 300, 110, 679, 8, 8, 1, 2, 90]  # Nutritional features (without ingredients)
# user_allergy = 'Dairy'  # Allergy input from the user
input_features = [540, 16, 10, 400, 10, 7, 90, 80, 10, 2, 90]  # Nutritional features
user_allergy = 'Gluten'  # Allergy input from the user

# Assuming 'scaler', 'vectorizer', 'knn_classifier', and 'pca' are already trained
recommendations = recommend_recipes(input_features, user_allergy, scaler, vectorizer, knn_classifier, pca, recipe_df)
print(recommendations)


#KNN MODEL 


Food_allergy
Unknown                                  172
Dairy                                     53
Gluten                                    38
Dairy, Gluten                             33
Soy                                        7
Shellfish                                  6
Gluten, Dairy                              4
Dairy, Gluten (pasta)                      3
Fish                                       3
Nuts (if pesto contains nuts)              2
Nuts (if pesto contains nuts), Gluten      2
Dairy, Gluten (wrap)                       2
Nuts                                       1
Nuts                                       1
Nuts (Cashews)                             1
Dairy, Gluten (noodles)                    1
Garlic                                     1
Dairy, Nuts (if cashews used)              1
Dairy, Gluten (croutons)                   1
Fish, Gluten                               1
Eggs, Dairy, Gluten (bread)                1
Dairy, Gluten (crust)                     



In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest

# Load the dataset
file_path = 'foodrecmergedallergen.xlsx'
recipe_df = pd.read_excel(file_path)

# Example User Allergy List
user_allergies = ['Gluten']  # Replace with actual user allergies

# Fill NaN values in 'Food_allergy' with 'Unknown'
recipe_df['Food_allergy'] = recipe_df['Food_allergy'].fillna('Unknown')

# Check the class distribution
print(recipe_df['Food_allergy'].value_counts())

# Remove classes with very few samples (e.g., less than 5)
min_class_count = 5  # Set the threshold for the minimum number of samples per class
class_counts = recipe_df['Food_allergy'].value_counts()
minority_classes = class_counts[class_counts < min_class_count].index
recipe_df = recipe_df[~recipe_df['Food_allergy'].isin(minority_classes)]

# Feature extraction for ingredients and nutritional values
vectorizer = TfidfVectorizer()
X_ingredients = vectorizer.fit_transform(recipe_df['Food_items'])

# Normalize Numerical Features
scaler = StandardScaler()
X_numerical = scaler.fit_transform(recipe_df[['Calories', 'Fats', 'Proteins', 'Iron', 'Calcium', 'Sodium', 'Potassium', 'Carbohydrates', 'Fibre', 'VitaminD', 'Sugars']])

# Combine Features
X_combined = np.hstack([X_numerical, X_ingredients.toarray()])
y = recipe_df['Food_allergy']

# Apply SMOTE for class balancing (now should work as we've removed small classes)
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X_combined, y)

# Apply PCA for dimensionality reduction (optional but can improve model performance)
pca = PCA(n_components=0.95)  # Keep 95% of the variance
X_resampled = pca.fit_transform(X_resampled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train Random Forest Classifier Model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # Random Forest with 100 trees
rf_classifier.fit(X_train, y_train)

# Predict on the test set using Random Forest
rf_predictions = rf_classifier.predict(X_test)

# Calculate accuracy for Random Forest
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Model Accuracy: {rf_accuracy:.2f}")

# Optionally, perform cross-validation to get a better estimate of accuracy for Random Forest
rf_cv_scores = cross_val_score(rf_classifier, X_resampled, y_resampled, cv=5, scoring='accuracy')
print(f"Random Forest Cross-validation scores: {rf_cv_scores}")
print(f"Mean Random Forest cross-validation accuracy: {rf_cv_scores.mean():.2f}")


def recommend_recipes(input_features, user_allergy, scaler, rf_classifier, pca, recipe_df):
    # Step 1: Extract and reshape the input nutritional data
    nutritional_input = np.array(input_features).reshape(1, -1)  # Nutritional features (no ingredients)
    
    # Step 2: Scale the nutritional features using the scaler (apply same scaling as during training)
    nutritional_input_scaled = scaler.transform(nutritional_input)
    
    # Step 3: Create a dummy ingredient vector (e.g., an empty ingredient vector)
    dummy_ingredient = np.zeros((1, X_ingredients.shape[1]))  # Create a dummy vector with zeros (no ingredients)
    
    # Step 4: Combine the nutritional features and the dummy ingredient vector
    combined_input = np.hstack([nutritional_input_scaled, dummy_ingredient])
    
    # Step 5: Apply PCA transformation to match the PCA space used during training
    combined_input_pca = pca.transform(combined_input)  # Transform using the combined input
    
    # Step 6: Predict using Random Forest classifier (getting the class probabilities for relevance)
    rf_prediction = rf_classifier.predict(combined_input_pca)
    rf_probabilities = rf_classifier.predict_proba(combined_input_pca)
    
    # Debug: Check the Random Forest Prediction and probabilities
    print(f"Random Forest Prediction: {rf_prediction[0]}")
    print(f"Class Probabilities: {rf_probabilities[0]}")
    
    # Step 7: Calculate the "relevance" of each food item to the input nutritional features
    distances = np.linalg.norm(X_resampled - combined_input_pca, axis=1)  # Use resampled data (X_resampled)
    
    # Step 8: Sort food items by relevance (in this case, smallest distance)
    sorted_indices = np.argsort(distances)
    
    # Ensure that we do not exceed the bounds of the DataFrame
    top_n = min(10, len(sorted_indices))  # Take the minimum of 10 or the available number of items
    top_10_indices = sorted_indices[:top_n]
    
    # Debug: Check the top 10 indices to ensure they are valid
    print(f"Top 10 indices: {top_10_indices}")
    print(f"Recipe DataFrame size: {recipe_df.shape[0]}")
    
    # Ensure that indices are within bounds of recipe_df
    top_10_indices = [i for i in top_10_indices if i < recipe_df.shape[0]]
    
    # Get the food items and allergies for the top 10
    recommendations = recipe_df.iloc[top_10_indices]
    
    # Filter based on the user allergy
    filtered_recommendations = recommendations[~recommendations['Food_allergy'].str.contains(user_allergy, case=False, na=False)]
    
    # Step 9: Return the filtered food items along with allergy information
    return filtered_recommendations[['Food_items', 'Food_allergy']]

# Example Input (Nutritional Features only, no ingredients)
input_features = [540, 16, 10, 400, 10, 7, 90, 80, 10, 2, 90]  # Nutritional features
user_allergy = 'Gluten'  # Allergy input from the user

# Assuming 'scaler', 'rf_classifier', and 'pca' are already trained
recommendations = recommend_recipes(input_features, user_allergy, scaler, rf_classifier, pca, recipe_df)
print(recommendations)
#random forest


Food_allergy
Unknown                                  172
Dairy                                     53
Gluten                                    38
Dairy, Gluten                             33
Soy                                        7
Shellfish                                  6
Gluten, Dairy                              4
Dairy, Gluten (pasta)                      3
Fish                                       3
Nuts (if pesto contains nuts)              2
Nuts (if pesto contains nuts), Gluten      2
Dairy, Gluten (wrap)                       2
Nuts                                       1
Nuts                                       1
Nuts (Cashews)                             1
Dairy, Gluten (noodles)                    1
Garlic                                     1
Dairy, Nuts (if cashews used)              1
Dairy, Gluten (croutons)                   1
Fish, Gluten                               1
Eggs, Dairy, Gluten (bread)                1
Dairy, Gluten (crust)                     



In [11]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Load the dataset
file_path = 'foodrecmergedallergen.xlsx'
data = pd.read_excel(file_path)

# Normalize Numerical Features
scaler = StandardScaler()
X_numerical = scaler.fit_transform(data[['Calories', 'Fats', 'Proteins', 'Iron', 'Calcium', 'Sodium', 'Potassium', 'Carbohydrates', 'Fibre', 'VitaminD', 'Sugars']])

# Apply KMeans Clustering (Choosing 5 clusters, but can be adjusted)
kmeans = KMeans(n_clusters=5, random_state=42)
data['Cluster'] = kmeans.fit_predict(X_numerical)

def recommend_recipes(input_features, user_allergy):
    # Step 1: Scale the input features using the previously fitted scaler
    input_features_scaled = scaler.transform([input_features])
    
    # Step 2: Predict the cluster for the given input
    input_cluster = kmeans.predict(input_features_scaled)[0]
    
    # Step 3: Get recipes from the same cluster
    recommendations = data[data['Cluster'] == input_cluster]
    
    # Step 4: Filter out recipes containing the user's allergy
    filtered_recommendations = recommendations[~recommendations['Food_allergy'].str.contains(user_allergy, case=False, na=False)]
    
    # Step 5: Return the food items in the filtered recommendations
    return filtered_recommendations[['Food_items']]

# Example Input Features (Nutritional values)
input_features = [540, 16, 10, 400, 10, 7, 90, 80, 10, 2, 90]
user_allergy = 'Gluten'  # Allergy to filter out

# Get recommendations based on the input features and allergy
recommendations = recommend_recipes(input_features, user_allergy)

# Display the recommended recipes
print(recommendations)
#kmeans clustering

            Food_items
187  Macroni n Cheese 
328            Uttapam




In [12]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load the dataset
file_path = 'foodrecmergedallergen.xlsx'
data = pd.read_excel(file_path)

# Normalize Numerical Features
scaler = StandardScaler()
X_numerical = scaler.fit_transform(data[['Calories', 'Fats', 'Proteins', 'Iron', 'Calcium', 'Sodium', 'Potassium', 'Carbohydrates', 'Fibre', 'VitaminD', 'Sugars']])

# Apply KMeans Clustering (Choosing 5 clusters, but can be adjusted)
kmeans = KMeans(n_clusters=5, random_state=42)
data['Cluster'] = kmeans.fit_predict(X_numerical)

def recommend_recipes(input_features, user_allergy):
    # Step 1: Scale the input features using the previously fitted scaler
    input_features_scaled = scaler.transform([input_features])
    
    # Step 2: Predict the cluster for the given input
    input_cluster = kmeans.predict(input_features_scaled)[0]
    
    # Step 3: Get recipes from the same cluster
    recommendations = data[data['Cluster'] == input_cluster]
    
    # Step 4: Filter out recipes containing the user's allergy
    filtered_recommendations = recommendations[~recommendations['Food_allergy'].str.contains(user_allergy, case=False, na=False)]
    
    # Step 5: Return the food items in the filtered recommendations
    return filtered_recommendations[['Food_items']], input_cluster

def calculate_accuracy(input_features_list, user_allergy):
    predicted_clusters = []
    true_clusters = []
    
    # Loop over input features
    for input_features in input_features_list:
        recommendations, predicted_cluster = recommend_recipes(input_features, user_allergy)
        
        # Get the actual cluster from the dataset by using the KMeans model
        actual_cluster = kmeans.predict([input_features])[0]  # Predict cluster for input features
        
        predicted_clusters.append(predicted_cluster)
        true_clusters.append(actual_cluster)
    
    # Check for NaN values in true_clusters or predicted_clusters and remove them
    valid_indices = ~np.isnan(true_clusters) & ~np.isnan(predicted_clusters)
    true_clusters = np.array(true_clusters)[valid_indices]
    predicted_clusters = np.array(predicted_clusters)[valid_indices]
    
    # Calculate accuracy by comparing predicted clusters with true clusters
    if len(true_clusters) > 0:  # Only calculate accuracy if there are valid entries
        accuracy = accuracy_score(true_clusters, predicted_clusters)
        return accuracy
    else:
        return 0.0

# Example Input Features (Nutritional values)
input_features_list = [
    [540, 16, 10, 400, 10, 7, 90, 80, 10, 2, 90],
    [300, 20, 15, 300, 8, 4, 70, 60, 8, 1, 85],
    # Add more input features here as needed for testing
]

user_allergy = 'Gluten'  # Allergy to filter out

# Calculate accuracy for the given input features
accuracy = calculate_accuracy(input_features_list, user_allergy)

# Print the accuracy
print(f"Accuracy of the clustering-based recommendation system: {accuracy * 100:.2f}%")


Accuracy of the clustering-based recommendation system: 100.00%


