In [2]:
import pandas as pd

In [3]:
df=pd.read_csv('final_dataset.csv')

In [4]:
data=df.drop('Unnamed: 0',axis=1)

In [5]:
data=data[0:18000]

In [6]:
data.head()

Unnamed: 0,recipe_id,recipe_name,aver_rate,review_nums,prep_time,cook_time,calories,fat,carbohydrates,protein,cholesterol,sodium,fiber,ingredients_list
0,222388,Homemade Bacon,5.0,3,8,8,15,36,1,42,21,81,2,"['pork belly', 'smoked paprika', 'kosher salt'..."
1,240488,"Pork Loin, Apples, and Sauerkraut",4.76,29,180,180,19,18,10,73,33,104,41,"['sauerkraut drained', 'Granny Smith apples sl..."
2,218939,Foolproof Rosemary Chicken Wings,4.57,12,5,5,17,36,2,48,24,31,4,"['chicken wings', 'sprigs rosemary', 'head gar..."
3,87211,Chicken Pesto Paninis,4.62,163,5,5,32,45,20,65,20,43,18,"['focaccia bread quartered', 'prepared basil p..."
4,245714,Potato Bacon Pizza,4.5,2,10,10,8,12,5,14,7,8,3,"['red potatoes', 'strips bacon', 'Sauce:', 'he..."


In [7]:
# prep_time and cook_time shows
# almost similar values so we can drop one of them
data=data.drop('cook_time',axis=1)

In [8]:
data.head()

Unnamed: 0,recipe_id,recipe_name,aver_rate,review_nums,prep_time,calories,fat,carbohydrates,protein,cholesterol,sodium,fiber,ingredients_list
0,222388,Homemade Bacon,5.0,3,8,15,36,1,42,21,81,2,"['pork belly', 'smoked paprika', 'kosher salt'..."
1,240488,"Pork Loin, Apples, and Sauerkraut",4.76,29,180,19,18,10,73,33,104,41,"['sauerkraut drained', 'Granny Smith apples sl..."
2,218939,Foolproof Rosemary Chicken Wings,4.57,12,5,17,36,2,48,24,31,4,"['chicken wings', 'sprigs rosemary', 'head gar..."
3,87211,Chicken Pesto Paninis,4.62,163,5,32,45,20,65,20,43,18,"['focaccia bread quartered', 'prepared basil p..."
4,245714,Potato Bacon Pizza,4.5,2,10,8,12,5,14,7,8,3,"['red potatoes', 'strips bacon', 'Sauce:', 'he..."


In [9]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
#Preprocess the Ingredients list

vectorizer=TfidfVectorizer()
x_ingredients=vectorizer.fit_transform(data['ingredients_list'])    

In [11]:
#Normalize the numerical values

scalar=StandardScaler()
x_numericals=scalar.fit_transform(data[['prep_time','calories','fat','carbohydrates','protein','cholesterol','sodium','fiber']])

In [12]:
x_numericals

array([[-0.3235504 , -0.38178444,  0.12222404, ..., -0.28597342,
         0.90630105, -0.79179985],
       [ 1.15317786, -0.05108005, -0.56919946, ...,  0.21237478,
         1.34943452,  2.44067529],
       [-0.34930729, -0.21643225,  0.12222404, ..., -0.16138637,
        -0.05703256, -0.6260319 ],
       ...,
       [-0.39223543, -1.12586933, -0.91491122, ..., -0.9919667 ,
        -0.61576606, -0.79179985],
       [-0.39223543, -0.87784104, -0.8764988 , ..., -0.78432162,
        -0.55796604, -0.54314792],
       [-0.26345099, -1.04319323, -1.03014847, ..., -1.07502473,
        -0.61576606, -0.21161201]])

In [13]:
# Combining both Standardized numercials and 
# the vectorized ingredients
x_combined=np.hstack([x_numericals,x_ingredients.toarray()])

In [14]:
x_combined

array([[-0.3235504 , -0.38178444,  0.12222404, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.15317786, -0.05108005, -0.56919946, ...,  0.        ,
         0.        ,  0.        ],
       [-0.34930729, -0.21643225,  0.12222404, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.39223543, -1.12586933, -0.91491122, ...,  0.        ,
         0.        ,  0.        ],
       [-0.39223543, -0.87784104, -0.8764988 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.26345099, -1.04319323, -1.03014847, ...,  0.        ,
         0.        ,  0.        ]])

## Train the model

### With KNN

In [15]:
# Train the model with KNN
KNN=NearestNeighbors(n_neighbors=3,metric='euclidean')
KNN.fit(x_combined)

#### Now Cross-validate KNN

In [16]:

# Import Necessary Libraries
from sklearn.model_selection import GridSearchCV


In [17]:
#Define the parameter grid
param_grid = {
    'n_neighbors': [3, 5],  # Number of neighbors
    'algorithm': ['auto'],  # Algorithm for computing neighbors
    'leaf_size': [10,20],  # Leaf size for tree-based algorithms
    'metric': [ 'euclidean', 'manhattan'],  # Distance metric
}

In [18]:
KNN=NearestNeighbors()

In [19]:
grid_search = GridSearchCV(estimator=KNN, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')


In [20]:
#Fit the GridSearchCV
grid_search.fit(x_combined)



In [21]:
print("Best parameters found: ", grid_search.best_params_)


Best parameters found:  {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'euclidean', 'n_neighbors': 3}


In [22]:
KNN=NearestNeighbors(algorithm='auto',leaf_size=10,metric='euclidean',n_neighbors=3)
KNN.fit(x_combined)

In [23]:
# recommendation function

def recommend_recipe(input_features):
    input_features_scaled=scalar.transform([input_features[:8]])
    input_ingredients_transformed=vectorizer.transform([input_features[8]])
    input_combined=np.hstack([input_features_scaled,input_ingredients_transformed.toarray()])
    distance,indices=KNN.kneighbors(input_combined)
    recommendations=data.iloc[indices[0]]
    return recommendations[['recipe_name','ingredients_list','prep_time']]

In [24]:
input_features=[5,30,13,30,25,42,14,20,'chicken, salt,butter']
sample_recommendation=recommend_recipe(input_features)




In [25]:
sample_recommendation

Unnamed: 0,recipe_name,ingredients_list,prep_time
9422,"Spinach, Egg, and Pancetta with Linguine","['uncooked linguine pasta', 'olive oil', 'panc...",10
5567,Pork Fried Rice,"['butter', 'boneless pork loin chop', 'chopped...",10
17996,Sweet Potato Pudding,"['large sweet potatoes', 'butter', 'dark brown...",40


#### Find most similar Recipe 

In [44]:
# Example query item
query_index = 0  
query_item = x_combined[query_index].reshape(1, -1)

In [28]:
# Find nearest neighbors
distances, indices = KNN.kneighbors(query_item, n_neighbors=5)  

In [29]:
# you can use indices to retrieve and display the recipe names or other details
similar_recipes = data.iloc[indices[0]]
print("Similar recipes:\n", similar_recipes[['recipe_name', 'ingredients_list', 'prep_time']])


Similar recipes:
                            recipe_name  \
0                       Homemade Bacon   
9975                     Pork Carnitas   
1926                       Adobo Twist   
3507  Char Siu (Chinese Barbeque Pork)   
6895                 Lucy's Ham Spread   

                                       ingredients_list  prep_time  
0     ['pork belly', 'smoked paprika', 'kosher salt'...          8  
9975  ['vegetable oil', 'pork shoulder', 'kosher sal...         10  
1926  ['vegetable oil', 'onion', 'head garlic', 'por...         15  
3507  ['honey', 'Shaoxing cooking wine', 'hoisin sau...          5  
6895  ['smoked ham cut into chunks', 'tomato sauce',...         15  


## Using K-mean Clustering to group togather items

In [45]:
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV

In [46]:
# Define the parameter grid
param_grid = {
    'n_clusters': [2000,2500,3000,3500,5000],  # Different number of clusters to try
}


In [47]:
# Initialize the KMeans model
kmeans = KMeans(random_state=0)

In [48]:
# Set up GridSearchCV
grid_search = GridSearchCV(estimator=kmeans, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

In [49]:
# Fit GridSearchCV to your data
grid_search.fit(x_combined)



In [50]:
query_item

array([[-0.3235504 , -0.38178444,  0.12222404, ...,  0.        ,
         0.        ,  0.        ]])

In [51]:
# Get the best KMeans model
best_kmeans = grid_search.best_estimator_

# Predict cluster for the query item using the best KMeans model
query_cluster = best_kmeans.predict(query_item)

In [52]:
# Find items in the same cluster
cluster_items = data[best_kmeans.labels_ == query_cluster[0]]

# Display results
print("Items in the same cluster:\n", cluster_items[['recipe_name', 'aver_rate', 'review_nums']])


Items in the same cluster:
                               recipe_name  aver_rate  review_nums
0                          Homemade Bacon       5.00            3
3431                Savory Oregano Salami       3.00            1
9975                        Pork Carnitas       4.85          274
10708                      Picnic Sausage       3.33            1
10782  Lamb and Rice Stuffed Grape Leaves       5.00           10


In [53]:
len(cluster_items)

5

In [54]:
grid_search.best_params_

{'n_clusters': 2000}

In [56]:
grid_search.best_score_

nan

In [57]:
KMeans=KMeans(n_clusters=2000)
KMeans.fit(x_combined)

In [58]:
from sklearn.metrics import silhouette_score

# Predict cluster labels
cluster_labels = KMeans.labels_

# Compute silhouette score
silhouette_avg = silhouette_score(x_combined, cluster_labels, metric='euclidean')
print("Silhouette Score:", silhouette_avg)


Silhouette Score: 0.0314645827149885


### DBSCAN

In [59]:
from sklearn.cluster import DBSCAN
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, silhouette_score
import numpy as np



In [60]:
# Define a custom scorer for silhouette score
def silhouette_scorer(estimator, X):
    labels = estimator.labels_
    if len(set(labels)) > 1 and len(set(labels)) < len(X):
        return silhouette_score(X, labels, metric='euclidean')
    else:
        return -1  # Return a negative score if silhouette score cannot be computed

In [61]:
# Create parameter grid
param_grid = {
    'eps': [0.3, 0.5, 0.7],  # Distance parameter
    'min_samples': [5, 10, 15]  # Minimum number of samples in a neighborhood
}

In [62]:
# Initialize DBSCAN
dbscan = DBSCAN()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=dbscan, param_grid=param_grid, scoring=make_scorer(silhouette_scorer), cv=3, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(x_combined)





In [63]:
# Get the best parameters and scores
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters found:", best_params)
print("Best silhouette score:", best_score)



Best parameters found: {'eps': 0.3, 'min_samples': 5}
Best silhouette score: nan


In [64]:
# Fit DBSCAN with the best parameters
best_dbscan = DBSCAN(eps=best_params['eps'], min_samples=best_params['min_samples'])
best_dbscan.fit(x_combined)



In [66]:
# Analyze results
cluster_labels = best_dbscan.labels_
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise = list(cluster_labels).count(-1)

print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")

Number of clusters: 0
Number of noise points: 18000


# We can see that clustering algorithms like DBSCAN and Kmean clustering are not performing well in the case of this Problem 
# so we can go with the KNN algorithms which is more suitable for this problem