<a href="https://colab.research.google.com/github/peteray-dev/Food_Hybrid_Recommender_System/blob/master/MSc_Food_Hybrid_Recommeder_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing Libraries

In [None]:
from google.colab import userdata
import os

os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')


In [None]:
!kaggle datasets download -d irkaal/foodcom-recipes-and-reviews



In [None]:
!mkdir Food-recipe-and-review.zip

!mv /content/foodcom-recipes-and-reviews.zip* /content/Food-recipe-and-review.zip


In [None]:
! unzip /content/Food-recipe-and-review.zip/foodcom-recipes-and-reviews.zip

In [None]:
!pip install --upgrade scikit-learn==1.5.0 -q
!pip install sentence-transformers -q
!pip install --upgrade scikit-learn==1.4.0 -q
!pip install networkx -q
!pip install torch_geometric -q
!pip install captum -q


In [None]:
import sklearn
# this makes is easier for getting dataframes by default as input/output of
# sklearn pipelines
sklearn.set_config(transform_output="pandas")
%matplotlib inline
import numpy as np
import pandas as pd
import ast
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
sns.set(
    { "figure.figsize": (6, 4) },
    style='ticks',
    color_codes=True,
    font_scale=0.8
)
import warnings
warnings.filterwarnings('ignore')



In [None]:
import nltk

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer


from scipy.sparse import csr_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel, cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import numpy as np

import random

import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import Embedding, ModuleList, Linear
import torch.nn.functional as F

import torch_geometric
import torch_geometric.nn as pyg_nn
from torch_geometric.data import Data
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, to_hetero
from torch.nn.modules.loss import _Loss

from torch_geometric.nn.conv import LGConv, GATConv, SAGEConv
from torch_geometric.typing import Adj, OptTensor, SparseTensor
from torch_geometric.explain import Explainer, GNNExplainer, CaptumExplainer

# from torch_lr_finder import LRFinder

# This is for the progress bar.
from tqdm.auto import tqdm
# This is for ploting
# import matplotlib.pyplot as plt

###Importing the dataset

In [None]:
recipes = pd.read_csv('/content/recipes.csv')
reviews = pd.read_csv('/content/reviews.csv')

In [None]:
# recipes

In [None]:
# reviews

Analysing Recipe

In [None]:
recipes.info()

In [None]:
recipes.isnull().sum()

In [None]:
# recipes.transpose()[0]

In [None]:
# recipes.shape

In [None]:
recipes.describe()

In [None]:
recipes_cp=recipes.copy()

del recipes

In [None]:
recipes_cp['PrepTime']=recipes_cp['PrepTime'].astype(str)
recipes_cp['CookTime']=recipes_cp['CookTime'].astype(str)
recipes_cp['TotalTime']=recipes_cp['TotalTime'].astype(str)

In [None]:
#extracting the time utilized to make the food
import re
def duration_iso(duration):
    pattern = re.compile(r'PT(?:(\d+)H)?(?:(\d+)M)?')
    match = pattern.match(duration)
    if not match:
        return 0
    hours = int(match.group(1)) if match.group(1) else 0
    minutes = int(match.group(2)) if match.group(2) else 0
    return hours * 60 + minutes

recipes_cp['TotalTime'] = recipes_cp['TotalTime'].apply(duration_iso)
recipes_cp['PrepTime'] = recipes_cp['PrepTime'].apply(duration_iso)
recipes_cp['CookTime'] = recipes_cp['CookTime'].apply(duration_iso)



In [None]:
recipes_cp.head()

In [None]:
# recipes_cp.isnull().sum()

In [None]:
recipes_cp['ReviewCount'].max()

recipes_cp[['RecipeId', 'Name']][recipes_cp['ReviewCount']==3063]
#Lets check if the number of count for the review is the same as the one in review
#the id is 45809 with count of 3063

In [None]:
recipes_cp['ReviewCount'].min()
recipes_cp[['RecipeId', 'Name']][recipes_cp['ReviewCount']==1]


In [None]:
#getting the recipeID of all with Null
recipes_cp[['RecipeId', 'ReviewCount']].loc[recipes_cp['ReviewCount'].isnull()]

In [None]:
# let's check for the number of review count for the recipe Id 38, which should be 4 as the same as the review data frame(check below)

recipes_cp[['RecipeId','ReviewCount']].loc[recipes_cp['RecipeId']==38]

#This reveals that the the reviewcount in recipe dataframe, is gotten from the review data frame,

### Basic work on review dataset

In [None]:
reviews.isnull().sum()

In [None]:
#Let's remove the ones without reviews
reviews.dropna(subset='Review', inplace=True)

In [None]:
reviews.info()

In [None]:
reviews[reviews['RecipeId']==45809]
# Inside the review dataframe, the id with the max review count from the recipe dataframe has 2892
# that's just a difference of (3063 - 2892)=71

In [None]:
reviews[reviews['RecipeId']==53]
# From Analysis, it is seen that the count correlates with what is in the recipes['reviewcount']

In [None]:
review_count = reviews.groupby('RecipeId').size().reset_index(name='ReviewCount')
# review_count

In [None]:
# Based on the above analysis, it is seen that tsome recipesId were not reviewed nor rate as well

# i will remove Aggregaterating and ratingcount
# Aggregate rating is a phrase that is commonly used to talk about an average score or rating,
# which is calculated from many individual reviews. It gives a brief summary of different opinions
#  while rating count only tells the number of recipe id that was rated

In [None]:
recipes_cp.drop(columns=['AggregatedRating', 'ReviewCount', 'RecipeYield', 'RecipeServings', 'Images'], inplace=True)

In [None]:
reviews.drop(columns=['DateSubmitted', 'DateModified'], inplace=True)

In [None]:
#Merging the 2 dataset together (review and recipe)
Merged_df = pd.merge(recipes_cp,reviews, how="outer", left_on = 'RecipeId', right_on= 'RecipeId')
Merged_df.info()


In [None]:
# Merge AuthorId_x and AuthorId_y and also Merge AuthorName_x and AuthorName_y
Merged_df['AuthorId'] = Merged_df['AuthorId_x'].combine_first(Merged_df['AuthorId_y'])
Merged_df['AuthorName'] = Merged_df['AuthorName_x'].combine_first(Merged_df['AuthorName_y'])

# Drop the original columns
Merged_df.drop(columns=['AuthorId_x', 'AuthorId_y', 'AuthorName_x', 'AuthorName_y'], inplace=True)


Merged_df.info()


In [None]:
Merged_df.dropna(inplace=True)

In [None]:
# Merged_df.info()

In [None]:
Merged_df['RecipeIngredientParts'][1]

In [None]:
def convert_to_list(string):
    # Remove the leading 'c(' and trailing ')'
    string = re.sub(r'^c\(|\)$','', string)
    # Split by ', ' and remove surrounding quotes
    ingredients = [item.strip().strip('"') for item in string.split(', ')]
    return ingredients

In [None]:
Merged_df['RecipeIngredientParts'] = Merged_df['RecipeIngredientParts'].apply(convert_to_list)
Merged_df['RecipeInstructions'] = Merged_df['RecipeInstructions'].apply(convert_to_list)
Merged_df['Keywords'] = Merged_df['Keywords'].apply(convert_to_list)

#All columns converted to List


### Exploratory Data Analysis

In [None]:
num_col = ['CookTime', 'PrepTime', 'TotalTime', 'Calories', 'FatContent', 'SaturatedFatContent', 'CholesterolContent',
           'SodiumContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent']

fig, axis = plt.subplots(4,3, figsize=(15,12))
axis = axis.ravel()
for i, ax in enumerate(axis):
  sns.boxplot(data=Merged_df[num_col[i]], ax=ax)
  ax.set(title=num_col[i])

plt.tight_layout()
plt.show()

There are a lot of outliers in this data set, and indicating they aren't normally distributed as well

Now i'm going to remove the outliers

In [None]:
for col in num_col:
  Q3 = Merged_df[col].quantile(0.75)
  Q1 = Merged_df[col].quantile(0.25)
  IQR =  Q3 - Q1
  Merged_df2 = Merged_df[(Merged_df[col]>=Q1-1.5*IQR) & (Merged_df[col]<=Q3+1.5*IQR) ]
  print(f'lower_limit for {col}: {Q1-1.5*IQR}, upper_limit{col}: {Q3+1.5*IQR}')

In [None]:
fig, axis = plt.subplots(4,3, figsize=(15,12))
axis = axis.ravel()
for i, ax in enumerate(axis):
  sns.boxplot(data=Merged_df2[num_col[i]], ax=ax)
  ax.set(title=num_col[i])

plt.tight_layout()
plt.show()

In [None]:
# print(f'the Author unique name')
Merged_df2['AuthorName'].nunique() # I can do stratified sampling using RecipeId

In [None]:
col_unique_list = ['RecipeId', 'Name', 'RecipeCategory', 'Rating', 'AuthorId', 'AuthorName']
for col in col_unique_list:
    unique_values = Merged_df2[col].nunique()
    print(f'The {col} column has {unique_values} unique values')


In [None]:
Merged_df2['Rating'].value_counts()

In [None]:
# lets take a user Id (44642) and see the item he/she interated with
# This user rated 29 items,
Merged_df2[Merged_df2['AuthorId']==44642]

problems:
Data sparsity: the problem of having insufficient or missing ratings or interactions between users and items, I will address this by building a model that helps in prediction

In [None]:
# Merged_df2.columns


In [None]:
# Merged_df2

### Exploratory Data Analysis 2

In [None]:
recipe_col = ['RecipeId', 'Name', 'AuthorId', 'Description','RecipeCategory', 'Keywords',  'RecipeIngredientParts', 'Calories', 'FatContent',
       'SaturatedFatContent', 'CholesterolContent', 'SodiumContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent', 'DatePublished' ,]
review_col=['RecipeId','ReviewId', 'AuthorId', 'Rating', 'Review']
recipe_df = recipes_cp[recipe_col]
review_df = reviews[review_col]


del recipes_cp
del reviews

In [None]:
# working with data between 2013 and 2020
recipe_df['DatePublished'] = pd.to_datetime(recipe_df['DatePublished'])
recipe_df['DatePublished'].dt.year.value_counts()
recipe_df = recipe_df[recipe_df['DatePublished'].dt.year.between(2013, 2020)]
# recipe_df

In [None]:
recipe_df['RecipeCategory'].value_counts()

In [None]:
# # i will drop the duplicate in the recipe df
recipe_df = recipe_df.dropna()
recipe_df['Name'].nunique()
# recipe_df = recipe_df.sample(50000, random_state=0) # i can seelect them based on time for prepartion

In [None]:
recipe_df.isnull().sum()

In [None]:
# num_col = ['Calories', 'FatContent', 'SaturatedFatContent', 'CholesterolContent',
#            'SodiumContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent']

# fig, axis = plt.subplots(3,3, figsize=(15,12))
# axis = axis.ravel()
# for i, ax in enumerate(axis):
#   sns.boxplot(data=recipe_df[num_col[i]], ax=ax)
#   ax.set(title=num_col[i])

# plt.tight_layout()
# plt.show()

In [None]:
# Remove outliers
num_col = ['Calories', 'FatContent', 'SaturatedFatContent', 'CholesterolContent',
           'SodiumContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent']

for col in num_col:
  Q3 = recipe_df[col].quantile(0.75)
  Q1 = recipe_df[col].quantile(0.25)
  IQR =  Q3 - Q1
  recipe_df = recipe_df[(recipe_df[col]>=Q1-1.5*IQR) & (recipe_df[col]<=Q3+1.5*IQR) ]
  print(f'lower_limit for {col}: {Q1-1.5*IQR}, upper_limit{col}: {Q3+1.5*IQR}')

In [None]:
num_col = ['Calories', 'FatContent', 'SaturatedFatContent', 'CholesterolContent',
           'SodiumContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent']
fig, axis = plt.subplots(3,3, figsize=(15,12))
axis = axis.ravel()
for i, ax in enumerate(axis):
  sns.histplot(data=recipe_df[num_col[i]], ax=ax)
  ax.set(title=num_col[i])

plt.tight_layout()
plt.show()

In [None]:
# recipe_df.nunique() #31088

In [None]:
recipe_df = recipe_df.reset_index(drop=True)

In [None]:
recipe_df[recipe_df['Name']=='Chicken Paprikash']

### Text Preprocessing





In [None]:
# recipe_df.head()

In [None]:
recipe_df['Keywords'].iloc[9]

#  the 'c' is used used for concatenation in R

In [None]:
def convert_to_list(col):
  cln = col.replace('c(', '').replace(')', '')
  return cln

In [None]:
recipe_df['Keywords']=recipe_df['Keywords'].apply(convert_to_list)
recipe_df['RecipeIngredientParts']=recipe_df['RecipeIngredientParts'].apply(convert_to_list)
# recipe_df['Keywords']=recipe_df['Keywords'].apply(convert_to_list).to_list()

In [None]:
recipe_df.head()

In [None]:
type(recipe_df['RecipeIngredientParts'][1])

In [None]:
#lets convert to list so that i can easily combine the 3 column together then preprocess it
recipe_df['Keywords']=recipe_df['Keywords'].apply(eval)
# # removing any presenceof '('
# recipe_df['RecipeIngredientParts'] = recipe_df['RecipeIngredientParts'].apply(lambda x: re.sub(r'^\(', '', x) if pd.notna(x) else x)
# #convert to list
# recipe_df['RecipeIngredientParts']=recipe_df['RecipeIngredientParts'].apply(eval)
recipe_df['RecipeIngredientParts']=recipe_df['RecipeIngredientParts'].str.replace('"', '')

In [None]:
recipe_df.columns

In [None]:
# combining 3 columns (Description, Recipecatergory, keywords)
recipe_df['Merged_recipe_info'] = recipe_df.apply(
    lambda row: f"{row['Description']} {row['RecipeIngredientParts']} {row['RecipeCategory']}  {', '.join(row['Keywords'])}",
    axis=1
)

In [None]:
recipe_df['Merged_recipe_info'].iloc[0]

lets remove stop words using the natural language processing toolkits

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
print(stopwords.words('english'))
stp=['Food.com.']
stop_words = set(stopwords.words('english'))
stop_words.update(stp)

stop_words

# Using this, i might loose the neccessary information, let me go for alaguage processing that will preserve my message

In [None]:
#  now using snowball stemmer (reducing a word to its base word or stem in such a way that the words of similar kind lie under a common stem, e.g run,ran, running can be reduce  to the base word 'run')
# snowball_stemmer = SnowballStemmer()
# Lemmatizatio also give context to the word and does stemming too, so lets process the merged column using lemmatization instead
word_lemma = WordNetLemmatizer()

def preprocess(word):
  word = str(word).lower()
  # let's also replace some meaningful word that i di not want the stop word to remove, e.g '<' with  below
  word = word.replace('<', 'lesser than').replace('>', 'greater than').replace("Food.com", "").replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("€", " euro ").replace("'ll", " will").replace('Mins', 'minutes')

  # i wil also remove any symbols that may be funt
  word=re.sub('[^A-Za-z0-9]+', ' ', str(word))
  #Applying the lemmatizer
  lemma_word = ' '.join(word_lemma.lemmatize(w) for w in word.split() if w not in stop_words)

  return lemma_word


In [None]:
recipe_df['recipe_process_info'] = recipe_df['Merged_recipe_info'].apply(preprocess)

In [None]:
recipe_df['recipe_process_info'].iloc[10]

In [None]:
recipe_df['Merged_recipe_info'].iloc[10]

In [None]:
# recipe_df

### Content Based Recommendation

In [None]:
tfvec = TfidfVectorizer(min_df=3, ngram_range=(1,2))

In [None]:
tfvec_mtrx = tfvec.fit_transform(recipe_df['recipe_process_info'])

In [None]:
tfvec_mtrx.shape

Sigmoid Kernel:

from sklearn doc: (Note that the tf-idf functionality in sklearn.feature_extraction.text can produce normalized vectors, in which case cosine_similarity is equivalent to linear_kernel, only slower.)

In [None]:
sig_mat = sigmoid_kernel(tfvec_mtrx, tfvec_mtrx)

In [None]:
sig_mat

In [None]:
ind = pd.Series(recipe_df.index, index=recipe_df['Name'])
ind

In [None]:
# cos_mat[484727]

In [None]:
# getting similar food recipe
def get_food_recipe_sig_content_based(indices):
  scores = list(enumerate(sig_mat[indices]))
  # sorting the similarity scores
  scores = sorted(scores, key=lambda x:x[1], reverse=True)
  # getting the first 5 most similar food recipe
  scores = scores[0:6]
  # print(scores)


  # getting the indices
  ind = [i[0] for i in scores]
  # print(ind)
  similar_recipe = pd.DataFrame(recipe_df.iloc[ind])
  print(f'Recommendation for RecipeID {ind[0]},RecipeId:{recipe_df.iloc[ind[0]]["RecipeId"]}, Name:{recipe_df.iloc[ind[0]]["Name"]}')

  return similar_recipe[1:]


In [None]:
ind['Chicken Paprikash']

In [None]:
get_food_recipe_sig_content_based(4)
# [4, 8806, 17595, 28020, 27679, 7443]

In [None]:
# without the usage of nutritional content, thsi wwas the similarities selected
# [4, 8806, 17595, 28020, 27679, 7443]
# let's check why it dropped last 3
recipe_df[recipe_df.index == 7539 ]
# recipe_df[recipe_df.index == 13901 ]

Cosine Similarity

In [None]:
cos_mat = cosine_similarity(tfvec_mtrx, tfvec_mtrx)

In [None]:
cos_mat[0]

In [None]:
# cos_sim = list(enumerate(cosine_similarity[436511]))
# sorted(cos_mat[0].max())

sort = sorted(cos_mat[40])
sort[-2]

In [None]:
# getting similar food recipe
def get_food_recipe_cos_content_based(indices):
  scores = list(enumerate(cos_mat[indices]))
  # sorting the similarity scores
  scores = sorted(scores, key=lambda x:x[1], reverse=True)
  # getting the first 5 most similar food recipe
  scores = scores[0:6]
  # print(scores)
  ind = [i[0] for i in scores]
  similarity_scores = [i[1] for i in scores]
  # getting the indices
  ind = [i[0] for i in scores]
  print(f"Recommendation for recipe name:{recipe_df.iloc[ind[0]]['Name']}")
  similar_recipe = recipe_df.loc[ind[1:]][['RecipeId', 'Name', 'AuthorId', 'Description', 'RecipeCategory', 'Keywords', 'RecipeIngredientParts', 'Calories']]
  similar_recipe['Score'] = similarity_scores[1:]
  return similar_recipe

In [None]:
res = get_food_recipe_cos_content_based(4)
res

### Collaborative Filtering - Item-Item based collaborative

In [None]:
reviews.head()

In [None]:
reviews.nunique()

In [None]:
reviews_rating_df = (
    reviews.groupby('RecipeId')['Rating']
    .agg(['mean', 'count'])
    .rename(columns={'mean': 'AvgRating', 'count': 'RatingCount'})
    .sort_values(by='RatingCount', ascending=False)
    .reset_index()  # This line ensures the RecipeId is part of the DataFrame
)

In [None]:
review_df = pd.merge(reviews, reviews_rating_df, left_on='RecipeId', right_on='RecipeId', how='left')

In [None]:
review_df.sort_values(by='RatingCount', ascending=False)

In [None]:
review_df.nunique()

In [None]:
# Let's create a rating_df, this df will have 'RecipeId', 'AuthorId', 'AvgRating', 'RatingCount

rating_df = review_df[['RecipeId', 'AuthorId','Rating', 'AvgRating', 'RatingCount']]
rating_df.head()

In [None]:
rating_df.drop_duplicates(inplace=True)

In [None]:
rating_df.sort_values(by='RatingCount', ascending=False)

In [None]:
rating_df.describe()

In [None]:
col = ['Rating', 'AvgRating', 'RatingCount']

fig, axis = plt.subplots(3,1, figsize=(10,8))
axis = axis.ravel()
for i, ax in enumerate(axis):
  sns.histplot(data=rating_df[col[i]], ax=ax, kde=True)
  ax.set(title=col[i])

plt.tight_layout()
plt.show()
# for x in col:
#   sns.histplot(x=rating_df[x], kde=True)

In [None]:
# I will prefer to remove the lesser count of ratingcount
# let's select the recipeId with the highest numbe rof ratiing, indicating that they are the most popular recipes
# that most user interact with and this will be a better use for recommendation, and this will best solve the
# problem of cold start by suggesting popular item to the new user

rating_df = rating_df[rating_df['RatingCount']>=100].sort_values(by="RecipeId")

In [None]:
rating_df.nunique()

In [None]:
# user-item matrix
rating_matrix = rating_df.pivot(index='AuthorId', columns='RecipeId', values='Rating').fillna(0)

rating_matrix

In [None]:
rating_matrix.shape
# users = 86723
# recipe = 1065

Recommendation using pearson correlation (Item-based Collaborative Filtering)

In [None]:
corr_matrix = rating_matrix.corr(method='pearson')

corr_matrix

In [None]:
new = rating_matrix.loc[1535]
new[new>0]
# this user only rated 81 items so 984 items was not rated by this user

In [None]:
corr_matrix[56].sort_values()

In [None]:
def recommend_item_pearson(user_id, user_item_matrix, item_corr_matrix, n_recomm):
  #select the user's rating
  rating = user_item_matrix.loc[user_id]

  # recipe not interacted with
  unrated = rating[rating==0].index

  score={}
  for item in unrated:
    # let's get the correlation of the unrated recipe by the user with other items
    item_corr = item_corr_matrix[item]

    # getting the rated item index by user
    rated_item = rating[rating>0].index
    # getting the correlation for rated items
    item_corr = item_corr[rated_item]
    # getting user rating for rated items
    similar_rating = rating[rated_item]

    # calculation of recommendation scores using weighted average of similar items
    num = sum(similar_rating*item_corr)
    denom = sum(item_corr)

    if denom !=0:
      score[item] = num/denom

  recommended_items = sorted(score.items(), key=lambda x:x[1], reverse=True)[:n_recomm]

  recommended_indices = [item[0] for item in recommended_items]

  recommended_scores = [item[1] for item in recommended_items]

  recommended_recipes = pd.DataFrame({
      'RecipeId': recommended_indices,
      'Score': recommended_scores  # Using values() to get scores from dictionary
  })
  print(f"""The recommended recipes for user with ID {user_id}: {reviews[reviews['AuthorId'] == user_id]['AuthorName'].tolist()[0]}""")
  rp = pd.merge(recommended_recipes, recipes[['Name', 'RecipeIngredientParts', 'RecipeId']], on='RecipeId')
  rp
  return rp


In [None]:
user_id = 1535
n_recomm= 10
recommend = recommend_item_pearson(user_id, rating_matrix, corr_matrix, n_recomm)
recommend

### Collaborative Filtering User-Item using KNN

The prediction of recipe is based on user interaction, this method predict recipe by comparing what the user has interated with with user with similar interation, so it then predict what similar users have predicted that the user has not predicted.
It can de deduced that the user and similar users have rated similar items in the past

The KNN model finds recipes that have similar user ratings to this recipe.

User Ratings: The similarity between recipes is based on how users rated them. If two recipes are rated similarly by many users, they are considered similar.

Cosine Similarity: This metric measures the angle between two rating vectors (one for each recipe). Smaller angles (closer to zero) mean the recipes are more similar.

KNN Model: This model identifies the nearest neighbors (similar recipes) based on the cosine similarity of their ratings.

Print Recommendations: The model prints out the names of these similar recipes, along with a similarity score.

In [None]:
combined_df=pd.merge(rating_df, recipes[['RecipeId', 'Name']], on='RecipeId' )
combined_df.isnull().sum()

In [None]:
combined_df.nunique()

In [None]:
combined_df_pivot = combined_df.pivot(index='Name', columns='AuthorId', values='Rating').fillna(0)
# the matrix is highhly sparse, so i can introduce csr sparse matrix
combined_df_matrix = csr_matrix(combined_df_pivot.values)
combined_df_matrix.shape

In [None]:
combined_df_pivot


In [None]:
print("Shape of sparse matrix:", combined_df_matrix.shape)
print("Nonzero values:", combined_df_matrix.data)
print("Column indices of nonzero values:", combined_df_matrix.indices)
print("Index pointer array:", combined_df_matrix.indptr)

In [None]:
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(combined_df_matrix)


In [None]:
query_index = np.random.choice(combined_df_pivot.shape[0])
print(query_index)
# combined_df_pivot[1533]
# comb = combined_df_pivot.transpose()
# comb
combined_df_pivot.shape

In [None]:
distance, indices = knn.kneighbors(combined_df_pivot.iloc[query_index,:].values.reshape(1,-1), n_neighbors=10)


In [None]:
distance

In [None]:
for i in range(0,len(distance.flatten())):
  if i==0:
    print(f'Recommmendation for {combined_df_pivot.index[query_index]}')
  else:
    print(f'{i}: {combined_df_pivot.index[indices.flatten()[i]]} has a distance of {distance.flatten()[i]} ')

In [None]:
print(f'let us get the other users that also rated {combined_df_pivot.index[query_index]} '  )
combined_df_pivot.iloc[query_index, :][combined_df_pivot.iloc[query_index, :] > 0]

In [None]:
def recommend_knn(matrix, pivot_table, query_index, n_neighbors=10):
  knn = NearestNeighbors(metric='cosine', algorithm='brute')
  knn.fit(matrix)

  distance, indices = knn.kneighbors(pivot_table.iloc[query_index,:].values.reshape(1,-1), n_neighbors=n_neighbors)
  # print(distance)
  for i in range(0,len(distance.flatten())):
    if i==0:
      print(f'Recommmendation for {pivot_table.index[query_index]}')
    else:
      print(f'{i}: {pivot_table.index[indices.flatten()[i]]} has a distance of {distance.flatten()[i]} ')

  return

In [None]:
# query_index = np.random.choice(combined_df_pivot.shape[0])

recommend_knn(combined_df_matrix, combined_df_pivot, query_index)


In [None]:
recipes[recipes['Name']=='Crock Pot Ravioli']

In [None]:
def recommend_knn(matrix, pivot_table, query_index, n_neighbors=10):
    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(matrix)

    distance, indices = knn.kneighbors(pivot_table.iloc[query_index, :].values.reshape(1, -1), n_neighbors=n_neighbors)

    for i in range(0, len(distance.flatten())):
        if i == 0:
            print(f'Recommendation for {pivot_table.index[query_index]}')
        else:
            print(f'{i}: {pivot_table.index[indices.flatten()[i]]} has a distance of {distance.flatten()[i]}')

    # Dimensionality Reduction using PCA
    pca = PCA(n_components=2, svd_solver='arpack') # Set the svd_solver to 'arpack' to handle sparse matrices
    matrix_2d = pca.fit_transform(matrix)
    matrix_2d_df = pd.DataFrame(matrix_2d, columns=['pca0', 'pca1'])

    # Get the 2D coordinates for the query point and its neighbors
    query_2d = matrix_2d_df.iloc[query_index]
    neighbors_2d = matrix_2d_df.iloc[indices.flatten()]

    # Plotting
    plt.figure(figsize=(10, 8))

    # Plot all data points
    # plt.scatter(matrix_2d_df['pca0'], matrix_2d_df['pca1'], c='lightgray', label='All Points')

    # Plot the query point
    plt.scatter(query_2d['pca0'], query_2d['pca1'], c='red', label='Query Point')

    # Plot the nearest neighbors (excluding the query point itself)
    plt.scatter(neighbors_2d['pca0'][1:], neighbors_2d['pca1'][1:], c='blue', label='Nearest Neighbors')

    for i in range(1, len(distance.flatten())):
        plt.annotate(f'{i}', (neighbors_2d['pca0'].iloc[i], neighbors_2d['pca1'].iloc[i]))

    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.legend()
    plt.title('KNN Visualization using PCA')
    plt.show()


# query_index = np.random.choice(combined_df_pivot.shape[0])

recommend_knn(combined_df_matrix, combined_df_pivot, query_index)

### Hybrid recommendation (Content Based + Collaborative )


In [None]:
def hybrid_recommendation(user_id, item_id, rating_matrix, corr_matrix, combined_df_matrix, combined_df_pivot, knn):
    # Content-based recommendation
    content_based_recommendations = get_food_recipe_cos_content_based(item_id)

    # Collaborative filtering recommendation
    # Find item index corresponding to item_id
    # item_index = combined_df_pivot.index.get_loc(item_id)  # Get the index of the item
    n_recomm = 5
    # Pass item_index to recommend_knn instead of user_id
    # user_based_recommendations = recommend_knn(combined_df_matrix, combined_df_pivot, item_index)
    item_based_recommendations = recommend_item_pearson(user_id, rating_matrix, corr_matrix, n_recomm)

    # Combine recommendations
    combined_recommendations = list(set(content_based_recommendations['Name'].tolist() + item_based_recommendations['Name'].tolist()))
    # print(combined_recommendations)
    # + user_based_recommendations['Name'].tolist()
    # Return top n recommendations
    top_n = 15
    return combined_recommendations[:top_n]

user_id = 1535
item_id = 4
recommendations = hybrid_recommendation(user_id, item_id, rating_matrix, corr_matrix, combined_df_matrix, combined_df_pivot, knn)

# Print the recommendations outside the function
print(f"Hybrid recommendations for user {user_id} and item {item_id}:")
for i, recommendation in enumerate(recommendations, start=1):
    print(f"{i}. {recommendation}")


### Graph neural network


In [None]:
del reviews
del recipes_cp
del recipes


In [None]:
# A User can review multiple time
Merged_df2[Merged_df2['RecipeId'] == 56]

In [None]:
# i would like to stratify the data to the recent details between 2010 to 2020

# Merged_df2['ReviewDate'] = pd.to_datetime(Merged_df2['DatePublished'])
Merged_df2['DatePublished'] = pd.to_datetime(Merged_df2['DatePublished'])
Merged_df2['AuthorId'] = Merged_df2['AuthorId'].astype('int64')
# Selecting of recipes published between 2015 and 2020
filtered_df = Merged_df2[(Merged_df2['DatePublished'] >= '2014-01-01') & (Merged_df2['DatePublished'] <= '2020-12-31')]
filtered_df.info()

In [None]:
filtered_df['Rating'].value_counts()

In [None]:
rec = filtered_df[['RecipeId', 'Name', 'RecipeCategory', 'Description', 'Calories', 'AuthorId'  ]]
rev = filtered_df[['AuthorId', 'RecipeId', 'ReviewId', 'Rating']]

In [None]:
rating_df = rev
recipe_fil_df = rec

In [None]:
# let's select the rows that has the same recipe ID as the recipe_df

# merged_temp = recipe_df.merge(reviews, on='RecipeId', how='left')
# merged_temp.sort_values(by='RecipeId')
# rating_df = merged_temp[['AuthorId', 'RecipeId', 'Rating']]
# rating_df

In [None]:
subset_rating = (
    review_df.groupby('AuthorId')['Rating']
    .agg(['count'])
    .rename(columns={'count': 'AutCount'})
    .sort_values(by='AutCount', ascending=False)
    .reset_index()  # This line ensures the RecipeId is part of the DataFrame
)

In [None]:
rating_df.nunique()

In [None]:
categories = []
for item in recipe_fil_df['RecipeCategory']:
  if item not in categories:
    categories.append(item)
  # categories.append(item.split(','))
print(categories)

In [None]:
len(categories)

In [None]:
categories = list(set(categories))

In [None]:
# rating_df.groupby('RecipeCategory').agg('count')[5].plot(kind='bar')

category_counts = recipe_fil_df.groupby('RecipeCategory').size().sort_values(ascending=False)

# Plot the top 20 as a bar chart
category_counts.head(20).plot(kind='bar', figsize=(10, 6))

plt.xlabel('Recipe Category')
plt.ylabel('Count')
plt.title('Count of Recipes by Category')
plt.xticks(rotation=75)
plt.show()

In [None]:
label_encoder = LabelEncoder()
# onehot_encoder = OneHotEncoder(sparse_output=False)
label_enc_cat = label_encoder.fit_transform(categories)

In [None]:
recipe_fil_df['RecipeCategoryEnc'] = label_encoder.transform(recipe_fil_df['RecipeCategory'])
recipe_fil_df

In [None]:
enc_shape = label_enc_cat.reshape(len(label_enc_cat), 1)
enc_shape

In [None]:
ohe = OneHotEncoder(sparse_output=False)
ohe_encode = ohe.fit_transform(enc_shape)
# ohe.set_output(transform="default")
np.array(ohe_encode)

In [None]:
categories


In [None]:
def vector_to_cat(vector) -> str:
  return label_encoder.inverse_transform([np.argmax(vector)])[0]

# convert categoories to one hot feature vector
def cat_to_vector(category):
  # categories = list(set(category))
  # print(categories)
  int_encoded = label_encoder.transform([category])
  int_encoded = int_encoded.reshape(-1, 1)
  # print(int_encoded)
  # onehot_encoder = OneHotEncoder(sparse=False)
  onehot_encoded = ohe.transform(int_encoded)
  onehot_encoded = np.array(onehot_encoded)
  return onehot_encoded[0]
  # output = onehot_encoded[0] # Initialize with the first row
  # # print(output_arr)
  # for i in range(1, len(onehot_encoded)):
  #   # output = np.concatenate((output, onehot_encoded[i]))
  #   output += onehot_encoded[i]
  # return output


In [None]:
class TextEncoder:
  def __init__(self, model='all-MiniLM-L6-v2', device=None):
    self.device = device
    self.model = SentenceTransformer(model, device=self.device)

  @torch.no_grad()
  def __call__(self, value: list):

    x = self.model.encode(value, show_progress_bar=True, convert_to_tensor=True, device=self.device)
    return x.cpu()


device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')

encoder = TextEncoder(device=device)

In [None]:
rating = rating_df[['AuthorId', 'RecipeId', 'Rating']].reset_index(drop=True)
rating

In [None]:
# recipe_df = recipe_df[['RecipeId', 'Name', 'RecipeCategory', 'Description', 'Keywords', 'Calories'  ]]

In [None]:
# Let's create a dataframe that includes the unique userId,
#  this will be used to create nodes in the graph

unique_user_id = rating['AuthorId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedId': pd.RangeIndex(len(unique_user_id))
})

unique_user_id

In [None]:
unique_recipe_id = recipe_fil_df['RecipeId'].unique()
unique_recipe_id = pd.DataFrame(data={
    'recipeId': unique_recipe_id,
    'mappedId': pd.RangeIndex(len(unique_recipe_id))
})

unique_recipe_id = unique_recipe_id.merge(recipe_fil_df[['RecipeId', 'RecipeCategoryEnc']], left_on='recipeId',right_on='RecipeId', how='left')
unique_recipe_id.drop('RecipeId', axis=1, inplace=True)
unique_recipe_id.drop_duplicates(inplace=True)
# unique_recipe_id.sort_values(by='mappedId', inplace=True)
unique_recipe_id.reset_index(drop=True, inplace=True)
unique_recipe_id

In [None]:
print(f'The number of unique users: {unique_user_id.shape[0]}')
print(f'The number of unique recipes: {unique_recipe_id.shape[0]}')

In [None]:
# creating a vector for each of the category
recipe_fil_df['RecipeCategoryvector'] = recipe_fil_df['RecipeCategory'].apply(cat_to_vector)



In [None]:
len(recipe_fil_df['RecipeCategoryvector'])

In [None]:
# creating a recipe feature torch.tensor

recipe_features = torch.zeros(len(unique_recipe_id), len(ohe_encode))
recipe_features.shape

In [None]:
recipe_fil_df.isnull().sum()

In [None]:
# creating a recipe feature torch.tensor

recipe_features = torch.zeros(len(unique_recipe_id), 384 + len(ohe_encode))
# recipe_features
def convert_to_embed(x):
  recipe = x['RecipeId']
  # getting the mapped Id
  mapped_id = unique_recipe_id.loc[unique_recipe_id['recipeId'] == recipe, 'mappedId'].values[0]
  # getting the vector and converting it to tensor
  one_hot = torch.tensor(x['RecipeCategoryvector'])
  # desc = ' '.join(x['Description'])
  # print(x['Description'])
  # checking if the description is empty
  if x['Description'] == '':
    recipe_features[mapped_id] = torch.cat((encoder('None'), one_hot), -1)
  else:
    recipe_features[mapped_id] = torch.cat((encoder(x['Description']), one_hot), -1)
# print(recipe_features)

In [None]:
recipe_fil_df.apply(lambda x: convert_to_embed(x), axis=1)

In [None]:
recipe_features[1233]

In [None]:
# i will try creating edges for user-user as well
#  here, i will get a user, and also get a list of other users that rate the same product

In [None]:
# creating user-recipe edge tensor

user_recipe_edges = []
user_recipe_ratings = []
for rating_info in rating.itertuples():
  # print(rating_info.RecipeId)
  user_recipe_edges.append([unique_user_id.loc[unique_user_id['userId'] == rating_info.AuthorId, 'mappedId'].values[0],
                           unique_recipe_id.loc[unique_recipe_id['recipeId'] == rating_info.RecipeId, 'mappedId'].values[0]])
  user_recipe_ratings.append(rating_info.Rating)

user_recipe_edges = torch.t(torch.tensor(user_recipe_edges, dtype=torch.long))
user_recipe_ratings = torch.tensor(user_recipe_ratings, dtype=torch.float)

print(user_recipe_edges.shape)
print(user_recipe_ratings.shape)

In [None]:
unique_recipe_id.sort_values(by='RecipeCategoryEnc', inplace=True)
# unique_recipe_id.reset_index(drop=True, inplace=True)
unique_recipe_id

In [None]:

from collections import defaultdict


# Create a mapping from RecipeCategoryEnc to a list of mappedIds
category_to_recipes = defaultdict(list)
for _, row in unique_recipe_id.iterrows():
    category_to_recipes[row['RecipeCategoryEnc']].append(row['mappedId'])

# Generate edges
recipe_recipe_edges = []
for recipes in category_to_recipes.values():
    for i in range(len(recipes)):
        for j in range(i + 1, len(recipes)):
            recipe_recipe_edges.append([recipes[i], recipes[j]])

# Convert to tensor
recipe_recipe_edges = torch.t(torch.tensor(recipe_recipe_edges, dtype=torch.long))
# print(recipe_recipe_edges.nunique())
recipe_recipe_edges.shape


In [None]:
# recipe_recipe_edge = []
# for recipe_info in recipe_df.itertuples():
#   recipe_recipe_edge.append([ unique_recipe_id.loc[unique_recipe_id['recipeId'] == recipe_info.RecipeId, 'mappedId'].values[0],
#                              unique_recipe_id.loc[unique_recipe_id['RecipeCategoryEnc'] == recipe_info.RecipeCategoryEnc, 'mappedId'].values[0]])
# recipe_recipe_edge = torch.t(torch.tensor(recipe_recipe_edge, dtype=torch.long))
# recipe_recipe_edge.shape


In [None]:
recipe_features.shape

In [None]:
# creating a heterogenuous graph
from  torch_geometric.data import HeteroData
import torch_geometric.transforms as T

data = HeteroData()

data["User"].node_id = torch.arange(len(unique_user_id))
data["Recipe"].node_id = torch.arange(len(unique_recipe_id))
# Adding node features to the HeteroData object
data['Recipe'].x = recipe_features
data['User'].x = torch.eye(len(recipe_fil_df['AuthorId'].unique()))

# Adding edges indices to the HeteroData object
data['User', 'RATING', 'Recipe'].edge_index = user_recipe_edges
data['User', 'RATING', 'Recipe'].edge_label = user_recipe_ratings
data['Recipe', 'CATEGORY', 'Recipe'].edge_index= recipe_recipe_edges

data = T.ToUndirected()(data)

del data['Recipe', 'rev_RATING', 'User'].edge_label
# del data['Recipe', 'rev_CATEGORY', 'Recipe'].edge_label

data

# the increament in the edge index of [2, 61324] because of the reverse egge created

In [None]:
data['Recipe', 'CATEGORY', 'Recipe'].edge_index.shape

In [None]:
from torch_geometric.transforms import RandomLinkSplit

# since there is no float in rating i.e, the rating ranges for 1 to 5,
data['User', 'RATING', 'Recipe'].edge_label = data['User', 'RATING', 'Recipe'].edge_label.long()

# Apply RandomLinkSplit, specifying the number of nodes for each node type
transform = T.RandomLinkSplit(
    num_val=0.2,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=0.0,
    add_negative_train_samples=False,
    edge_types=[('User', 'RATING', 'Recipe'), ('Recipe', 'CATEGORY', 'Recipe')],
    rev_edge_types=[('Recipe', 'rev_RATING', 'User'), ('Recipe', 'rev_CATEGORY', 'Recipe')]
)
train_data, val_data, test_data = transform(data)

In [None]:
train_data

In [None]:
G = nx.Graph()

# Add edges from each edge type
for edge_type in data.edge_types:
    if edge_type != ('Recipe', 'rev_RATING', 'User'):  # Skip the missing edge
        src, dst = data[edge_type].edge_index
        for i in range(src.size(0)):
            G.add_edge(src[i].item(), dst[i].item())

# Plotting the graph
plt.figure(figsize=(12, 8))  # Adjust the figure size as needed

# Draw nodes with default options
nx.draw(G, with_labels=True, node_size=300, font_size=10, node_color='skyblue', edge_color='gray', linewidths=0.5, font_color='black')

plt.title('Heterogeneous Graph User_Recipe')
plt.show()

#### Model Building

In [None]:
# ENCODER
# hidden channel is the dimentionality learned for each node
class GNNEncoder(torch.nn.Module):
  def __init__(self, hidden_channels, out_channels):
    super().__init__()
    self.conv1 = SAGEConv((-1, -1), hidden_channels)
    self.conv2 = SAGEConv((-1, -1), out_channels)

  def forward(self, x, edge_index):
    x = self.conv1(x, edge_index).relu()
    x = self.conv2(x, edge_index)
    return x

#DECODER
class EdgeDecoder(torch.nn.Module):
  def __init__(self, hidden_channels):
    super().__init__()
    self.lin1 = torch.nn.Linear(2 * hidden_channels, hidden_channels)
    self.lin2 = torch.nn.Linear(hidden_channels, 1)

  def forward(self, z_dict, edge_label_index):
    row, col = edge_label_index
    z = torch.cat([z_dict['User'][row], z_dict['Recipe'][col]], dim=-1)

    z = self.lin1(z).relu()
    z = self.lin2(z)
    return z.view(-1)

# ENCODER-DECODER MODEL
class GNNModel(torch.nn.Module):
  def __init__(self, hidden_channels):
    super().__init__()
    self.encoder = GNNEncoder(hidden_channels, hidden_channels)
    self.encoder = to_hetero(self.encoder, data.metadata(), aggr='mean')
    self.decoder = EdgeDecoder(hidden_channels)

  def forward(self, x_dict, edge_index_dict, edge_label_index):
    edge_index_dict = {k: v.to(torch.long).view(2, -1) for k, v in edge_index_dict.items()}
    z_dict = self.encoder(x_dict, edge_index_dict)
    return self.decoder(z_dict, edge_label_index)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GNNModel(hidden_channels=32).to(device)
print(model)

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv, to_hetero

# ENCODER
# hidden_channels is the dimensionality learned for each node
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels//4)
        self.conv2 = SAGEConv((-1, -1), hidden_channels//2)
        self.conv3 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        x = self.conv3(x, edge_index)
        return x

# DECODER
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = torch.nn.Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = torch.nn.Linear(hidden_channels, hidden_channels // 2)
        self.lin3 = torch.nn.Linear(hidden_channels // 2, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['User'][row], z_dict['Recipe'][col]], dim=-1)
        z = self.lin1(z).relu()
        z = self.lin2(z).relu()
        z = self.lin3(z)
        return z.view(-1)

# ENCODER-DECODER MODEL
class GNNModel(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='mean')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        edge_index_dict = {k: v.to(torch.long).view(2, -1) for k, v in edge_index_dict.items()}
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GNNModel(hidden_channels=32).to(device)
print(model)


In [None]:
# import torch
# import torch.nn.functional as F
# import torch.optim as optim

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
# optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)

# Define the training function
def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['User','Recipe'].edge_label_index)
    target = train_data['User', 'Recipe'].edge_label.float()
    loss = F.mse_loss(pred, target).sqrt()
    loss.backward()
    optimizer.step()
    return float(loss)

# Define the test function
@torch.no_grad()
def test(data):
    data = data.to(device)
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['User', 'Recipe'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = data['User', 'Recipe'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

# Training loop
num_epochs = 1000
train_loss = []
valid_loss = []
best_val_rmse = float('inf')
best_epoch = -1

for epoch in range(1, num_epochs + 1):
    train_data = train_data.to(device)
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    train_loss.append(train_rmse)
    valid_loss.append(val_rmse)

    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_epoch = epoch
        # Save the best model
        torch.save(model.state_dict(), 'SAGE_best_model.pth')

    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train_RMSE: {train_rmse:.4f}, '
          f'Val_RMSE: {val_rmse:.4f}')
          #Best_Val_RMSE: {best_val_rmse:.4f} at Epoch: {best_epoch:03d}'

print(f'Best Validation RMSE: {best_val_rmse:.4f} at Epoch: {best_epoch}')


In [None]:
import matplotlib.pyplot as plt
plt.plot(train_loss)
plt.plot(valid_loss)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()


In [None]:


# Load the best model's state_dict
model.load_state_dict(torch.load('SAGE_best_model.pth'))
# Move model to the appropriate device
model.to(device)
# Set model to evaluation mode
model.eval()
# Move test data to device
test_data = test_data.to(device)

# Make predictions
with torch.no_grad():
    pred = model(test_data.x_dict, test_data.edge_index_dict,
                 test_data['User', 'Recipe'].edge_label_index)
    pred = pred.clamp(min=0, max=5)  # Ensure predictions are within valid rating range
    target = test_data['User', 'Recipe'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    print(f'Test RMSE: {rmse:.4f}')

    # Extract userId and recipeId
    userId = test_data['User', 'Recipe'].edge_label_index[0].cpu().numpy()
    recipeId = test_data['User', 'Recipe'].edge_label_index[1].cpu().numpy()

    # Convert predictions and targets to numpy
    pred = pred.cpu().numpy()
    target = target.cpu().numpy()

    # Create a DataFrame with the results
    result = pd.DataFrame({'UserId': userId, 'RecipeId': recipeId, 'PredictedRating': pred, 'ActualRating': target})

result


In [None]:
# with torch.no_grad():
#   test_data = test_data.to(device)
#   pred = model(test_data.x_dict, test_data.edge_index_dict,
#                test_data['User', 'Recipe'].edge_label_index)
#   pred = pred.clamp(min=0, max=5)
#   target = test_data['User', 'Recipe'].edge_label.float()
#   rmse = F.mse_loss(pred, target).sqrt()
#   print(f'Test RMSE: {rmse:.4f}')

# userId = test_data['User', 'Recipe'].edge_label_index[0].cpu().numpy()
# recipeId = test_data['User', 'Recipe'].edge_label_index[1].cpu().numpy()

# pred = pred.cpu().numpy()
# target = target.cpu().numpy()

# result = pd.DataFrame({'UserId': userId, 'RecipeId': recipeId, 'PredictedRating': pred, 'ActualRating': target})
# result

GAT

In [None]:
# import torch
# import torch.nn.functional as F
# from torch_geometric.nn import GATConv, to_hetero

# ENCODER
# hidden_channels is the dimensionality learned for each node
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GATConv((-1, -1), hidden_channels//4, heads=2, add_self_loops=False)
        self.conv2 = GATConv((-1, -1), hidden_channels//2, heads=2, add_self_loops=False)
        self.conv3 = GATConv((-1, -1), out_channels, heads=1, add_self_loops=False)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        x = self.conv3(x, edge_index)
        return x

# DECODER
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = torch.nn.Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = torch.nn.Linear(hidden_channels, hidden_channels // 2)
        self.lin3 = torch.nn.Linear(hidden_channels//2, hidden_channels // 4)
        self.lin4 = torch.nn.Linear(hidden_channels // 4, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['User'][row], z_dict['Recipe'][col]], dim=-1)
        z = self.lin1(z).relu()
        z = self.lin2(z).relu()
        z = self.lin3(z).relu()
        z = self.lin4(z)
        return z.view(-1)

# ENCODER-DECODER MODEL
class GNNModel(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='mean')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        edge_index_dict = {k: v.to(torch.long).view(2, -1) for k, v in edge_index_dict.items()}
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
GATmodel = GNNModel(hidden_channels=32).to(device)
print(GATmodel)


In [None]:
# Define the optimizer
optimizer = torch.optim.Adam(GATmodel.parameters(), lr=0.0001)
# optimizer = torch.optim.SGD(GATmodel.parameters(), lr=0.0001, momentum=0.9)


# Define the training function
def train():
    GATmodel.train()
    optimizer.zero_grad()
    pred = GATmodel(train_data.x_dict, train_data.edge_index_dict,
                 train_data['User', 'Recipe'].edge_label_index)
    target = train_data['User', 'Recipe'].edge_label.float()
    loss = F.mse_loss(pred, target).sqrt()
    loss.backward()
    optimizer.step()
    return float(loss)

# Define the test function
@torch.no_grad()
def test(data):
    data = data.to(device)
    GATmodel.eval()
    pred = GATmodel(data.x_dict, data.edge_index_dict,
                 data['User', 'Recipe'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = data['User', 'Recipe'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

# Training loop
num_epochs = 1500
train_loss = []
valid_loss = []
best_val_rmse = float('inf')
best_epoch = -1

for epoch in range(1, num_epochs + 1):
    train_data = train_data.to(device)
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    train_loss.append(train_rmse)
    valid_loss.append(val_rmse)

    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_epoch = epoch
        # Save the best model
        torch.save(GATmodel.state_dict(), 'GAT_best_model.pth')

    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train_RMSE: {train_rmse:.4f}, '
          f'Val_RMSE: {val_rmse:.4f}')
          #Best_Val_RMSE: {best_val_rmse:.4f} at Epoch: {best_epoch:03d}'

print(f'Best Validation RMSE: {best_val_rmse:.4f} at Epoch: {best_epoch}')


In [None]:
import matplotlib.pyplot as plt
plt.plot(train_loss)
plt.plot(valid_loss)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()



In [None]:
# GATmodel = GNNModel(hidden_channels=64).to(device)
# Load the best model's state_dict
GATmodel.load_state_dict(torch.load('GAT_best_model.pth'))
# Move model to the appropriate device
GATmodel.to(device)
# Set model to evaluation mode
GATmodel.eval()
# Move test data to device
test_data = test_data.to(device)

# Make predictions
with torch.no_grad():
    pred = GATmodel(test_data.x_dict, test_data.edge_index_dict,
                 test_data['User', 'Recipe'].edge_label_index)
    pred = pred.clamp(min=0, max=5)  # Ensure predictions are within valid rating range
    target = test_data['User', 'Recipe'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    print(f'Test RMSE: {rmse:.4f}')

    # Extract userId and recipeId
    userId = test_data['User', 'Recipe'].edge_label_index[0].cpu().numpy()
    recipeId = test_data['User', 'Recipe'].edge_label_index[1].cpu().numpy()

    # Convert predictions and targets to numpy
    pred = pred.cpu().numpy()
    target = target.cpu().numpy()

    # Create a DataFrame with the results
    result = pd.DataFrame({'UserId': userId, 'RecipeId': recipeId, 'PredictedRating': pred, 'ActualRating': target})

result


In [None]:
max_user_review = rating_df.groupby('AuthorId')['ReviewId'].count().sort_values(ascending=False)
# max_recipe_review = rating_df.groupby('RecipeId').size().max()
max_user_review

In [None]:

max_user_id = max_user_review.idxmax()
max_user_id

In [None]:
#let's rate a product noot rated by the user
mapped_user_id = unique_user_id.loc[unique_user_id['userId']==max_user_id, 'mappedId'].values[0]
print('the mapped id of user', max_user_id, 'who has the maximum review is' , mapped_user_id)
recipe_rated = rating_df[rating_df['AuthorId']==max_user_id]
recipe_unrated = recipe_fil_df[~recipe_fil_df['RecipeId'].isin(recipe_rated['RecipeId'])]
recipe_unrated = recipe_unrated.merge(unique_recipe_id, left_on='RecipeId', right_on='recipeId', how='inner')
print('--------The sampled recipe that ha not been rated by the user-------------------')
recipe_item = recipe_unrated.sample(1)
recipe_item

In [None]:

recipe_item = recipe_item['mappedId'].item()
recipe_item

In [None]:
# considering the product hasn't be rated, let us rate the recipe
# the tenspr is the id of the max reviewed rated item with one of the unrated item, let's predict
# what the user will predict for the product, this will solve the cold start problem
edge_label_index = torch.tensor([mapped_user_id, recipe_item], dtype=torch.long)
edge_label_index
with torch.no_grad():
  test_data.to(device)
  pred = model(test_data.x_dict, test_data.edge_index_dict, edge_label_index)
  pred = pred.clamp(min=0, max=5).detach().cpu().numpy()
print(pred.item())

In [None]:
explainer = Explainer(
    model = model,
    algorithm=CaptumExplainer('IntegratedGradients'),
    explanation_type='model',
    model_config = dict(
        mode='regression',
        task_level='edge',
        return_type='raw'
    ),
    node_mask_type=None,
    edge_mask_type='object'
)

explanation = explainer(
    test_data.x_dict,
    test_data.edge_index_dict,
    index=0,
    edge_label_index=edge_label_index
).cpu().detach()

explanation

In [None]:
recipe_id = unique_recipe_id.loc[unique_recipe_id['mappedId']==recipe_item, 'recipeId'].values[0]
recipe_title = recipe_fil_df.loc[recipe_fil_df['RecipeId']==recipe_id, 'Name'].values[0]
# print(recipe_title)recipe_fil_df
user_to_recipe = explanation['User', 'Recipe'].edge_index.numpy().T
user_to_recipe_attr = explanation['User', 'Recipe'].edge_mask.numpy().T
user_to_recipe_df = pd.DataFrame(
    np.hstack([user_to_recipe, user_to_recipe_attr.reshape(-1, 1)]),
    columns = ['mappedUserId', 'mappedRecipeId', 'attr']
)

recipe_to_user = explanation['Recipe', 'User'].edge_index.numpy().T
recipe_to_user_attr = explanation['Recipe', 'User'].edge_mask.numpy().T
recipe_to_user_df = pd.DataFrame(
    np.hstack([recipe_to_user, recipe_to_user_attr.reshape(-1, 1)]),
    columns = ['mappedRecipeId', 'mappedUserId', 'attr']
)

explanation_df = pd.concat([user_to_recipe_df, recipe_to_user_df])
explanation_df[['mappedUserId', 'mappedRecipeId']] = explanation_df[['mappedUserId', 'mappedRecipeId']].astype(int)

print(f"Attribtion for all edges towards prediction of Recipe rating of Recipe: \n {recipe_title}")
print(explanation_df.sort_values(by='attr'))

In [None]:
print(mapped_user_id)

In [None]:
unique_recipe_id

In [None]:
explanation_df

In [None]:
def recipe_recommendation_GNN(explanation_df, recipe_fil_df, unique_recipe_id, mapped_user_id):
    explanation_df = explanation_df[explanation_df['mappedUserId'] == mapped_user_id]
    explanation_df = explanation_df.groupby('mappedRecipeId').sum()
    explanation_df = explanation_df.merge(unique_recipe_id, left_on='mappedRecipeId', right_on='mappedId', how='inner')
    recipe_df_unique = recipe_fil_df.drop_duplicates(subset=['RecipeId'])
    explanation_df = explanation_df.merge(recipe_df_unique, left_on='recipeId', right_on='RecipeId', how='inner')
    pd.options.display.float_format = "{:,.9f}".format
    # print(explanation_df.columns)

    print("Top products that contributed to the prediction")
    exp = explanation_df.sort_values(by='attr', ascending=False, key=lambda x: abs(x))[['RecipeId', 'Name', 'Calories', 'RecipeCategory', 'attr']].head(5)

    return exp

recommendation = recipe_recommendation_GNN(explanation_df, recipe_fil_df, unique_recipe_id, mapped_user_id)
recommendation


###Hybrid Recommnedation Sytem (GNN + Content based)

In [None]:
# def hybrid_recipe_recommendation(explanation_df, recipe_df, unique_recipe_id, mapped_user_id, cos_mat):
#     gnn_recommendations = recipe_recommendation_GNN(explanation_df, recipe_df, unique_recipe_id, mapped_user_id)
#     content_based_recommendations = get_food_recipe_cos_content_based(gnn_recommendations.index)

#     # Assign weights to each system (adjust weights as needed)
#     gnn_weight = 0.7
#     content_based_weight = 0.3

#     # Combine recommendations using weighted average
#     combined_recommendations = gnn_recommendations.copy()
#     combined_recommendations['score'] = gnn_recommendations['attr'] * gnn_weight + content_based_recommendations['score'] * content_based_weight
#     combined_recommendations = combined_recommendations.sort_values(by='score', ascending=False)

#     return combined_recommendations

In [None]:
def hybrid_recipe_recommendation(explanation_df, recipe_fil_df, unique_recipe_id, mapped_user_id, recipe_title):
    gnn_recommendations = recipe_recommendation_GNN(explanation_df, recipe_fil_df, unique_recipe_id, mapped_user_id)
    # print(gnn_recommendations)
    index = ind[recipe_title]
    content_based_recommendations = get_food_recipe_cos_content_based(index)
    content_based_recommendations = content_based_recommendations[['RecipeId', 'Name', 'Calories', 'RecipeCategory', 'Score']]
    # content_based_recommendations
    user_id = unique_user_id.loc[unique_user_id['mappedId']==mapped_user_id, 'userId'].values[0]

    print(f'Recommendation of Recipe to user with id {user_id} after interacting witth "{recipe_title}"')
    hybrid_df = pd.concat([gnn_recommendations, content_based_recommendations]).drop_duplicates(subset=['RecipeId'])
    hybrid_df.fillna(0, inplace=True)


    return hybrid_df


In [None]:
recommendations = hybrid_recipe_recommendation(explanation_df, recipe_fil_df, unique_recipe_id, mapped_user_id, recipe_title)
recommendations