In [33]:
dining_file = './preprocessed-data/alldininghalls.csv'
ratings_file = './preprocessed-data/dining_ratings.csv'
users_file = './preprocessed-data/randomUserDataset.csv'

In [34]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

In [35]:
df = pd.read_csv(dining_file)
ratings = pd.read_csv(ratings_file)
users = pd.read_csv(users_file)

In [36]:
# height ft-inch to inch
users['Height'] = users['Height'].apply(lambda x:int(x.split('\'')[0])*12+int(x.split('\'')[1]))

In [37]:
#### collaborative filtering
## find similar users
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Function to find similar users
def find_similar_users(new_user_df, users_df, k=25):    
    # Compute cosine similarity between new user and existing users
    new_user_df.drop(columns = ['UserID'])
    users_df.drop(columns = ['UserID'])
    similarities = cosine_similarity(new_user_df, users_df)[0]
    
    # Find the top-k similar users
    top_k_similar_users_indices = similarities.argsort()[-k-1:-1][::-1]
    top_k_similar_users = users.iloc[top_k_similar_users_indices]
    
    return top_k_similar_users

# Normalize numerical features
numerical_features = ['Weight (lbs)', 'Height']
numerical_data = users[numerical_features]
scaler = StandardScaler()
numerical_data_scaled = scaler.fit_transform(numerical_data)
users[numerical_features] = numerical_data_scaled

# similar users in a dataframe
one_user = users[:1].copy() ## new user data frame
similar_users = find_similar_users(one_user, users)
display(similar_users)

Unnamed: 0,UserID,Weight (lbs),Height,Fish,Soybeans,Wheat,Gluten,Milk,Tree Nuts,Eggs,...,Vegetarian,Contains Nuts,gelfenbien,kosher,north,northwest,McMahon,putnam,south,whitney
3,4,0.718346,0.970583,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,-0.926104,-0.726486,1,1,0,1,1,1,0,...,1,0,0,1,0,0,0,0,0,0
6,7,-0.033402,-1.242985,0,0,1,0,1,1,0,...,0,0,0,0,0,0,1,0,0,0
4,5,0.060566,1.708439,0,1,1,1,1,1,0,...,1,1,0,0,0,0,0,0,0,1
8,9,0.6557,0.749226,0,0,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,1
13,14,0.827976,1.487082,1,1,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
14,15,0.279826,1.487082,1,1,1,1,1,0,0,...,1,0,0,0,0,0,1,0,0,0
9,10,-0.753828,1.413296,1,0,1,0,0,0,0,...,1,1,0,0,0,0,1,0,0,0
2,3,-0.550229,1.634653,0,1,0,0,1,0,0,...,1,1,0,0,0,0,0,0,1,0
12,13,-1.192348,0.380298,1,1,0,1,1,1,1,...,1,1,0,0,0,1,0,0,0,0


In [38]:
# avg_ratings_users = ratings.groupby(by = ['userId','foodItem']).agg({
#     'rating':np.mean
# }).reset_index()

# ratings_similar_users = ratings[ratings['userId'].isin(similar_users['UserID'].tolist())].copy()
# avg_ratings_similar_users = ratings_similar_users.groupby(by = ['userId','foodItem']).agg({
#     'rating':np.mean
# }).reset_index()

In [39]:
# Define the model
class MatrixFactorization(nn.Module):
    def __init__(self, n_users, n_movies, n_factors=20):
        super(MatrixFactorization, self).__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.movie_factors = nn.Embedding(n_movies, n_factors)
        # initializing our matrices with a positive number generally will yield better results
        self.user_factors.weight.data.uniform_(0, 0.5)
        self.movie_factors.weight.data.uniform_(0, 0.5)
        
    def forward(self, user, movie):
        return (self.user_factors(user) * self.movie_factors(movie)).sum(1)

In [40]:
# Load the MovieLens dataset
diningHalls = df.copy()
diningRates = ratings.copy()

# Preprocess the data
n_users = diningRates.userId.unique().shape[0]
n_foodItems = diningRates.foodItem.unique().shape[0]

# Convert movieId and userId into unique integers
user_map = {u: i for i, u in enumerate(diningRates.userId.unique())}
diningRates['user_id'] = diningRates['userId'].map(user_map)

dining_map = {m: i for i, m in enumerate(diningRates.foodItem.unique())}
diningRates['food_item'] = diningRates['foodItem'].map(dining_map)

# Create a matrix with users as rows and movies as columns
matrix = torch.zeros((n_users, n_foodItems))
for i, row in diningRates.iterrows():
    matrix[int(row.user_id), int(row.food_item)] = row.rating

In [41]:
model = MatrixFactorization(n_users, n_foodItems)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# Train the model
for i in range(50):
    optimizer.zero_grad()
    user = torch.LongTensor(diningRates.user_id)
    movie = torch.LongTensor(diningRates.food_item)
    rating = torch.FloatTensor(diningRates.rating)
    predictions = model(user, movie)
    loss = criterion(predictions, rating)
    loss.backward()
    optimizer.step()
    
    if i % 10 == 0:
        print(loss)

tensor(6.2892, grad_fn=<MseLossBackward0>)
tensor(6.2741, grad_fn=<MseLossBackward0>)
tensor(6.2592, grad_fn=<MseLossBackward0>)
tensor(6.2443, grad_fn=<MseLossBackward0>)
tensor(6.2295, grad_fn=<MseLossBackward0>)


In [42]:
# Make recommendations for a given user
def recommend_food(model, user_id, num_recommendations):
    with torch.no_grad():
        user = torch.LongTensor([user_map[user_id]])
        movies = torch.arange(n_foodItems)
        ratings = model(user, movies).detach().numpy()
    movie_ids = ratings.argsort()[-num_recommendations:][::-1]
    recommended_food = [movies[i] for i in movie_ids]
    return recommended_food

In [43]:
# Get recommendations for a user with user_id 1
def getRecs(model, user_id, num_recs):
    recommended_food = recommend_food(model, user_id, num_recs)

    # Convert tensors to Int
    val = []
    for i in range(num_recs):
        val.append(int(recommended_food[i]))

    for id in val:
        row = diningHalls.loc[diningHalls['foodId'] == id]
        movie = row.values.tolist()
        if len(movie) == 0:
            continue
    return val

In [44]:
# recommendation for that particular user:

def getSimilarUserRecs(model, userID, num_recs = 5):
    recs = []
    recs += getRecs(model, userID, num_recs)
    # recommendation for similar users:
    for user in similar_users['UserID'].tolist():
        recs += getRecs(model, user, 1)
    recs = set(recs)
    food_recs = df[df['foodId'].isin(recs)]
        
    return food_recs

In [45]:
# Now do content filtering

def contentFiltering(df, dHall_pref, allergens, diet_restr, meal):     
        
    # filter by meal
    recs = df.loc[(df[meal] == 1)] # works
    recs = recs.drop_duplicates(subset="Food Item") # remove duplicates
        
    # filter by dining 
    if len(dHall_pref) == 0:
        pass
    else:
        for index, row in recs.iterrows():
            L = []
            actual_index = row["foodId"]
            for pref in dHall_pref:  

                if row[pref] == 0: L.append(0)
                else: L.append(1)

            all_zeros = all(val == 0 for val in L)
            if all_zeros == True: 
                recs = recs.drop(actual_index)
            
    # filter by allergens 
    if len(allergens) == 0:
        pass
    else:
        for index, row in recs.iterrows():
            L = []
            actual_index = row["foodId"]
            for allergen in allergens:  

                if row[allergen] == 1: L.append(1)
                else: L.append(0)

            all_zeros = all(val == 0 for val in L)
            # If there is an allergen present, a 1 in the list, drop it
            if all_zeros == False: 
                recs = recs.drop(actual_index)

    
    if len(diet_restr) == 0:
        pass
    else:
        for index, row in recs.iterrows():
            L = []
            actual_index = row["foodId"]
            for restr in diet_restr:  

                if row[restr] == 0: L.append(0)
                else: L.append(1)

            all_zeros = all(val == 0 for val in L)
            # If there is an allergen present, a 1 in the list, drop it
            if all_zeros == True: 
                recs = recs.drop(actual_index)
                     
    return recs

In [46]:
# meals = ['Breakfast', 'Lunch', 'Dinner']
# allergens = ['Fish', 'Soybeans', 'Wheat', 'Gluten', 'Milk', 'Tree Nuts', 'Eggs', 'Sesame', 'Crustacean Shellfish']
# dietary_restrictions = ['Gluten Friendly', 'Less Sodium', 'Smart Check', 'Vegan', 'Vegetarian', 'Contains Nuts']
# dHalls = ["gelfenbien", "kosher", "north", "northwest", "McMahon", "putnam", "south", "whitney"]

df = getSimilarUserRecs(model, 1, 10) # dataframe
dHallPref = []
allergens = []
dietary_restrictions = []
meal = "Lunch"

recs = contentFiltering(df, dHallPref, allergens, dietary_restrictions, meal)
display(recs)

Unnamed: 0,foodId,Food Item,Breakfast,Lunch,Dinner,Fish,Soybeans,Wheat,Gluten,Milk,...,Vegetarian,Contains Nuts,gelfenbien,kosher,north,northwest,McMahon,putnam,south,whitney
36,36,Special Pizza,0,1,0,0,0,1,0,1,...,0,0,1,0,0,0,0,0,0,0
370,370,Grilled Chicken Fajita Wrap,0,1,0,0,1,1,0,1,...,0,0,1,0,0,0,0,0,0,0
1511,1511,Flour Tortillas,0,1,0,0,1,1,0,0,...,1,0,0,1,0,0,0,0,0,0
2542,2542,French Toast,0,1,0,0,1,1,0,1,...,1,0,0,0,1,0,0,0,0,0
2870,2870,Potato Chips,0,1,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
5423,5423,Grilled Vegetables,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
7342,7342,Marinara Sauce,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
