# Part I: Find healthy alternatives
The first part of the recommendation system has the aim to find healthy products (Nutri-Score A or B) which are similar to the unhealthier (Nutri-Score C or lower) original product. 

### Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
import difflib
from spacy.lang.nl.stop_words import STOP_WORDS as nl_stop
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval 

### Inspect data

In [None]:
df1 = pd.read_excel(r"C:\Users\ninaj\Documents\Data-Driven Design Y2\Graduation Project\Week 9\data\labelled_products.xlsx")
df1.head()

In [None]:
df1.info()

In [None]:
df1['Description'].head

### Convert words from the product description into vectors
The vectors will be computed by Term Frequency-Inverse Document Frequency. This is a statistical measure to evaluate how important a word is in the collection of all words. Term frequency can be defined as the number of times a word is counted in the collection of all words. Inverse Dcoument Frequency is a logarithm which measure how important a word is. 

In [64]:
# Remove all Dutch stop words
stopwords_list = list(nl_stop)

# Define the Term Frequency-Inverse Document Frequency object. 
tfidf = TfidfVectorizer(stop_words=stopwords_list)

# Construct the required TF-TDF matrix by fitting and transforming the data
tfidf_matrx = tfidf.fit_transform(df1['Description'].astype(str))

# Output is the shape of tfidf_matrix
tfidf_matrx.shape

(3400, 3470)

### Cosine similarity 
The tfidf_matrix will be put into a cosine similarity made with the linear_kernel function.


In [65]:
# Calculate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrx, tfidf_matrx)

### Reverse mapping
A top 10 of most similar products will be made. Therefore, we need to make a function for the reverse mapping of the product titles and indices. This is needed to identify the index of a product.

In [66]:
# Construct a reverse map of indices and products
indices = pd.Series(df1.index, index=df1['Title'])

In [67]:
# A function for creating a top 10 of similar products
def get_recommendations(title, cosine=cosine_sim):
    # Get the index of the product which matches the title
    idx = indices[title]
    
    # Get the pairwise similarity scores of all products with that product
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the products on the similarity scores
    sim_scores= sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar products
    sim_scors = sim_scores[1:11]
    
    # Get the product indices
    product_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar products
    return df1.iloc[product_indices]

### CountVectorizer
The CountVectorizer can be used for couting the number of times a certain Nutri-Score appears

In [76]:
# Import CountVectorizer and create the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df1['Nutri-Score'].values.astype('U'))

In [77]:
# Compute the cosine similarity matrix based on the count_matrix
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [75]:
# Reset the index of our main DataFrame and construct reverse mapping
df1 = df1.reset_index()
indices = pd.Series(df1.index, index=df1['Title'])

ValueError: cannot insert level_0, already exists

In [74]:
# Test the recommendations
get_recommendations('AH Griekse stijl portie')[0:11].sort_values(['Nutri-Score'], ascending=[True]).head(10)

Unnamed: 0.1,level_0,index,Unnamed: 0,Title,Nutri-Score,Description,Price,Price_cat,BIO,Low_sugar,Low_fat,Plant_based,Glutenfree,Fairtrade,UTZ,Vegan,Vegetarian,Nutrients
1435,1435,1435,1435,AH Yoghurt Griekse stijl 0% vet,Nutri-Score A,"Yoghurt*0,1% vet*Van weidemelk.Deze Albert Hei...",1.19,Low budget,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SoortPer 100 GramEnergie389 kJ (93 kcal)Vet5 g...
1405,1405,1405,1405,De Zaanse Hoeve Magere yoghurt,Nutri-Score A,magere yoghurt**Van weidemelk.Deze Albert Heij...,0.69,Low budget,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SoortPer 100 GramEnergie544 kJ (131 kcal)Vet9....
1456,1456,1456,1456,AH Smikkeltoetje,Nutri-Score A,Magere yoghurt**Van weidemelk.Deze Albert Heij...,1.79,Medium budget,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SoortPer 100 MilliliterEnergie143 kJ (34 kcal)...
1451,1451,1451,1451,De Zaanse Hoeve Halfvolle yoghurt naturel,Nutri-Score A,Halfvolle yoghurt**Van weidemelk.Deze Albert H...,0.89,Low budget,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SoortPer 100 GramRI*Energie215 kJ (52 kcal)Vet...
1436,1436,1436,1436,AH Witte vrije uitloop eieren,Nutri-Score A,Volle yoghurt**Van weidemelk.Deze Albert Heijn...,1.19,Low budget,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SoortPer 100 GramRI*Energie286 kJ (68 kcal)Vet...
1514,1514,1514,1514,De Zaanse Hoeve Volle yoghurt,Nutri-Score B,volle yoghurt**Van weidemelk.Deze Albert Heijn...,0.95,Low budget,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SoortPer 100 MilliliterRI*Energie295 kJ (70 kc...
1586,1586,1586,1586,AH Milde Hollandse hangop,Nutri-Score B,Ingedikte yoghurt*6% vet*Van weidemelk.Deze Al...,1.49,Low budget,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SoortPer 100 MilliliterRI*Energie2590 kJ (630 ...
1611,1611,1611,1611,AH Griekse stijl portie,Nutri-Score C,Yoghurt* 10% vet *Van weidemelk.Deze Albert He...,0.59,Low budget,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SoortPer 100 MilliliterEnergie405 kJ (96 kcal)...
1634,1634,1634,1634,AH Yoghurt Griekse stijl,Nutri-Score C,Yoghurt* 10% vet*Van weidemelk.Deze Albert Hei...,1.79,Medium budget,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SoortPer 100 GramEnergie2794 kJ (679 kcal)Vet7...
1626,1626,1626,1626,AH Yoghurt Griekse stijl 10% vet,Nutri-Score C,Yoghurt10% vet*Van weidemelk.Deze Albert Heijn...,1.35,Low budget,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SoortPer 100 MilliliterEnergie509 kJ (124 kcal...


### User input

The user input, in this case, can be seen as a grcoery list where users need to add products to a grocery list. 

In [133]:
grocery_list = []

In [None]:
user_input = input("Voeg (eetbare) producten toe aan je boodschappenlijstje.")

In [None]:
grocery_list.append(user_input)

In [None]:
print(grocery_list)

### Find similar products with DiffLib.get_close_matches

In [None]:
# With the library difflib.get_close_matches, user input will be matched with existing products from the dataset
import difflib

In [None]:
words = df1['Title']

match_list = difflib.get_close_matches(user_input, words)

In [None]:
# The user has to choose from one of the suggestions above in real life 
chosen_item = match_list
print(chosen_item)

In [None]:
get_recommendations(chosen_item[1])[1:11].sort_values(['Nutri-Score'], ascending=[True]).head(10)


In [None]:
# The user has to choose from one of the recommendations above
healthy_products = [df1.loc[2224], df1.loc[19]]

print(healthy_products)

# Part II: Recognize patterns and improve recommendations
In order to recommend personalized products, a model has to be build to recognize patterns. Figure 1 illustrates the workflow.  

![alt text](personalized_system_flowchart.png "Personalized system")

In [None]:
# User adds a product 

In [None]:
# Check if product contains Nutri-Score label C or lower

In [None]:
# Recommend healthy alternative based on Nutri-Score (A or B)

# Condition  Nutri-Score = A or B

# AND previously bought product 
product_history = [] # Somee list from part 1
# AND labels 
labels = df1[['Price_cat', 'BIO', 'Low_sugar', 'Low_fat', 'Plant_based', 'Glutenfree', 'Fairtrade', 'UTZ', 'Vegan', 'Vegetarian'].head()

In [None]:
# Find similar products (Cosine Similarity?)