# Finding similair healthy products

Inspired by P. Sharma (2021)

#### Import the libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt
from spacy.lang.nl.stop_words import STOP_WORDS as nl_stop
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics.pairwise import cosine_similarity


#### Read the dataset

In [4]:
df1 = pd.read_excel(r"C:\Users\ninaj\Documents\Data-Driven Design Y2\Graduation Project\Week 7\RS\Webscraping\ah_products_extended_Large_V2.xlsx")
df1.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Title,Nutri-Score,Description,Price
0,0,0,AH Verse tagliatelle all'uovo,Nutri-Score A,Verse eierpasta,2.09
1,1,1,AH Verspakket erwtensoep,Nutri-Score A,Kookpakket voor het maken van erwtensoepmet ee...,2.99
2,2,2,AH Italiaanse lasagne verspakket,Nutri-Score A,Kookpakket voor het maken van lasagne met 10% ...,4.99
3,3,3,AH Verse spaghetti all'uovo,Nutri-Score A,Verse eierpasta,2.09
4,4,4,AH Indiase curry madras verspakket,Nutri-Score A,Kookpakket voor curry met een pakje kokosmelk ...,4.99


#### Clean the dataset


In [5]:
# Remove duplicates
df1.sort_values("Title", inplace = True)
 
# dropping ALL duplicate values
df1.drop_duplicates(subset ="Title",
                     keep = False, inplace = True)

df1.head(10)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Title,Nutri-Score,Description,Price
1927,1927,218,AH 1 Minuut basmati rijst,Nutri-Score B,Gekookte basmati rijst,1.55
1948,1948,239,AH 1 Minuut witte rijst,Nutri-Score B,Gekookte witte rijst,1.25
3321,3321,128,AH 100% Haver volkoren,Nutri-Score A,Gepofte volkoren haver,1.59
3260,3260,67,AH 100% Pindakaas extra grof,Nutri-Score A,100% pindakaas,2.35
3311,3311,118,AH 100% Pindakaas stukjes,Nutri-Score A,Pindakaas met stukjes pinda,4.99
3252,3252,59,AH 100% pindakaas,Nutri-Score A,Pindakaas,4.99
3274,3274,81,AH 100% volkoren durumtarwe gepoft,Nutri-Score A,Gepofte volkoren durumtarwe,1.59
3253,3253,60,AH 100% volkoren spelt gepoft,Nutri-Score A,Gepofte volkoren spelt,1.59
2819,2819,1338,AH 3V5 Wasabi bollen,Nutri-Score D,"Pittige, gecoate pinda's met 0,1% wasabipoeder",2.15
3445,3445,280,AH 4 vruchten fruitspread minder suiker,Nutri-Score C,"FruitspreadBereid met 20 g aardbei, 17 g kers,...",1.99


#### Get an overview of the description column
We want to find a similiarty between the description in order to predict a similar product.

In [6]:
df1['Description'].head(10)

1927                               Gekookte basmati rijst
1948                                 Gekookte witte rijst
3321                               Gepofte volkoren haver
3260                                       100% pindakaas
3311                          Pindakaas met stukjes pinda
3252                                            Pindakaas
3274                          Gepofte volkoren durumtarwe
3253                               Gepofte volkoren spelt
2819       Pittige, gecoate pinda's met 0,1% wasabipoeder
3445    FruitspreadBereid met 20 g aardbei, 17 g kers,...
Name: Description, dtype: object

#### Converting words into vectors
We want to convert words into vector in order to get meaningul insights from the data. Based on the similarity, the words will be put together. Each word will be getting a magnitude and a direction in a 3D-space, which will determine how similar words are. 

The vectors will be computed by Term Frequency-Inverse Document Frequency. This is a statistical measure to evaluate how important a word is in the collection of all words. Term frequency can be defined as the number of times a word is counted in the collection of all words. Inverse Dcoument Frequency is a logarithm which measure how important a word is. 

In [7]:
# Remove all Dutch stop words
stopwords_list = list(nl_stop)

# Define the Term Frequency-Inverse Document Frequency object. 
tfidf = TfidfVectorizer(stop_words=stopwords_list)

# Construct the required TF-TDF matrix by fitting and transforming the data
tfidf_matrx = tfidf.fit_transform(df1['Description'].astype(str))

# Output is the shape of tfidf_matrix
tfidf_matrx.shape

(2287, 2843)

#### Cosine similarity 
The tfidf_matrix will be put into a cosine similarity made with the linear_kernel function. 

In [8]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrx, tfidf_matrx)

#### Reverse mapping
A top 10 of most similar products will be made. Therefore, we need to make a function for the reverse mapping of the product titles and indices. This is needed to identify the index of a product. 

In [9]:
# Construct a reverse map of indices and products
indices = pd.Series(df1.index, index=df1['Title']).drop_duplicates()

In [11]:
# A function for creating a top 10 of similar products
def get_recommendations(title, cosine=cosine_sim):
    # Get the index of the product which matches the title
    idx = indices[title]
    
    # Get the pairwise similarity scores of all products with that product
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the products on the similarity scores
    sim_scores= sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar products
    sim_scors = sim_scores[1:11]
    
    # Get the product indices
    product_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar products
    return df1[['Title', 'Nutri-Score', 'Price']].iloc[product_indices]

#### CountVectorizer
We use the CountVectorizer to count the number of times a certain Nutri-Score appears. 

In [13]:
# Import CountVectorizer and create the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df1['Nutri-Score'])

In [14]:
# Compute the cosine similarity matrix based on the count_matrix
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [15]:
# Reset the indexof our main DataFrame and construct reverse mapping
df1 = df1.reset_index()
indices = pd.Series(df1.index, index=df1['Title'])

In [16]:
# Make price readable
df1['Price'].head(10)

0    1.55
1    1.25
2    1.59
3    2.35
4    4.99
5    4.99
6    1.59
7    1.59
8    2.15
9    1.99
Name: Price, dtype: float64

In [17]:
df1['Price'] = np.float32(df1['Price'])

In [18]:
get_recommendations('AH Roomboter ongezouten')[1:11].sort_values(['Nutri-Score'], ascending=[True]).head(10)

Unnamed: 0,Title,Nutri-Score,Price
1099,AH Liefde & Passie Beurre (ongesneden),Nutri-Score A,1.39
60,AH Appeltaartpunten,Nutri-Score D,2.5
1033,AH Kokoskransen in toefzak roomboter,Nutri-Score D,1.39
2144,De Zaanse Hoeve Roomboter ongezouten,Nutri-Score E,2.09
2143,De Zaanse Hoeve Roomboter gezouten,Nutri-Score E,2.09
1188,AH Marmercake,Nutri-Score E,3.19
1559,AH Roomboter marmercake,Nutri-Score E,1.99
243,AH Biologisch Roomboter ongezouten,Nutri-Score E,2.75
159,AH Biologisch Gezouten roomboter,Nutri-Score E,2.75
1974,AH Zaanlander Kaasvlinder,Nutri-Score E,2.69


## Adding interactions

Now we can make a function for interaction between the potential user and this functionallity. Therefore, we need to:

- Create a function for the user input; 
- Find products from the dataset similar to the users' input
- Suggest healthy products, including the Nutri-Score label and price.  

#### User input

The user input, in this case, can be seen as a grcoery list where users need to add products to a grocery list. 

In [19]:
grocery_list = []

In [None]:
user_input = input("Voeg (eetbare) producten toe aan je boodschappenlijstje.")

In [None]:
grocery_list.append(user_input)

In [None]:
print(grocery_list)

#### Find similar products

In [22]:
# With the library difflib.get_close_matches, user input will be matched with existing products from the dataset
import difflib

In [23]:
words = df1['Title']

match_list = difflib.get_close_matches(user_input, words)

In [24]:
# The user has to choose from one of the suggestions above in real life 
chosen_item = match_list
print(chosen_item)

[]


In [25]:
get_recommendations(chosen_item[1])[1:11].sort_values(['Nutri-Score'], ascending=[True]).head(10)



IndexError: list index out of range

#### Suggesting healthy products

In [41]:
# The user has to choose from one of the recommendations above
# print(df1.loc[2224])
healthy_products = [df1.loc[2224], df1.loc[19]]

print(healthy_products)

[index                                                        1088
Unnamed: 0                                                   1088
Unnamed: 0.1                                                 1268
Title                                       Isey Gezouten karamel
Nutri-Score                                         Nutri-Score A
Description     Skyr licht gezouten caramel. Bevat suiker en z...
Price                                                        2.29
soup                                    N u t r i - S c o r e   A
Name: 2224, dtype: object, index                                            3391
Unnamed: 0                                       3391
Unnamed: 0.1                                      214
Title            AH Aardbei fruitspread minder suiker
Nutri-Score                             Nutri-Score C
Description     Aardbei fruitspread met minder suiker
Price                                            1.99
soup                        N u t r i - S c o r e   C
Name: 19, dt

# Literature
Sharma, P. (2021, November 17). Content Based Recommender System: Part 2. Machine Mantra. Retrieved 4 January 2022, from https://machinemantra.in/content-based-recommender-system/