## Preperation

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

## Data Wragling

In [6]:
product = pd.read_csv("./dataset/final_perfume_data.csv", encoding="ISO-8859-1")
product.head()

Unnamed: 0,Name,Brand,Description,Notes,Image URL
0,Tihota Eau de Parfum,Indult,"Rapa Nui for sugar, Tihota is, quite simply, ...","Vanilla bean, musks",https://static.luckyscent.com/images/products/...
1,Sola Parfum,Di Ser,A tribute to the expanse of space extending f...,"Lavender, Yuzu, Lemongrass, Magnolia, Geraniu...",https://static.luckyscent.com/images/products/...
2,Kagiroi Parfum,Di Ser,An aromatic ode to the ancient beauty of Japa...,"Green yuzu, green shikuwasa, sansho seed, cor...",https://static.luckyscent.com/images/products/...
3,Velvet Fantasy Eau de Parfum,Montale,Velvet Fantasy is a solar fragrance where cit...,"tangerine, pink pepper, black coffee, leat...",https://static.luckyscent.com/images/products/...
4,A Blvd. Called Sunset Eau de Parfum,A Lab on Fire,There's no way A Lab On Fire could relocate t...,"Bergamot, almond, violet, jasmine, leather, s...",https://static.luckyscent.com/images/products/...


In [7]:
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2191 entries, 0 to 2190
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         2191 non-null   object
 1   Brand        2191 non-null   object
 2   Description  2191 non-null   object
 3   Notes        2111 non-null   object
 4   Image URL    2191 non-null   object
dtypes: object(5)
memory usage: 85.7+ KB


In [8]:
product.isnull().sum()

Name            0
Brand           0
Description     0
Notes          80
Image URL       0
dtype: int64

In [9]:
product[product["Notes"].isnull()]

Unnamed: 0,Name,Brand,Description,Notes,Image URL
641,Concrete Eau de Parfum,Comme des Garcons,The maestros of the unexpected have again tac...,,https://static.luckyscent.com/images/products/...
643,Escentric 01 Eau de Toilette,Escentric Molecules,"Second only to its sister scent Molecule 01, ...",,https://static.luckyscent.com/images/products/...
686,Molecule 02 Eau de Toilette,Escentric Molecules,Ambroxan is the ingredient around which Molec...,,https://static.luckyscent.com/images/products/...
687,Escentric 02 Eau de Toilette,Escentric Molecules,While its sister scent Molecule 02 is built a...,,https://static.luckyscent.com/images/products/...
720,Molecule 03 Eau de Toilette,Escentric Molecules,"As with the first two pairs, launched in 2006...",,https://static.luckyscent.com/images/products/...
...,...,...,...,...,...
2026,Absinth Parfum Extrait,Nasomatto,"""The fragrance aims to evoke degrees of hyste...",,https://static.luckyscent.com/images/products/...
2027,Narcotic V Parfum Extrait,Nasomatto,"""The fragrance is the result of a quest for t...",,https://static.luckyscent.com/images/products/...
2028,Silver Musk Parfum Extrait,Nasomatto,"""The fragrance aims to evoke superhero magnet...",,https://static.luckyscent.com/images/products/...
2029,Duro Parfum Extrait,Nasomatto,"""The fragrance aims to enhance all the manife...",,https://static.luckyscent.com/images/products/...


In [10]:
print("Count of duplicated: ", product.duplicated().sum())
product.describe()

Count of duplicated:  0


Unnamed: 0,Name,Brand,Description,Notes,Image URL
count,2191,2191,2191,2111,2191
unique,2184,249,2167,2053,2191
top,New York Intense Eau de Parfum,TOM FORD Private Blend,Dedicated to the cradle of the great civiliza...,"Bergamot, lemon, neroli, african marigold, bu...",https://static.luckyscent.com/images/products/...
freq,2,39,2,3,1


In [11]:
product = product.dropna()
product.isnull().sum()

Name           0
Brand          0
Description    0
Notes          0
Image URL      0
dtype: int64

## Exploratory Data Analaysis

In [12]:
product[product["Name"] == "New York Intense Eau de Parfum"]

Unnamed: 0,Name,Brand,Description,Notes,Image URL
409,New York Intense Eau de Parfum,Fragrance du Bois,It's not easy to capture one of the most dive...,"Cinnamon, Coriander, Orange, Blackberry, Bay ...",https://static.luckyscent.com/images/products/...
1573,New York Intense Eau de Parfum,PARFUMS DE NICOLAI,A classic from Nicolai now available in INTEN...,"Bergamot, Sicilian lemon, cloves, thyme, cinn...",https://static.luckyscent.com/images/products/...


In [13]:
product.groupby(by="Brand").Name.nunique().sort_values(ascending=False).head(10).reset_index()

Unnamed: 0,Brand,Name
0,TOM FORD Private Blend,36
1,Serge Lutens,36
2,L'Artisan Parfumeur,34
3,BYREDO,34
4,Montale,33
5,PARFUMS DE NICOLAI,33
6,Xerjoff,33
7,Profumum,32
8,Le Labo,32
9,Fragrance du Bois,31


In [14]:
note_list = product["Notes"].str.split(",")

unique_values = list(set().union(*note_list))

print("Count of notes: ", len(unique_values))
print(f"5 list of notes: {unique_values[:5]}")

Count of notes:  4003
5 list of notes: [' Black Ink Accord', '', ' Marine accords', ' iris cedre', ' Moroccan atlas cedar and cashmere wood']


In [15]:
note_list = [[note for note in sublist if note.strip() != ''] for sublist in note_list]

unique_values = list(set().union(*note_list))

unique_values[:5]

[' Black Ink Accord',
 ' Marine accords',
 ' iris cedre',
 ' Moroccan atlas cedar and cashmere wood',
 ' Bay Laurel']

In [16]:
product["Notes"] = note_list
product["Notes"] = product["Notes"].apply(lambda x: ", ".join(x))
product.head()

Unnamed: 0,Name,Brand,Description,Notes,Image URL
0,Tihota Eau de Parfum,Indult,"Rapa Nui for sugar, Tihota is, quite simply, ...","Vanilla bean, musks",https://static.luckyscent.com/images/products/...
1,Sola Parfum,Di Ser,A tribute to the expanse of space extending f...,"Lavender, Yuzu, Lemongrass, Magnolia, Ger...",https://static.luckyscent.com/images/products/...
2,Kagiroi Parfum,Di Ser,An aromatic ode to the ancient beauty of Japa...,"Green yuzu, green shikuwasa, sansho seed, ...",https://static.luckyscent.com/images/products/...
3,Velvet Fantasy Eau de Parfum,Montale,Velvet Fantasy is a solar fragrance where cit...,"tangerine, pink pepper, black coffee, l...",https://static.luckyscent.com/images/products/...
4,A Blvd. Called Sunset Eau de Parfum,A Lab on Fire,There's no way A Lab On Fire could relocate t...,"Bergamot, almond, violet, jasmine, leathe...",https://static.luckyscent.com/images/products/...


## Data Preperation

In [17]:
data = product[["Name", "Brand", "Notes"]].copy()
data.head()

Unnamed: 0,Name,Brand,Notes
0,Tihota Eau de Parfum,Indult,"Vanilla bean, musks"
1,Sola Parfum,Di Ser,"Lavender, Yuzu, Lemongrass, Magnolia, Ger..."
2,Kagiroi Parfum,Di Ser,"Green yuzu, green shikuwasa, sansho seed, ..."
3,Velvet Fantasy Eau de Parfum,Montale,"tangerine, pink pepper, black coffee, l..."
4,A Blvd. Called Sunset Eau de Parfum,A Lab on Fire,"Bergamot, almond, violet, jasmine, leathe..."


In [18]:
tf = TfidfVectorizer()

tf.fit(data["Notes"])

tf.get_feature_names_out()

array(['10', '100', '100k', ..., 'zanzibar', 'zdravetz', 'zest'],
      dtype=object)

In [19]:
tfidf_matrix = tf.fit_transform(data["Notes"])
tfidf_matrix.shape

(2111, 1855)

In [20]:
tfidf_matrix.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [21]:
pd.DataFrame(
    tfidf_matrix.todense(),
    columns=tf.get_feature_names_out(),
    index=data.Notes
).sample(10, axis=1).sample(10, axis=0)

Unnamed: 0_level_0,matã,by,jam,diabolically,aqua,bush,good,33,five,balls
Notes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"almond, orange, rose, candyfloss, hazelnut, vanilla, amber, musk",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Calabrese bergamot, Florida orange, Southern Italian neroli, Paraguayan petitgrain, rose wood, resins, oud wood, palisander, tonka, amber",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Orange, lemon, bergamot, fruit notes, amber, white musk, vanilla",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Bergamot, red currant, apple, pineapple, patchouli, birch, jasmine, thyme, incense, agarwood, musk, cedar",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"blackcurrant, tomato leaves, verbena, fig leaves, white musk",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Bergamot, mandarin, cardamom, saffron, whisky, amberwood, jasmine, rose, tiramisu, patchouli leaves, cacao, coffee, vanilla, oakmoss, cedarwood, white musk",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Blackberry, Ink, Rockrose, Opoponax, Iris, Davana, Myrrh, Cardamom, Ambergris, Storax, Sandalwood, Labdanum, Civet, Atlas Cedarwood, Birch, Musk",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Cherry blossom nectar, rose liquor, ylang ylang, amaretto, vanilla, tonka bean, sandalwood, cashmeran",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Lemon, muscat, mimosa, muguet, almond, sugar, honey, vanilla, iris, guaiacwood",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Lemon, nutmeg, geranium, TaÃ¯f rose essential oil, rose Damas absolute, rose musk",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.06772933, 0.02279778,
        0.03412006],
       [0.        , 1.        , 0.10308801, ..., 0.04072224, 0.        ,
        0.        ],
       [0.        , 0.10308801, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.06772933, 0.04072224, 0.        , ..., 1.        , 0.01124594,
        0.01683112],
       [0.02279778, 0.        , 0.        , ..., 0.01124594, 1.        ,
        0.00566538],
       [0.03412006, 0.        , 0.        , ..., 0.01683112, 0.00566538,
        1.        ]])

In [23]:
cosine_sim_df = pd.DataFrame(cosine_sim, index=data["Name"], columns=data["Name"])
print("Shape: ", cosine_sim_df.shape)

cosine_sim_df.sample(5, axis=1).sample(5, axis=0)

Shape:  (2111, 2111)


Name,Not a Perfume Eau de Parfum,Deja le Printemps Eau de Parfum,Dot Eau de Parfum,Dolceacqua Eau de Parfum,Rose de Siwa Eau de Parfum
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Oud Silk Mood - Extrait de Parfum,0.0,0.0,0.0,0.00987,0.0
Framboise Noire Eau de Parfum,0.0,0.0,0.0,0.020402,0.0
Colonia No. 1 Eau de Cologne,0.0,0.063358,0.0,0.0,0.0
Guimauve de Noel Eau de Parfum,0.0,0.140562,0.080595,0.0,0.0
Tempo Eau de Parfum,0.0,0.0,0.050947,0.0,0.060769


## Model Development

In [84]:
def product_recommendation(product_name, similarity_data=cosine_sim_df, items=data, k=5):
    index = similarity_data.loc[:, product_name].to_numpy().argpartition(
        range(-1, -k, -1)
    )

    closest = similarity_data.columns[index[-1:-(k+2):-1]]

    closest = closest.drop(product_name, errors="ignore")

    return pd.DataFrame(closest).merge(items).head(k)

In [78]:
data[data.Name.eq("Sola Parfum")]

Unnamed: 0,Name,Brand,Notes
1,Sola Parfum,Di Ser,"Lavender, Yuzu, Lemongrass, Magnolia, Ger..."


In [85]:
product_recommendation("Sola Parfum")

Unnamed: 0,Name,Brand,Notes
0,Bayolea Eau de Toilette,Penhaligons,"Lemongrass, mandarin, tangerine, cardamom,..."
1,Ambre Mythique Eau de Parfum,Maitre Parfumeur et Gantier,"Bergamot, geranium, coriander, incense, m..."
2,Shiragoromo Parfum,Di Ser,"Agarwood, yuzu, jasmine sambac, rose"
3,Eau My Soul Eau de Parfum,4160 Tuesdays,"Bergamot, lavender, honey, frankincense, ..."
4,Ishtar Eau de Toilette,Rogue Perfumery,"Frankincense, benzoin, juniper, myrrh, li..."
