## Preperation

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

## Data Wragling

In [2]:
product = pd.read_csv("./dataset/final_perfume_data.csv", encoding="ISO-8859-1")
product.head()

Unnamed: 0,Name,Brand,Description,Notes,Image URL
0,Tihota Eau de Parfum,Indult,"Rapa Nui for sugar, Tihota is, quite simply, ...","Vanilla bean, musks",https://static.luckyscent.com/images/products/...
1,Sola Parfum,Di Ser,A tribute to the expanse of space extending f...,"Lavender, Yuzu, Lemongrass, Magnolia, Geraniu...",https://static.luckyscent.com/images/products/...
2,Kagiroi Parfum,Di Ser,An aromatic ode to the ancient beauty of Japa...,"Green yuzu, green shikuwasa, sansho seed, cor...",https://static.luckyscent.com/images/products/...
3,Velvet Fantasy Eau de Parfum,Montale,Velvet Fantasy is a solar fragrance where cit...,"tangerine, pink pepper, black coffee, leat...",https://static.luckyscent.com/images/products/...
4,A Blvd. Called Sunset Eau de Parfum,A Lab on Fire,There's no way A Lab On Fire could relocate t...,"Bergamot, almond, violet, jasmine, leather, s...",https://static.luckyscent.com/images/products/...


In [3]:
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2191 entries, 0 to 2190
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         2191 non-null   object
 1   Brand        2191 non-null   object
 2   Description  2191 non-null   object
 3   Notes        2111 non-null   object
 4   Image URL    2191 non-null   object
dtypes: object(5)
memory usage: 85.7+ KB


In [4]:
print("Count of duplicated: ", product.duplicated().sum())
product.describe()

Count of duplicated:  0


Unnamed: 0,Name,Brand,Description,Notes,Image URL
count,2191,2191,2191,2111,2191
unique,2184,249,2167,2053,2191
top,New York Intense Eau de Parfum,TOM FORD Private Blend,Dedicated to the cradle of the great civiliza...,"Bergamot, lemon, neroli, african marigold, bu...",https://static.luckyscent.com/images/products/...
freq,2,39,2,3,1


## Exploratory Data Analaysis

In [5]:
product[product["Name"] == "New York Intense Eau de Parfum"]

Unnamed: 0,Name,Brand,Description,Notes,Image URL
409,New York Intense Eau de Parfum,Fragrance du Bois,It's not easy to capture one of the most dive...,"Cinnamon, Coriander, Orange, Blackberry, Bay ...",https://static.luckyscent.com/images/products/...
1573,New York Intense Eau de Parfum,PARFUMS DE NICOLAI,A classic from Nicolai now available in INTEN...,"Bergamot, Sicilian lemon, cloves, thyme, cinn...",https://static.luckyscent.com/images/products/...


In [6]:
product.groupby(by="Brand").Name.nunique().sort_values(ascending=False).head(10).reset_index()

Unnamed: 0,Brand,Name
0,TOM FORD Private Blend,39
1,Profumum,38
2,Serge Lutens,36
3,BYREDO,35
4,L'Artisan Parfumeur,34
5,Xerjoff,34
6,Montale,33
7,PARFUMS DE NICOLAI,33
8,Le Labo,32
9,Fragrance du Bois,31


In [7]:
note_list = product["Notes"].astype(str).str.split(",")

unique_values = list(set().union(*note_list))

print("Count of notes: ", len(unique_values))
print(f"5 list of notes: {unique_values[:5]}")

Count of notes:  4004
5 list of notes: [' red thyme', '', '  oakmoss', ' Blond Tobacco', ' black African olibanum']


## Data Preperation

In [8]:
notes = product["Notes"].astype(str).str.split(",")
notes = notes.apply(lambda x: [item for item in x if item != ""])
notes

0                                 [ Vanilla bean,  musks]
1       [ Lavender,  Yuzu,  Lemongrass,  Magnolia,  Ge...
2       [ Green yuzu,  green shikuwasa,  sansho seed, ...
3       [ tangerine,   pink pepper,   black coffee,   ...
4       [ Bergamot,  almond,  violet,  jasmine,  leath...
                              ...                        
2186    [ top: lemon,  bergamot base: musk,  vanilla, ...
2187    [ amber,  jasmine tea,  mother of pearl hibisc...
2188    [ bergamot,  mandarine,  cinnamon,  jasmine,  ...
2189    [ Tahitian gardenia,  French muguet du bois,  ...
2190    [ exotic woods,  spice,  raisin,  vine flowers...
Name: Notes, Length: 2191, dtype: object

In [9]:
unique_values = []

for sublist in notes:
    for note in sublist:
        if note not in unique_values:
            unique_values.append(note)

len(unique_values)

4003

In [10]:
data = product[["Name", "Brand"]].copy()
data["Notes"] = notes.apply(lambda x: ", ".join(x))
data.head()

Unnamed: 0,Name,Brand,Notes
0,Tihota Eau de Parfum,Indult,"Vanilla bean, musks"
1,Sola Parfum,Di Ser,"Lavender, Yuzu, Lemongrass, Magnolia, Ger..."
2,Kagiroi Parfum,Di Ser,"Green yuzu, green shikuwasa, sansho seed, ..."
3,Velvet Fantasy Eau de Parfum,Montale,"tangerine, pink pepper, black coffee, l..."
4,A Blvd. Called Sunset Eau de Parfum,A Lab on Fire,"Bergamot, almond, violet, jasmine, leathe..."


In [11]:
tf = TfidfVectorizer()

tf.fit(data["Notes"])

tf.get_feature_names_out()

array(['10', '100', '100k', ..., 'zanzibar', 'zdravetz', 'zest'],
      dtype=object)

In [12]:
tfidf_matrix = tf.fit_transform(data["Notes"])
tfidf_matrix.shape

(2191, 1855)

In [13]:
tfidf_matrix.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
pd.DataFrame(
    tfidf_matrix.todense(),
    columns=tf.get_feature_names_out(),
    index=data.Notes
).sample(10, axis=1).sample(10, axis=0)

Unnamed: 0_level_0,crushed,irian,timbersilk,adrenaline,sumi,east,been,grasse,bulgarian,kola
Notes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"Grapefruit, bergamot, lemon, mate, gurjum balsam, amber wood, tolu, benzoin, labdanum, red tea, tonka, styrax, vanilla, musk",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"fig, fig leaf, milk of almond, sandalwood, coconut",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Lime, Coriander, ginger, sandalwood, cedarwood, musc, vanilla, amber wood",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"White oud, cistus labdanum, patchouli, olibanum, pink pepper, gurjun balsam, amber, moss, vetiver",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Ambergris, frankincense, myrrh, labdanum, castoreum, civet, leather, oud, musk",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"orange, pepper, coriander, cardamom, cumin, olibanum, benzoin, myrrh, cistus, sandalwood, cedar, vetiver, everlasting flower, oakmoss, musk",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Jasmine, rose essence, ylang ylang, orris, ivy greens, galbanumb, vanilla, vetiver.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"raisin, coffee, copaiba, cinnamon, dried fruits (prune and cherry), rum absolute, mandarin, amber, tobacco, oakwood, vetiver, resins, labdanum absolute, tea rose, leather, seaweed absolute",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Papaya, ylang ylang, tangerine, green teas, white flowers",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Bergamot, lemon, saffron, Japanese plum blossom, rose, violet, oud, patchouli, sandalwood, moss, frankincense, white musk, labdanum, ambergris",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.06871499, 0.02321562,
        0.03469295],
       [0.        , 1.        , 0.10294877, ..., 0.0412545 , 0.        ,
        0.        ],
       [0.        , 0.10294877, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.06871499, 0.0412545 , 0.        , ..., 1.        , 0.0114697 ,
        0.01714009],
       [0.02321562, 0.        , 0.        , ..., 0.0114697 , 1.        ,
        0.00579084],
       [0.03469295, 0.        , 0.        , ..., 0.01714009, 0.00579084,
        1.        ]])

In [16]:
cosine_sim_df = pd.DataFrame(cosine_sim, index=data["Name"], columns=data["Name"])
print("Shape: ", cosine_sim_df.shape)

cosine_sim_df.sample(5, axis=1).sample(5, axis=0)

Shape:  (2191, 2191)


Name,Douce Amere Eau de Parfum,Craft Eau De Parfum,Tenebrae Eau de Parfum,L'Air Des Alpes Suisses Eau de Parfum,Mile High Eau de Parfum
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Noir de Noir Travel Atomizer,0.0,0.007698,0.0,0.0,0.013834
Homme Eau de Parfum,0.063654,0.032843,0.0,0.101594,0.059024
Humus Eau de Parfum,0.0,0.0,0.0,0.0,0.0
Bertrand Duchaufour Eau de Toilette,0.0,0.0,0.0,0.078834,0.0
Akanesasu Eau de Parfum,0.047824,0.0,0.0,0.0,0.0


## Model Development

In [36]:
def product_recommendation(product_name, similarity_data=cosine_sim_df, items=data, k=5):
    index = similarity_data.loc[:, product_name].to_numpy().argpartition(
        range(-1, -k, -1)
    )

    closest = similarity_data.columns[index[-1:-(k+2):-1]]

    closest = closest.drop(product_name, errors="ignore")

    return pd.DataFrame(closest).merge(items).head(k)

In [37]:
data[data.Name.eq("Sola Parfum")]

Unnamed: 0,Name,Brand,Notes
1,Sola Parfum,Di Ser,"Lavender, Yuzu, Lemongrass, Magnolia, Ger..."


In [38]:
product_recommendation("Sola Parfum")

Unnamed: 0,Name,Brand,Notes
0,Bayolea Eau de Toilette,Penhaligons,"Lemongrass, mandarin, tangerine, cardamom,..."
1,Ambre Mythique Eau de Parfum,Maitre Parfumeur et Gantier,"Bergamot, geranium, coriander, incense, m..."
2,Shiragoromo Parfum,Di Ser,"Agarwood, yuzu, jasmine sambac, rose"
3,Eau My Soul Eau de Parfum,4160 Tuesdays,"Bergamot, lavender, honey, frankincense, ..."
4,Frankincense - Myrrh - Rose Maroc perfume oil,Regina Harris,"frankincense, myrrh, rose maroc"
