In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import pickle

In [11]:
df = pd.read_csv("perfume_data.csv", encoding= 'unicode_escape')
df.head()

Unnamed: 0,Name,Brand,Description,Notes,Image URL
0,Tihota Eau de Parfum,Indult,"Rapa Nui for sugar, Tihota is, quite simply, ...","Vanilla bean, musks",https://static.luckyscent.com/images/products/...
1,Sola Parfum,Di Ser,A tribute to the expanse of space extending f...,"Lavender, Yuzu, Lemongrass, Magnolia, Geraniu...",https://static.luckyscent.com/images/products/...
2,Kagiroi Parfum,Di Ser,An aromatic ode to the ancient beauty of Japa...,"Green yuzu, green shikuwasa, sansho seed, cor...",https://static.luckyscent.com/images/products/...
3,Velvet Fantasy Eau de Parfum,Montale,Velvet Fantasy is a solar fragrance where cit...,"tangerine, pink pepper, black coffee, leat...",https://static.luckyscent.com/images/products/...
4,A Blvd. Called Sunset Eau de Parfum,A Lab on Fire,There's no way A Lab On Fire could relocate t...,"Bergamot, almond, violet, jasmine, leather, s...",https://static.luckyscent.com/images/products/...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2191 entries, 0 to 2190
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         2191 non-null   object
 1   Brand        2191 non-null   object
 2   Description  2191 non-null   object
 3   Notes        2111 non-null   object
 4   Image URL    2191 non-null   object
dtypes: object(5)
memory usage: 85.7+ KB


In [4]:
df.describe()

Unnamed: 0,Name,Brand,Description,Notes,Image URL
count,2191,2191,2191,2111,2191
unique,2184,249,2167,2053,2191
top,New York Intense Eau de Parfum,TOM FORD Private Blend,Dedicated to the cradle of the great civiliza...,"Bergamot, lemon, neroli, african marigold, bu...",https://static.luckyscent.com/images/products/...
freq,2,39,2,3,1


In [5]:
df = df.drop_duplicates(subset=['Name'], keep='first')

In [6]:
df.isnull().sum()

Name            0
Brand           0
Description     0
Notes          80
Image URL       0
dtype: int64

In [7]:
df = df.dropna()
df = df.reset_index(drop=True)

In [8]:
df['Description'] = df['Description'].str.lower()
df['Notes'] = df['Notes'].str.lower()
df['Name'] = df['Name'].str.lower()

In [9]:
perfume_types = ['eau de parfum', 'extrait de parfum', 'parfum', 'extrait', 'perfume oil', 'fragrance oil', 
                'eau de toilette', 'parfum intense', 'parfum extrait', 'eau de cologne', 'roll-on perfume oil', 
                'pure oud oil', 'eau fraiche', 'cologne absolute']

pattern = '|'.join(perfume_types)
pattern += '|[./\'-]'
df['Name'] = df['Name'].str.replace(pattern, '', regex=True).str.strip()

In [10]:
df.to_csv('modified_perfume_data.csv')

In [188]:
subset = df['Description'] + " " + df['Notes']

In [189]:
vectorizer = TfidfVectorizer()
subset_vectors = vectorizer.fit_transform(subset.values.astype('U'))

In [190]:
similarity_matrix = cosine_similarity(subset_vectors, subset_vectors)

In [191]:
similarity_df = pd.DataFrame(similarity_matrix, index=df['Name'], columns=df['Name'])

In [192]:
similarity_df.head()

Name,tihota,sola,kagiroi,velvet fantasy,a blvd called sunset,freckled and beautiful,exit the king,eshu,saringkarn,arsalan,...,dzing!,perfect nectar,coquette tropique,kai,to twirl all girly,perfect veil,scent,bronze,monyette paris,1270
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tihota,1.0,0.101516,0.130306,0.163248,0.153183,0.136134,0.111867,0.067208,0.095389,0.077135,...,0.110072,0.035319,0.083446,0.100173,0.121669,0.085657,0.07361,0.127799,0.134325,0.121618
sola,0.101516,1.0,0.238143,0.103038,0.094591,0.087879,0.052017,0.058251,0.084172,0.069721,...,0.099547,0.054322,0.070937,0.037472,0.066311,0.055667,0.065064,0.087397,0.023428,0.073861
kagiroi,0.130306,0.238143,1.0,0.162245,0.107602,0.110604,0.068694,0.078763,0.103995,0.078716,...,0.106781,0.110484,0.078007,0.050514,0.098511,0.076095,0.085585,0.105515,0.044598,0.101578
velvet fantasy,0.163248,0.103038,0.162245,1.0,0.145281,0.134627,0.10435,0.092629,0.143212,0.106164,...,0.106703,0.131469,0.091258,0.059338,0.089989,0.064916,0.081136,0.130597,0.090888,0.130989
a blvd called sunset,0.153183,0.094591,0.107602,0.145281,1.0,0.171945,0.097853,0.057861,0.116642,0.076095,...,0.086061,0.042821,0.054239,0.100544,0.097178,0.066219,0.076696,0.099107,0.052165,0.084327


In [223]:
def clean_perfume_name(name, pattern):    
    cleaned_name = re.sub(pattern, '', name, flags=re.IGNORECASE).strip()
    return cleaned_name

In [224]:
def recommend_perfumes(liked_perfumes, similarity_df, num=5):
    
    liked_perfumes = [clean_perfume_name(perfume.lower(), pattern) for perfume in liked_perfumes]
    if not all(perfume in similarity_df.index for perfume in liked_perfumes):
        raise ValueError("one or more liked perfumes are not in the similarity matrix.")

    agg_scores = similarity_df.loc[liked_perfumes].sum(axis=0)
    agg_scores = agg_scores.drop(labels=liked_perfumes)
    recomms = agg_scores.sort_values(ascending=False).head(num)
    
    return recomms.index.tolist()

In [225]:
recommend_perfumes(["A Blvd. Called Sunset"], similarity_df)

['los angeles',
 'city of jasmine',
 'musc moschus',
 'new york',
 'almond harmony']

In [218]:
with open('similarity_df.pkl', 'wb') as f:
    pickle.dump(similarity_df, f)