# I. Import Library

In [80]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.compose import make_column_transformer
import pickle

# II. Data Loading and Preprocessing

In [2]:
df = pd.read_csv('sephora_website_dataset.csv')

In [3]:
df.head(2)

Unnamed: 0,id,brand,category,name,size,rating,number_of_reviews,love,price,value_price,...,MarketingFlags,MarketingFlags_content,options,details,how_to_use,ingredients,online_only,exclusive,limited_edition,limited_time_offer
0,2218774,Acqua Di Parma,Fragrance,Blu Mediterraneo MINIATURE Set,5 x 0.16oz/5mL,4.0,4,3002,66.0,75.0,...,True,online only,no options,This enchanting set comes in a specially handc...,Suggested Usage:-Fragrance is intensified by t...,Arancia di Capri Eau de Toilette: Alcohol Dena...,1,0,0,0
1,2044816,Acqua Di Parma,Cologne,Colonia,0.7 oz/ 20 mL,4.5,76,2700,66.0,66.0,...,True,online only,- 0.7 oz/ 20 mL Spray - 1.7 oz/ 50 mL Eau d...,An elegant timeless scent filled with a fresh-...,no instructions,unknown,1,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9168 entries, 0 to 9167
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      9168 non-null   int64  
 1   brand                   9168 non-null   object 
 2   category                9168 non-null   object 
 3   name                    9168 non-null   object 
 4   size                    9168 non-null   object 
 5   rating                  9168 non-null   float64
 6   number_of_reviews       9168 non-null   int64  
 7   love                    9168 non-null   int64  
 8   price                   9168 non-null   float64
 9   value_price             9168 non-null   float64
 10  URL                     9168 non-null   object 
 11  MarketingFlags          9168 non-null   bool   
 12  MarketingFlags_content  9168 non-null   object 
 13  options                 9168 non-null   object 
 14  details                 9168 non-null   

Tidak ditemukan missing value pada data

In [74]:
df.category.value_counts().nlargest(15)

Perfume                      665
Moisturizers                 451
Face Serums                  384
Value & Gift Sets            378
Face Wash & Cleansers        247
Face Masks                   230
Rollerballs & Travel Size    228
Hair Styling Products        224
Eye Palettes                 202
Lipstick                     191
Eye Creams & Treatments      191
Shampoo                      186
Face Brushes                 183
Highlighter                  169
Foundation                   163
Name: category, dtype: int64

Hanya dipilih category yang memang mempunyai banyak barang yaitu Perfume, Moisturizers, Face Serums, Face Wash & Cleansers dan Face Masks

In [5]:
data = df.query("category == ['Perfume', 'Moisturizers', 'Face Serums', 'Face Wash & Cleansers', 'Face Masks']")
data

Unnamed: 0,id,brand,category,name,size,rating,number_of_reviews,love,price,value_price,...,MarketingFlags,MarketingFlags_content,options,details,how_to_use,ingredients,online_only,exclusive,limited_edition,limited_time_offer
2,1417567,Acqua Di Parma,Perfume,Arancia di Capri,5 oz/ 148 mL,4.5,26,2600,180.0,180.0,...,True,online only,- 1oz/30mL Eau de Toilette - 2.5 oz/ 74 mL E...,Fragrance Family: Fresh Scent Type: Fresh Citr...,no instructions,Alcohol Denat.- Water- Fragrance- Limonene- Li...,1,0,0,0
3,1417617,Acqua Di Parma,Perfume,Mirto di Panarea,2.5 oz/ 74 mL,4.5,23,2900,120.0,120.0,...,True,online only,- 1 oz/ 30 mL Eau de Toilette Spray - 2.5 oz/...,Panarea near Sicily is an an island suspended ...,no instructions,unknown,1,0,0,0
5,1417609,Acqua Di Parma,Perfume,Fico di Amalfi,5 oz/ 148 mL,4.5,79,2600,180.0,180.0,...,True,online only,- 1oz/30mL Eau de Toilette - 2.5 oz/ 74 mL E...,Fragrance Family: Floral Scent Type: Fruity Fl...,no instructions,unknown,1,0,0,0
6,1638832,Acqua Di Parma,Perfume,Rosa Nobile,3.4 oz/ 101 mL,4.5,79,5000,210.0,210.0,...,True,online only,"- 0.7 oz, 20 mL Eau de Parfum Spray - 1.7 oz/...",Fragrance Family: Floral\r\n\r\nScent Type: Cl...,no instructions,unknown,1,0,0,0
9,2221596,Acqua Di Parma,Perfume,Rosa Nobile Hair Mist,1.7oz/50mL,3.0,5,2100,58.0,58.0,...,True,exclusive · online only,- 1.7oz/50mL Hair Mist,Fragrance Family: Floral\r\n\r\nScent Type: Cl...,Suggested Usage:-Spray over your hair to leave...,unknown,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9131,2208494,SEPHORA COLLECTION,Face Masks,The Blue Mask,no size,3.5,20,4100,6.0,6.0,...,True,limited edition · exclusive,no options,What it is: A limited-edition- ultra-toning fo...,Suggested Usage:-Unfold the mask.-Apply the ma...,-Sapphire Extract: Smooths tired-looking skin....,0,1,1,0
9141,2210284,SEPHORA COLLECTION,Face Masks,The Golden Mask,no size,4.0,22,5000,6.0,6.0,...,True,limited edition · exclusive,no options,What it is: A limited-edition- ultra-moisturiz...,Suggested Usage:-Unfold the mask.-Apply the ma...,-Honey Extract: Known for its moisturizing pow...,0,1,1,0
9148,1617489,SEPHORA COLLECTION,Face Masks,Mud Mask Purifying & Mattifying,no size,4.5,1000,48200,20.0,20.0,...,True,exclusive,no options,What it is: \r\nA mud mask featuring zinc and ...,Suggested Usage:-Use one to two times per week...,-Zinc and Copper: Purify.\r\n-White Clay: Sof...,0,1,0,0
9152,1662410,SEPHORA COLLECTION,Face Wash & Cleansers,Supreme Cleansing Oil,6.4 oz/ 190 mL,4.5,357,15800,15.0,15.0,...,True,exclusive,no options,What it is:\r\nA cleansing oil with the effect...,Suggested Usage:\r\n-Dispense one dose from th...,-Cottonseed Oil: Nourishes and provides smoot...,0,1,0,0


In [8]:
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,id,brand,category,name,size,rating,number_of_reviews,love,price,value_price,...,MarketingFlags,MarketingFlags_content,options,details,how_to_use,ingredients,online_only,exclusive,limited_edition,limited_time_offer
0,1417567,Acqua Di Parma,Perfume,Arancia di Capri,5 oz/ 148 mL,4.5,26,2600,180.0,180.0,...,True,online only,- 1oz/30mL Eau de Toilette - 2.5 oz/ 74 mL E...,Fragrance Family: Fresh Scent Type: Fresh Citr...,no instructions,Alcohol Denat.- Water- Fragrance- Limonene- Li...,1,0,0,0
1,1417617,Acqua Di Parma,Perfume,Mirto di Panarea,2.5 oz/ 74 mL,4.5,23,2900,120.0,120.0,...,True,online only,- 1 oz/ 30 mL Eau de Toilette Spray - 2.5 oz/...,Panarea near Sicily is an an island suspended ...,no instructions,unknown,1,0,0,0
2,1417609,Acqua Di Parma,Perfume,Fico di Amalfi,5 oz/ 148 mL,4.5,79,2600,180.0,180.0,...,True,online only,- 1oz/30mL Eau de Toilette - 2.5 oz/ 74 mL E...,Fragrance Family: Floral Scent Type: Fruity Fl...,no instructions,unknown,1,0,0,0
3,1638832,Acqua Di Parma,Perfume,Rosa Nobile,3.4 oz/ 101 mL,4.5,79,5000,210.0,210.0,...,True,online only,"- 0.7 oz, 20 mL Eau de Parfum Spray - 1.7 oz/...",Fragrance Family: Floral\r\n\r\nScent Type: Cl...,no instructions,unknown,1,0,0,0
4,2221596,Acqua Di Parma,Perfume,Rosa Nobile Hair Mist,1.7oz/50mL,3.0,5,2100,58.0,58.0,...,True,exclusive · online only,- 1.7oz/50mL Hair Mist,Fragrance Family: Floral\r\n\r\nScent Type: Cl...,Suggested Usage:-Spray over your hair to leave...,unknown,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1972,2208494,SEPHORA COLLECTION,Face Masks,The Blue Mask,no size,3.5,20,4100,6.0,6.0,...,True,limited edition · exclusive,no options,What it is: A limited-edition- ultra-toning fo...,Suggested Usage:-Unfold the mask.-Apply the ma...,-Sapphire Extract: Smooths tired-looking skin....,0,1,1,0
1973,2210284,SEPHORA COLLECTION,Face Masks,The Golden Mask,no size,4.0,22,5000,6.0,6.0,...,True,limited edition · exclusive,no options,What it is: A limited-edition- ultra-moisturiz...,Suggested Usage:-Unfold the mask.-Apply the ma...,-Honey Extract: Known for its moisturizing pow...,0,1,1,0
1974,1617489,SEPHORA COLLECTION,Face Masks,Mud Mask Purifying & Mattifying,no size,4.5,1000,48200,20.0,20.0,...,True,exclusive,no options,What it is: \r\nA mud mask featuring zinc and ...,Suggested Usage:-Use one to two times per week...,-Zinc and Copper: Purify.\r\n-White Clay: Sof...,0,1,0,0
1975,1662410,SEPHORA COLLECTION,Face Wash & Cleansers,Supreme Cleansing Oil,6.4 oz/ 190 mL,4.5,357,15800,15.0,15.0,...,True,exclusive,no options,What it is:\r\nA cleansing oil with the effect...,Suggested Usage:\r\n-Dispense one dose from th...,-Cottonseed Oil: Nourishes and provides smoot...,0,1,0,0


# III. Recommendation Using Cosine Similarity

In [10]:
tf = TfidfVectorizer(analyzer='word', stop_words='english')
ct = make_column_transformer((tf, 'ingredients'), (tf, 'category'))
matrix = ct.fit_transform(data)

In [11]:
cos_sim = cosine_similarity(matrix, matrix)

In [12]:
cos_sim[0]

array([1.        , 0.5       , 0.5       , ..., 0.00210024, 0.00631874,
       0.00258247])

In [13]:
data = data.reset_index()
names = data['name']
indices = pd.Series(data.index, index=data['name'])

In [76]:
def get_recommendations(name):
    idx = indices[name]
    sim_scores = list(enumerate(cos_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    name_indices = [i[0] for i in sim_scores]
    return names.iloc[name_indices][0:5]

In [78]:
get_recommendations('Rose Cleansing Foam')

741     Soy Makeup Removing Face Wash
762           Soy Face Cleansing Milk
1061            Ultra Facial Cleanser
167               Lathering Tube Soap
67           Treatment Cleansing Foam
Name: name, dtype: object

In [None]:
pickle.dump(cos_sim, open('cos_sim.pkl', 'wb'))

# IV. Simple Recommendation by Filtering

In [21]:
df_new = data[['brand', 'name', 'category', 'rating', 'love']]
df_new 

Unnamed: 0,brand,name,category,rating,love
0,Acqua Di Parma,Arancia di Capri,Perfume,4.5,2600
1,Acqua Di Parma,Mirto di Panarea,Perfume,4.5,2900
2,Acqua Di Parma,Fico di Amalfi,Perfume,4.5,2600
3,Acqua Di Parma,Rosa Nobile,Perfume,4.5,5000
4,Acqua Di Parma,Rosa Nobile Hair Mist,Perfume,3.0,2100
...,...,...,...,...,...
1972,SEPHORA COLLECTION,The Blue Mask,Face Masks,3.5,4100
1973,SEPHORA COLLECTION,The Golden Mask,Face Masks,4.0,5000
1974,SEPHORA COLLECTION,Mud Mask Purifying & Mattifying,Face Masks,4.5,48200
1975,SEPHORA COLLECTION,Supreme Cleansing Oil,Face Wash & Cleansers,4.5,15800


In [22]:
df_new['rating'].value_counts()

4.5    934
4.0    646
5.0    162
3.5    149
3.0     40
0.0     30
2.5      8
2.0      5
1.0      3
Name: rating, dtype: int64

In [23]:
df_new['category'].value_counts()

Perfume                  665
Moisturizers             451
Face Serums              384
Face Wash & Cleansers    247
Face Masks               230
Name: category, dtype: int64

In [24]:
df_new['love'].groupby(df_new['category']).describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Face Masks,230.0,21469.904348,36835.692508,0.0,2425.0,7550.0,23225.0,256200.0
Face Serums,384.0,15602.101562,30937.864606,0.0,1900.0,5250.0,13850.0,225500.0
Face Wash & Cleansers,247.0,14975.421053,27317.720434,0.0,2500.0,5100.0,14200.0,195000.0
Moisturizers,451.0,12365.017738,25056.247135,0.0,1700.0,4300.0,10250.0,233700.0
Perfume,665.0,6953.996992,12732.625647,0.0,1300.0,3100.0,7600.0,144700.0


In [67]:
def get_recommendations_new(input):
    new_rec = data.loc[(data['category'] == input) & (data['rating'] == 4.5) & (data['love'] >= 80000)]
    rec = new_rec['name'].to_numpy()
    return rec[0:5]

In [68]:
get_recommendations_new('Face Wash & Cleansers')

array(['MakeUp-BreakUp Cool Cleansing Oil',
       'Detoxifying Black Charcoal Cleanser',
       'Take The Day Off Cleansing Balm',
       'Green Clean Makeup Removing Cleansing Balm',
       'Pure Skin Face Cleanser'], dtype=object)

In [69]:
get_recommendations_new('Face Masks')

array(['T.L.C. Sukari Babyfacial™  25% AHA + 2% BHA Mask',
       'Umbrian Clay Pore Purifying Face Mask',
       'THIRSTYMUD™ Hydrating Treatment Mask',
       'Blue Tansy Fruit Enzyme Resurfacing Clarity Mask',
       'Water Sleeping Mask'], dtype=object)

In [70]:
get_recommendations_new('Face Serums')

array(['T.L.C. Framboos™ Glycolic Resurfacing Night Serum',
       'B-Hydra™ Intensive Hydration Serum', 'Lactic Acid 10% + HA',
       'Granactive Retinoid* 2% Emulsion',
       'Violet-C Brightening Serum 20% Vitamin C + 10% AHA'], dtype=object)

In [71]:
get_recommendations_new('Moisturizers')

array(['The True Cream Moisturizing Bomb', 'The True Cream Aqua Bomb',
       'Vitamin Enriched Face Base Priming Moisturizer',
       'Dramatically Different Moisturizing Gel',
       'Ultra Repair® Cream Intense Hydration'], dtype=object)

In [72]:
get_recommendations_new('Perfume')

array(['COCO MADEMOISELLE Eau de Parfum', 'Chloé Eau de Parfum', 'Daisy',
       'Bright Crystal', 'Flowerbomb'], dtype=object)

# V. Data Export to CSV

In [74]:
data.to_csv('data_clean.csv', index=False)