# Exploratory Data Analysis on Sephora Product Reviews

### Product Info Analysis

In [9]:
import pandas as pd
import numpy as np
import os
import plotly.express as px


In [18]:
DATA_PATH = '../data/raw/'
pd.set_option('display.width', 2000)

product_info_df = pd.read_csv(os.path.join(DATA_PATH, 'product_info.csv'))
print(product_info_df.shape)
print(product_info_df.columns)
print(product_info_df.head())

(8494, 27)
Index(['product_id', 'product_name', 'brand_id', 'brand_name', 'loves_count', 'rating', 'reviews', 'size', 'variation_type', 'variation_value', 'variation_desc', 'ingredients', 'price_usd', 'value_price_usd', 'sale_price_usd', 'limited_edition', 'new', 'online_only', 'out_of_stock', 'sephora_exclusive', 'highlights', 'primary_category', 'secondary_category', 'tertiary_category', 'child_count', 'child_max_price', 'child_min_price'], dtype='object')
  product_id               product_name  brand_id brand_name  loves_count  rating  reviews            size                      variation_type variation_value  ... online_only out_of_stock  sephora_exclusive                                         highlights  primary_category  secondary_category  tertiary_category  child_count  child_max_price  child_min_price
0    P473671    Fragrance Discovery Set      6342      19-69         6320  3.6364     11.0             NaN                                 NaN             NaN  ...           

In [3]:
reviews_df1 = pd.read_csv(os.path.join(DATA_PATH, 'reviews_0-250.csv'), index_col=0, dtype={'author_id': str})
reviews_df2 = pd.read_csv(os.path.join(DATA_PATH, 'reviews_250-500.csv'), index_col=0, dtype={'author_id': str})
reviews_df3 = pd.read_csv(os.path.join(DATA_PATH, 'reviews_500-750.csv'), index_col=0, dtype={'author_id': str})
reviews_df4 = pd.read_csv(os.path.join(DATA_PATH, 'reviews_750-1250.csv'), index_col=0, dtype={'author_id': str})
reviews_df5 = pd.read_csv(os.path.join(DATA_PATH, 'reviews_1250-end.csv'), index_col=0, dtype={'author_id': str})
all_reviews_df = pd.concat([reviews_df1, reviews_df2, reviews_df3, reviews_df4, reviews_df5], ignore_index=True)
print(all_reviews_df.shape)
print(all_reviews_df.head())

(1094411, 18)
     author_id  rating  is_recommended  helpfulness  total_feedback_count  total_neg_feedback_count  total_pos_feedback_count submission_time                                        review_text                      review_title skin_tone eye_color    skin_type hair_color product_id                                       product_name brand_name  price_usd
0   1741593524       5             1.0          1.0                     2                         0                         2      2023-02-01  I use this with the Nudestix “Citrus Clean Bal...  Taught me how to double cleanse!       NaN     brown          dry      black    P504322                     Gentle Hydra-Gel Face Cleanser   NUDESTIX       19.0
1  31423088263       1             0.0          NaN                     0                         0                         0      2023-03-21  I bought this lip mask after reading the revie...                      Disappointed       NaN       NaN          NaN        NaN    P4

In [4]:
product_info_df.describe(include='all')

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,...,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count,child_max_price,child_min_price
count,8494,8494,8494.0,8494,8494.0,8216.0,8216.0,6863,7050,6896,...,8494.0,8494.0,8494.0,6287,8494,8486,7504,8494.0,2754.0,2754.0
unique,8494,8415,,304,,,,2055,7,2729,...,,,,4417,9,41,118,,,
top,P473671,Fragrance Discovery Set,,SEPHORA COLLECTION,,,,1.7 oz/ 50 mL,Size,1.7 oz/ 50 mL,...,,,,"['Layerable Scent', 'Floral Scent']",Skincare,Women,Perfume,,,
freq,1,3,,352,,,,500,4043,374,...,,,,64,2420,875,568,,,
mean,,,5422.440546,,29179.57,4.194513,448.545521,,,,...,0.219096,0.073699,0.279374,,,,,1.631622,53.792023,39.665802
std,,,1709.595957,,66092.12,0.516694,1101.982529,,,,...,0.413658,0.261296,0.448718,,,,,5.37947,58.765894,38.68572
min,,,1063.0,,0.0,1.0,1.0,,,,...,0.0,0.0,0.0,,,,,0.0,3.0,3.0
25%,,,5333.0,,3758.0,3.981725,26.0,,,,...,0.0,0.0,0.0,,,,,0.0,22.0,19.0
50%,,,6157.5,,9880.0,4.28935,122.0,,,,...,0.0,0.0,0.0,,,,,0.0,32.0,28.0
75%,,,6328.0,,26841.25,4.530525,418.0,,,,...,0.0,0.0,1.0,,,,,1.0,59.0,42.0


In [5]:
# Lets check product_info_df which columns that similar with all_reviews_df
cols_to_use = product_info_df.columns.difference(all_reviews_df.columns)
cols_to_use = list(cols_to_use)
cols_to_use.append('product_id')
print(cols_to_use)

['brand_id', 'child_count', 'child_max_price', 'child_min_price', 'highlights', 'ingredients', 'limited_edition', 'loves_count', 'new', 'online_only', 'out_of_stock', 'primary_category', 'reviews', 'sale_price_usd', 'secondary_category', 'sephora_exclusive', 'size', 'tertiary_category', 'value_price_usd', 'variation_desc', 'variation_type', 'variation_value', 'product_id']


In [6]:
reviews_df = pd.merge(all_reviews_df, product_info_df[cols_to_use], how='left', on='product_id')
print(reviews_df.shape)
print(reviews_df.head())

(1094411, 40)
     author_id  rating  is_recommended  helpfulness  total_feedback_count  total_neg_feedback_count  total_pos_feedback_count submission_time                                        review_text                      review_title  ...  reviews sale_price_usd      secondary_category sephora_exclusive            size tertiary_category value_price_usd  variation_desc  variation_type  variation_value
0   1741593524       5             1.0          1.0                     2                         0                         2      2023-02-01  I use this with the Nudestix “Citrus Clean Bal...  Taught me how to double cleanse!  ...      1.0            NaN               Cleansers                 0  2.4 oz / 70 ml               NaN             NaN             NaN            Size   2.4 oz / 70 ml
1  31423088263       1             0.0          NaN                     0                         0                         0      2023-03-21  I bought this lip mask after reading the revie...

In [7]:
print(reviews_df.describe(include='all'))

         author_id        rating  is_recommended    helpfulness  total_feedback_count  total_neg_feedback_count  total_pos_feedback_count submission_time                                        review_text review_title  ...       reviews sale_price_usd secondary_category sephora_exclusive           size tertiary_category value_price_usd                   variation_desc  variation_type  variation_value
count      1094411  1.094411e+06   926423.000000  532819.000000          1.094411e+06              1.094411e+06              1.094411e+06         1094411                                            1092967       783757  ...  1.094411e+06    9753.000000            1094411      1.094411e+06        1051048            933155    30877.000000                             8283         1042956          1030978
unique      503216           NaN             NaN            NaN                   NaN                       NaN                       NaN            5317                                       

In [24]:
import plotly.express as px
import pandas as pd

# Count categories
category_counts = (
    product_info_df['primary_category']
    .dropna()
    .value_counts()
    .reset_index()
)
category_counts.columns = ['primary_category', 'count']
category_counts['percent'] = category_counts['count'] / category_counts['count'].sum()

# Threshold and grouping
threshold = 0.03
main_cats = category_counts[category_counts['percent'] >= threshold]
others_sum = category_counts[category_counts['percent'] < threshold]['count'].sum()

# Use pd.concat instead of .append
if others_sum > 0:
    others_row = pd.DataFrame([{
        'primary_category': 'Others',
        'count': others_sum,
        'percent': others_sum / category_counts['count'].sum()
    }])
    category_final = pd.concat([main_cats, others_row], ignore_index=True)
else:
    category_final = main_cats

# Plot
fig_cat = px.pie(
    category_final,
    names='primary_category',
    values='count',
    title='Distribution of Products by Primary Category (Grouped)',
    hole=0.3,
    width=700,
    height=700
)
fig_cat.update_traces(textposition='inside', textinfo='percent+label')
fig_cat.show()

In [None]:
# Ensure 'reviews' is numeric
product_info_df['reviews'] = pd.to_numeric(product_info_df['reviews'], errors='coerce')

# Get top 20 products by number of reviews
top_products = (
    product_info_df
    .sort_values(by='reviews', ascending=False)
    .head(20)
)

# Create the bar chart
fig = px.bar(
    top_products,
    x='product_name',
    y='reviews',
    color='brand_name',
    hover_data=['product_id', 'rating', 'loves_count'],
    title='Top 20 Most Reviewed Sephora Products',
)

# Adjust layout for thinner bars and a taller, narrower plot
fig.update_layout(
    xaxis_tickangle=-45,
    xaxis_title='Product Name',
    yaxis_title='Number of Reviews',
    bargap=0.3,
    height=700,
)

fig.show()