# Exploratory Data Analysis on Sephora Product Reviews

### Product Info Analysis

In [2]:
import pandas as pd
import numpy as np
import os
import plotly.express as px


In [3]:
DATA_PATH = '../data/raw/'
pd.set_option('display.width', 2000)

product_info_df = pd.read_csv(os.path.join(DATA_PATH, 'product_info.csv'))
print(product_info_df.shape)
print(product_info_df.columns)
print(product_info_df.head())

(8494, 27)
Index(['product_id', 'product_name', 'brand_id', 'brand_name', 'loves_count', 'rating', 'reviews', 'size', 'variation_type', 'variation_value', 'variation_desc', 'ingredients', 'price_usd', 'value_price_usd', 'sale_price_usd', 'limited_edition', 'new', 'online_only', 'out_of_stock', 'sephora_exclusive', 'highlights', 'primary_category', 'secondary_category', 'tertiary_category', 'child_count', 'child_max_price', 'child_min_price'], dtype='object')
  product_id               product_name  brand_id brand_name  loves_count  rating  reviews            size                      variation_type variation_value  ... online_only out_of_stock  sephora_exclusive                                         highlights  primary_category  secondary_category  tertiary_category  child_count  child_max_price  child_min_price
0    P473671    Fragrance Discovery Set      6342      19-69         6320  3.6364     11.0             NaN                                 NaN             NaN  ...           

In [4]:
reviews_df1 = pd.read_csv(os.path.join(DATA_PATH, 'reviews_0-250.csv'), index_col=0, dtype={'author_id': str})
reviews_df2 = pd.read_csv(os.path.join(DATA_PATH, 'reviews_250-500.csv'), index_col=0, dtype={'author_id': str})
reviews_df3 = pd.read_csv(os.path.join(DATA_PATH, 'reviews_500-750.csv'), index_col=0, dtype={'author_id': str})
reviews_df4 = pd.read_csv(os.path.join(DATA_PATH, 'reviews_750-1250.csv'), index_col=0, dtype={'author_id': str})
reviews_df5 = pd.read_csv(os.path.join(DATA_PATH, 'reviews_1250-end.csv'), index_col=0, dtype={'author_id': str})
all_reviews_df = pd.concat([reviews_df1, reviews_df2, reviews_df3, reviews_df4, reviews_df5], ignore_index=True)
print(all_reviews_df.shape)
print(all_reviews_df.columns)
print(all_reviews_df.head())

(1094411, 18)
Index(['author_id', 'rating', 'is_recommended', 'helpfulness', 'total_feedback_count', 'total_neg_feedback_count', 'total_pos_feedback_count', 'submission_time', 'review_text', 'review_title', 'skin_tone', 'eye_color', 'skin_type', 'hair_color', 'product_id', 'product_name', 'brand_name', 'price_usd'], dtype='object')
     author_id  rating  is_recommended  helpfulness  total_feedback_count  total_neg_feedback_count  total_pos_feedback_count submission_time                                        review_text                      review_title skin_tone eye_color    skin_type hair_color product_id                                       product_name brand_name  price_usd
0   1741593524       5             1.0          1.0                     2                         0                         2      2023-02-01  I use this with the Nudestix “Citrus Clean Bal...  Taught me how to double cleanse!       NaN     brown          dry      black    P504322                     Gentle Hy

In [5]:
product_info_df.describe(include='all')

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,...,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count,child_max_price,child_min_price
count,8494,8494,8494.0,8494,8494.0,8216.0,8216.0,6863,7050,6896,...,8494.0,8494.0,8494.0,6287,8494,8486,7504,8494.0,2754.0,2754.0
unique,8494,8415,,304,,,,2055,7,2729,...,,,,4417,9,41,118,,,
top,P473671,Fragrance Discovery Set,,SEPHORA COLLECTION,,,,1.7 oz/ 50 mL,Size,1.7 oz/ 50 mL,...,,,,"['Layerable Scent', 'Floral Scent']",Skincare,Women,Perfume,,,
freq,1,3,,352,,,,500,4043,374,...,,,,64,2420,875,568,,,
mean,,,5422.440546,,29179.57,4.194513,448.545521,,,,...,0.219096,0.073699,0.279374,,,,,1.631622,53.792023,39.665802
std,,,1709.595957,,66092.12,0.516694,1101.982529,,,,...,0.413658,0.261296,0.448718,,,,,5.37947,58.765894,38.68572
min,,,1063.0,,0.0,1.0,1.0,,,,...,0.0,0.0,0.0,,,,,0.0,3.0,3.0
25%,,,5333.0,,3758.0,3.981725,26.0,,,,...,0.0,0.0,0.0,,,,,0.0,22.0,19.0
50%,,,6157.5,,9880.0,4.28935,122.0,,,,...,0.0,0.0,0.0,,,,,0.0,32.0,28.0
75%,,,6328.0,,26841.25,4.530525,418.0,,,,...,0.0,0.0,1.0,,,,,1.0,59.0,42.0


In [6]:
# Lets check product_info_df which columns that similar with all_reviews_df
cols_to_use = product_info_df.columns.difference(all_reviews_df.columns)
cols_to_use = list(cols_to_use)
cols_to_use.append('product_id')
print(cols_to_use)

['brand_id', 'child_count', 'child_max_price', 'child_min_price', 'highlights', 'ingredients', 'limited_edition', 'loves_count', 'new', 'online_only', 'out_of_stock', 'primary_category', 'reviews', 'sale_price_usd', 'secondary_category', 'sephora_exclusive', 'size', 'tertiary_category', 'value_price_usd', 'variation_desc', 'variation_type', 'variation_value', 'product_id']


In [7]:
reviews_df = pd.merge(all_reviews_df, product_info_df[cols_to_use], how='left', on='product_id')
print(reviews_df.shape)
print(reviews_df.head())

(1094411, 40)
     author_id  rating  is_recommended  helpfulness  total_feedback_count  total_neg_feedback_count  total_pos_feedback_count submission_time                                        review_text                      review_title  ...  reviews sale_price_usd      secondary_category sephora_exclusive            size tertiary_category value_price_usd  variation_desc  variation_type  variation_value
0   1741593524       5             1.0          1.0                     2                         0                         2      2023-02-01  I use this with the Nudestix “Citrus Clean Bal...  Taught me how to double cleanse!  ...      1.0            NaN               Cleansers                 0  2.4 oz / 70 ml               NaN             NaN             NaN            Size   2.4 oz / 70 ml
1  31423088263       1             0.0          NaN                     0                         0                         0      2023-03-21  I bought this lip mask after reading the revie...

In [8]:
print(reviews_df.describe(include='all'))

         author_id        rating  is_recommended    helpfulness  total_feedback_count  total_neg_feedback_count  total_pos_feedback_count submission_time                                        review_text review_title  ...       reviews sale_price_usd secondary_category sephora_exclusive           size tertiary_category value_price_usd                   variation_desc  variation_type  variation_value
count      1094411  1.094411e+06   926423.000000  532819.000000          1.094411e+06              1.094411e+06              1.094411e+06         1094411                                            1092967       783757  ...  1.094411e+06    9753.000000            1094411      1.094411e+06        1051048            933155    30877.000000                             8283         1042956          1030978
unique      503216           NaN             NaN            NaN                   NaN                       NaN                       NaN            5317                                       

In [9]:
import plotly.express as px
import pandas as pd

# Count categories
category_counts = (
    product_info_df['primary_category']
    .dropna()
    .value_counts()
    .reset_index()
)
category_counts.columns = ['primary_category', 'count']
category_counts['percent'] = category_counts['count'] / category_counts['count'].sum()

# Threshold and grouping
threshold = 0.03
main_cats = category_counts[category_counts['percent'] >= threshold]
others_sum = category_counts[category_counts['percent'] < threshold]['count'].sum()

# Use pd.concat instead of .append
if others_sum > 0:
    others_row = pd.DataFrame([{
        'primary_category': 'Others',
        'count': others_sum,
        'percent': others_sum / category_counts['count'].sum()
    }])
    category_final = pd.concat([main_cats, others_row], ignore_index=True)
else:
    category_final = main_cats

# Plot
fig_cat = px.pie(
    category_final,
    names='primary_category',
    values='count',
    title='Distribution of Products by Primary Category (Grouped)',
    hole=0.3,
    width=700,
    height=700
)
fig_cat.update_traces(textposition='inside', textinfo='percent+label')
fig_cat.show()

In [129]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Step 1: Ensure rating is numeric and clean
all_reviews_df['rating'] = pd.to_numeric(all_reviews_df['rating'], errors='coerce')
all_reviews_df = all_reviews_df.dropna(subset=['rating', 'product_id'])

# Step 2: Label review type
all_reviews_df['review_type'] = all_reviews_df['rating'].apply(lambda x: 'Good (≥ 4⭐)' if x >= 4 else 'Bad (< 4⭐)')

# Step 3: Get top reviewed products
review_totals = all_reviews_df.groupby('product_id').size().reset_index(name='total_reviews')
top_x_ids = review_totals.sort_values(by='total_reviews', ascending=False).head(40)

# Step 4: Filter reviews
filtered_reviews = all_reviews_df[all_reviews_df['product_id'].isin(top_x_ids['product_id'])]

# Step 5: Count good/bad reviews per product
review_counts = (
    filtered_reviews
    .groupby(['product_id', 'review_type'], observed=True)
    .size()
    .reset_index(name='count')
)

# Step 6: Add product names & brands
review_counts = pd.merge(
    review_counts,
    product_info_df[['product_id', 'product_name', 'brand_name']],
    on='product_id',
    how='left'
)

# Step 7: Preserve product order
product_order = pd.merge(top_x_ids, product_info_df[['product_id', 'product_name']], on='product_id')
ordered_names = product_order['product_name'].tolist()
review_counts['product_name'] = pd.Categorical(review_counts['product_name'], categories=ordered_names, ordered=True)

# Step 8: Create initial bar chart
fig = px.bar(
    review_counts.sort_values('product_name'),
    x='product_name',
    y='count',
    color='review_type',
    text='count',
    hover_data=['brand_name'],
    title='Top 40 Most Reviewed Products — Good vs. Bad Review Count',
    barmode='stack',
    category_orders={"product_name": ordered_names},
    color_discrete_map={
        'Good (≥ 4⭐)': 'rgba(0, 0, 0, 0.95)',
        'Bad (< 4⭐)': 'rgba(255, 0, 0, 0.95)'
    }
)

# --- Add background stripes manually ---
# Estimate Y-axis max
y_max = review_counts.groupby('product_name')['count'].sum().max() * 1.1
band_count = 7
band_height = y_max / band_count

for i in range(0, band_count, 2):
    fig.add_shape(
        type='rect',
        xref='paper',
        yref='y',
        x0=0,
        x1=1,
        y0=i * band_height,
        y1=(i + 1) * band_height,
        fillcolor='rgba(0, 0, 0, 0.07)',
        line_width=0,
        layer='below'
    )

# --- Final styling tweaks ---
fig.update_layout(
    xaxis=dict(showgrid=False),
    xaxis_tickangle=-45,
    xaxis_title='Product Name',
    yaxis=dict(
        title='Number of Reviews',
        range=[0, y_max],
        showgrid=False
    ),
    plot_bgcolor='white',
    height=800,
    margin=dict(t=80, b=60, l=60, r=60),
    font=dict(family='Arial', size=14),
    showlegend=True
)

fig.show()





In [None]:
import plotly.graph_objects as go
import pandas as pd

# Clean and prepare data
df = product_info_df.dropna(subset=['price_usd', 'loves_count', 'primary_category']).copy()
categories = df['primary_category'].unique()

# Compute true max value for the y-axis (from both raw and smoothed values)
rolling_max = (
    df.groupby('primary_category')['loves_count']
    .apply(lambda x: x.sort_values().rolling(window=50, min_periods=1).mean().max())
).max()

y_max_loves = max(df['loves_count'].max(), rolling_max) * 1.05

# Build traces and dropdown buttons
traces = []
buttons = []

for i, cat in enumerate(categories):
    df_cat = df[df['primary_category'] == cat].sort_values(by='price_usd')
    smooth_loves = df_cat['loves_count'].rolling(window=50, min_periods=1).mean()

    # Heart markers (transparent red)
    trace_points = go.Scatter(
        x=df_cat['price_usd'],
        y=df_cat['loves_count'],
        mode='text',
        name='Loves Count',
        text=['♥'] * len(df_cat),
        textfont=dict(size=16, color='rgba(255, 50, 50, 0.6)'),
        hovertemplate='<b>%{text}</b><br>Price: $%{x}<br>Loves: %{y}<extra></extra>',
        visible=(i == 0)
    )

    # Trendline
    trace_line = go.Scatter(
        x=df_cat['price_usd'],
        y=smooth_loves,
        mode='lines',
        name='Loves Trend',
        line=dict(color='black', width=2),
        hoverinfo='skip',
        visible=(i == 0)
    )

    traces.extend([trace_points, trace_line])

    vis = [False] * len(categories) * 2
    vis[i * 2] = True
    vis[i * 2 + 1] = True

    buttons.append(dict(
        label=cat,
        method='update',
        args=[{'visible': vis},
              {'title': f'Loves Count vs. Price — {cat}'}]
    ))

# Create alternating horizontal bands
band_shapes = []
band_count = 10
band_height = y_max_loves / band_count

for i in range(0, band_count, 2):
    band_shapes.append(dict(
        type='rect',
        xref='paper', yref='y',
        x0=0, x1=1,
        y0=i * band_height,
        y1=(i + 1) * band_height,
        fillcolor='rgba(0, 0, 0, 0.1)',
        layer='below',
        line_width=0
    ))

# Build figure
fig1 = go.Figure(data=traces)
fig1.update_layout(
    updatemenus=[dict(
        type='dropdown',
        buttons=buttons,
        active=0,
        x=1.05,
        xanchor='left',
        y=1.15,
        yanchor='top'
    )],
    title=f'Loves Count vs. Price — {categories[0]}',
    xaxis=dict(title='Price (USD)', showgrid=False),
    yaxis=dict(title='Loves Count', range=[0, y_max_loves], showgrid=False),
    shapes=band_shapes,
    plot_bgcolor='white',
    font=dict(family='Arial', size=14),
    height=600,
    margin=dict(t=80, b=60, l=60, r=60)
)

fig1.show()

In [None]:
import plotly.graph_objects as go
import pandas as pd

# Clean data
df_rating = product_info_df.dropna(subset=['price_usd', 'rating', 'primary_category']).copy()
categories = df_rating['primary_category'].unique()

traces = []
buttons = []

for i, cat in enumerate(categories):
    df_cat = df_rating[df_rating['primary_category'] == cat].sort_values(by='price_usd')
    smooth_rating = df_cat['rating'].rolling(window=50, min_periods=1).mean()

    # Use red transparent stars as text markers
    trace_points = go.Scatter(
        x=df_cat['price_usd'],
        y=df_cat['rating'],
        mode='text',
        name='Rating',
        text=['★'] * len(df_cat),
        textfont=dict(size=10, color='rgba(252, 175, 43, 0.6)'),
        textposition='middle center',
        hovertemplate='<b>%{text}</b><br>Price: $%{x}<br>Rating: %{y}<extra></extra>',
        visible=(i == 0)
    )

    trace_line = go.Scatter(
        x=df_cat['price_usd'],
        y=smooth_rating,
        mode='lines',
        name='Rating Trend',
        line=dict(color='black', width=2),
        hoverinfo='skip',
        visible=(i == 0)
    )

    traces.extend([trace_points, trace_line])

    # Calculate max for current category (to avoid overscaling)
    y_max_cat = max(df_cat['rating'].max(), smooth_rating.max()) * 1.05
    y_max_cat = min(y_max_cat, 5.2)

    vis = [False] * len(categories) * 2
    vis[i * 2] = True
    vis[i * 2 + 1] = True

    buttons.append(dict(
        label=cat,
        method='update',
        args=[
            {'visible': vis},
            {
                'title': f'Rating vs. Price — {cat}',
                'yaxis': {'range': [0, y_max_cat]}
            }
        ]
    ))

# Horizontal background bands (darker)
band_shapes = []
band_count = 10
band_height = 5 / band_count

for i in range(0, band_count, 2):
    band_shapes.append(dict(
        type='rect',
        xref='paper', yref='y',
        x0=0, x1=1,
        y0=i * band_height,
        y1=(i + 1) * band_height,
        fillcolor='rgba(0, 0, 0, 0.1)',
        layer='below',
        line_width=0
    ))

# Build figure
fig2 = go.Figure(data=traces)
fig2.update_layout(
    updatemenus=[dict(
        type='dropdown',
        buttons=buttons,
        active=0,
        x=1.05,
        xanchor='left',
        y=1.15,
        yanchor='top'
    )],
    title=f'Rating vs. Price — {categories[0]}',
    xaxis=dict(title='Price (USD)', showgrid=False),
    yaxis=dict(title='Average Rating', range=[0, 5]),
    shapes=band_shapes,
    plot_bgcolor='white',
    font=dict(family='Arial', size=14),
    height=600,
    margin=dict(t=80, b=60, l=60, r=60)
)

fig2.show()

In [None]:
import plotly.graph_objects as go
import pandas as pd

# Prepare data
df_reviews = product_info_df.dropna(subset=['price_usd', 'reviews', 'primary_category']).copy()
df_reviews['reviews'] = pd.to_numeric(df_reviews['reviews'], errors='coerce')
categories = df_reviews['primary_category'].unique()

# Get global y-axis max for reviews
y_max_global = df_reviews['reviews'].max() * 1.05

traces = []
buttons = []

for i, cat in enumerate(categories):
    df_cat = df_reviews[df_reviews['primary_category'] == cat].sort_values(by='price_usd')
    smooth_reviews = df_cat['reviews'].rolling(window=50, min_periods=1).mean()

    # Use a symbolic marker for reviews
    trace_points = go.Scatter(
        x=df_cat['price_usd'],
        y=df_cat['reviews'],
        mode='text',
        name='Reviews',
        text=['🗨️'] * len(df_cat),
        textfont=dict(size=10, color='rgba(135, 3, 56, 0.5)'),
        textposition='middle center',
        hovertemplate='<b>%{text}</b><br>Price: $%{x}<br>Reviews: %{y}<extra></extra>',
        visible=(i == 0)
    )

    trace_line = go.Scatter(
        x=df_cat['price_usd'],
        y=smooth_reviews,
        mode='lines',
        name='Reviews Trend',
        line=dict(color='rgb(255, 50, 50)', width=2),
        hoverinfo='skip',
        visible=(i == 0)
    )

    traces.extend([trace_points, trace_line])

    vis = [False] * len(categories) * 2
    vis[i * 2] = True
    vis[i * 2 + 1] = True

    #  No dynamic y-axis range — just set the title
    buttons.append(dict(
        label=cat,
        method='update',
        args=[
            {'visible': vis},
            {'title': f'Reviews vs. Price — {cat}'}
        ]
    ))

# Add horizontal bands (same global scale)
band_shapes = []
band_count = 10
band_height = y_max_global / band_count

for i in range(0, band_count, 2):
    band_shapes.append(dict(
        type='rect',
        xref='paper', yref='y',
        x0=0, x1=1,
        y0=i * band_height,
        y1=(i + 1) * band_height,
        fillcolor='rgba(0, 0, 0, 0.1)',
        layer='below',
        line_width=0
    ))

# Build the figure
fig3 = go.Figure(data=traces)
fig3.update_layout(
    updatemenus=[dict(
        type='dropdown',
        buttons=buttons,
        active=0,
        x=1.05,
        xanchor='left',
        y=1.15,
        yanchor='top'
    )],
    title=f'Reviews vs. Price — {categories[0]}',
    xaxis=dict(title='Price (USD)', showgrid=False),
    yaxis=dict(title='Number of Reviews', range=[0, y_max_global], showgrid=False),
    shapes=band_shapes,
    plot_bgcolor='white',
    font=dict(family='Arial', size=14),
    height=600,
    margin=dict(t=80, b=60, l=60, r=60)
)

fig3.show()