In [38]:
import numpy as np
import pandas as pd

import warnings
from pandas.errors import DtypeWarning
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore', category=DtypeWarning)

	•	reviews_0-250.csv: Contains reviews for products indexed from 0 to 250.  
	•	reviews_250-500.csv: Contains reviews for products indexed from 250 to 500.  
	•	reviews_500-750.csv: Contains reviews for products indexed from 500 to 750.  
	•	reviews_750-1250.csv: Contains reviews for products indexed from 750 to 1250.  
	•	reviews_1250-end.csv: Contains reviews for products indexed from 1250 to the last product in the dataset.  

This segmentation is likely implemented to manage file sizes and facilitate easier data handling. Each file encompasses all reviews associated with the products within the specified index range.

In [39]:
pro_inf = pd.read_csv('sephora_datasets/product_info.csv')
pro_inf_original = pro_inf.copy()
rew_1 = pd.read_csv('sephora_datasets/reviews_0-250.csv')
rew_2 = pd.read_csv('sephora_datasets/reviews_250-500.csv')
rew_3 = pd.read_csv('sephora_datasets/reviews_500-750.csv')
rew_4 = pd.read_csv('sephora_datasets/reviews_750-1250.csv')
rew_5 = pd.read_csv('sephora_datasets/reviews_1250-end.csv')

In [40]:
print("Shape:", pro_inf.shape)
print("Columns:", pro_inf.columns.tolist())
pro_inf.head(3)

Shape: (8494, 27)
Columns: ['product_id', 'product_name', 'brand_id', 'brand_name', 'loves_count', 'rating', 'reviews', 'size', 'variation_type', 'variation_value', 'variation_desc', 'ingredients', 'price_usd', 'value_price_usd', 'sale_price_usd', 'limited_edition', 'new', 'online_only', 'out_of_stock', 'sephora_exclusive', 'highlights', 'primary_category', 'secondary_category', 'tertiary_category', 'child_count', 'child_max_price', 'child_min_price']


Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,variation_desc,ingredients,price_usd,value_price_usd,sale_price_usd,limited_edition,new,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count,child_max_price,child_min_price
0,P473671,Fragrance Discovery Set,6342,19-69,6320,3.6364,11.0,,,,,"['Capri Eau de Parfum:', 'Alcohol Denat. (SD A...",35.0,,,0,0,1,0,0,"['Unisex/ Genderless Scent', 'Warm &Spicy Scen...",Fragrance,Value & Gift Sets,Perfume Gift Sets,0,,
1,P473668,La Habana Eau de Parfum,6342,19-69,3827,4.1538,13.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,,"['Alcohol Denat. (SD Alcohol 39C), Parfum (Fra...",195.0,,,0,0,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,85.0,30.0
2,P473662,Rainbow Bar Eau de Parfum,6342,19-69,3253,4.25,16.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,,"['Alcohol Denat. (SD Alcohol 39C), Parfum (Fra...",195.0,,,0,0,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,75.0,30.0


In [65]:
display(pro_inf.nunique())
print('************************** missing values:')
display(pro_inf.isnull().sum())

product_id            8494
product_name          8415
brand_id               304
brand_name             304
loves_count           7436
rating                4394
reviews               1556
size                  2055
variation_type           7
variation_value       2729
variation_desc         935
ingredients           6538
price_usd              298
value_price_usd        174
sale_price_usd          88
limited_edition          2
new                      2
online_only              2
out_of_stock             2
sephora_exclusive        2
highlights            4417
primary_category         9
secondary_category      41
tertiary_category      118
child_count             55
child_max_price        222
child_min_price        208
dtype: int64

************************** missing values:


product_id               0
product_name             0
brand_id                 0
brand_name               0
loves_count              0
rating                 278
reviews                278
size                  1631
variation_type        1444
variation_value       1598
variation_desc        7244
ingredients            945
price_usd                0
value_price_usd       8043
sale_price_usd        8224
limited_edition          0
new                      0
online_only              0
out_of_stock             0
sephora_exclusive        0
highlights            2207
primary_category         0
secondary_category       8
tertiary_category      990
child_count              0
child_max_price       5740
child_min_price       5740
dtype: int64

In [43]:
core_columns_item = ['product_id', 'brand_id', 'price_usd', 'limited_edition', 'new', 'online_only', 'out_of_stock', 'sephora_exclusive', 'primary_category', 'child_count', 'variation_type']
#columns_to_impute = ['rating', 'reviews', 'child_max_price', 'child_min_price']
columns_to_drop_item = ['product_name', 'brand_name', 'value_price_usd', 'sale_price_usd', 'size', 'variation_value', 'variation_desc', 'tertiary_category']

#### use later: variation_value, variation_desc, size.
#### use later (important): ingredients, highlights

In [67]:
df = pro_inf.copy()

In [68]:
core_columns_item = [
    'product_id', 'brand_id', 'price_usd', 'limited_edition', 'new',
    'online_only', 'out_of_stock', 'sephora_exclusive', 'primary_category',
    'child_count', 'variation_type'
]
columns_to_impute = ['rating', 'reviews', 'child_max_price', 'child_min_price']
columns_to_drop_item = [
    'product_name', 'brand_name', 'value_price_usd', 'sale_price_usd',
    'size', 'variation_value', 'variation_desc', 'tertiary_category'
]

In [69]:
df_cleaned = df.drop(columns=columns_to_drop_item)

In [70]:
for col in columns_to_impute:
    if col in df_cleaned.columns:
        if col in ['rating', 'child_max_price', 'child_min_price']:
            df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce').fillna(df_cleaned[col].median())
        elif col == 'reviews':
            df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce').fillna(0)

In [71]:
# Fill missing brand_id with -1 (for later embedding use)
df_cleaned['brand_id'] = df_cleaned['brand_id'].fillna(-1).astype(int)

In [72]:
# Fill missing categorical with "Unknown"
df_cleaned['primary_category'] = df_cleaned['primary_category'].fillna("Unknown")
df_cleaned['variation_type'] = df_cleaned['variation_type'].fillna("Unknown")

In [73]:
# One-hot encode variation_type and primary_category (few unique values)
df_encoded = pd.get_dummies(df_cleaned, columns=['variation_type', 'primary_category'], prefix=['var_type', 'pc'])

In [74]:
# Save brand_id as a separate integer column for embedding
brand_id_for_embedding = df_encoded['brand_id'].values.reshape(-1, 1)

# Drop brand_id from the rest of the features (to avoid duplication)
df_encoded = df_encoded.drop(columns=['brand_id'])

In [75]:
# Save product_id separately (not used in training features, but useful for reference)
product_ids = df_encoded['product_id'].values
df_encoded = df_encoded.drop(columns=['product_id'])

In [76]:
# Final item feature matrix
df_encoded = df_encoded.drop(columns=['ingredients', 'highlights', 'secondary_category'])
item_train_matrix = df_encoded.to_numpy()
item_train_matrix.shape, brand_id_for_embedding.shape, product_ids.shape

((8494, 29), (8494, 1), (8494,))

In [79]:
product_ids

array(['P473671', 'P473668', 'P473662', ..., 'P504428', 'P504448',
       'P505461'], dtype=object)