In [7]:
# ===== Standard Libraries =====
import re
import warnings
from collections import defaultdict

# ===== Scientific Computing =====
import numpy as np
import pandas as pd

# ===== TensorFlow / Keras =====
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, Concatenate, Flatten, Dot, Lambda
from tqdm.keras import TqdmCallback

# ===== Keras Tuner =====
import keras_tuner as kt

# ===== Scikit-learn =====
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    average_precision_score,
    ndcg_score
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# ===== Pandas Display & Warnings =====
from pandas.errors import DtypeWarning
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore', category=DtypeWarning)

In [8]:
# ========================
# SECTION 1: DATA LOADING
# ========================
def load_data():
    """Load and combine datasets with validation checks"""
    # Product data
    products = pd.read_csv('../sephora_dataset/product_info.csv')
    
    # Review data
    review_files = [
        '../sephora_dataset/reviews_0-250.csv',
        '../sephora_dataset/reviews_250-500.csv',
        '../sephora_dataset/reviews_500-750.csv',
        '../sephora_dataset/reviews_750-1250.csv',
        '../sephora_dataset/reviews_1250-end.csv'
    ]
    reviews = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)
    
    # Data validation
    assert len(products) == 8494, "Product data mismatch"
    assert len(reviews) == 1094411, "Review data mismatch"
    
    return products, reviews

products, reviews = load_data()


In [11]:
# ========================
# SECTION 2: DATA INSPECTION
# ========================
def inspect_data(df, name):
    """Generate data quality report"""
    print(f"=== {name} Data Quality Report ===")
    print(f"Total rows: {df.shape[0]:,}")
    print(f"Missing values:")
    display(df.isna().sum().sort_values(ascending=False))
    print("\nColumn dtypes:")
    display(df.dtypes)

inspect_data(products, "Product")
inspect_data(reviews, "Review")

=== Product Data Quality Report ===
Total rows: 8,494
Missing values:


sale_price_usd        8224
value_price_usd       8043
variation_desc        7244
child_max_price       5740
child_min_price       5740
highlights            2207
size                  1631
variation_value       1598
variation_type        1444
tertiary_category      990
ingredients            945
rating                 278
reviews                278
secondary_category       8
loves_count              0
product_id               0
brand_name               0
product_name             0
brand_id                 0
out_of_stock             0
online_only              0
new                      0
limited_edition          0
price_usd                0
sephora_exclusive        0
primary_category         0
child_count              0
dtype: int64


Column dtypes:


product_id             object
product_name           object
brand_id                int64
brand_name             object
loves_count             int64
rating                float64
reviews               float64
size                   object
variation_type         object
variation_value        object
variation_desc         object
ingredients            object
price_usd             float64
value_price_usd       float64
sale_price_usd        float64
limited_edition         int64
new                     int64
online_only             int64
out_of_stock            int64
sephora_exclusive       int64
highlights             object
primary_category       object
secondary_category     object
tertiary_category      object
child_count             int64
child_max_price       float64
child_min_price       float64
dtype: object

=== Review Data Quality Report ===
Total rows: 1,094,411
Missing values:


helpfulness                 561592
review_title                310654
hair_color                  226768
eye_color                   209628
skin_tone                   170539
is_recommended              167988
skin_type                   111557
review_text                   1444
rating                           0
Unnamed: 0                       0
author_id                        0
total_pos_feedback_count         0
submission_time                  0
total_feedback_count             0
total_neg_feedback_count         0
product_id                       0
product_name                     0
brand_name                       0
price_usd                        0
dtype: int64


Column dtypes:


Unnamed: 0                    int64
author_id                    object
rating                        int64
is_recommended              float64
helpfulness                 float64
total_feedback_count          int64
total_neg_feedback_count      int64
total_pos_feedback_count      int64
submission_time              object
review_text                  object
review_title                 object
skin_tone                    object
eye_color                    object
skin_type                    object
hair_color                   object
product_id                   object
product_name                 object
brand_name                   object
price_usd                   float64
dtype: object

### Focus on product features for cold-start

In [14]:
# ========================
# SECTION 3: PRODUCT PREPROCESSING
# ========================
def preprocess_products(products):
    """Clean and transform product data"""
    # Feature selection (Lops et al., 2011 - Content-based features)
    keep_cols = [
        'product_id', 'brand_name', 'primary_category',
        'ingredients', 'price_usd', 'highlights'
    ]
    df = products[keep_cols].copy()
    
    # Handle missing values (Schein et al., 2002)
    df['ingredients'] = df['ingredients'].fillna('unknown')
    df['highlights'] = df['highlights'].fillna('[]')
    
    # Clean ingredients text
    def clean_ingredients(text):
        try:
            return re.sub(r'Product variation \d+:', '', str(text))
        except:
            return 'unknown'
    
    df['ingredients'] = df['ingredients'].apply(clean_ingredients)
    
    # Validate
    assert df['ingredients'].isna().sum() == 0, "Missing ingredients remaining"
    print("Product preprocessing complete")
    return df

product_clean = preprocess_products(products)
display(product_clean.head(2))

Product preprocessing complete


Unnamed: 0,product_id,brand_name,primary_category,ingredients,price_usd,highlights
0,P473671,19-69,Fragrance,"['Capri Eau de Parfum:', 'Alcohol Denat. (SD A...",35.0,"['Unisex/ Genderless Scent', 'Warm &Spicy Scen..."
1,P473668,19-69,Fragrance,"['Alcohol Denat. (SD Alcohol 39C), Parfum (Fra...",195.0,"['Unisex/ Genderless Scent', 'Layerable Scent'..."


### Focus on user features for cold-start

In [16]:
# ========================
# SECTION 4: REVIEW PREPROCESSING
# ========================
def preprocess_reviews(reviews):
    """Clean and transform review data"""
    # Feature selection (Park et al., 2013 - Demographic features)
    keep_cols = [
        'author_id', 'product_id', 'rating', 
        'skin_type', 'skin_tone'
    ]
    df = reviews[keep_cols].copy()
    
    # Handle missing demographics (Micci-Barreca, 2001)
    for col in ['skin_type', 'skin_tone']:
        df[col] = df[col].fillna('unknown')
    
    # Validate
    assert df['skin_type'].isna().sum() == 0, "Missing skin types"
    print("Review preprocessing complete")
    return df

review_clean = preprocess_reviews(reviews)
display(review_clean.head(2))

Review preprocessing complete


Unnamed: 0,author_id,product_id,rating,skin_type,skin_tone
0,1741593524,P504322,5,dry,unknown
1,31423088263,P420652,1,unknown,unknown


### Create content-based features

In [26]:
# ========================
# SECTION 5: FEATURE ENGINEERING
# ========================
from sklearn.decomposition import TruncatedSVD

def create_features(product_df):
    """Process ingredients without list parsing"""
    # 1. Clean ingredients directly from strings
    product_df['clean_ingredients'] = (
        product_df['ingredients']
        .fillna('')
        .str.lower()
        .str.replace(r'[^\w\s,]', '', regex=True)
        .str.split(',')
        .apply(lambda x: [i.strip() for i in x if i.strip() != ''])
        .apply(lambda x: ' '.join(x))
    )
    
    # 2. TF-IDF with basic settings
    tfidf = TfidfVectorizer(
        max_features=500,
        stop_words='english',
        ngram_range=(1, 2))
    X_tfidf = tfidf.fit_transform(product_df['clean_ingredients'])
    
    # 3. Dimensionality reduction
    svd = TruncatedSVD(n_components=50)
    X_svd = svd.fit_transform(X_tfidf)
    
    # 4. Create feature DataFrame
    ingredient_df = pd.DataFrame(
        X_svd,
        columns=[f'ing_{i}' for i in range(50)],
        index=product_df.index
    )
    
    final_df = pd.concat([product_df, ingredient_df], axis=1)
    
    # Validation
    print("\nFeature engineering successful!")
    print("Sample processed ingredients:")
    print(product_df['clean_ingredients'].iloc[0][:100] + "...")
    print("\nFinal feature columns:")
    print(final_df.iloc[:2, -5:].to_string())  # Changed to to_string()
    return final_df

product_features = create_features(product_clean)


Feature engineering successful!
Sample processed ingredients:
capri eau de parfum alcohol denat sd alcohol 39c parfum fragrance dlimonene linalool benzyl salicyla...

Final feature columns:
     ing_45    ing_46    ing_47    ing_48    ing_49
0 -0.003200 -0.046183  0.010064 -0.042232 -0.002094
1  0.001122 -0.052677  0.009895 -0.097179 -0.038063


#### Result: Rare but significant ingredients (e.g., "Niacinamide") get higher weights, while common ones (e.g., "Water") are downweighted.

In [28]:
# ========================
# SECTION 6: COLD-START SPLIT
# ========================
def create_coldstart_split(review_df, product_df, cold_threshold=5):
    """Split data into warm/cold-start products (Son, 2016)"""
    # Calculate product interactions
    product_counts = review_df['product_id'].value_counts()
    
    # Identify cold-start products
    cold_products = product_counts[product_counts <= cold_threshold].index
    
    # Create test set
    test_mask = review_df['product_id'].isin(cold_products)
    test_data = review_df[test_mask]
    train_data = review_df[~test_mask]
    
    # Validation
    print(f"\nCold-start split ({cold_threshold} reviews threshold):")
    print(f"- Training products: {len(product_counts[product_counts > cold_threshold]):,}")
    print(f"- Cold-start products: {len(cold_products):,}")
    print(f"- Training reviews: {len(train_data):,}")
    print(f"- Test reviews: {len(test_data):,}")
    
    return train_data, test_data

train_data, test_data = create_coldstart_split(review_clean, product_features)


Cold-start split (5 reviews threshold):
- Training products: 2,207
- Cold-start products: 144
- Training reviews: 1,093,993
- Test reviews: 418


In [32]:
# ========================
# SECTION 7: MODEL PREPARATION
# ========================
def prepare_model_inputs(product_features, review_data):
    """Merge features with review data (Rendle, 2010)"""
    # Merge product features
    merged = review_data.merge(
        product_features[
            ['product_id'] + 
            [f'ing_{i}' for i in range(50)] + 
            ['price_usd']
        ],
        on='product_id',
        how='left'
    )
    
    # Final features
    feature_cols = [f'ing_{i}' for i in range(50)] + ['price_usd']
    X = merged[feature_cols]
    y = merged['rating']
    
    # Validation
    print("\nModel inputs prepared:")
    print(f"- Samples: {len(X):,}")
    print(f"- Features: {len(feature_cols)}")
    print(f"  - Ingredient embeddings: 50")
    print(f"  - Price: 1")
    print(f"- Target distribution:")
    print(y.describe().to_string())
    
    return X, y

X_train, y_train = prepare_model_inputs(product_features, train_data)
X_test, y_test = prepare_model_inputs(product_features, test_data)


Model inputs prepared:
- Samples: 1,093,993
- Features: 51
  - Ingredient embeddings: 50
  - Price: 1
- Target distribution:
count    1.093993e+06
mean     4.299286e+00
std      1.149292e+00
min      1.000000e+00
25%      4.000000e+00
50%      5.000000e+00
75%      5.000000e+00
max      5.000000e+00

Model inputs prepared:
- Samples: 418
- Features: 51
  - Ingredient embeddings: 50
  - Price: 1
- Target distribution:
count    418.000000
mean       3.964115
std        1.458015
min        1.000000
25%        3.000000
50%        5.000000
75%        5.000000
max        5.000000


In [41]:
# ========================
# SECTION 8: BASELINE MODEL
# ========================
def build_baseline_model(input_shape):
    """Simple neural network baseline (He et al., 2017)"""
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(input_shape,)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1)
    ])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(0.001),
        loss='mse',
        metrics=['mae']
    )
    
    print("\nBaseline model architecture:")
    model.summary()
    return model

model = build_baseline_model(X_train.shape[1])


Baseline model architecture:


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [42]:
# ========================
# SECTION 9: MODEL TRAINING
# ========================
early_stopping = tf.keras.callbacks.EarlyStopping(
    patience=5,
    restore_best_weights=True,
    monitor='val_loss'
)

print("\nTraining progress:")
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=30,
    batch_size=256,
    callbacks=[early_stopping],
    verbose=1
)


Training progress:
Epoch 1/30
[1m3419/3419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 2.3732 - mae: 1.1450 - val_loss: 1.3520 - val_mae: 0.9099
Epoch 2/30
[1m3419/3419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 1.3345 - mae: 0.8907 - val_loss: 1.3493 - val_mae: 0.8872
Epoch 3/30
[1m3419/3419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 1.3179 - mae: 0.8854 - val_loss: 1.3856 - val_mae: 0.8529
Epoch 4/30
[1m3419/3419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 1.3159 - mae: 0.8835 - val_loss: 1.5551 - val_mae: 1.0446
Epoch 5/30
[1m3419/3419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 1.3022 - mae: 0.8793 - val_loss: 1.3662 - val_mae: 0.9288
Epoch 6/30
[1m3419/3419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 1.2958 - mae: 0.8765 - val_loss: 1.3546 - val_mae: 0.8875
Epoch 7/30
[1m3419/3419[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [43]:
# ========================
# SECTION 10: COLD-START EVALUATION
# ========================
def evaluate_model(model, X_test, y_test):
    """Comprehensive cold-start evaluation (Cremonesi et al., 2010)"""
    # Predict ratings
    y_pred = model.predict(X_test).flatten()
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    # Calculate precision/recall for recommendations
    y_pred_binary = (y_pred >= 4).astype(int)  # Threshold for "good" rating
    y_true_binary = (y_test >= 4).astype(int)
    
    precision = precision_score(y_true_binary, y_pred_binary, zero_division=0)
    recall = recall_score(y_true_binary, y_pred_binary, zero_division=0)
    
    print("\nCold-start Evaluation:")
    print(f"- RMSE: {rmse:.3f}")
    print(f"- MAE: {mae:.3f}")
    print(f"- Precision@4: {precision:.3f}")
    print(f"- Recall@4: {recall:.3f}")
    print(f"- Coverage: {len(np.unique(X_test.index))} cold-start products")

evaluate_model(model, X_test, y_test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 

Cold-start Evaluation:
- RMSE: 1.523
- MAE: 1.132
- Precision@4: 0.706
- Recall@4: 0.983
- Coverage: 418 cold-start products
