In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, Concatenate, Flatten, Dot, Lambda
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from collections import defaultdict
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, average_precision_score, precision_score, ndcg_score
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


import warnings
from pandas.errors import DtypeWarning
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore', category=DtypeWarning)

	•	reviews_0-250.csv: Contains reviews for products indexed from 0 to 250.  
	•	reviews_250-500.csv: Contains reviews for products indexed from 250 to 500.  
	•	reviews_500-750.csv: Contains reviews for products indexed from 500 to 750.  
	•	reviews_750-1250.csv: Contains reviews for products indexed from 750 to 1250.  
	•	reviews_1250-end.csv: Contains reviews for products indexed from 1250 to the last product in the dataset.  

This segmentation is likely implemented to manage file sizes and facilitate easier data handling. Each file encompasses all reviews associated with the products within the specified index range.

In [2]:
pro_inf = pd.read_csv('../sephora_datasets/product_info.csv')
pro_inf_original = pro_inf.copy()
rew_1 = pd.read_csv('../sephora_datasets/reviews_0-250.csv')
rew_2 = pd.read_csv('../sephora_datasets/reviews_250-500.csv')
rew_3 = pd.read_csv('../sephora_datasets/reviews_500-750.csv')
rew_4 = pd.read_csv('../sephora_datasets/reviews_750-1250.csv')
rew_5 = pd.read_csv('../sephora_datasets/reviews_1250-end.csv')

files = [
    "reviews_0-250.csv",
    "reviews_250-500.csv",
    "reviews_500-750.csv",
    "reviews_750-1250.csv",
    "reviews_1250-end.csv"
]

df_reviews = pd.concat([rew_1, rew_2, rew_3, rew_4, rew_5], ignore_index=True, axis=0)
print("Combined shape:", df_reviews.shape)

Combined shape: (1094411, 19)


In [3]:
df_reviews_final = df_reviews[['product_id', 'author_id', 'rating']].copy()

In [4]:
df_reviews_final[['product_id', 'author_id']].nunique()

product_id      2351
author_id     578653
dtype: int64

In [5]:
display(pro_inf.nunique())
print('************************** missing values:')
display(pro_inf.isnull().sum())

product_id            8494
product_name          8415
brand_id               304
brand_name             304
loves_count           7436
rating                4394
reviews               1556
size                  2055
variation_type           7
variation_value       2729
variation_desc         935
ingredients           6538
price_usd              298
value_price_usd        174
sale_price_usd          88
limited_edition          2
new                      2
online_only              2
out_of_stock             2
sephora_exclusive        2
highlights            4417
primary_category         9
secondary_category      41
tertiary_category      118
child_count             55
child_max_price        222
child_min_price        208
dtype: int64

************************** missing values:


product_id               0
product_name             0
brand_id                 0
brand_name               0
loves_count              0
rating                 278
reviews                278
size                  1631
variation_type        1444
variation_value       1598
variation_desc        7244
ingredients            945
price_usd                0
value_price_usd       8043
sale_price_usd        8224
limited_edition          0
new                      0
online_only              0
out_of_stock             0
sephora_exclusive        0
highlights            2207
primary_category         0
secondary_category       8
tertiary_category      990
child_count              0
child_max_price       5740
child_min_price       5740
dtype: int64

#### use later: variation_value, variation_desc, size.
#### use later (important): ingredients, highlights

## Item Matrix

In [6]:
df = pro_inf.copy()

In [7]:
core_columns_item = [
    'product_id', 'brand_id', 'price_usd', 'limited_edition', 'new',
    'online_only', 'out_of_stock', 'sephora_exclusive', 'primary_category',
    'child_count', 'variation_type'
]
columns_to_impute = ['rating', 'reviews', 'child_max_price', 'child_min_price']
columns_to_drop_item = [
    'product_name', 'brand_name', 'value_price_usd', 'sale_price_usd',
    'variation_value', 'variation_desc', 'tertiary_category', 'secondary_category'
]

In [8]:
df_cleaned = df.drop(columns=columns_to_drop_item)

In [9]:
brand_freq = df_cleaned['brand_id'].value_counts()
df_cleaned['brand_freq'] = df_cleaned['brand_id'].map(brand_freq)

# Log-scaling
# Helps reduce dominance of very high-frequency brands
df_cleaned['brand_freq_log'] = np.log1p(df_cleaned['brand_freq'])  # log(1 + x) to avoid log(0)

# Normalize (MinMax scaling to [0, 1])
scaler = MinMaxScaler()
df_cleaned['brand_freq_scaled'] = scaler.fit_transform(df_cleaned[['brand_freq_log']])

brand_freq_scaled_mapping = (
    df_cleaned[['brand_id', 'brand_freq_scaled']]
    .drop_duplicates(subset='brand_id')
    .set_index('brand_id')
    .squeeze()  # turns DataFrame → Series
    .to_dict()
)

In [10]:
for col in columns_to_impute:
    if col in df_cleaned.columns:
        if col in ['rating', 'child_max_price', 'child_min_price']:
            df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce').fillna(df_cleaned[col].median())
        elif col == 'reviews':
            df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce').fillna(0)

# Fill missing categorical with "Unknown"
df_cleaned['primary_category'] = df_cleaned['primary_category'].fillna("Unknown")
#df_cleaned['secondary_category'] = df_cleaned['secondary_category'].fillna("Unknown")
df_cleaned['variation_type'] = df_cleaned['variation_type'].fillna("Unknown")

# One-hot encode variation_type and primary_category (few unique values)
df_encoded = pd.get_dummies(df_cleaned, columns=['variation_type', 'primary_category'], prefix=['var_type', 'pc'])

#### handle size column:

In [11]:
def normalize_size(text):
    """
    Convert the 'size' field to millilitres (mL), with oz converted to mL.
    Solids in g are left as-is. If parsing fails, returns np.nan.
    """
    if pd.isna(text):
        return np.nan
    
    text = str(text).strip().lower()
    
    # Match the first numeric value and a valid unit (ml, mL, g, oz), with optional spaces or separators
    # Handles: "1 oz / 30 ml", "1oz/30ml", ".5 oz / 15 mL", "2 x 0.5oz/15ml"
    m = re.search(r'(?<!\w)([\d.]+)\s*(oz|fl oz|ml|g|mL|grams?)(?!\w)', text)

    if not m:
        return np.nan

    val = m.group(1)
    unit = m.group(2)

    try:
        val = float(val)
    except ValueError:
        return np.nan

    unit = unit.lower()
    if unit in ('ml', 'mL'):
        return val
    elif unit in ('oz', 'fl oz'):
        return val * 29.5735  # Assume oz means fluid oz
    elif unit in ('g', 'gram', 'grams'):
        return val  # Leave solids in g
    else:
        return np.nan

In [12]:
# Apply to your DataFrame
df_encoded['size_ml'] = df_encoded['size'].apply(normalize_size)
df_encoded['size_ml_log'] = np.log1p(df_encoded['size_ml'])

df_encoded['size_missing'] = df_encoded['size'].isna().astype(int)
median_size_log = df_encoded['size_ml_log'].median()
df_encoded['size_ml_log'] = df_encoded['size_ml_log'].fillna(median_size_log).copy()
df_encoded = df_encoded.drop(['size_ml', 'size'], axis=1)

#### Handle Highlights and Ingredients

In [13]:
"""
import ast
from sklearn.feature_extraction.text import CountVectorizer

# Remove the “Product variation n:” labels and brackets
def clean_ing(row):
    if pd.isna(row):
        return ''
    # Turn the outer quotes into a python list then join
    try:
        lst = ast.literal_eval(row)
        # Some rows are nested lists, others are plain strings
        if isinstance(lst, list):
            row = ' '.join(lst)
    except Exception:
        pass
    # drop the "Product variation ..." chunks
    row = re.sub(r'Product variation \d+:', '', row, flags=re.I)
    return row.lower()

pro_inf['ingredients_clean'] = pro_inf['ingredients'].apply(clean_ing)

# Bag‑of‑words, keep top 500 ingredients
ing_vec = CountVectorizer(max_features=500, token_pattern=r'[A-Za-z]+').fit_transform(
    pro_inf['ingredients_clean']
)
ing_df = pd.DataFrame(ing_vec.toarray(),
                      columns=[f'ing_{w}' for w in CountVectorizer(max_features=500,
                                                                   token_pattern=r'[A-Za-z]+')
                               .fit(pro_inf['ingredients_clean'])
                               .get_feature_names_out()])

df_encoded = pd.concat([df_encoded, ing_df], axis=1)
"""
print()




In [14]:
"""
import ast

def parse_highlights(text):
    if pd.isna(text):
        return []
    try:
        return [tag.strip().lower() for tag in ast.literal_eval(text)]
    except Exception:
        return []

pro_inf['highlights_list'] = pro_inf['highlights'].apply(parse_highlights)

# build vocabulary
from itertools import chain
all_tags = sorted({tag for tags in pro_inf['highlights_list'] for tag in tags})
tag2col = {tag: idx for idx, tag in enumerate(all_tags)}

# multi‑hot matrix
high_mat = np.zeros((len(pro_inf), len(all_tags)), dtype=np.uint8)
for i, tags in enumerate(pro_inf['highlights_list']):
    high_mat[i, [tag2col[t] for t in tags]] = 1

high_df = pd.DataFrame(high_mat, columns=[f'high_{t}' for t in all_tags])
df_encoded = pd.concat([df_encoded, high_df], axis=1)
"""
print()




In [15]:
# Final item feature matrix
df_encoded = df_encoded.astype({col: 'int' for col in df_encoded.columns if df_encoded[col].dtype == 'bool'})
df_encoded = df_encoded.drop(['brand_id', 'brand_freq', 'brand_freq_log'], axis=1)

In [16]:
item_features_ordered = df_reviews_final[['product_id']].merge(df_encoded, on="product_id", how="left")
item_features_ordered = item_features_ordered.drop(['highlights', 'ingredients'], axis=1)

## User Matrix

In [17]:
print('************************ nunique_values:')
display(df_reviews.nunique())
print('************************ missing values:')
display(df_reviews.isnull().sum())
print('************************ shape:')
print(df_reviews.shape)

************************ nunique_values:


Unnamed: 0                  602130
author_id                   578653
rating                           5
is_recommended                   2
helpfulness                   3767
total_feedback_count           676
total_neg_feedback_count       259
total_pos_feedback_count       590
submission_time               5317
review_text                 969419
review_title                364105
skin_tone                       14
eye_color                        6
skin_type                        4
hair_color                       7
product_id                    2351
product_name                  2334
brand_name                     142
price_usd                      221
dtype: int64

************************ missing values:


Unnamed: 0                       0
author_id                        0
rating                           0
is_recommended              167988
helpfulness                 561592
total_feedback_count             0
total_neg_feedback_count         0
total_pos_feedback_count         0
submission_time                  0
review_text                   1444
review_title                310654
skin_tone                   170539
eye_color                   209628
skin_type                   111557
hair_color                  226768
product_id                       0
product_name                     0
brand_name                       0
price_usd                        0
dtype: int64

************************ shape:
(1094411, 19)


![image.png](attachment:3f3fde0c-c6d7-4495-843f-3f591e0aaf8a.png)

In [18]:
print('sentiment of the text')
"""
from textblob import TextBlob
# Fill missing texts just in case
df_reviews['review_text'] = df_reviews['review_text'].fillna("")

# Vectorized sentiment calculation
# Avoid lambda + apply and instead use a list comprehension
sentiments = [TextBlob(text).sentiment.polarity for text in df_reviews['review_text']]
df_reviews['sentiment'] = sentiments

# Group by user and take the mean sentiment
user_sentiment = df_reviews.groupby('user_id', sort=False)['sentiment'].mean().reset_index()
"""
print()

sentiment of the text



In [19]:
print('create sentence embeddings using sentence transformer')
"""
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Fill missing texts
df_reviews['review_text'] = df_reviews['review_text'].fillna("")

# Encode ALL reviews at once (much faster)
review_embeddings = model.encode(df_reviews['review_text'].tolist(), show_progress_bar=True)

# Add embeddings back to df_reviews
df_reviews['review_embedding'] = list(review_embeddings)

# Group by user_id and take mean of embeddings per user
user_text_embeddings = df_reviews.groupby('user_id')['review_embedding'].apply(
    lambda vecs: np.mean(vecs.tolist(), axis=0)
)

# Convert to dataframe for merging
user_text_embeddings_df = pd.DataFrame(user_text_embeddings.tolist(), index=user_text_embeddings.index)
user_text_embeddings_df.columns = [f'user_text_emb_{i}' for i in range(user_text_embeddings_df.shape[1])]
user_text_embeddings_df = user_text_embeddings_df.reset_index()
"""
print()

create sentence embeddings using sentence transformer



In [20]:
mask_no_feedback = (df_reviews['total_feedback_count'] == 0)
df_reviews.loc[mask_no_feedback, 'helpfulness'] = 0.0

for col in ['skin_tone', 'eye_color', 'skin_type', 'hair_color']:
    df_reviews[col] = df_reviews[col].fillna("Unknown")

user_agg = df_reviews.groupby("author_id").agg({
    "rating": ["mean", "count"],
    "is_recommended": "mean",
    "helpfulness": "mean",
    "skin_tone": "first",
    "skin_type": "first",
    "eye_color": "first",
    "hair_color": "first"
})
user_agg.columns = ['rating_avg', 'rating_count', 'recommend_ratio', 'helpfulness_avg',
                    'skin_tone', 'skin_type', 'eye_color', 'hair_color']
user_agg = user_agg.reset_index()

user_features = pd.get_dummies(user_agg, 
    columns=['skin_tone', 'skin_type', 'eye_color', 'hair_color'],
    prefix=['tone', 'type', 'eye', 'hair'])

global_recommend_avg = df_reviews["is_recommended"].mean()
user_features["recommend_missing"] = user_features["recommend_ratio"].isna().astype(int)
user_features["recommend_ratio"] = user_features["recommend_ratio"].fillna(global_recommend_avg)

user_features = user_features.astype({col: 'int' for col in user_features.columns if user_features[col].dtype == 'bool'})

#### Sentiment Score for review_text

In [21]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

df_reviews['sentiment'] = df_reviews['review_text'].fillna("").apply(
    lambda x: analyzer.polarity_scores(x)['compound']
)

user_sentiment = df_reviews.groupby('author_id', sort=False)['sentiment'].mean().reset_index()

In [22]:
user_features = pd.merge(user_features, user_sentiment, on='author_id', how='left')

#### TF-IDF with Truncated Singular Value Decomposition (SVD-k)

In [23]:
tfidf = TfidfVectorizer(max_features=5000, min_df=5, max_df=0.8)
X_tfidf = tfidf.fit_transform(df_reviews['review_text'].fillna(""))

In [24]:
svd = TruncatedSVD(n_components=128)
X_svd = svd.fit_transform(X_tfidf)

review_embeddings = pd.DataFrame(X_svd)

In [25]:
review_embeddings['author_id'] = df_reviews['author_id'].astype(str).values

user_text_embeddings_df = review_embeddings.groupby('author_id').mean().reset_index()
user_text_embeddings_df.columns = ['author_id'] + [f'user_text_emb_{i}' for i in range(X_svd.shape[1])]

In [26]:
user_features = user_features.merge(user_text_embeddings_df, on='author_id', how='left')

user_text_cols = [col for col in user_features.columns if col.startswith('user_text_emb_')]
user_features['text_emb_missing'] = user_features[user_text_cols[0]].isna().astype(int)
user_features[user_text_cols] = user_features[user_text_cols].fillna(user_features[user_text_cols].mean())

#user_text_embeddings_df['author_id'].nunique()
#user_features['author_id'].nunique() #some users dont have text, so that we have to fill these missing values after merging.

In [27]:
user_features_ordered = df_reviews_final[['author_id']].merge(user_features, on="author_id", how="left")

### Finalized User-Item DataFrames

In [28]:
user_features_ordered.head()

Unnamed: 0,author_id,rating_avg,rating_count,recommend_ratio,helpfulness_avg,tone_Unknown,tone_dark,tone_deep,tone_ebony,tone_fair,tone_fairLight,tone_light,tone_lightMedium,tone_medium,tone_mediumTan,tone_notSureST,tone_olive,tone_porcelain,tone_rich,tone_tan,type_Unknown,type_combination,type_dry,type_normal,type_oily,eye_Grey,eye_Unknown,eye_blue,eye_brown,eye_gray,eye_green,eye_hazel,hair_Unknown,hair_auburn,hair_black,hair_blonde,hair_brown,hair_brunette,hair_gray,hair_red,recommend_missing,sentiment,user_text_emb_0,user_text_emb_1,user_text_emb_2,user_text_emb_3,user_text_emb_4,user_text_emb_5,user_text_emb_6,user_text_emb_7,user_text_emb_8,user_text_emb_9,user_text_emb_10,user_text_emb_11,user_text_emb_12,user_text_emb_13,user_text_emb_14,user_text_emb_15,user_text_emb_16,user_text_emb_17,user_text_emb_18,user_text_emb_19,user_text_emb_20,user_text_emb_21,user_text_emb_22,user_text_emb_23,user_text_emb_24,user_text_emb_25,user_text_emb_26,user_text_emb_27,user_text_emb_28,user_text_emb_29,user_text_emb_30,user_text_emb_31,user_text_emb_32,user_text_emb_33,user_text_emb_34,user_text_emb_35,user_text_emb_36,user_text_emb_37,user_text_emb_38,user_text_emb_39,user_text_emb_40,user_text_emb_41,user_text_emb_42,user_text_emb_43,user_text_emb_44,user_text_emb_45,user_text_emb_46,user_text_emb_47,user_text_emb_48,user_text_emb_49,user_text_emb_50,user_text_emb_51,user_text_emb_52,user_text_emb_53,user_text_emb_54,user_text_emb_55,user_text_emb_56,user_text_emb_57,user_text_emb_58,user_text_emb_59,user_text_emb_60,user_text_emb_61,user_text_emb_62,user_text_emb_63,user_text_emb_64,user_text_emb_65,user_text_emb_66,user_text_emb_67,user_text_emb_68,user_text_emb_69,user_text_emb_70,user_text_emb_71,user_text_emb_72,user_text_emb_73,user_text_emb_74,user_text_emb_75,user_text_emb_76,user_text_emb_77,user_text_emb_78,user_text_emb_79,user_text_emb_80,user_text_emb_81,user_text_emb_82,user_text_emb_83,user_text_emb_84,user_text_emb_85,user_text_emb_86,user_text_emb_87,user_text_emb_88,user_text_emb_89,user_text_emb_90,user_text_emb_91,user_text_emb_92,user_text_emb_93,user_text_emb_94,user_text_emb_95,user_text_emb_96,user_text_emb_97,user_text_emb_98,user_text_emb_99,user_text_emb_100,user_text_emb_101,user_text_emb_102,user_text_emb_103,user_text_emb_104,user_text_emb_105,user_text_emb_106,user_text_emb_107,user_text_emb_108,user_text_emb_109,user_text_emb_110,user_text_emb_111,user_text_emb_112,user_text_emb_113,user_text_emb_114,user_text_emb_115,user_text_emb_116,user_text_emb_117,user_text_emb_118,user_text_emb_119,user_text_emb_120,user_text_emb_121,user_text_emb_122,user_text_emb_123,user_text_emb_124,user_text_emb_125,user_text_emb_126,user_text_emb_127,text_emb_missing
0,1741593524,5.0,1,1.0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0.948,0.253524,0.001548,-0.000663,-0.000726,-0.002937,0.001417,-0.001304,-0.004229,0.00339,0.00505,3.8e-05,-0.002473,-0.001584,-0.001849,-0.002891,0.003139,0.003223,0.001298,0.000541,0.000218,0.003435,-0.000867,-0.000148,-0.002019,0.000738,0.000895,2.2e-05,-0.001179,-0.002358,-0.000208,-0.002841,0.001232,-0.000255,0.000435,-0.001915,6e-05,0.001556,-0.00146,0.001695,-0.000546,0.000992,6.7e-05,-0.000765,0.000236,0.000168,8.8e-05,0.000573,-0.000503,0.000647,-0.000864,0.001377,0.001875,-0.000354,0.000904,0.000988,0.000625,-0.000721,0.000878,-0.000131,-0.00065,-3.6e-05,-3.7e-05,-0.000599,0.000423,0.000606,-0.000253,0.000116,-0.000752,0.000469,-0.000148,-0.000745,-0.00099,-0.000925,0.000174,-3.7e-05,-0.000223,-0.000196,-0.000695,0.00024,0.000509,-0.000262,0.000929,-0.000868,-0.000757,-0.000167,0.000577,-0.000323,-8e-06,0.000917,0.000415,-0.001132,-0.000101,-0.000846,-0.001071,0.000324,8.9e-05,-0.001411,-6.7e-05,-0.000816,0.000584,0.000474,-0.000602,-4.9e-05,-0.000532,0.000253,-0.000756,0.000286,-0.000726,0.000435,0.000134,-6e-06,-7.4e-05,-9.1e-05,0.000357,-0.00079,-0.000333,0.000224,0.000268,0.000116,-0.0001,9.3e-05,-0.000423,4.2e-05,-0.000205,-0.000651,0.00033,-2.4e-05,-0.000301,1
1,31423088263,1.0,1,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.128,0.253524,0.001548,-0.000663,-0.000726,-0.002937,0.001417,-0.001304,-0.004229,0.00339,0.00505,3.8e-05,-0.002473,-0.001584,-0.001849,-0.002891,0.003139,0.003223,0.001298,0.000541,0.000218,0.003435,-0.000867,-0.000148,-0.002019,0.000738,0.000895,2.2e-05,-0.001179,-0.002358,-0.000208,-0.002841,0.001232,-0.000255,0.000435,-0.001915,6e-05,0.001556,-0.00146,0.001695,-0.000546,0.000992,6.7e-05,-0.000765,0.000236,0.000168,8.8e-05,0.000573,-0.000503,0.000647,-0.000864,0.001377,0.001875,-0.000354,0.000904,0.000988,0.000625,-0.000721,0.000878,-0.000131,-0.00065,-3.6e-05,-3.7e-05,-0.000599,0.000423,0.000606,-0.000253,0.000116,-0.000752,0.000469,-0.000148,-0.000745,-0.00099,-0.000925,0.000174,-3.7e-05,-0.000223,-0.000196,-0.000695,0.00024,0.000509,-0.000262,0.000929,-0.000868,-0.000757,-0.000167,0.000577,-0.000323,-8e-06,0.000917,0.000415,-0.001132,-0.000101,-0.000846,-0.001071,0.000324,8.9e-05,-0.001411,-6.7e-05,-0.000816,0.000584,0.000474,-0.000602,-4.9e-05,-0.000532,0.000253,-0.000756,0.000286,-0.000726,0.000435,0.000134,-6e-06,-7.4e-05,-9.1e-05,0.000357,-0.00079,-0.000333,0.000224,0.000268,0.000116,-0.0001,9.3e-05,-0.000423,4.2e-05,-0.000205,-0.000651,0.00033,-2.4e-05,-0.000301,1
2,5061282401,5.0,1,1.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,-0.124,0.253524,0.001548,-0.000663,-0.000726,-0.002937,0.001417,-0.001304,-0.004229,0.00339,0.00505,3.8e-05,-0.002473,-0.001584,-0.001849,-0.002891,0.003139,0.003223,0.001298,0.000541,0.000218,0.003435,-0.000867,-0.000148,-0.002019,0.000738,0.000895,2.2e-05,-0.001179,-0.002358,-0.000208,-0.002841,0.001232,-0.000255,0.000435,-0.001915,6e-05,0.001556,-0.00146,0.001695,-0.000546,0.000992,6.7e-05,-0.000765,0.000236,0.000168,8.8e-05,0.000573,-0.000503,0.000647,-0.000864,0.001377,0.001875,-0.000354,0.000904,0.000988,0.000625,-0.000721,0.000878,-0.000131,-0.00065,-3.6e-05,-3.7e-05,-0.000599,0.000423,0.000606,-0.000253,0.000116,-0.000752,0.000469,-0.000148,-0.000745,-0.00099,-0.000925,0.000174,-3.7e-05,-0.000223,-0.000196,-0.000695,0.00024,0.000509,-0.000262,0.000929,-0.000868,-0.000757,-0.000167,0.000577,-0.000323,-8e-06,0.000917,0.000415,-0.001132,-0.000101,-0.000846,-0.001071,0.000324,8.9e-05,-0.001411,-6.7e-05,-0.000816,0.000584,0.000474,-0.000602,-4.9e-05,-0.000532,0.000253,-0.000756,0.000286,-0.000726,0.000435,0.000134,-6e-06,-7.4e-05,-9.1e-05,0.000357,-0.00079,-0.000333,0.000224,0.000268,0.000116,-0.0001,9.3e-05,-0.000423,4.2e-05,-0.000205,-0.000651,0.00033,-2.4e-05,-0.000301,1
3,6083038851,5.0,1,1.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0.946,0.253524,0.001548,-0.000663,-0.000726,-0.002937,0.001417,-0.001304,-0.004229,0.00339,0.00505,3.8e-05,-0.002473,-0.001584,-0.001849,-0.002891,0.003139,0.003223,0.001298,0.000541,0.000218,0.003435,-0.000867,-0.000148,-0.002019,0.000738,0.000895,2.2e-05,-0.001179,-0.002358,-0.000208,-0.002841,0.001232,-0.000255,0.000435,-0.001915,6e-05,0.001556,-0.00146,0.001695,-0.000546,0.000992,6.7e-05,-0.000765,0.000236,0.000168,8.8e-05,0.000573,-0.000503,0.000647,-0.000864,0.001377,0.001875,-0.000354,0.000904,0.000988,0.000625,-0.000721,0.000878,-0.000131,-0.00065,-3.6e-05,-3.7e-05,-0.000599,0.000423,0.000606,-0.000253,0.000116,-0.000752,0.000469,-0.000148,-0.000745,-0.00099,-0.000925,0.000174,-3.7e-05,-0.000223,-0.000196,-0.000695,0.00024,0.000509,-0.000262,0.000929,-0.000868,-0.000757,-0.000167,0.000577,-0.000323,-8e-06,0.000917,0.000415,-0.001132,-0.000101,-0.000846,-0.001071,0.000324,8.9e-05,-0.001411,-6.7e-05,-0.000816,0.000584,0.000474,-0.000602,-4.9e-05,-0.000532,0.000253,-0.000756,0.000286,-0.000726,0.000435,0.000134,-6e-06,-7.4e-05,-9.1e-05,0.000357,-0.00079,-0.000333,0.000224,0.000268,0.000116,-0.0001,9.3e-05,-0.000423,4.2e-05,-0.000205,-0.000651,0.00033,-2.4e-05,-0.000301,1
4,47056667835,5.0,1,1.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0.3291,0.253524,0.001548,-0.000663,-0.000726,-0.002937,0.001417,-0.001304,-0.004229,0.00339,0.00505,3.8e-05,-0.002473,-0.001584,-0.001849,-0.002891,0.003139,0.003223,0.001298,0.000541,0.000218,0.003435,-0.000867,-0.000148,-0.002019,0.000738,0.000895,2.2e-05,-0.001179,-0.002358,-0.000208,-0.002841,0.001232,-0.000255,0.000435,-0.001915,6e-05,0.001556,-0.00146,0.001695,-0.000546,0.000992,6.7e-05,-0.000765,0.000236,0.000168,8.8e-05,0.000573,-0.000503,0.000647,-0.000864,0.001377,0.001875,-0.000354,0.000904,0.000988,0.000625,-0.000721,0.000878,-0.000131,-0.00065,-3.6e-05,-3.7e-05,-0.000599,0.000423,0.000606,-0.000253,0.000116,-0.000752,0.000469,-0.000148,-0.000745,-0.00099,-0.000925,0.000174,-3.7e-05,-0.000223,-0.000196,-0.000695,0.00024,0.000509,-0.000262,0.000929,-0.000868,-0.000757,-0.000167,0.000577,-0.000323,-8e-06,0.000917,0.000415,-0.001132,-0.000101,-0.000846,-0.001071,0.000324,8.9e-05,-0.001411,-6.7e-05,-0.000816,0.000584,0.000474,-0.000602,-4.9e-05,-0.000532,0.000253,-0.000756,0.000286,-0.000726,0.000435,0.000134,-6e-06,-7.4e-05,-9.1e-05,0.000357,-0.00079,-0.000333,0.000224,0.000268,0.000116,-0.0001,9.3e-05,-0.000423,4.2e-05,-0.000205,-0.000651,0.00033,-2.4e-05,-0.000301,1


In [29]:
item_features_ordered.head()

Unnamed: 0,product_id,loves_count,rating,reviews,price_usd,limited_edition,new,online_only,out_of_stock,sephora_exclusive,child_count,child_max_price,child_min_price,brand_freq_scaled,var_type_Color,var_type_Formulation,var_type_Scent,var_type_Size,var_type_Size + Concentration,var_type_Size + Concentration + Formulation,var_type_Type,var_type_Unknown,pc_Bath & Body,pc_Fragrance,pc_Gifts,pc_Hair,pc_Makeup,pc_Men,pc_Mini Size,pc_Skincare,pc_Tools & Brushes,size_ml_log,size_missing
0,P504322,177,5.0,1.0,19.0,0,0,1,0,0,0,32.0,28.0,0.606089,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,4.276338,0
1,P420652,1081315,4.3508,16118.0,24.0,0,0,0,0,1,3,24.0,24.0,0.510128,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3.077379,0
2,P420652,1081315,4.3508,16118.0,24.0,0,0,0,0,1,3,24.0,24.0,0.510128,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3.077379,0
3,P420652,1081315,4.3508,16118.0,24.0,0,0,0,0,1,3,24.0,24.0,0.510128,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3.077379,0
4,P420652,1081315,4.3508,16118.0,24.0,0,0,0,0,1,3,24.0,24.0,0.510128,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3.077379,0


In [30]:
item_features_ordered.shape

(1094411, 33)

In [31]:
user_features_ordered.shape

(1094411, 171)

## Final Touches on User & Item Datasets

In [32]:
num_user_features = user_features_ordered.shape[1] - 1
num_item_features = item_features_ordered.shape[1] - 1

u_s = 1  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items

print(num_user_features)
print(num_item_features)

170
32


In [33]:
item_train_df, item_test_df = train_test_split(item_features_ordered, train_size=0.80, shuffle=True, random_state=1)
user_train_df, user_test_df = train_test_split(user_features_ordered, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test = train_test_split(df_reviews_final.rating.to_numpy(), train_size=0.80, shuffle=True, random_state=1)

user_test_ids = user_test_df.author_id.values

In [34]:
# 2. Detect binary columns and scale only non-binary
binary_user_cols = [col for col in user_train_df.columns[1:] if user_train_df[col].dropna().isin([0, 1]).all()]
user_cols_to_scale = [col for col in user_train_df.columns[1:] if col not in binary_user_cols]

binary_item_cols = [col for col in item_train_df.columns[1:] if item_train_df[col].dropna().isin([0, 1]).all()]
item_cols_to_scale = [col for col in item_train_df.columns[1:] if col not in binary_item_cols]

In [35]:
# 3. Scale only selected columns
scalerUser = StandardScaler()
user_train_df[user_cols_to_scale] = scalerUser.fit_transform(user_train_df[user_cols_to_scale])
user_test_df[user_cols_to_scale] = scalerUser.transform(user_test_df[user_cols_to_scale])

scalerItem = StandardScaler()
item_train_df[item_cols_to_scale] = scalerItem.fit_transform(item_train_df[item_cols_to_scale])
item_test_df[item_cols_to_scale] = scalerItem.transform(item_test_df[item_cols_to_scale])

In [36]:
# 4. Convert to numpy for model input
user_train = user_train_df.to_numpy()
user_test = user_test_df.to_numpy()
item_train = item_train_df.to_numpy()
item_test = item_test_df.to_numpy()

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train_scaled = scalerTarget.transform(y_train.reshape(-1, 1))
y_test_scaled = scalerTarget.transform(y_test.reshape(-1, 1))

## Starting Building the Model

In [37]:
num_outputs = 32
tf.random.set_seed(1)

user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs)
])

item_NN = tf.keras.models.Sequential([  
      tf.keras.layers.Dense(256, activation='relu'),
      tf.keras.layers.Dense(128, activation='relu'),
      tf.keras.layers.Dense(num_outputs)
])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features,))
vu = user_NN(input_user)
vu = Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1), name="vu_normalized")(vu)


# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features,))
vm = item_NN(input_item)
vm = Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1), name="vm_normalized")(vm)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

model.summary()

In [38]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

tf.random.set_seed(1)

model.fit([tf.convert_to_tensor(user_train[:, u_s:], dtype=tf.float32), tf.convert_to_tensor(item_train[:, i_s:],dtype=tf.float32)], tf.convert_to_tensor(y_train_scaled, dtype=tf.float32), epochs=30)

Epoch 1/30
[1m27361/27361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 673us/step - loss: 0.0948
Epoch 2/30
[1m27361/27361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 678us/step - loss: 0.0841
Epoch 3/30
[1m27361/27361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 680us/step - loss: 0.0830
Epoch 4/30
[1m27361/27361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 751us/step - loss: 0.0822
Epoch 5/30
[1m27361/27361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 762us/step - loss: 0.0815
Epoch 6/30
[1m27361/27361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 735us/step - loss: 0.0807
Epoch 7/30
[1m27361/27361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 775us/step - loss: 0.0801
Epoch 8/30
[1m27361/27361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 806us/step - loss: 0.0794
Epoch 9/30
[1m27361/27361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 753us/step - loss: 0.0787
Epoch 10/30
[1m27361/27361

<keras.src.callbacks.history.History at 0x383593190>

### MAE, RMSE, MSE

In [39]:
# Apply the SAME scalers used on training data
model.evaluate(
    [tf.convert_to_tensor(user_test[:, u_s:], dtype=tf.float32),
     tf.convert_to_tensor(item_test[:, i_s:], dtype=tf.float32)],
    tf.convert_to_tensor(y_test_scaled, dtype=tf.float32)
)

[1m6841/6841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 317us/step - loss: 0.0923


0.09267489612102509

In [40]:
# Predict on scaled test data
y_pred_scaled = model.predict([
    tf.convert_to_tensor(user_test[:, u_s:], dtype=tf.float32),
    tf.convert_to_tensor(item_test[:, i_s:], dtype=tf.float32)
])

# Inverse transform predictions and true ratings back to original scale
y_pred_orig = scalerTarget.inverse_transform(y_pred_scaled)
y_test_orig = scalerTarget.inverse_transform(y_test_scaled)

rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))
mse = mean_squared_error(y_test_orig, y_pred_orig)
mae = mean_absolute_error(y_test_orig, y_pred_orig)

print("Test RMSE:", rmse)
print("Test MSE:", mse)
print("Test MAE:", mae)

[1m6841/6841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 342us/step
Test RMSE: 0.6088509513079056
Test MSE: 0.3706994809085416
Test MAE: 0.3256373892858349


MSE --> Scaled [-1 to 1 scale] --> 0.08037 --> Model loss during training/eval  
RMSE --> Original (1 to 5 scale) --> 0.567 --> Avg error in predicted rating  

![image.png](attachment:3d0da3d8-a331-4de5-8e3e-601844506f84.png)

## avg_precision, avg_recall, avg_f1, avg_accuracy

In [41]:
"""
def precision_recall_f1_nn(user_ids_test, y_test_orig, y_pred_orig, threshold=3.5):
    '''
    Computes average precision, recall, F1, and accuracy across users.
    
    Parameters:
      - user_ids_test: array-like, user IDs corresponding to each test row.
      - y_test_orig: array-like, true ratings in the original scale.
      - y_pred_orig: array-like, predicted ratings in the original scale.
      - threshold: float, threshold to decide if a rating is considered 'positive'
    
    Returns:
      avg_precision, avg_recall, avg_f1, avg_accuracy
    '''
    
    # Map predictions and true ratings to user IDs
    user_est_true = defaultdict(list)
    for uid, true_r, est in zip(user_ids_test, y_test_orig, y_pred_orig):
        user_est_true[uid].append((est, true_r))
    
    precisions = {}
    recalls = {}
    f1s = {}
    accuracies = {}
    
    for uid, ratings in user_est_true.items():
        # Create binary labels: 1 if rating >= threshold, else 0
        y_true = [1 if (true_r >= threshold) else 0 for (_, true_r) in ratings]
        y_pred_labels = [1 if (est >= threshold) else 0 for (est, _) in ratings]
        
        precisions[uid] = precision_score(y_true, y_pred_labels, zero_division=0)
        recalls[uid] = recall_score(y_true, y_pred_labels, zero_division=0)
        f1s[uid] = f1_score(y_true, y_pred_labels, zero_division=0)
        accuracies[uid] = accuracy_score(y_true, y_pred_labels)
    
    # Average the scores over all users
    avg_precision = sum(precisions.values()) / len(precisions) if precisions else 0.
    avg_recall = sum(recalls.values()) / len(recalls) if recalls else 0.
    avg_f1 = sum(f1s.values()) / len(f1s) if f1s else 0.
    avg_accuracy = sum(accuracies.values()) / len(accuracies) if accuracies else 0.
    
    return avg_precision, avg_recall, avg_f1, avg_accuracy
"""
print()




In [42]:
"""
avg_precision, avg_recall, avg_f1, avg_accuracy = precision_recall_f1_nn(
    user_ids_test=user_test_ids, 
    y_test_orig=y_test_orig.reshape(-1),  # flatten if necessary
    y_pred_orig=y_pred_orig.reshape(-1),
    threshold=3.5
)

print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1 Score:", avg_f1)
print("Average Accuracy:", avg_accuracy)
"""
print()




In [43]:
def precision_recall_f1_nn_vectorized(user_ids_test, y_test_orig, y_pred_orig, threshold=3.5):
    """
    Computes average precision, recall, F1, and accuracy across users in a vectorized manner.
    
    Parameters:
      - user_ids_test: array-like, user IDs for each test record.
      - y_test_orig: array-like, true ratings in the original scale.
      - y_pred_orig: array-like, predicted ratings in the original scale.
      - threshold: float, threshold above which a rating is considered "positive".
      
    Returns:
      avg_precision, avg_recall, avg_f1, avg_accuracy
    """
    
    y_true_bin = (np.array(y_test_orig).flatten() >= threshold).astype(int)
    y_pred_bin = (np.array(y_pred_orig).flatten() >= threshold).astype(int)
    
    df = pd.DataFrame({
        'user': user_ids_test,
        'true': y_true_bin,
        'pred': y_pred_bin
    })
    
    # Group by user and compute aggregated statistics:
    # - n_tp: number of true positives
    # - n_pred: total predicted positives
    # - n_true: total actual positives
    # - n_total: total number of ratings for the user
    # - n_correct: total correct predictions
    grouped = df.groupby('user').agg(
        n_tp = ('true', lambda x: np.sum(x * df.loc[x.index, 'pred'])),
        n_pred = ('pred', 'sum'),
        n_true = ('true', 'sum'),
        n_total = ('true', 'size'),
        n_correct = ('true', lambda x: np.sum(x == df.loc[x.index, 'pred']))
    )
    
    grouped['precision'] = grouped['n_tp'] / grouped['n_pred'].replace(0, np.nan)
    grouped['recall'] = grouped['n_tp'] / grouped['n_true'].replace(0, np.nan)
    grouped['accuracy'] = grouped['n_correct'] / grouped['n_total']
    grouped['f1'] = 2 * (grouped['precision'] * grouped['recall']) / (grouped['precision'] + grouped['recall'])
    
    grouped = grouped.fillna(0)
    
    avg_precision = grouped['precision'].mean()
    avg_recall = grouped['recall'].mean()
    avg_f1 = grouped['f1'].mean()
    avg_accuracy = grouped['accuracy'].mean()
    
    return avg_precision, avg_recall, avg_f1, avg_accuracy

In [44]:
avg_precision, avg_recall, avg_f1, avg_accuracy = precision_recall_f1_nn_vectorized(
    user_ids_test=user_test_ids, 
    y_test_orig=y_test_orig, 
    y_pred_orig=y_pred_orig, 
    threshold=3.5
)

print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1 Score:", avg_f1)
print("Average Accuracy:", avg_accuracy)

Average Precision: 0.7898866474512624
Average Recall: 0.797844910234919
Average F1 Score: 0.7921698092515485
Average Accuracy: 0.9377047070289458


## Average Precision, nDCG@k, Average Precision@k

In [45]:
def measures_at_k_nn(user_ids_test, y_test_orig, y_pred_orig, k=10, threshold=3.5):
    """
    Computes average average_precision, precision@k, and nDCG@k across users
    for predictions from a two-tower model.
    
    Parameters:
      - user_ids_test: array-like of user IDs (one per test record)
      - y_test_orig: array-like of true ratings (original scale)
      - y_pred_orig: array-like of predicted ratings (original scale)
      - k: int, the number of top items to consider for "at k" metrics
      - threshold: float, the rating value at or above which an item is considered relevant
      
    Returns:
      avg_average_precision, avg_precision_at_k, avg_ndcg_at_k
    """
    user_ids_test = np.array(user_ids_test).flatten()
    y_test_orig = np.array(y_test_orig).flatten()
    y_pred_orig = np.array(y_pred_orig).flatten()
    
    df = pd.DataFrame({
        'user': user_ids_test,
        'true': y_test_orig,
        'pred': y_pred_orig
    })
    
    df['true_bin'] = (df['true'] >= threshold).astype(int)
    df['pred_bin'] = (df['pred'] >= threshold).astype(int)
    
    user_est_true = df.groupby('user')
    
    average_precisions = {}
    precisions_at_k = {}
    ndcgs_at_k = {}
    
    for uid, group in user_est_true:
        # Convert each user's group to a list of tuples: (predicted rating, true rating, binary true)
        # Sort the user’s items by predicted rating in descending order.
        sorted_group = group.sort_values(by='pred', ascending=False)
        y_true = sorted_group['true_bin'].tolist()
        y_pred = sorted_group['pred_bin'].tolist()
        
        y_true_at_k = y_true[:k]
        y_pred_at_k = y_pred[:k]
        
        # Compute average precision over the full ranked list if the user has any relevant items.
        if sum(y_true) > 0:
            average_precisions[uid] = average_precision_score(sorted_group['true_bin'], sorted_group['pred_bin'])
        else:
            average_precisions[uid] = 0.0
        
        # Compute precision at k.
        precisions_at_k[uid] = precision_score(y_true_at_k, y_pred_at_k, zero_division=0)
        
        # For nDCG, raw ratings are needed and consider the ranking.
        # nDCG expects input arrays of shape (1, n); convert the sorted true and predicted ratings.
        if len(sorted_group) > 1:
            true_rel = np.asarray(sorted_group['true'].tolist()).reshape(1, -1)
            pred_rel = np.asarray(sorted_group['pred'].tolist()).reshape(1, -1)
            ndcgs_at_k[uid] = ndcg_score(true_rel, pred_rel, k=k)
        else:
            # If there's only one item, then return a simple binary match.
            ndcgs_at_k[uid] = 1.0 if (y_true[0] == y_pred[0] and y_true[0] == 1) else 0.0
    
    # Average metrics across all users.
    avg_average_precision = np.mean(list(average_precisions.values()))
    avg_precision_at_k = np.mean(list(precisions_at_k.values()))
    avg_ndcg_at_k = np.mean(list(ndcgs_at_k.values()))
    
    return avg_average_precision, avg_precision_at_k, avg_ndcg_at_k

In [46]:
avg_ap, avg_prec_k, avg_ndcg_k = measures_at_k_nn(user_test_ids, y_test_orig, y_pred_orig, k=10, threshold=3.5)
print("Avg. Average Precision:", avg_ap)
print("Avg. Precision@10:", avg_prec_k)
print("Avg. nDCG@10:", avg_ndcg_k)

Avg. Average Precision: 0.8144865343155763
Avg. Precision@10: 0.7899020732800889
Avg. nDCG@10: 0.8184835148560334
