In [144]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares

os.chdir("/Users/nikhithagollamudi/Desktop/School/5123/final-project/CSCI-5123-Project/")

#load data
user_triplets_df = pd.read_csv("resources/data/dataset/user_activity_triplets.csv", sep=";")
original_orders_df = pd.read_csv("resources/data/dataset/original_orders.csv", sep=";")
outfits_df = pd.read_csv("resources/data/dataset/outfits.csv", sep=";")

#preprocess outfit tags
outfits_df["outfit_tags"] = outfits_df["outfit_tags"].apply(eval)
outfits_df["tag_categories"] = outfits_df["tag_categories"].apply(eval)

from datetime import datetime

def get_current_season():
    month = datetime.now().month
    return (
        "Winter" if month in [12, 1, 2]
        else "Spring" if month in [3, 4, 5]
        else "Summer" if month in [6, 7, 8]
        else "Fall"
    )


#filter for the current season
current_season = get_current_season()
outfits_df = outfits_df.loc[[current_season in tags for tags in outfits_df["outfit_tags"]]].copy()


#combine triplet data
user_triplets_df = pd.concat([user_triplets_df, original_orders_df], ignore_index=True)


In [139]:
#train/test splits
from src.prepare_train_test_splits import (translate_user_triplets_to_orders, remove_consecutive_duplicates, convert_user_orders_to_train_test_splits)

user_triplets_df = remove_consecutive_duplicates(user_triplets_df)
user_orders_df = translate_user_triplets_to_orders(user_triplets_df, outfits_df)
user_splits_df, user_splits_unique_df = convert_user_orders_to_train_test_splits(user_orders_df, percentage_test=0.3)

#filter unique outfits
def flatten_lists(series):
    unique_items = set()

    for item_list in series:
        for item in item_list:
            unique_items.add(item)
    
    return unique_items

all_train_ids = flatten_lists(user_splits_df["train_outfit_ids"])
all_test_ids = flatten_lists(user_splits_df["test_outfit_id"])
all_outfit_ids = all_train_ids.union(all_test_ids)
outfits_df = outfits_df[outfits_df["id"].isin(all_outfit_ids)]

4949
No unique outfit found with groups [nan nan]
No unique outfit found with groups [nan nan]
No unique outfit found with groups [nan nan]
No unique outfit found with groups [nan nan nan]
No unique outfit found with groups [nan nan nan nan nan nan]
No unique outfit found with groups [nan nan]
No unique outfit found with groups [nan nan]
No unique outfit found with groups [nan nan]
No unique outfit found with groups [nan nan nan nan]
No unique outfit found with groups [nan nan nan]
No unique outfit found with groups [nan nan]
No unique outfit found with groups [nan nan nan nan]
No unique outfit found with groups [nan nan]
No unique outfit found with groups [nan nan nan nan nan nan nan nan nan nan]
No unique outfit found with groups [nan nan]
No unique outfit found with groups [nan nan]
No unique outfit found with groups [nan nan]
No unique outfit found with groups [nan nan nan nan nan nan nan nan]
No unique outfit found with groups [nan nan nan nan nan]
No unique outfit found with grou

In [140]:
#Tag-based recommendations (Content-Based)
mlb = MultiLabelBinarizer()
tag_matrix = pd.DataFrame(
    mlb.fit_transform(outfits_df["outfit_tags"]),
    index=outfits_df["id"],
    columns=mlb.classes_
)
        
#store user profiles based on average tag vectors of their training outfits
user_profiles = {}
for user_index, row in user_splits_df.iterrows():
    #filter outfits that exist in the tag matrix
    valid_outfit_ids = []
    for outfit_id in row["train_outfit_ids"]:
        if outfit_id in tag_matrix.index:
            valid_outfit_ids.append(outfit_id)
    
    #calculate the mean tag vector 
    if valid_outfit_ids:
        average_tag_vector = tag_matrix.loc[valid_outfit_ids].mean(axis=0)
        user_profiles[user_index] = average_tag_vector


user_tag_matrix = pd.DataFrame.from_dict(user_profiles, orient="index").fillna(0)
similarity_matrix = cosine_similarity(user_tag_matrix.values, tag_matrix.values)

cb_scores = []
for user_index, user_id in enumerate(user_tag_matrix.index):
    #get similarity scores
    user_scores = similarity_matrix[user_index]

    #get indices of the 300 most similar outfits
    top_outfit_indices = user_scores.argsort()[::-1][:300]

    for outfit_index in top_outfit_indices:
        outfit_id = tag_matrix.index[outfit_index]
        similarity_score = user_scores[outfit_index]
        cb_scores.append((user_id, outfit_id, similarity_score))

cb_df = pd.DataFrame(cb_scores, columns=["user_id", "item_id", "score"])
cb_df.to_csv("tag_embedding_scores.csv", index=False)

#ALS-based recommendations (Collaborative Filtering)
train_df = user_splits_df.explode("train_outfit_ids").dropna()
train_df["user_id"] = train_df.index
train_df["value"] = 1
train_df.rename(columns={"train_outfit_ids": "item_id"}, inplace=True)

#encode user and item IDs to ints
user_enc = LabelEncoder()
item_enc = LabelEncoder()
train_df["user_idx"] = user_enc.fit_transform(train_df["user_id"])
train_df["item_idx"] = item_enc.fit_transform(train_df["item_id"])

user_index_to_user_id = {}
item_index_to_item_id = {}

#user index mapping
for i in range(len(train_df)):
    row = train_df.iloc[i]
    user_index = row["user_idx"]
    user_id = row["user_id"]
    user_index_to_user_id[user_index] = user_id

#item index mapping
for i in range(len(train_df)):
    row = train_df.iloc[i]
    item_index = row["item_idx"]
    item_id = row["item_id"]
    item_index_to_item_id[item_index] = item_id

#create a sparse matrix and convert it to CSR 
coo = coo_matrix((train_df["value"], (train_df["user_idx"], train_df["item_idx"])))
csr = coo.tocsr()

#parameters matched to the ones used by the paper 
als = AlternatingLeastSquares(factors=128, regularization=0.01, iterations=15)
als.fit(csr.T)

als_scores = []
for user_idx in range(csr.shape[0]):
    #get top 300 recommended item indices and their scores 
    item_indices, scores = als.recommend(user_idx, csr, N=300, filter_already_liked_items=False)
    user_id = user_index_to_id[user_idx]

    for i in range(len(item_indices)):
        item_idx = item_indices[i]
        score = scores[i]
        item_id = item_index_to_id[item_idx]
        als_scores.append((user_id, item_id, float(score)))

als_df = pd.DataFrame(als_scores, columns=["user_id", "item_id", "score"])
als_df.to_csv("als_recommendation_scores.csv", index=False)



  0%|          | 0/15 [00:00<?, ?it/s]

In [141]:
#hybrid recommendations
als_df = pd.read_csv("als_recommendation_scores.csv")
cb_df = pd.read_csv("tag_embedding_scores.csv")

#normalize scores
scaler_als = MinMaxScaler()
als_df["score_als"] = scaler_als.fit_transform(als_df[["score"]])

scaler_cb = MinMaxScaler()
cb_df["score_cb"] = scaler_cb.fit_transform(cb_df[["score"]])

merged = pd.merge(als_df[["user_id", "item_id", "score_als"]], cb_df[["user_id", "item_id", "score_cb"]], on=["user_id", "item_id"])

alpha = 0.5
#calculate the hybrid score as a weighted average of ALS and CB scores
merged["score_hybrid"] = alpha * merged["score_als"] + (1 - alpha) * merged["score_cb"]
merged["rank"] = merged.groupby("user_id")["score_hybrid"].rank(ascending=False, method="first")
#save the top 100 recommendations
top_100 = merged[merged["rank"] <= 100]
top_100.to_csv("hybrid_top100.csv", index=False)

In [142]:
#Evaluation
user_splits_df = pd.read_pickle("user_splits_df.pkl")
user_splits_df["user_id"] = user_splits_df.index

#reshape the hybrid recommendations so each user_id maps to a list of their top recommended item_ids
hybrid_df = pd.read_csv("hybrid_top100.csv")
hybrid_grouped = hybrid_df.groupby("user_id")["item_id"].apply(list).reset_index()
hybrid_grouped.columns = ["user_id", "hybrid_recommendations"]

#merge and evaluate
eval_df = pd.merge(user_splits_df, hybrid_grouped, on="user_id")

#format outfit IDs to match the prefixes 
def format_outfit_ids(outfit_ids):
    formatted = []
    for outfit_id in outfit_ids:
        outfit_str = str(outfit_id)
        if not outfit_str.startswith("outfit."):
            outfit_str = "outfit." + outfit_str
        formatted.append(outfit_str)
    return formatted

eval_df["hybrid_recommendations"] = eval_df["hybrid_recommendations"].apply(format_outfit_ids)


def evaluate_hit_rate_at_n(test_ids, predicted_ids, n=100):
    top_n_predictions = predicted_ids[:n]

    #make sure test_ids is a list
    is_list_or_array = isinstance(test_ids, (list, np.ndarray))
    if not is_list_or_array:
        test_ids = [test_ids]

    #check for hits 
    hit_found = False
    for test_id in test_ids:
        if test_id in top_n_predictions:
            hit_found = True
            break

    return int(hit_found)


#ID Hit Rates 
id_hit_rate_at_10 = []
id_hit_rate_at_100 = []

for idx, row in eval_df.iterrows():
    hit_10 = evaluate_hit_rate_at_n(row["test_outfit_id"], row["hybrid_recommendations"], n=10)
    hit_100 = evaluate_hit_rate_at_n(row["test_outfit_id"], row["hybrid_recommendations"], n=100)
    
    id_hit_rate_at_10.append(hit_10)
    id_hit_rate_at_100.append(hit_100)

eval_df["id_hit_rate_at_10"] = id_hit_rate_at_10
eval_df["id_hit_rate_at_100"] = id_hit_rate_at_100


#map outfit IDs to group IDs 
predicted_groups_list = []
outfit_to_group = outfits_df.set_index("id")["group"].to_dict()

for idx, row in eval_df.iterrows():
    group_ids = []
    for outfit_id in row["hybrid_recommendations"]:
        group_ids.append(outfit_to_group.get(outfit_id, ""))
    predicted_groups_list.append(group_ids)

eval_df["predicted_groups"] = predicted_groups_list


#Group Hit Rates 
group_hit_rate_at_10 = []
group_hit_rate_at_100 = []

for _, row in eval_df.iterrows():
    hit_10 = evaluate_hit_rate_at_n(row["test_group"], row["predicted_groups"], n=10)
    hit_100 = evaluate_hit_rate_at_n(row["test_group"], row["predicted_groups"], n=100)
    
    group_hit_rate_at_10.append(hit_10)
    group_hit_rate_at_100.append(hit_100)

eval_df["group_hit_rate_at_10"] = group_hit_rate_at_10
eval_df["group_hit_rate_at_100"] = group_hit_rate_at_100

#results
print(f"ID Hit Rate @10: {eval_df['id_hit_rate_at_10'].mean():.4f}")
print(f"ID Hit Rate @100: {eval_df['id_hit_rate_at_100'].mean():.4f}")
print(f"Group Hit Rate @10: {eval_df['group_hit_rate_at_10'].mean():.4f}")
print(f"Group Hit Rate @100: {eval_df['group_hit_rate_at_100'].mean():.4f}")


ID Hit Rate @10: 0.0263
ID Hit Rate @100: 0.0279
Group Hit Rate @10: 0.0389
Group Hit Rate @100: 0.0399


In [143]:
import pyperclip
#formatting results for overleaf
def format_eval_df_latex(eval_df, precision=4, run_name="Hybrid"):
    id_hr10 = eval_df["id_hit_rate_at_10"].mean()
    id_hr100 = eval_df["id_hit_rate_at_100"].mean()
    group_hr10 = eval_df["group_hit_rate_at_10"].mean()
    group_hr100 = eval_df["group_hit_rate_at_100"].mean()

    first_row = f"{run_name} Ind & {id_hr10:.{precision}f} & {id_hr100:.{precision}f} \\\\"
    second_row = f"{run_name} Groups & {group_hr10:.{precision}f} & {group_hr100:.{precision}f} \\\\\\hline"
    
    full_string = first_row + "\n" + second_row
    print(full_string)
    pyperclip.copy(full_string)

format_eval_df_latex(eval_df, run_name="Hybrid")


Hybrid Ind & 0.0263 & 0.0279 \\
Hybrid Groups & 0.0389 & 0.0399 \\\hline


In [136]:
#Intra-List Similarity 
def compute_ils(tag_matrix, recommendations):
    ils_scores = []
    for recommended_item_ids in recommendations:
        #filter out item IDs not in tag matrix index
        valid_item_ids = []
        for item_id in recommended_item_ids:
            if item_id in tag_matrix.index:
                valid_item_ids.append(item_id)

        if len(valid_item_ids) < 2:
            continue
        
        tag_vectors = tag_matrix.loc[valid_item_ids].values

        #compute cosine similarity matrix 
        similarity_matrix = cosine_similarity(tag_vectors)

        #get the upper triangle of the similarity matrix
        upper_triangle_indices = np.triu_indices_from(similarity_matrix, k=1)

        if len(upper_triangle_indices[0]) == 0:
            continue

        #compute the mean pairwise similarity for the item set
        upper_triangle_values = similarity_matrix[upper_triangle_indices]
        mean_similarity = upper_triangle_values.mean()

        ils_scores.append(mean_similarity)

    if ils_score:
        return np.mean(ils_scores) 
    else:
        return float("nan")


# Rebuild tag_matrix with outfit.id as index 
tag_matrix_filtered = tag_matrix.loc[tag_matrix.index.intersection(outfits_df["id"])]

ils_score = compute_ils(tag_matrix_filtered, eval_df["hybrid_recommendations"])
print(f"Intra-List Similarity (ILS): {ils_score:.4f}")


Intra-List Similarity (ILS): 0.5000


In [137]:
#NDCG
def evaluate_ndcg_at_n(test_ids, predicted_ids, n=100):
    top_n_predictions = predicted_ids[:n]

    #calculate DCG 
    dcg = 0.0
    for rank, predicted_id in enumerate(top_n_predictions):
        position = rank + 2  
        if predicted_id in test_ids:
            gain = 1 / np.log2(position)
            dcg += gain

    #calculate IDCG 
    ideal_ranking_length = min(len(test_ids), n)
    idcg = 0.0
    for rank in range(ideal_ranking_length):
        position = rank + 2
        idcg += 1 / np.log2(position)

    #normalize DCG 
    if idcg > 0:
        ndcg = dcg / idcg
    else:
        ndcg = 0.0

    return ndcg

#ID NDCG
id_ndcg_at_10 = []

for idx, row in eval_df.iterrows():
    ndcg_10 = evaluate_ndcg_at_n(row["test_outfit_id"], row["hybrid_recommendations"], n=10)
    id_ndcg_at_10.append(ndcg_10)

eval_df["id_ndcg_at_10"] = id_ndcg_at_10

#Group NDCG
group_ndcg_at_10 = []

for idx, row in eval_df.iterrows():
    ndcg_10 = evaluate_ndcg_at_n(row["test_group"], row["predicted_groups"], n=10)
    group_ndcg_at_10.append(ndcg_10)

eval_df["group_ndcg_at_10"] = group_ndcg_at_10

#results
print(f"ID NDCG @10: {eval_df['id_ndcg_at_10'].mean():.4f}")
print(f"Group NDCG @10: {eval_df['group_ndcg_at_10'].mean():.4f}")


ID NDCG @10: 0.0049
Group NDCG @10: 0.0063
