In [6]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares

# Set working directory
os.chdir("/Users/nikhithagollamudi/Desktop/School/5123/final-project/CSCI-5123-Project/")

# Load data
user_triplets_df = pd.read_csv("resources/data/dataset/user_activity_triplets.csv", sep=";")
original_orders_df = pd.read_csv("resources/data/dataset/original_orders.csv", sep=";")
outfits_df = pd.read_csv("resources/data/dataset/outfits.csv", sep=";")

# Preprocess outfit tags
outfits_df["outfit_tags"] = outfits_df["outfit_tags"].apply(eval)
outfits_df["tag_categories"] = outfits_df["tag_categories"].apply(eval)

# Combine triplet data
user_triplets_df = pd.concat([user_triplets_df, original_orders_df], ignore_index=True)


In [7]:
# Prepare train/test splits
from src.prepare_train_test_splits import (
    translate_user_triplets_to_orders,
    remove_consecutive_duplicates,
    convert_user_orders_to_train_test_splits
)

user_triplets_df = remove_consecutive_duplicates(user_triplets_df)
user_orders_df = translate_user_triplets_to_orders(user_triplets_df, outfits_df)
user_splits_df, user_splits_unique_df = convert_user_orders_to_train_test_splits(user_orders_df, percentage_test=0.3)

# Filter outfits
def flatten_lists(series):
    return set(o for l in series for o in l)

all_train_ids = flatten_lists(user_splits_df["train_outfit_ids"])
all_test_ids = flatten_lists(user_splits_df["test_outfit_id"])
all_outfit_ids = all_train_ids.union(all_test_ids)
outfits_df = outfits_df[outfits_df["id"].isin(all_outfit_ids)]

4949
No unique outfit found with groups ['group.4bd4ee24eac8948e82783b15d9404f6b'
 'group.4bd4ee24eac8948e82783b15d9404f6b']
No unique outfit found with groups ['group.423a23f6717e6d85adac54c051ee9832'
 'group.423a23f6717e6d85adac54c051ee9832']
No unique outfit found with groups ['group.e0cb0f6e113edc4df8a1e304376734f6'
 'group.e0cb0f6e113edc4df8a1e304376734f6']
No unique outfit found with groups ['group.384b8170c6a6ddfd568ff7fab5fb49c4'
 'group.384b8170c6a6ddfd568ff7fab5fb49c4']
No unique outfit found with groups ['group.edb60c2f440a9ac7d0883fb9371c8607'
 'group.edb60c2f440a9ac7d0883fb9371c8607']
No unique outfit found with groups ['group.a3ab26b5d2f7ef2cf102422a3dde3b46'
 'group.a3ab26b5d2f7ef2cf102422a3dde3b46']
No unique outfit found with groups ['group.2c7095c075561fe6278f3a2d7c1d6ac9'
 'group.2c7095c075561fe6278f3a2d7c1d6ac9']
No unique outfit found with groups ['group.ae8da3f0ad6f8ff3f83b2af96e975991'
 'group.ae8da3f0ad6f8ff3f83b2af96e975991']
No unique outfit found with groups 

In [8]:
# Tag-based recommendations (Content-Based)
mlb = MultiLabelBinarizer()
tag_matrix = pd.DataFrame(
    mlb.fit_transform(outfits_df["outfit_tags"]),
    index=outfits_df["id"],
    columns=mlb.classes_
)

user_profiles = {}
for _, row in user_splits_df.iterrows():
    outfit_ids = [oid for oid in row["train_outfit_ids"] if oid in tag_matrix.index]
    if outfit_ids:
        tag_vectors = tag_matrix.loc[outfit_ids].mean(axis=0)
        user_profiles[row.name] = tag_vectors

user_tag_matrix = pd.DataFrame.from_dict(user_profiles, orient="index").fillna(0)
similarity_matrix = cosine_similarity(user_tag_matrix.values, tag_matrix.values)

cb_scores = []
for u_idx, user_id in enumerate(user_tag_matrix.index):
    scores = similarity_matrix[u_idx]
    top_indices = scores.argsort()[::-1][:300]
    for i in top_indices:
        cb_scores.append((user_id, tag_matrix.index[i], scores[i]))

cb_df = pd.DataFrame(cb_scores, columns=["user_id", "item_id", "score"])
cb_df.to_csv("tag_embedding_scores.csv", index=False)

# ALS-based recommendations (Collaborative Filtering)
train_df = user_splits_df.explode("train_outfit_ids").dropna()
train_df["user_id"] = train_df.index
train_df["value"] = 1
train_df.rename(columns={"train_outfit_ids": "item_id"}, inplace=True)

user_enc = LabelEncoder()
item_enc = LabelEncoder()
train_df["user_idx"] = user_enc.fit_transform(train_df["user_id"])
train_df["item_idx"] = item_enc.fit_transform(train_df["item_id"])

user_index_to_id = dict(zip(train_df["user_idx"], train_df["user_id"]))
item_index_to_id = dict(zip(train_df["item_idx"], train_df["item_id"]))

coo = coo_matrix((train_df["value"], (train_df["user_idx"], train_df["item_idx"])))
csr = coo.tocsr()

als = AlternatingLeastSquares(factors=128, regularization=0.01, iterations=15)
als.fit(csr.T)

als_scores = []
for user_idx in range(csr.shape[0]):
    item_indices, scores = als.recommend(user_idx, csr, N=300, filter_already_liked_items=False)
    user_id = user_index_to_id[user_idx]
    for item_idx, score in zip(item_indices, scores):
        item_id = item_index_to_id[item_idx]
        als_scores.append((user_id, item_id, float(score)))

als_df = pd.DataFrame(als_scores, columns=["user_id", "item_id", "score"])
als_df.to_csv("als_recommendation_scores.csv", index=False)



  0%|          | 0/15 [00:00<?, ?it/s]

In [9]:
# Build hybrid recommendations
als_df = pd.read_csv("als_recommendation_scores.csv")
cb_df = pd.read_csv("tag_embedding_scores.csv")

scaler = MinMaxScaler()
als_df["score_als"] = scaler.fit_transform(als_df[["score"]])
cb_df["score_cb"] = scaler.fit_transform(cb_df[["score"]])

merged = pd.merge(
    als_df[["user_id", "item_id", "score_als"]],
    cb_df[["user_id", "item_id", "score_cb"]],
    on=["user_id", "item_id"]
)

alpha = 0.5
merged["score_hybrid"] = alpha * merged["score_als"] + (1 - alpha) * merged["score_cb"]
merged["rank"] = merged.groupby("user_id")["score_hybrid"].rank(ascending=False, method="first")
top_100 = merged[merged["rank"] <= 100]
top_100.to_csv("hybrid_top100.csv", index=False)

In [10]:
# Evaluation
user_splits_df = pd.read_pickle("user_splits_df.pkl")
user_splits_df["user_id"] = user_splits_df.index

hybrid_df = pd.read_csv("hybrid_top100.csv")
hybrid_grouped = hybrid_df.groupby("user_id")["item_id"].apply(list).reset_index()
hybrid_grouped.columns = ["user_id", "hybrid_recommendations"]

# Merge and evaluate
eval_df = pd.merge(user_splits_df, hybrid_grouped, on="user_id")
eval_df["hybrid_recommendations"] = eval_df["hybrid_recommendations"].apply(
    lambda x: ["outfit." + str(i) if not str(i).startswith("outfit.") else str(i) for i in x]
)

def evaluate_hit_rate_at_n(test_ids, predicted_ids, n=100):
    predicted_ids = predicted_ids[:n]
    if not isinstance(test_ids, (list, np.ndarray)):
        test_ids = [test_ids]
    return int(any(tid in predicted_ids for tid in test_ids))

# ID hit rates
eval_df["id_hit_rate_at_10"] = eval_df.apply(
    lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["hybrid_recommendations"], n=10), axis=1)
eval_df["id_hit_rate_at_100"] = eval_df.apply(
    lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["hybrid_recommendations"], n=100), axis=1)

# Group hit rates
outfit_to_group = outfits_df.set_index("id")["group"].to_dict()
eval_df["predicted_groups"] = eval_df["hybrid_recommendations"].apply(
    lambda outfit_ids: [outfit_to_group.get(oid, "") for oid in outfit_ids]
)
eval_df["group_hit_rate_at_10"] = eval_df.apply(
    lambda x: evaluate_hit_rate_at_n(x["test_group"], x["predicted_groups"], n=10), axis=1)
eval_df["group_hit_rate_at_100"] = eval_df.apply(
    lambda x: evaluate_hit_rate_at_n(x["test_group"], x["predicted_groups"], n=100), axis=1)

print(f"Hit Rate @10 (ID): {eval_df['id_hit_rate_at_10'].mean():.4f}")
print(f"Hit Rate @100 (ID): {eval_df['id_hit_rate_at_100'].mean():.4f}")
print(f"Hit Rate @10 (group): {eval_df['group_hit_rate_at_10'].mean():.4f}")
print(f"Hit Rate @100 (group): {eval_df['group_hit_rate_at_100'].mean():.4f}")

Hit Rate @10 (ID): 0.0218
Hit Rate @100 (ID): 0.0233
Hit Rate @10 (group): 0.0305
Hit Rate @100 (group): 0.0331
