In [1]:
! pip install pandas
! pip install numpy
! pip install scikit-learn
! pip install matplotlib
! pip install joblib
! pip install streamlit


Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m88.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m107.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.50.0


In [5]:
# smart_recommender.py
"""
Smart Hybrid Recommender with Weighted Hybrid, Implicit Feedback, and Explainability
Author: Harshit Mittal (adapted)
Date: 2025-10-11
Run: python smart_recommender.py
"""

import os
import math
import random
import pickle
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import joblib

SEED = 42
random.seed(SEED)
np.random.seed(SEED)


# -------------------------
# 1) Utilities & Dataset
# -------------------------
def build_synthetic_dataset(n_users=600, n_items=350, min_ratings_per_user=10, max_ratings_per_user=25):
    """
    Build a synthetic dataset with userId, productId, rating and product metadata.
    Returns: ratings_df (userId, productId, rating), products_df (productId, title, category, description)
    """
    users = [f"U{u}" for u in range(1, n_users + 1)]
    items = [f"P{i}" for i in range(1, n_items + 1)]
    categories = ['Electronics', 'Books', 'Home', 'Toys', 'Beauty', 'Sports', 'Clothing', 'Grocery']

    rows = []
    for u in users:
        k = np.random.randint(min_ratings_per_user, max_ratings_per_user + 1)
        sampled = np.random.choice(items, size=k, replace=False)
        for it in sampled:
            # realistic-ish distribution: normal around 3.6, clipped to [1,5], rounded to 0.5 steps
            r = np.random.normal(loc=3.6, scale=1.0)
            r = min(5.0, max(1.0, r))
            r = round(r * 2) / 2.0
            rows.append((u, it, r))
    ratings_df = pd.DataFrame(rows, columns=['userId', 'productId', 'rating'])

    # product metadata
    product_meta = []
    keywords = {
        'Electronics': ['battery', 'wireless', 'bluetooth', 'USB', 'portable', 'charger'],
        'Books': ['story', 'novel', 'guide', 'history', 'author', 'learn'],
        'Home': ['kitchen', 'durable', 'design', 'compact', 'decor', 'clean'],
        'Toys': ['kids', 'fun', 'safe', 'interactive', 'educational', 'colorful'],
        'Beauty': ['gentle', 'skin', 'organic', 'scent', 'serum', 'moisturizer'],
        'Sports': ['fitness', 'outdoor', 'durable', 'training', 'performance', 'comfort'],
        'Clothing': ['fabric', 'comfortable', 'casual', 'size', 'style', 'soft'],
        'Grocery': ['fresh', 'organic', 'snack', 'ingredients', 'package', 'tasty']
    }
    for i, pid in enumerate(items, start=1):
        cat = random.choice(categories)
        title = f"{cat} Product {i}"
        desc_words = " ".join(np.random.choice(keywords[cat], size=6, replace=True))
        description = f"{title}. {desc_words}. High quality and good value."
        product_meta.append({'productId': pid, 'title': title, 'category': cat, 'description': description})
    products_df = pd.DataFrame(product_meta)
    return ratings_df, products_df


# -------------------------
# 2) Preprocess & EDA (brief)
# -------------------------
def preprocess(ratings_df):
    ratings_df = ratings_df.drop_duplicates().dropna(subset=['userId', 'productId', 'rating']).reset_index(drop=True)
    ratings_df['userId'] = ratings_df['userId'].astype(str)
    ratings_df['productId'] = ratings_df['productId'].astype(str)
    ratings_df['rating'] = pd.to_numeric(ratings_df['rating'], errors='coerce').astype(float)
    return ratings_df


# -------------------------
# 3) Simulate Implicit Feedback
# -------------------------
def add_implicit_scores(ratings_df, click_multiplier=(0.8, 1.4)):
    """
    Create an 'implicit_score' column as a function of explicit rating and a random click/view factor.
    This mimics implicit feedback (clicks, views).
    """
    mults = np.random.uniform(click_multiplier[0], click_multiplier[1], size=len(ratings_df))
    ratings_df = ratings_df.copy()
    ratings_df['implicit_score'] = (ratings_df['rating'] / 5.0) * mults  # normalized
    return ratings_df


# -------------------------
# 4) Collaborative Filtering (matrix factorization) - offline approach
# -------------------------
def train_cf(train_df, n_components=30):
    """
    Train TruncatedSVD on the user-item matrix (train only).
    Returns:
      - svd (TruncatedSVD fitted on filled matrix)
      - user_index (list of user ids), item_index (list of item ids)
      - pred_matrix_df: DataFrame of predicted ratings (users x items)
    """
    # create pivot (users x items) with NaNs for unrated
    user_ids = sorted(train_df['userId'].unique())
    item_ids = sorted(train_df['productId'].unique())
    pivot = train_df.pivot_table(index='userId', columns='productId', values='rating')
    pivot = pivot.reindex(index=user_ids, columns=item_ids)

    # fill missing with user mean (or global mean)
    filled = pivot.copy()
    user_mean = pivot.mean(axis=1)
    global_mean = train_df['rating'].mean()
    for u in user_ids:
        if not np.isnan(user_mean.loc[u]):
            filled.loc[u] = filled.loc[u].fillna(user_mean.loc[u])
        else:
            filled.loc[u] = filled.loc[u].fillna(global_mean)

    svd = TruncatedSVD(n_components=n_components, random_state=SEED)
    user_factors = svd.fit_transform(filled.values)  # users x components
    item_factors = svd.components_.T  # items x components

    reconstructed = np.dot(user_factors, item_factors.T)
    pred_matrix_df = pd.DataFrame(reconstructed, index=user_ids, columns=item_ids)

    return svd, user_ids, item_ids, pred_matrix_df, filled


# -------------------------
# 5) Content-Based Filtering (TF-IDF on product descriptions)
# -------------------------
def train_cbf(products_df, max_features=2000):
    """
    Train TF-IDF on product descriptions and return tfidf vectorizer and matrix (sparse).
    """
    vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
    descs = products_df['description'].fillna(products_df['title'].fillna(''))
    tfidf_matrix = vectorizer.fit_transform(descs)
    return vectorizer, tfidf_matrix


# -------------------------
# 6) Scoring & Weighted Hybrid
# -------------------------
def compute_weighted_score(user_id, product_id, pred_matrix_df, tfidf_matrix, products_df,
                           rating_history_count, alpha_cf, alpha_cbf, implicit_weight=0.2,
                           ratings_df=None):
    """
    Compute final score for a user-product combining:
      - collaborative estimate (from pred_matrix_df)
      - content similarity score (if user has liked items)
      - implicit feedback adjustment (if available)
    alpha_cf + alpha_cbf should be 1.0 (weights for CF and CBF).
    implicit_weight is additional multiplier for implicit signals.
    """
    # collaborative estimate (if available)
    try:
        cf_score = pred_matrix_df.loc[user_id, product_id]
    except Exception:
        cf_score = None

    # content-based: average similarity between the product and user's liked items (rating>=4)
    cbf_score = 0.0
    if ratings_df is not None:
        liked = ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['rating'] >= 4.0)]['productId'].tolist()
    else:
        liked = []
    if len(liked) > 0:
        # map product ids to indices in products_df
        prod_to_idx = {pid: idx for idx, pid in enumerate(products_df['productId'])}
        if product_id not in prod_to_idx:
            cbf_score = 0.0
        else:
            p_idx = prod_to_idx[product_id]
            liked_idxs = [prod_to_idx[p] for p in liked if p in prod_to_idx]
            if len(liked_idxs) == 0:
                cbf_score = 0.0
            else:
                # cosine similarity average
                sims = cosine_similarity(tfidf_matrix[p_idx], tfidf_matrix[liked_idxs]).flatten()
                cbf_score = float(np.mean(sims))
    else:
        # cold-start: no liked items -> cbf_score can be similarity to popular items; keep 0 to let global popularity fill
        cbf_score = 0.0

    # Normalize cf_score to 0-1 based on rating range if present (we assume approx 1-5 ratings)
    if cf_score is None:
        cf_norm = 0.0
    else:
        cf_norm = (cf_score - 1.0) / 4.0  # maps 1..5 -> 0..1

    # final weighted score
    score = alpha_cf * cf_norm + alpha_cbf * cbf_score

    # implicit feedback boost: if the user previously had high implicit scores for similar items, boost
    if ratings_df is not None and implicit_weight > 0:
        # compute mean implicit_score for user's liked items (>=4)
        user_implicit_mean = ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['rating'] >= 4.0)]['implicit_score'].mean()
        if not np.isnan(user_implicit_mean):
            score = score * (1.0 + implicit_weight * user_implicit_mean)

    return float(score), float(cf_norm), float(cbf_score)


# -------------------------
# 7) Dynamic weight function
# -------------------------
def dynamic_weights(num_user_ratings, min_ratings=5, max_ratings=40):
    """
    Decide alpha_cf and alpha_cbf dynamically.
    If user has many ratings -> rely more on CF.
    If user has few ratings -> rely more on CBF.
    Returns (alpha_cf, alpha_cbf)
    """
    # map num_user_ratings in [0, max_ratings] to alpha_cf in [0.2, 0.9]
    if num_user_ratings <= min_ratings:
        alpha_cf = 0.2
    else:
        # linear mapping
        alpha_cf = 0.2 + 0.7 * min(num_user_ratings - min_ratings, max_ratings - min_ratings) / (max_ratings - min_ratings)
    alpha_cbf = 1.0 - alpha_cf
    return alpha_cf, alpha_cbf


# -------------------------
# 8) Recommendation function with explanations
# -------------------------
def recommend_products(user_id, n, pred_matrix_df, tfidf_matrix, products_df, ratings_df,
                       method='hybrid', implicit_weight=0.2):
    """
    Return top-n product recommendations for user_id along with short explanations.
    method: 'cf', 'cbf', or 'hybrid'
    Output: list of dicts: {productId, title, score, reason}
    """
    all_products = products_df['productId'].tolist()
    # compute how many ratings the user has in training data
    num_user_ratings = len(ratings_df[ratings_df['userId'] == user_id])
    alpha_cf, alpha_cbf = dynamic_weights(num_user_ratings)

    # fallback popular items if user unknown
    user_known = (user_id in pred_matrix_df.index)
    popular = list(ratings_df['productId'].value_counts().index)

    candidates = [p for p in all_products if p not in ratings_df[ratings_df['userId'] == user_id]['productId'].tolist()]

    scored = []
    for p in candidates:
        if method == 'cf':
            alpha_cf_use, alpha_cbf_use = 1.0, 0.0
        elif method == 'cbf':
            alpha_cf_use, alpha_cbf_use = 0.0, 1.0
        else:  # hybrid
            alpha_cf_use, alpha_cbf_use = alpha_cf, alpha_cbf

        score, cf_norm, cbf_score = compute_weighted_score(
            user_id, p, pred_matrix_df, tfidf_matrix, products_df,
            rating_history_count=num_user_ratings,
            alpha_cf=alpha_cf_use, alpha_cbf=alpha_cbf_use,
            implicit_weight=implicit_weight, ratings_df=ratings_df
        )

        scored.append((p, score, cf_norm, cbf_score))

    # sort by score
    scored_sorted = sorted(scored, key=lambda x: x[1], reverse=True)

    # build explanation for each top item
    prod_to_meta = {row['productId']: row for _, row in products_df.iterrows()}
    recommendations = []
    for pid, score, cf_norm, cbf_score in scored_sorted[:n]:
        meta = prod_to_meta.get(pid, {})
        reasons = []
        # reason templates
        if cf_norm > 0.45:
            # collaborative is strong
            reasons.append("Users with tastes like yours rated this highly.")
        if cbf_score > 0.15:
            # content similarity strong (threshold tunable)
            # find the most similar liked product name to mention
            liked = ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['rating'] >= 4.0)]['productId'].tolist()
            if len(liked) > 0:
                prod_to_idx = {prod: idx for idx, prod in enumerate(products_df['productId'])}
                if pid in prod_to_idx:
                    p_idx = prod_to_idx[pid]
                    liked_idxs = [prod_to_idx[l] for l in liked if l in prod_to_idx]
                    if len(liked_idxs) > 0:
                        sims = cosine_similarity(tfidf_matrix[p_idx], tfidf_matrix[liked_idxs]).flatten()
                        best_idx = liked_idxs[int(np.argmax(sims))]
                        best_prod = products_df.iloc[best_idx]['title']
                        reasons.append(f"Similar to what you liked: '{best_prod}'.")
            else:
                reasons.append("Has features similar to products you might like.")
        # popularity reason
        pop_rank = popular.index(pid) + 1 if pid in popular else None
        if pop_rank and pop_rank <= 20:
            reasons.append(f"Popular (top {min(pop_rank,20)} most-rated items).")
        if len(reasons) == 0:
            reasons.append("Recommended based on hybrid scoring.")

        recommendations.append({
            'productId': pid,
            'title': meta.get('title', ''),
            'category': meta.get('category', ''),
            'score': float(score),
            'reasons': reasons
        })

    # If no candidates or user unknown, fallback to top popular
    if len(recommendations) == 0:
        for pid in popular[:n]:
            meta = prod_to_meta.get(pid, {})
            recommendations.append({
                'productId': pid,
                'title': meta.get('title', ''),
                'category': meta.get('category', ''),
                'score': None,
                'reasons': ["Fallback: popular item."]
            })

    return recommendations


# -------------------------
# 9) Evaluation functions
# -------------------------
def evaluate_rmse(pred_matrix_df, test_df):
    y_true, y_pred = [], []
    missing = 0
    for _, row in test_df.iterrows():
        u, p, r = row['userId'], row['productId'], row['rating']
        if (u in pred_matrix_df.index) and (p in pred_matrix_df.columns):
            y_true.append(r)
            y_pred.append(pred_matrix_df.loc[u, p])
        else:
            missing += 1
    rmse = mean_squared_error(y_true, y_pred) if len(y_true) > 0 else None
    return rmse, missing


def precision_at_k(pred_matrix_df, train_df, test_df, k=5, threshold=4.0):
    users = test_df['userId'].unique()
    precisions = []
    for u in users:
        if u not in pred_matrix_df.index:
            continue
        train_items = set(train_df[train_df['userId'] == u]['productId'].tolist())
        all_items = set(pred_matrix_df.columns)
        candidates = list(all_items - train_items)
        if len(candidates) == 0:
            continue
        scores = [(iid, pred_matrix_df.loc[u, iid]) for iid in candidates]
        scores.sort(key=lambda x: x[1], reverse=True)
        topk = [iid for iid, _ in scores[:k]]
        relevant = set(test_df[(test_df['userId'] == u) & (test_df['rating'] >= threshold)]['productId'].tolist())
        if len(topk) == 0:
            continue
        prec = len([i for i in topk if i in relevant]) / len(topk)
        precisions.append(prec)
    return np.mean(precisions) if len(precisions) > 0 else 0.0


# -------------------------
# 10) Orchestration: train everything and demo
# -------------------------
def train_and_demo(save_artifacts=True):
    print(">> Building dataset...")
    ratings_df, products_df = build_synthetic_dataset()
    ratings_df = preprocess(ratings_df)
    ratings_df = add_implicit_scores(ratings_df)

    print(">> Train/test split...")
    train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=SEED)

    print(">> Training CF (TruncatedSVD)...")
    svd, user_ids, item_ids, pred_matrix_df, filled_matrix = train_cf(train_df, n_components=30)
    print("CF trained. Pred matrix shape:", pred_matrix_df.shape)

    print(">> Training CBF (TF-IDF)...")
    tfidf_vectorizer, tfidf_matrix = train_cbf(products_df)

    print(">> Evaluation (RMSE + Precision@5)...")
    rmse, missing = evaluate_rmse(pred_matrix_df, test_df)
    p5 = precision_at_k(pred_matrix_df, train_df, test_df, k=5, threshold=4.0)
    print(f"RMSE (approx): {rmse:.4f}, missing preds: {missing}")
    print(f"Precision@5 (CF approx): {p5:.4f}")

    # Quick demo: show recommendations for three users
    sample_users = train_df['userId'].unique()[:3]
    for u in sample_users:
        recs = recommend_products(u, n=5, pred_matrix_df=pred_matrix_df,
                                  tfidf_matrix=tfidf_matrix, products_df=products_df,
                                  ratings_df=train_df, method='hybrid')
        print(f"\nTop 5 hybrid recs for {u}:")
        for r in recs:
            print(f"- {r['productId']} | {r['title'][:40]} | score={r['score']:.4f}")
            for reason in r['reasons']:
                print("   •", reason)

    # Optionally save artifacts for Streamlit app
    if save_artifacts:
        out_dir = "recommender_app/models"
        os.makedirs(out_dir, exist_ok=True)
        joblib.dump(pred_matrix_df, os.path.join(out_dir, "pred_matrix_df.pkl"))
        joblib.dump(svd, os.path.join(out_dir, "truncated_svd.pkl"))
        joblib.dump(tfidf_vectorizer, os.path.join(out_dir, "tfidf_vectorizer.pkl"))
        joblib.dump(tfidf_matrix, os.path.join(out_dir, "tfidf_matrix.pkl"))
        products_df.to_pickle(os.path.join(out_dir, "products_df.pkl"))
        train_df.to_pickle(os.path.join(out_dir, "train_ratings_df.pkl"))
        print(f"\nSaved artifacts under {out_dir}")

    # return main objects for interactive use
    return {
        'pred_matrix_df': pred_matrix_df,
        'tfidf_matrix': tfidf_matrix,
        'products_df': products_df,
        'train_df': train_df,
        'svd': svd,
        'tfidf_vectorizer': tfidf_vectorizer
    }


if __name__ == "__main__":
    artifacts = train_and_demo(save_artifacts=True)

    # Example: interactive usage
    user_to_try = artifacts['train_df']['userId'].unique()[0]
    print("\nInteractive example for user:", user_to_try)
    recs = recommend_products(user_to_try, n=7,
                              pred_matrix_df=artifacts['pred_matrix_df'],
                              tfidf_matrix=artifacts['tfidf_matrix'],
                              products_df=artifacts['products_df'],
                              ratings_df=artifacts['train_df'],
                              method='hybrid')
    for r in recs:
        print(f"- {r['productId']} | {r['title']} | score={r['score']:.4f} -- reasons: {r['reasons']}")

>> Building dataset...
>> Train/test split...
>> Training CF (TruncatedSVD)...
CF trained. Pred matrix shape: (600, 350)
>> Training CBF (TF-IDF)...
>> Evaluation (RMSE + Precision@5)...
RMSE (approx): 0.9173, missing preds: 0
Precision@5 (CF approx): 0.0045

Top 5 hybrid recs for U378:
- P12 | Books Product 12 | score=0.5282
   • Users with tastes like yours rated this highly.
   • Similar to what you liked: 'Books Product 198'.
- P48 | Books Product 48 | score=0.5261
   • Users with tastes like yours rated this highly.
   • Similar to what you liked: 'Books Product 149'.
- P44 | Books Product 44 | score=0.5235
   • Users with tastes like yours rated this highly.
   • Similar to what you liked: 'Books Product 99'.
- P126 | Books Product 126 | score=0.5212
   • Users with tastes like yours rated this highly.
   • Similar to what you liked: 'Books Product 99'.
- P152 | Books Product 152 | score=0.5150
   • Users with tastes like yours rated this highly.
   • Similar to what you liked: '