In [24]:
! pip install pandas
! pip install numpy
! pip install scikit-learn
! pip install matplotlib
! pip install joblib
! pip install streamlit




In [25]:
!pip -q install pandas numpy scikit-learn joblib streamlit cloudflared


In [42]:
%%writefile /content/smart_recommender.py
import os, math, random
import numpy as np, pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import joblib

SEED=42
random.seed(SEED); np.random.seed(SEED)

def build_synth(n_users=600,n_items=350,minr=10,maxr=25):
    users=[f"U{u}" for u in range(1,n_users+1)]
    items=[f"P{i}" for i in range(1,n_items+1)]
    cats=['Electronics','Books','Home','Toys','Beauty','Sports','Clothing','Grocery']
    rows=[]
    for u in users:
        k=np.random.randint(minr,maxr+1)
        rated=np.random.choice(items,size=k,replace=False)
        for it in rated:
            r=np.clip(np.random.normal(3.6,1.0),1,5)
            r=round(r*2)/2.0
            rows.append((u,it,r))
    ratings=pd.DataFrame(rows,columns=['userId','productId','rating'])
    kw={'Electronics':['battery','wireless','bluetooth','USB','portable','charger'],
        'Books':['story','novel','guide','history','author','learn'],
        'Home':['kitchen','durable','design','compact','decor','clean'],
        'Toys':['kids','fun','safe','interactive','educational','colorful'],
        'Beauty':['gentle','skin','organic','scent','serum','moisturizer'],
        'Sports':['fitness','outdoor','durable','training','performance','comfort'],
        'Clothing':['fabric','comfortable','casual','size','style','soft'],
        'Grocery':['fresh','organic','snack','ingredients','package','tasty']}
    metas=[]
    for i,p in enumerate(items,1):
        c=random.choice(cats); title=f"{c} Product {i}"
        desc=" ".join(np.random.choice(kw[c],size=6,replace=True))
        metas.append({'productId':p,'title':title,'category':c,'description':f"{title}. {desc}. High quality and good value."})
    products=pd.DataFrame(metas)
    return ratings,products

def preprocess(df):
    df=df.drop_duplicates().dropna(subset=['userId','productId','rating']).reset_index(drop=True)
    df['userId']=df['userId'].astype(str); df['productId']=df['productId'].astype(str)
    df['rating']=pd.to_numeric(df['rating'],errors='coerce').astype(float)
    return df

def add_implicit(df):
    mult=np.random.uniform(0.8,1.4,size=len(df))
    df=df.copy()
    df['implicit_score']=(df['rating']/5.0)*mult
    return df

def train_cf(train_df,n_components=30):
    users=sorted(train_df['userId'].unique())
    items=sorted(train_df['productId'].unique())
    pivot=train_df.pivot_table(index='userId',columns='productId',values='rating').reindex(index=users,columns=items)
    filled=pivot.copy()
    umean=pivot.mean(axis=1); gmean=train_df['rating'].mean()
    for u in users:
        filled.loc[u]=filled.loc[u].fillna(umean.loc[u] if not np.isnan(umean.loc[u]) else gmean)
    svd=TruncatedSVD(n_components=n_components,random_state=SEED)
    U=svd.fit_transform(filled.values); V=svd.components_.T
    pred=np.dot(U,V.T)
    pred_df=pd.DataFrame(pred,index=users,columns=items)
    return svd,pred_df,filled

def train_cbf(products,max_features=2000):
    vec=TfidfVectorizer(max_features=max_features,stop_words='english')
    X=vec.fit_transform(products['description'].fillna(products['title'].fillna('')))
    return vec,X

def evaluate_rmse(pred_df,test_df):
    y_true=[]; y_pred=[]
    for _,r in test_df.iterrows():
        u,p,rt=r['userId'],r['productId'],r['rating']
        if (u in pred_df.index) and (p in pred_df.columns):
            y_true.append(rt); y_pred.append(pred_df.loc[u,p])
    return mean_squared_error(y_true,y_pred,squared=False) if y_true else None

def precision_at_k(pred_df,train_df,test_df,k=5,thr=4.0):
    users=test_df['userId'].unique(); precs=[]
    for u in users:
        if u not in pred_df.index: continue
        train_items=set(train_df[train_df['userId']==u]['productId'])
        cand=[i for i in pred_df.columns if i not in train_items]
        if not cand: continue
        top=sorted([(i,pred_df.loc[u,i]) for i in cand],key=lambda x:x[1],reverse=True)[:k]
        top_ids=[i for i,_ in top]
        relevant=set(test_df[(test_df['userId']==u)&(test_df['rating']>=thr)]['productId'])
        if not top_ids: continue
        prec=len([i for i in top_ids if i in relevant])/len(top_ids)
        precs.append(prec)
    return float(np.mean(precs)) if precs else 0.0

def train_and_save():
    ratings,products=build_synth()
    ratings=preprocess(ratings)
    ratings=add_implicit(ratings)
    train_df,test_df=train_test_split(ratings,test_size=0.2,random_state=SEED)
    svd,pred_df,filled=train_cf(train_df,30)
    vec,X=train_cbf(products)
    rmse=evaluate_rmse(pred_df,test_df)
    p5=precision_at_k(pred_df,train_df,test_df,5,4.0)
    print(f"RMSE: {rmse:.4f} | Precision@5: {p5:.4f}")
    out="/content/recommender_app/models"; os.makedirs(out,exist_ok=True)
    joblib.dump(pred_df, f"{out}/pred_matrix_df.pkl")
    joblib.dump(svd, f"{out}/truncated_svd.pkl")
    joblib.dump(vec, f"{out}/tfidf_vectorizer.pkl")
    joblib.dump(X, f"{out}/tfidf_matrix.pkl")
    products.to_pickle(f"{out}/products_df.pkl")
    train_df.to_pickle(f"{out}/train_ratings_df.pkl")
    print("Artifacts saved to", out)

if __name__=="__main__":
    train_and_save()


Overwriting /content/smart_recommender.py


In [43]:
%%bash
python - <<'PY'
from pathlib import Path
p = Path("/content/smart_recommender.py")
if not p.exists():
    print("ERROR: /content/smart_recommender.py not found. Make sure file path is correct.")
    raise SystemExit(1)

text = p.read_text()

old_snip = """
def evaluate_rmse(pred_df,test_df):
    y_true=[]; y_pred=[]
    for _,r in test_df.iterrows():
        u,p,rt=r['userId'],r['productId'],r['rating']
        if (u in pred_df.index) and (p in pred_df.columns):
            y_true.append(rt); y_pred.append(pred_df.loc[u,p])
    return mean_squared_error(y_true,y_pred,squared=False) if y_true else None
"""

new_snip = """
import numpy as np
def evaluate_rmse(pred_df,test_df):
    # compute RMSE without relying on sklearn's 'squared' parameter for compatibility
    y_true = []
    y_pred = []
    for _,r in test_df.iterrows():
        u = r['userId']; p = r['productId']; rt = r['rating']
        if (u in pred_df.index) and (p in pred_df.columns):
            y_true.append(rt)
            y_pred.append(pred_df.loc[u, p])
    if len(y_true) == 0:
        return None
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    mse = np.mean((y_true - y_pred) ** 2)
    return float(np.sqrt(mse))
"""

if old_snip in text:
    text = text.replace(old_snip, new_snip)
    p.write_text(text)
    print("Patched evaluate_rmse in /content/smart_recommender.py")
else:
    # fallback: try to insert function by searching for function name
    import re
    if re.search(r"def evaluate_rmse\(", text):
        text = re.sub(r"def evaluate_rmse\\([\\s\\S]*?\\)\\n\\s*return[\\s\\S]*?\\n", new_snip + "\n", text, flags=re.MULTILINE)
        p.write_text(text)
        print("Replaced evaluate_rmse via regex in /content/smart_recommender.py")
    else:
        print("Could not find evaluate_rmse function. Please open the file and replace the function with the new version manually.")
PY


Patched evaluate_rmse in /content/smart_recommender.py


In [44]:
!python /content/smart_recommender.py


RMSE: 0.9577 | Precision@5: 0.0045
Artifacts saved to /content/recommender_app/models


In [45]:
%%writefile /content/streamlit_recommender_app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics.pairwise import cosine_similarity

@st.cache_resource
def load_artifacts():
    pred_matrix_df = joblib.load("/content/recommender_app/models/pred_matrix_df.pkl")
    svd = joblib.load("/content/recommender_app/models/truncated_svd.pkl")
    tfidf_vectorizer = joblib.load("/content/recommender_app/models/tfidf_vectorizer.pkl")
    tfidf_matrix = joblib.load("/content/recommender_app/models/tfidf_matrix.pkl")
    products_df = pd.read_pickle("/content/recommender_app/models/products_df.pkl")
    ratings_df = pd.read_pickle("/content/recommender_app/models/train_ratings_df.pkl")
    return pred_matrix_df, tfidf_vectorizer, tfidf_matrix, products_df, ratings_df

pred_matrix_df, tfidf_vectorizer, tfidf_matrix, products_df, ratings_df = load_artifacts()

CATEGORY_TO_IMAGE = {
    "Electronics": "https://source.unsplash.com/800x520/?electronics,gadgets",
    "Books": "https://source.unsplash.com/800x520/?books,reading",
    "Home": "https://source.unsplash.com/800x520/?home,interior",
    "Toys": "https://source.unsplash.com/800x520/?toys,kids",
    "Beauty": "https://source.unsplash.com/800x520/?beauty,cosmetics",
    "Sports": "https://source.unsplash.com/800x520/?sports,fitness",
    "Clothing": "https://source.unsplash.com/800x520/?clothing,fashion",
    "Grocery": "https://source.unsplash.com/800x520/?grocery,food",
}
def img_for(cat): return CATEGORY_TO_IMAGE.get(cat, "https://source.unsplash.com/800x520/?product")

PROD_TO_IDX = {pid: idx for idx, pid in enumerate(products_df["productId"])}
POPULAR_ORDER = list(ratings_df["productId"].value_counts().index)

def dynamic_weights(num_ratings, min_r=5, max_r=40):
    if num_ratings <= min_r: a_cf = 0.25
    else: a_cf = 0.25 + 0.65 * min(num_ratings - min_r, max_r - min_r) / (max_r - min_r)
    return a_cf, 1.0 - a_cf

def build_explanations(user_id, pid, cf_raw, cbf_score):
    reasons=[]
    cf_norm=(cf_raw-1)/4 if cf_raw is not None else 0.0
    if cf_norm>0.5: reasons.append("Users with tastes like yours rated this highly.")
    liked = ratings_df[(ratings_df.userId==user_id) & (ratings_df.rating>=4.0)]['productId'].tolist()
    if cbf_score>0.12 and liked:
        liked_idxs=[PROD_TO_IDX[l] for l in liked if l in PROD_TO_IDX]
        if pid in PROD_TO_IDX and liked_idxs:
            p_idx=PROD_TO_IDX[pid]
            sims=cosine_similarity(tfidf_matrix[p_idx], tfidf_matrix[liked_idxs]).flatten()
            best_idx=liked_idxs[int(np.argmax(sims))]
            best_title=products_df.iloc[best_idx]['title']
            reasons.append(f"Similar to what you liked: '{best_title}'.")
        else:
            reasons.append("Shares features with items you liked.")
    if pid in POPULAR_ORDER:
        rnk=POPULAR_ORDER.index(pid)+1
        if rnk<=20: reasons.append(f"Popular choice (top {rnk} most-rated).")
    if not reasons: reasons.append("Recommended by hybrid model signals.")
    return reasons

def recommend(user_id, n=5):
    all_p = products_df['productId'].tolist()
    seen = ratings_df[ratings_df.userId==user_id]['productId'].tolist()
    cand = [p for p in all_p if p not in seen]
    num_r = len(ratings_df[ratings_df.userId==user_id])
    a_cf,a_cbf = dynamic_weights(num_r)
    rows=[]
    for p in cand:
        cf_raw = float(pred_matrix_df.loc[user_id,p]) if (user_id in pred_matrix_df.index and p in pred_matrix_df.columns) else 0.0
        cf_norm=(cf_raw-1)/4
        liked = ratings_df[(ratings_df.userId==user_id) & (ratings_df.rating>=4.0)]['productId'].tolist()
        if liked and (p in PROD_TO_IDX):
            p_idx=PROD_TO_IDX[p]
            liked_idxs=[PROD_TO_IDX[l] for l in liked if l in PROD_TO_IDX]
            cbf_sim = float(cosine_similarity(tfidf_matrix[p_idx], tfidf_matrix[liked_idxs]).flatten().mean()) if liked_idxs else 0.0
        else:
            cbf_sim=0.0
        score = a_cf*cf_norm + a_cbf*cbf_sim
        meta = products_df.loc[products_df.productId==p].iloc[0]
        rows.append({
            "productId": p,
            "title": meta["title"],
            "category": meta["category"],
            "description": meta["description"],
            "hybrid_score": float(score),
            "cf_raw": float(cf_raw),
            "cbf_sim": float(cbf_sim),
            "reasons": build_explanations(user_id,p,cf_raw,cbf_sim),
            "image_url": img_for(meta["category"]),
        })
    out = pd.DataFrame(rows).sort_values("hybrid_score", ascending=False).head(n).reset_index(drop=True)
    return out, a_cf, a_cbf

st.title("🛒 Smart Product Recommender Dashboard (Colab)")
st.caption("Weighted Hybrid: Collaborative + Content-Based with explanations")

users = sorted(ratings_df['userId'].unique())
user = st.selectbox("Select User ID", users)
n = st.slider("Number of recommendations", 3, 10, 5)

if st.button("🔍 Get Recommendations"):
    df, a_cf, a_cbf = recommend(user, n)
    st.subheader(f"🎯 Recommendations for {user}")
    st.caption(f"Weights → CF: {a_cf:.2f} | CBF: {a_cbf:.2f}")
    for _, row in df.iterrows():
        st.image(row["image_url"])
        st.markdown(f"### {row['title']}")
        st.markdown(f"*Category:* {row['category']}")
        st.markdown(f"**Hybrid Score:** {row['hybrid_score']:.3f}  |  **CF(raw):** {row['cf_raw']:.3f}  |  **CBF(sim):** {row['cbf_sim']:.3f}")
        st.markdown("**Why this?**")
        for reason in row["reasons"]:
            st.markdown(f"- {reason}")
        st.caption(row["description"][:220] + ("..." if len(row["description"])>220 else ""))
        st.divider()

st.sidebar.header("📈 How it works")
st.sidebar.markdown("- **CF** learns from similar users' ratings.\n- **CBF** compares product text (TF-IDF).\n- **Weights** adapt to how active the user is (cold-start aware).\n- Unsplash placeholders used for images; replace with your own if available.")


Overwriting /content/streamlit_recommender_app.py


In [46]:
# Start Streamlit in background
!streamlit run /content/streamlit_recommender_app.py &>/content/streamlit.log &

# Expose via Cloudflare tunnel (prints a public URL)
!nohup cloudflared tunnel --url http://localhost:8501 --no-autoupdate > /content/cf.log 2>&1 & sleep 3; grep -o 'https://.*trycloudflare.com' -m 1 /content/cf.log


In [47]:
# 1) Reinstall/ensure deps
!pip -q install cloudflared streamlit >/dev/null

# 2) Kill any stale processes
!pkill -f streamlit || true
!pkill -f cloudflared || true

# 3) Start Streamlit (headless) in background
!streamlit run /content/streamlit_recommender_app.py --server.port 8501 --server.headless true &>/content/streamlit.log &

# 4) Start Cloudflare tunnel in background
!nohup cloudflared tunnel --url http://localhost:8501 --no-autoupdate > /content/cf.log 2>&1 &

^C
^C


In [48]:
# Is Streamlit actually running?
!ps -ef | grep streamlit | grep -v grep

# Any traceback in Streamlit logs?
!sed -n '1,200p' /content/streamlit.log

# Any errors in cloudflared?
!sed -n '1,200p' /content/cf.log

root        6405       1 27 12:16 ?        00:00:01 /usr/bin/python3 /usr/local/bin/streamlit run /content/streamlit_recommender_app.py --server.port 8501 --server.headless true

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.


  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8501
  Network URL: http://172.28.0.12:8501
  External URL: http://34.80.34.3:8501

2025-10-12T12:16:08Z INF Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/clou