In [1]:
import os, time, pickle
from typing import List
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from openai import OpenAI
from openai import RateLimitError

In [2]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY) if OPENAI_API_KEY else None

EMBED_MODEL = "text-embedding-ada-002"

EMBEDDINGS_CACHE_FILE = "embeddings_cache.pkl"
GOOD_MATCH_THRESHOLD = 0.7

_TFIDF = None
_TFIDF_MATRIX = None
_MODE = "openai"

In [3]:
def create_catalog():
    products = [
        dict(
            name="Boho Maxi Dress",
            description="Flowy maxi dress with earthy tones and floral patterns. Perfect for festival vibes and relaxed summer days.",
            price=79.99,
            category="Dresses",
            vibe_tags=["boho", "relaxed", "earthy", "festival"],
            views=1250,
        ),
        dict(
            name="Classic Leather Jacket",
            description="Timeless black leather jacket with silver zippers. Edgy urban style for city nights, durable and bold.",
            price=189.99,
            category="Outerwear",
            vibe_tags=["urban", "edgy", "bold", "cool"],
            views=2100,
        ),
        dict(
            name="Cozy Cashmere Sweater",
            description="Ultra-soft cashmere sweater in warm cream. Cozy winter evenings by the fireplace; oversized comfort.",
            price=159.99,
            category="Tops",
            vibe_tags=["cozy", "luxurious", "comfortable", "winter"],
            views=1800,
        ),
        dict(
            name="High-Waist Skinny Jeans",
            description="Modern high-waist skinny jeans in dark wash. Sleek city-style essential for professional casual looks.",
            price=89.99,
            category="Bottoms",
            vibe_tags=["modern", "urban", "sleek", "professional"],
            views=3200,
        ),
        dict(
            name="Linen Beach Shirt",
            description="Breathable white linen shirt perfect for beach vacations and tropical getaways; relaxed and breezy.",
            price=54.99,
            category="Tops",
            vibe_tags=["beach", "casual", "breezy", "vacation"],
            views=980,
        ),
        dict(
            name="Tailored Blazer",
            description="Sharp navy blazer with structured shoulders. Professional office wear for business meetings.",
            price=149.99,
            category="Outerwear",
            vibe_tags=["professional", "formal", "sharp", "business"],
            views=1650,
        ),
        dict(
            name="Floral Sundress",
            description="Bright floral print sundress with spaghetti straps. Fun playful summer style for outdoor brunch.",
            price=64.99,
            category="Dresses",
            vibe_tags=["playful", "summer", "beach", "fun"],
            views=1420,
        ),
        dict(
            name="Chunky Sneaker Boots",
            description="Trendy white sneaker boots with platform sole. Street-style essential for energetic urban explorers.",
            price=119.99,
            category="Shoes",
            vibe_tags=["trendy", "streetwear", "energetic", "urban"],
            views=2800,
        ),
        dict(
            name="Vintage Denim Jacket",
            description="Classic light-wash denim jacket with distressed details. Retro 90s vibe for casual everyday looks.",
            price=74.99,
            category="Outerwear",
            vibe_tags=["vintage", "retro", "casual", "90s"],
            views=1950,
        ),
        dict(
            name="Silk Evening Gown",
            description="Elegant floor-length silk gown in deep burgundy. Sophisticated glamour for formal events and galas.",
            price=299.99,
            category="Dresses",
            vibe_tags=["elegant", "glamorous", "formal", "sophisticated"],
            views=890,
        ),
    ]
    return pd.DataFrame(products)


df = create_catalog()
df.head()

Unnamed: 0,name,description,price,category,vibe_tags,views
0,Boho Maxi Dress,Flowy maxi dress with earthy tones and floral ...,79.99,Dresses,"[boho, relaxed, earthy, festival]",1250
1,Classic Leather Jacket,Timeless black leather jacket with silver zipp...,189.99,Outerwear,"[urban, edgy, bold, cool]",2100
2,Cozy Cashmere Sweater,Ultra-soft cashmere sweater in warm cream. Coz...,159.99,Tops,"[cozy, luxurious, comfortable, winter]",1800
3,High-Waist Skinny Jeans,Modern high-waist skinny jeans in dark wash. S...,89.99,Bottoms,"[modern, urban, sleek, professional]",3200
4,Linen Beach Shirt,Breathable white linen shirt perfect for beach...,54.99,Tops,"[beach, casual, breezy, vacation]",980


In [4]:
def _sleep_backoff(attempt: int, base: float = 0.5, cap: float = 8.0):
    delay = min(cap, base * (2**attempt))
    time.sleep(delay)


def get_embedding_openai(text: str, model: str = EMBED_MODEL, max_retries: int = 5):
    if client is None:
        raise RuntimeError("OpenAI client not configured (no API key).")

    text = text.replace("\n", " ")
    for attempt in range(max_retries):
        try:
            resp = client.embeddings.create(model=model, input=text)
            return resp.data[0].embedding
        except RateLimitError:
            if attempt == max_retries - 1:
                raise
            _sleep_backoff(attempt)
        except Exception as e:
            msg = str(e).lower()
            if "insufficient_quota" in msg or "api key" in msg:
                raise
            if attempt == max_retries - 1:
                raise
            _sleep_backoff(attempt)


def get_embeddings_openai_batch(
    texts: List[str], model: str = EMBED_MODEL, max_retries: int = 5
):
    if client is None:
        raise RuntimeError("OpenAI client not configured (no API key).")

    clean_texts = [t.replace("\n", " ") for t in texts]
    for attempt in range(max_retries):
        try:
            resp = client.embeddings.create(model=model, input=clean_texts)
            return [d.embedding for d in resp.data]
        except RateLimitError:
            if attempt == max_retries - 1:
                raise
            _sleep_backoff(attempt)
        except Exception as e:
            msg = str(e).lower()
            if "insufficient_quota" in msg or "api key" in msg:
                raise
            if attempt == max_retries - 1:
                raise
            _sleep_backoff(attempt)

In [5]:
def fit_tfidf_on_catalog(df: pd.DataFrame):
    global _TFIDF, _TFIDF_MATRIX, _MODE

    # NaN-proof ‚Üí ensure strings before concatenation
    safe_desc = df["description"].fillna("").astype(str)
    safe_tags = df["vibe_tags"].apply(
        lambda tags: " ".join(tags) if isinstance(tags, list) else ""
    )
    corpus = (safe_desc + " " + safe_tags).tolist()

    _TFIDF = TfidfVectorizer(stop_words="english", max_features=2048)
    _TFIDF_MATRIX = _TFIDF.fit_transform(corpus)  # (N x D, sparse)
    _MODE = "tfidf"
    print("‚öôÔ∏è  Switched to TF-IDF fallback (no API usage).")


def embed_query_tfidf(query: str):
    assert (
        _TFIDF is not None and _TFIDF_MATRIX is not None
    ), "TF-IDF not initialized. Call fit_tfidf_on_catalog(df) first."
    return _TFIDF.transform([query])

In [6]:
def generate_embeddings(df: pd.DataFrame, use_cache: bool = True):
    global _MODE

    if use_cache and os.path.exists(EMBEDDINGS_CACHE_FILE):
        try:
            with open(EMBEDDINGS_CACHE_FILE, "rb") as f:
                df["embedding"] = pickle.load(f)
            _MODE = "openai"
            print("üì¶ Loaded embeddings from cache (OpenAI mode).")
            return df
        except Exception:
            pass

    if client is not None:
        try:
            descs = df["description"].tolist()
            vectors = get_embeddings_openai_batch(descs, model=EMBED_MODEL)
            df["embedding"] = vectors
            with open(EMBEDDINGS_CACHE_FILE, "wb") as f:
                pickle.dump(vectors, f)
            _MODE = "openai"
            print("‚úÖ OpenAI embeddings computed & cached.")
            return df
        except Exception as e:
            print(f"‚ö†Ô∏è OpenAI unavailable ({e}). Falling back to TF-IDF.")

    fit_tfidf_on_catalog(df)
    return df


df = generate_embeddings(df, use_cache=True)
df.head()

‚ö†Ô∏è OpenAI unavailable (Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}). Falling back to TF-IDF.
‚öôÔ∏è  Switched to TF-IDF fallback (no API usage).


Unnamed: 0,name,description,price,category,vibe_tags,views
0,Boho Maxi Dress,Flowy maxi dress with earthy tones and floral ...,79.99,Dresses,"[boho, relaxed, earthy, festival]",1250
1,Classic Leather Jacket,Timeless black leather jacket with silver zipp...,189.99,Outerwear,"[urban, edgy, bold, cool]",2100
2,Cozy Cashmere Sweater,Ultra-soft cashmere sweater in warm cream. Coz...,159.99,Tops,"[cozy, luxurious, comfortable, winter]",1800
3,High-Waist Skinny Jeans,Modern high-waist skinny jeans in dark wash. S...,89.99,Bottoms,"[modern, urban, sleek, professional]",3200
4,Linen Beach Shirt,Breathable white linen shirt perfect for beach...,54.99,Tops,"[beach, casual, breezy, vacation]",980


In [7]:
def fallback_popular(df: pd.DataFrame, n: int = 2):
    pop = df.nlargest(n, "views")[["name", "price", "vibe_tags"]].copy()
    return pop.to_dict("records")


def cosine_top_k(query: str, df: pd.DataFrame, k: int = 3):
    global _MODE
    start = time.time()
    k = min(k, len(df))

    if _MODE == "openai":
        try:
            q_vec = np.array(get_embedding_openai(query)).reshape(1, -1)
            catalog = np.vstack(df["embedding"].values)
            sims = cosine_similarity(q_vec, catalog)[0]
        except Exception as e:
            print(f"‚ö†Ô∏è Query embedding failed ({e}). Switching to TF-IDF.")
            fit_tfidf_on_catalog(df)
            q_vec = embed_query_tfidf(query)
            sims = cosine_similarity(q_vec, _TFIDF_MATRIX)[0]
    else:
        q_vec = embed_query_tfidf(query)
        sims = cosine_similarity(q_vec, _TFIDF_MATRIX)[0]

    temp = df.copy()
    temp["score"] = sims
    topk = temp.nlargest(k, "score")

    latency_ms = round((time.time() - start) * 1000, 2)
    max_sim = float(topk["score"].max()) if len(topk) else 0.0

    matches = topk[
        ["name", "category", "price", "vibe_tags", "description", "score"]
    ].copy()
    matches["score"] = matches["score"].round(4)

    result = {
        "mode": _MODE,
        "query": query,
        "latency_ms": latency_ms,
        "max_similarity": round(max_sim, 4),
        "has_good_match": bool(max_sim >= GOOD_MATCH_THRESHOLD),
        "matches": matches.to_dict("records"),
    }

    print(f"\nüîé Query: {query}")
    print(
        f"   mode={result['mode']} | latency={latency_ms} ms | max_sim={result['max_similarity']} | good? {result['has_good_match']}"
    )
    print("   Top-3:")
    for m in result["matches"]:
        print(
            f"    ‚Ä¢ {m['name']} (score {m['score']}) ‚Ä¢ {m['category']} ‚Ä¢ ${m['price']} ‚Ä¢ vibes={m['vibe_tags']}"
        )
    if not result["has_good_match"]:
        fb = fallback_popular(df, n=2)
        print("   üí° Fallback suggestions:", fb)

    return result

In [8]:
test_queries = [
    "professional office wear",
    "beach vacation vibes",
    "energetic urban chic",
]

all_results = []
for q in test_queries:
    r = cosine_top_k(q, df, k=3)
    all_results.append(r)


üîé Query: professional office wear
   mode=tfidf | latency=3.92 ms | max_sim=0.484 | good? False
   Top-3:
    ‚Ä¢ Tailored Blazer (score 0.484) ‚Ä¢ Outerwear ‚Ä¢ $149.99 ‚Ä¢ vibes=['professional', 'formal', 'sharp', 'business']
    ‚Ä¢ High-Waist Skinny Jeans (score 0.1943) ‚Ä¢ Bottoms ‚Ä¢ $89.99 ‚Ä¢ vibes=['modern', 'urban', 'sleek', 'professional']
    ‚Ä¢ Boho Maxi Dress (score 0.0) ‚Ä¢ Dresses ‚Ä¢ $79.99 ‚Ä¢ vibes=['boho', 'relaxed', 'earthy', 'festival']
   üí° Fallback suggestions: [{'name': 'High-Waist Skinny Jeans', 'price': 89.99, 'vibe_tags': ['modern', 'urban', 'sleek', 'professional']}, {'name': 'Chunky Sneaker Boots', 'price': 119.99, 'vibe_tags': ['trendy', 'streetwear', 'energetic', 'urban']}]

üîé Query: beach vacation vibes
   mode=tfidf | latency=1.58 ms | max_sim=0.3636 | good? False
   Top-3:
    ‚Ä¢ Linen Beach Shirt (score 0.3636) ‚Ä¢ Tops ‚Ä¢ $54.99 ‚Ä¢ vibes=['beach', 'casual', 'breezy', 'vacation']
    ‚Ä¢ Boho Maxi Dress (score 0.1321) ‚Ä¢ Dresses ‚Ä¢ $7

In [9]:
eval_rows = []
for r in all_results:
    eval_rows.append(
        dict(
            query=r["query"],
            mode=r["mode"],
            latency_ms=r["latency_ms"],
            max_similarity=r["max_similarity"],
            good_match=int(r["has_good_match"]),
        )
    )
eval_df = pd.DataFrame(eval_rows)

print("\n=== Evaluation Table ===")
try:
    display(eval_df)
except NameError:
    print(eval_df)

plt.figure(figsize=(6, 4))
plt.bar(eval_df["query"], eval_df["latency_ms"])
plt.title("Query Latency (ms)")
plt.xlabel("Query")
plt.ylabel("Latency (ms)")
plt.xticks(rotation=25, ha="right")
plt.tight_layout()
plt.savefig("latency_by_query.png", dpi=200, bbox_inches="tight")
plt.close()

top3_scores = []
for r in all_results:
    for m in r["matches"]:
        top3_scores.append(float(m["score"]))

plt.figure(figsize=(6, 4))
plt.hist(top3_scores, bins=8)
plt.title("Distribution of Top-3 Similarity Scores")
plt.xlabel("Similarity score")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig("score_distribution.png", dpi=200, bbox_inches="tight")
plt.close()

summary = {
    "avg_latency_ms": round(eval_df["latency_ms"].mean(), 2),
    "max_latency_ms": round(eval_df["latency_ms"].max(), 2),
    "min_latency_ms": round(eval_df["latency_ms"].min(), 2),
    "good_match_threshold": GOOD_MATCH_THRESHOLD,
    "good_match_count": int(eval_df["good_match"].sum()),
    "total_queries": len(eval_df),
}
print("\n-> Summary <-")
for k, v in summary.items():
    print(f"{k}: {v}")


=== Evaluation Table ===


Unnamed: 0,query,mode,latency_ms,max_similarity,good_match
0,professional office wear,tfidf,3.92,0.484,0
1,beach vacation vibes,tfidf,1.58,0.3636,0
2,energetic urban chic,tfidf,0.93,0.5704,0



-> Summary <-
avg_latency_ms: 2.14
max_latency_ms: 3.92
min_latency_ms: 0.93
good_match_threshold: 0.7
good_match_count: 0
total_queries: 3


In [10]:
print("\n-> Reflection <-")
print("* What went well:")
print(
    "  - Simple, explainable pipeline: vibe text ‚Üí embeddings ‚Üí cosine similarity ‚Üí top-3."
)
print(
    "  - Reliability-first: caching, retries, and TF-IDF fallback so the notebook always runs."
)
print("  - Consistent scoring: cosine similarity in both OpenAI and TF-IDF modes.")

print("\n* Metrics & observations:")
print(
    f"  - Average latency across queries: {summary['avg_latency_ms']} ms (see bar chart)."
)
print(
    f"  - 'Good match' rule: score ‚â• {GOOD_MATCH_THRESHOLD}. "
    f"Good matches: {summary['good_match_count']} / {summary['total_queries']}."
)

print("\n* Edge cases handled:")
print("  - No/weak matches ‚Üí show popular fallbacks (by views).")
print("  - API limits/unavailable ‚Üí auto-switch to local TF-IDF space.")
print(
    "  - Repeat runs ‚Üí cached vectors avoid redundant API calls and speed up re-runs."
)

print("\n* Next improvements:")
print(
    "  - Vector DB (e.g., Pinecone/FAISS/Weaviate) for larger catalogs & metadata filters."
)
print(
    "  - Multi-signal ranking: blend similarity with popularity, price, and user feedback."
)
print("  - Tiny labeled test set to track precision@k / MRR over time.")


-> Reflection <-
* What went well:
  - Simple, explainable pipeline: vibe text ‚Üí embeddings ‚Üí cosine similarity ‚Üí top-3.
  - Reliability-first: caching, retries, and TF-IDF fallback so the notebook always runs.
  - Consistent scoring: cosine similarity in both OpenAI and TF-IDF modes.

* Metrics & observations:
  - Average latency across queries: 2.14 ms (see bar chart).
  - 'Good match' rule: score ‚â• 0.7. Good matches: 0 / 3.

* Edge cases handled:
  - No/weak matches ‚Üí show popular fallbacks (by views).
  - API limits/unavailable ‚Üí auto-switch to local TF-IDF space.
  - Repeat runs ‚Üí cached vectors avoid redundant API calls and speed up re-runs.

* Next improvements:
  - Vector DB (e.g., Pinecone/FAISS/Weaviate) for larger catalogs & metadata filters.
  - Multi-signal ranking: blend similarity with popularity, price, and user feedback.
  - Tiny labeled test set to track precision@k / MRR over time.
