In [1]:
# Build a tiny, working baseline recommender on your dataset (TF-IDF + rule-aware re-rank)
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load
df = pd.read_csv(r'D:\Python\Projects\Shopping\classifier\processed_data.csv', low_memory=False)

# Drop unnamed junk columns
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

# Normalize useful fields
for col in ["initial_price", "final_price", "reviews_count"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# Build a text field
def safe_str(x):
    if pd.isna(x):
        return ""
    return str(x)

text_cols = [c for c in ["title", "brand", "description"] if c in df.columns]
df["text"] = df[text_cols].astype(str).agg(" ".join, axis=1).str.replace(r"\s+", " ", regex=True).str.strip()

# Remove rows with no text
df = df[df["text"].str.len() > 0].copy()
df.reset_index(drop=True, inplace=True)

# Vectorize (TF-IDF)
vectorizer = TfidfVectorizer(stop_words="english", max_features=50000)
X = vectorizer.fit_transform(df["text"])

# Simple prompt parser for budget/brands/negatives
brand_vocab = sorted(df["brand"].dropna().astype(str).str.lower().unique().tolist()) if "brand" in df.columns else []

money_pat = re.compile(r"\$?\s*(\d{2,5})(?:\s*(?:usd|dollars)?)", re.I)

def parse_prompt(prompt: str):
    p = prompt.lower()
    # price
    prices = [int(m.group(1)) for m in money_pat.finditer(p)]
    budget_min = None
    budget_max = None
    if "under" in p or "less than" in p or "<" in p:
        if prices:
            budget_max = prices[0]
    elif "between" in p and "and" in p:
        if len(prices) >= 2:
            budget_min, budget_max = prices[:2]
    elif "over" in p or "more than" in p or ">" in p:
        if prices:
            budget_min = prices[0]
    elif prices:
        budget_max = prices[0]
    # brands
    mentioned = [b for b in brand_vocab if re.search(rf"\b{re.escape(b)}\b", p)]
    # negative keywords
    neg = re.findall(r"(?:no|not|avoid)\s+([a-z0-9\- ]+)", p)
    neg_terms = [t.strip() for t in neg if t.strip()]
    return dict(budget_min=budget_min, budget_max=budget_max, brands=mentioned, negative_terms=neg_terms)

def availability_mask():
    if "availability" not in df.columns:
        return np.ones(len(df), dtype=bool)
    avail = df["availability"].astype(str).str.lower()
    # keep "in stock", True/1-like, or empty (assume available)
    mask = avail.isna() | (avail == "") | avail.str.contains("in stock|available|true|yes|1")
    return mask.values

avail_mask = availability_mask()

def score_and_rank(prompt: str, topk=10):
    parsed = parse_prompt(prompt)
    # vector search
    q = vectorizer.transform([prompt])
    sims = cosine_similarity(q, X).ravel()
    
    # Start with similarity
    score = sims.copy()
    
    # Price-aware reweighting
    if "final_price" in df.columns:
        price = df["final_price"].values
        # normalize
        pnorm = (price - np.nanmin(price)) / (np.nanmax(price) - np.nanmin(price) + 1e-9)
        # prefer mid-to-low priced slightly
        score *= (1.05 - 0.1 * pnorm)
        # budget compliance
        if parsed["budget_max"] is not None:
            within = price <= parsed["budget_max"]
            score *= np.where(within, 1.15, 0.75)
        if parsed["budget_min"] is not None:
            above = price >= parsed["budget_min"]
            score *= np.where(above, 1.05, 0.7)
    
    # Popularity boost via reviews_count (if exists)
    if "reviews_count" in df.columns:
        rc = df["reviews_count"].fillna(0).values
        score *= (1.0 + 0.05 * np.log1p(rc))
    
    # Availability filter
    score *= avail_mask.astype(float)
    
    # Brand nudges
    if parsed["brands"] and "brand" in df.columns:
        brand_l = df["brand"].astype(str).str.lower().values
        boost = np.isin(brand_l, parsed["brands"])
        score *= np.where(boost, 1.15, 1.0)
    
    # Negative keyword penalty
    if parsed["negative_terms"]:
        text = df["text"].str.lower().values
        neg_hit = np.array([any(t in text[i] for t in parsed["negative_terms"]) for i in range(len(text))])
        score *= np.where(neg_hit, 0.6, 1.0)
    
    # Rank
    idx = np.argsort(-score)[:max(topk, 50)]  # take more to ensure quality after final checks
    
    # Build result table
    cols = [c for c in ["title", "brand", "final_price", "currency", "availability", "reviews_count", "url"] if c in df.columns]
    res = df.iloc[idx][cols].copy()
    res.insert(0, "score", score[idx])
    res["score"] = res["score"].round(4)
    return res.head(topk), parsed

In [3]:
demo_query = "Gaming laptop under 200$"
results, parsed = score_and_rank(demo_query, topk=8)

print(f'Results: {results}\nParsing: {parsed}')

Results:       score                                              title  \
871  0.2765  Lexar Memory Stick Pro Duo 4 GB Gaming Edition...   
781  0.2242  UFNDC 3PCS Computer Backpack for Men, 17" Wate...   
337  0.2192  SMARTGUARD 3-Year Laptop Protection Plan ($200...   
616  0.1997  Lenovo Legion Go 8.8" 144Hz WQXGA Handheld Tou...   
221  0.1759  Universal Fit MightySkins Skin Compatible with...   
0    0.1326  New 15.6 inches Compatible with ROG Strix GL50...   
59   0.1295  Skytech Gaming Azure Gaming PC Desktop â€“ Intel...   
756  0.1244  OLOy DDR4 RAM 64GB (2x32GB) 2666 MHz CL19 1.2V...   

              brand  final_price currency  \
871           Lexar        34.99      USD   
781           UFNDC        29.99      USD   
337      SMARTGUARD        63.52      USD   
616          Lenovo       659.99      USD   
221    MIGHTY SKINS        15.31      USD   
0           Generic        68.30      USD   
59   Skytech Gaming      1299.99      USD   
756            OLOy        99.99  