In [1]:
import pandas as pd
import numpy as np
import re
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv(r"C:\Users\ISMAIMZ\OneDrive - Hapag-Lloyd AG\Documents\Books\Projects\Recommendation system\archive\clean_data.csv")

In [3]:
df.drop(columns=["Unnamed: 0", "ImageURL"], inplace=True)

In [4]:
df.head(5)

Unnamed: 0,ID,ProdID,Rating,ReviewCount,Category,Brand,Name,Description,Tags
0,1705736792,2,0.0,0.0,"premium, beauty, premium, makeup, premium, nai...",opi,"OPI Infinite Shine, Nail Lacquer Nail Polish, ...",,"premium, beauty, premium, makeup, premium, nai..."
1,95,76,0.0,0.0,"beauty, hair, care, hair, color, auburn, hair,...",easy,"Nice n Easy Permanent Color, 111 Natural Mediu...","pack, 3, pack, 3, upc, 381519000201, beautiful...","beauty, hair, care, hair, color, auburn, hair,..."
2,8,8,4.5,29221.0,"beauty, hair, care, hair, color, permanent, ha...",clairol,Clairol Nice N Easy Permanent Color 7/106A Nat...,"clairol, nice, n, easy, permanent, color, give...","beauty, hair, care, hair, color, permanent, ha..."
3,4,3,0.0,0.0,"beauty, makeup, lip","kokie, cosmetics","Kokie Professional Matte Lipstick, Hot Berry, ...","calling, matte, lip, lovers, indulge, rich, cr...","beauty, makeup, lip, kokie, cosmetics, calling..."
4,990,3,0.0,131.0,"seasonal, stock, essentials, personal, care, s...",gillette,"Gillette TRAC II Plus Razor Blade Refills, Fit...","1971, gillette, introduced, trac, ii, razor, s...","seasonal, stock, essentials, personal, care, s..."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4090 entries, 0 to 4089
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ID           4090 non-null   int64  
 1   ProdID       4090 non-null   int64  
 2   Rating       4090 non-null   float64
 3   ReviewCount  4090 non-null   float64
 4   Category     4081 non-null   object 
 5   Brand        3994 non-null   object 
 6   Name         4090 non-null   object 
 7   Description  3174 non-null   object 
 8   Tags         4090 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 287.7+ KB


In [6]:
df.columns

Index(['ID', 'ProdID', 'Rating', 'ReviewCount', 'Category', 'Brand', 'Name',
       'Description', 'Tags'],
      dtype='object')

In [7]:
for col in ['Category', 'Brand', 'Name','Description', 'Tags']:
    df[col] = df[col].fillna("")

def clean(s):
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ",s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

for col in ['Category', 'Brand', 'Name','Description', 'Tags']:
    df[col] = df[col].astype(str).map(clean)

df["doc"] = (df["Name"] + " " + df["Brand"] + " " + df["Category"] + " " +
             df["Description"] + " " + df["Tags"])

df["Rating"] = df["Rating"].fillna(0)
df["ReviewCount"] = df["ReviewCount"].fillna(0)

rating = (df["Rating"] - df["Rating"].min()) / (df["Rating"].max() - df["Rating"].min() + 1e-9)
reviews = np.log1p(df["ReviewCount"])
reviews = (reviews - reviews.min()) / (reviews.max() - reviews.min() + 1e-9)
alpha = 0.6
df["Pop_Score"] = alpha * rating + (1 + alpha) * reviews

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4090 entries, 0 to 4089
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ID           4090 non-null   int64  
 1   ProdID       4090 non-null   int64  
 2   Rating       4090 non-null   float64
 3   ReviewCount  4090 non-null   float64
 4   Category     4090 non-null   object 
 5   Brand        4090 non-null   object 
 6   Name         4090 non-null   object 
 7   Description  4090 non-null   object 
 8   Tags         4090 non-null   object 
 9   doc          4090 non-null   object 
 10  Pop_Score    4090 non-null   float64
dtypes: float64(3), int64(2), object(6)
memory usage: 351.6+ KB


In [9]:
df.head(10)

Unnamed: 0,ID,ProdID,Rating,ReviewCount,Category,Brand,Name,Description,Tags,doc,Pop_Score
0,1705736792,2,0.0,0.0,premium beauty premium makeup premium nail pol...,opi,opi infinite shine nail lacquer nail polish bu...,,premium beauty premium makeup premium nail pol...,opi infinite shine nail lacquer nail polish bu...,0.0
1,95,76,0.0,0.0,beauty hair care hair color auburn hair color,easy,nice n easy permanent color 111 natural medium...,pack 3 pack 3 upc 381519000201 beautiful natur...,beauty hair care hair color auburn hair color ...,nice n easy permanent color 111 natural medium...,0.0
2,8,8,4.5,29221.0,beauty hair care hair color permanent hair color,clairol,clairol nice n easy permanent color 7 106a nat...,clairol nice n easy permanent color gives 8 we...,beauty hair care hair color permanent hair col...,clairol nice n easy permanent color 7 106a nat...,2.139888
3,4,3,0.0,0.0,beauty makeup lip,kokie cosmetics,kokie professional matte lipstick hot berry 0 ...,calling matte lip lovers indulge rich creamy m...,beauty makeup lip kokie cosmetics calling matt...,kokie professional matte lipstick hot berry 0 ...,0.0
4,990,3,0.0,131.0,seasonal stock essentials personal care stock ...,gillette,gillette trac ii plus razor blade refills fit ...,1971 gillette introduced trac ii razor system ...,seasonal stock essentials personal care stock ...,gillette trac ii plus razor blade refills fit ...,0.759718
5,262,-2147483648,4.6,52.0,beauty hair care hair care brands old spice ha...,old spice,old spice artisan styling high hold matte fini...,old spice artisan molding clay mens styling pr...,beauty hair care hair care brands old spice ha...,old spice artisan styling high hold matte fini...,1.16974
6,-2147483648,371,4.3,10.0,personal care oral care kids oral care,colgate,colgate my first baby and toddler toothpaste f...,colgate baby toddler toothpaste safe fluoride ...,personal care oral care kids oral care colgate...,colgate my first baby and toddler toothpaste f...,0.88909
7,6,0,0.0,0.0,beauty makeup nails,bmc,bmc bright and loud cream gel lacquer polish s...,lights sick beats epic gel polish present neon...,beauty makeup nails bmc lights sick beats epic...,bmc bright and loud cream gel lacquer polish s...,0.0
8,507,-2147483648,5.0,1.0,beauty hair care hair care brands suave,suave,suave extra hold 7 shaping mousse 9 oz pack of 6,free shipping pack 6 pack 6 upc 079400816603 y...,beauty hair care hair care brands suave suave ...,suave extra hold 7 shaping mousse 9 oz pack of...,0.707847
9,32,-2147483648,0.0,950.0,health lip care vaseline,vaseline,3 pack vaseline lip therapy tinted lip balm mi...,vaseline lip therapy lip balm mini rosy clinic...,health lip care vaseline vaseline vaseline lip...,3 pack vaseline lip therapy tinted lip balm mi...,1.066965


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(min_df=3, ngram_range=(1,2), stop_words="english", max_features=200000)
X = tfidf.fit_transform(df["doc"])
sim = cosine_similarity(X, X)
id_to_idx = {pid:i for i, pid in enumerate(df["ProdID"])}
idx_to_id = {i:pid for pid, i in id_to_idx.items()}

In [11]:
def similar_items(prod_id, k=10, w_pop=0.2):
    i = id_to_idx[prod_id]
    scores = sim[i].copy()
    scores = (1 - w_pop) * scores + w_pop * df["Pop_Score"].values
    scores[i] = -np.inf
    top_idx = np.argsort(-scores)[:k]
    return df.iloc[top_idx][["ProdID","Name","Brand","Category","Rating","ReviewCount"]]

In [12]:
similar_items(prod_id=156, k=5)

Unnamed: 0,ProdID,Name,Brand,Category,Rating,ReviewCount
3524,8,pine sol multi surface cleaner original 144 oz...,clorox,household essentials cleaning supplies purpose...,4.7,20997.0
1053,43880,pine sol all purpose cleaner lemon 144 oz bottle,pine sol,household essentials cleaning supplies purpose...,4.8,9806.0
1383,62,aquaphor ointment body spray 3 7 oz spray can,aquaphor,personal care men essentials men body lotions,4.0,6669.0
2,8,clairol nice n easy permanent color 7 106a nat...,clairol,beauty hair care hair color permanent hair color,4.5,29221.0
614,53,clairol nice n easy permanent hair color 4 120...,clairol,beauty hair care hair color permanent hair color,4.5,29221.0


In [37]:
def recommend_from_history(prod_ids, k=10, w_pop=0.2):
    if isinstance(prod_ids, int):
        prod_ids = [prod_ids]
    idxs = [id_to_idx[p] for p in prod_ids if p in id_to_idx]
    if not idxs:
        return df.sort_values("Pop_Score", ascending=False).head(k)[["ProdID","Name","Brand","Category","Rating","ReviewCount"]]
    profile = np.asarray(X[idxs].mean(axis=0)).ravel()
    scores = cosine_similarity(profile.reshape(1, -1), X).ravel()
    scores = (1 - w_pop) * scores + w_pop * df["Pop_Score"].values

    for i in idxs: 
        scores[i] = -np.inf
        
    top_idx = np.argsort(-scores)[:k]
    return df.iloc[top_idx][["ProdID","Name","Brand","Category","Rating","ReviewCount"]]

In [41]:
recommend_from_history([156, 101], k=5)

Unnamed: 0,ProdID,Name,Brand,Category,Rating,ReviewCount
2133,8,redken color extend magnetics sulfate free sha...,redken,premium beauty premium hair care hair tools pr...,4.5,326.0
3581,56,kevin murphy balancing wash daily shampoo 8 4 oz,kevin murphy,premium beauty premium hair care hair tools pr...,5.0,2.0
295,3,living proof perfect hair day shampoo 8 oz,living proof,premium beauty premium hair care hair tools pr...,4.3,12.0
1078,955,biolage volumebloom cotton shampoo 33 8 fl oz,matrix,premium beauty premium hair care hair tools pr...,0.0,13.0
3497,3,redken diamond oil glow dry detangling conditi...,redken,premium beauty premium hair care hair tools pr...,4.5,334.0


In [42]:
def diversified_top_picks(k=10):
    cand = df.copy()
    cand["score"] = cand["Pop_Score"]
    top = []
    seen_cat = set()
    for _, row in cand.sort_values("score", ascending=False).iterrows():
        cat = row["Category"].split(",")[0] if row["Category"] else "misc"
        if cat not in seen_cat or len(top) < k//2:
            top.append(row)
            seen_cat.add(cat)
        if len(top) >= k:
            break
    return pd.DataFrame(top)[["ProdID","Name","Brand","Category","Rating","ReviewCount"]]

In [43]:
diversified_top_picks(5)

Unnamed: 0,ProdID,Name,Brand,Category,Rating,ReviewCount
2,8,clairol nice n easy permanent color 7 106a nat...,clairol,beauty hair care hair color permanent hair color,4.5,29221.0
196,252,clairol nice n easy permanent color 6g 116a na...,clairol,beauty hair care hair color permanent hair color,4.5,29221.0
3524,8,pine sol multi surface cleaner original 144 oz...,clorox,household essentials cleaning supplies purpose...,4.7,20997.0
323,31287,clairol age defy expert collection hair color,clairol,beauty hair care hair color clairol hair color,4.3,22002.0
365,970,garnier whole blends repairing shampoo honey t...,garnier,beauty hair care hair care brands garnier hair...,4.7,13809.0


In [47]:
def category_consistency(recs_df, query_cat_tokens):
    tokens = set(query_cat_tokens.split())
    matches = recs_df["Category"].apply(lambda s: len(tokens & set(s.split())) > 0)
    return matches.mean()

In [55]:
query = "beauty skincare"
recs = recommend_from_history([252, 970], k=5)  # your recommender output
score = category_consistency(recs, query)
print(f"Category consistency: {score*100:.2f} %")

Category consistency: 100.00 %


In [53]:
queries = {
    "yoga fitness": [205, 309],
    "beauty skincare": [1443, 2133],
    "electronics audio": [5001, 5002]
}

results = []
for q, prod_ids in queries.items():
    recs = recommend_from_history(prod_ids, k=5)
    score = category_consistency(recs, q)
    results.append({"query": q, "score": score})

eval_df = pd.DataFrame(results)
print(eval_df)
print("Average consistency:", eval_df["score"].mean())

               query  score
0       yoga fitness    0.6
1    beauty skincare    1.0
2  electronics audio    0.0
Average consistency: 0.5333333333333333
