In [2]:
# =========================
# Cell 0 — CÀI ĐẶT (chạy nếu cần)
# =========================
# đang dùng môi trường mới, chạy dòng pip dưới đây 1 lần.
#pip install -q sentence-transformers faiss-cpu pandas tqdm

In [3]:
# =========================
# Cell 0 — IMPORTS & CONFIG
# =========================
import os
from dataclasses import dataclass
from typing import List, Optional, Tuple

import re
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, CrossEncoder
from tqdm.auto import tqdm
import faiss
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import os
print("Current working directory:", os.getcwd())


Current working directory: e:\1KhoaLuan


In [5]:
#cell 1
# Config dễ chỉnh
@dataclass
class Config:
    csv_path: str = "data/foods.csv"                 # đường dẫn file CSV của bạn
    output_csv: str = "data/encr/foods_with_text.csv"     # file lưu kèm cột text
    embedding_path: str = "data/encr/foods_embeddings.npy" # file lưu embeddings
    faiss_index_path: str = "data/encr/foods_faiss.index"  # file lưu index (tuỳ hệ thống)
    sbert_model: str = "VoVanPhuc/sup-SimCSE-VietNamese-phobert-base"  # model SBERT (đa ngôn ngữ)
    cross_encoder_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"  # optional re-ranker
    batch_size: int = 64
    use_gpu: bool = torch.cuda.is_available()
    hnsw_m: int = 32
    hnsw_ef_construction: int = 200
    hnsw_ef_search: int = 50

cfg = Config()
print(f"Config: {cfg}")
print("GPU available:", cfg.use_gpu)

Config: Config(csv_path='data/foods.csv', output_csv='data/encr/foods_with_text.csv', embedding_path='data/encr/foods_embeddings.npy', faiss_index_path='data/encr/foods_faiss.index', sbert_model='VoVanPhuc/sup-SimCSE-VietNamese-phobert-base', cross_encoder_model='cross-encoder/ms-marco-MiniLM-L-6-v2', batch_size=64, use_gpu=False, hnsw_m=32, hnsw_ef_construction=200, hnsw_ef_search=50)
GPU available: False


In [6]:
# =========================
# Cell 2 — TIỀN XỬ LÝ & TẠO TEXT
# =========================
def clean_text(s: str) -> str:
    if pd.isna(s):
        return ""
    s = str(s)
    s = re.sub(r"http\S+", "", s)
    s = re.sub(r"<[^>]+>", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def make_text(row: pd.Series,
              title_weight: int = 2,
              include_nutrition: bool = False,
              include_meta: bool = False) -> str:
    parts = []
    title = clean_text(row.get("dish_name") or row.get("title") or "")
    if title:
        for _ in range(max(1, title_weight)):
            parts.append(f"title: {title}")
    ing = clean_text(row.get("ingredients", ""))
    if ing:
        parts.append(f"ingredients: {ing}")
    tags = clean_text(row.get("dish_tags", "") or row.get("tags", ""))
    if tags:
        parts.append(f"tags: {tags}")
    method = clean_text(row.get("cooking_method", ""))
    if method:
        parts.append(f"method: {method}")
    desc = clean_text(row.get("description", ""))
    if desc:
        parts.append(f"desc: {desc}")
    if include_meta:
        dtype = clean_text(row.get("dish_type", ""))
        if dtype:
            parts.append(f"type: {dtype}")
        if "serving_size" in row and pd.notna(row["serving_size"]):
            parts.append(f"serving: {clean_text(row['serving_size'])}")
        if "cooking_time" in row and pd.notna(row["cooking_time"]):
            parts.append(f"time: {clean_text(row['cooking_time'])}")
    if include_nutrition:
        nums = []
        for col in ("calories", "fat", "fiber", "sugar", "protein"):
            if col in row and pd.notna(row[col]):
                nums.append(f"{col}:{row[col]}")
        if nums:
            parts.append("nutrition: " + ", ".join(nums))
    return " | ".join([p for p in parts if p])

In [7]:
# =========================
# Cell 3 — LOAD CSV & TẠO CỘT TEXT
# =========================
if not os.path.exists(cfg.csv_path):
    raise FileNotFoundError(f"Không tìm thấy {cfg.csv_path} — đặt file foods.csv vào đường dẫn này hoặc chỉnh cfg.csv_path")

df = pd.read_csv(cfg.csv_path)
print("Dataframe shape:", df.shape)
print("Columns:", df.columns.tolist())

if "text" not in df.columns:
    df["text"] = df.apply(make_text, axis=1)
df.to_csv(cfg.output_csv, index=False)
print("Saved file with text column to:", cfg.output_csv)

print("\nSample text (first 5):")
for t in df["text"].head(5).tolist():
    print("-", t[:300])

Dataframe shape: (4000, 16)
Columns: ['food_id', 'dish_name', 'description', 'dish_type', 'serving_size', 'cooking_time', 'ingredients', 'cooking_method', 'dish_tags', 'calories', 'fat', 'fiber', 'sugar', 'protein', 'image_link', 'nutrient_content']
Saved file with text column to: data/encr/foods_with_text.csv

Sample text (first 5):
- title: Mực ống hấp củ đậu | title: Mực ống hấp củ đậu | ingredients: Mực ống lớn 3 con: 600g làm sạch không bỏ da, Củ sắn: 150g, Cà rốt: 30g, Hành lá: 3 cây, Gốc ngò: 5 cây, Gừng: 15g thái lát, Ớt sừng: 1 trái, Hành tím băm: 1 muỗng, Ớt xiêm xanh: 10 trái nhỏ, đập dập, Gia vị: tiêu, đường, bột năng,
- title: Chả giò ngũ vị | title: Chả giò ngũ vị | ingredients: Da bò bía tươi 20 muỗngiếng, Thịt bò xay 100g, Giò sống 100g, Lòng đỏ trứng vịt muối 2 cái, Trứng gà 2 quả, Củ năng gọt vỏ 50g, Khoai lang 100g, Hành tỏi phi 3 muỗng, Cọng cần cắt nhuyễn 2 muỗng, Hành lá cắt khúc, ớt cắt sợi, Ớt tabasco, t
- title: Canh ba màu | title: Canh ba màu | ingredients: K

In [8]:
# =========================
# Cell 4 — LOAD SBERT & ENCODE
# =========================
device = "cuda" if cfg.use_gpu else "cpu"
print("Loading SBERT model on", device, ":", cfg.sbert_model)
sbert = SentenceTransformer(cfg.sbert_model, device=device)

embeddings = None
if os.path.exists(cfg.embedding_path):
    try:
        tmp = np.load(cfg.embedding_path)
        if tmp.shape[0] == len(df):
            embeddings = tmp
            print("Found existing embeddings at", cfg.embedding_path, "shape:", embeddings.shape)
        else:
            print("Existing embeddings length != n_rows. Will re-encode.")
    except Exception as e:
        print("Could not load embeddings file (will re-encode). Error:", e)

if embeddings is None:
    texts = df["text"].astype(str).tolist()
    batch_size = cfg.batch_size
    all_embs = []
    print("Start encoding texts: n=", len(texts), ", batch_size=", batch_size)
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding"):
        batch = texts[i:i+batch_size]
        emb = sbert.encode(batch, convert_to_numpy=True, show_progress_bar=False)
        all_embs.append(emb)
    embeddings = np.vstack(all_embs)
    np.save(cfg.embedding_path, embeddings)
    print("Saved embeddings to", cfg.embedding_path)

print("Embeddings shape:", embeddings.shape)


Loading SBERT model on cpu : VoVanPhuc/sup-SimCSE-VietNamese-phobert-base


No sentence-transformers model found with name VoVanPhuc/sup-SimCSE-VietNamese-phobert-base. Creating a new one with mean pooling.


Found existing embeddings at data/encr/foods_embeddings.npy shape: (4000, 768)
Embeddings shape: (4000, 768)


In [9]:
# xem thử dât embedding ròi
embs = np.load("data/encr/foods_embeddings.npy")
print("Shape:", embs.shape)   # (3000, 384)
print("First vector (10 dims):", embs[0][:20])   # in 10 số đầu
print("Second vector (10 dims):", embs[1][:20])


Shape: (4000, 768)
First vector (10 dims): [ 0.04742062  0.31501174 -0.13158861  0.1844102  -0.10268987  0.55493146
 -0.18508497 -0.25136575  0.25044173  0.11955011  0.02844164  0.00164126
  0.21102202 -0.05933519 -0.18255445 -0.6203158   0.4196148   0.0999219
 -0.04565507 -0.09049148]
Second vector (10 dims): [ 0.05434306  0.5268317  -0.1334398   0.14432225  0.08203457  0.47928613
 -0.03867362 -0.21033369  0.37205866  0.12104223 -0.4850851  -0.18961293
  0.1887839  -0.0619368  -0.18721479 -0.50146925  0.1427987   0.10388633
  0.21189083  0.03822605]


In [10]:


embs = np.load("data/encr/foods_embeddings.npy")
df_embs = pd.DataFrame(embs)
df_embs.to_csv("data/encr/foods_embeddings.csv", index=False)
print("Đã xuất embeddings ra data/encr/foods_embeddings.csv")

Đã xuất embeddings ra data/encr/foods_embeddings.csv


In [11]:
# =========================
# Cell 5 — NORMALIZE & BUILD FAISS INDEX
# =========================
faiss.normalize_L2(embeddings)
d = embeddings.shape[1]
print("Embedding dimension d =", d)

try:
    index = faiss.IndexHNSWFlat(d, cfg.hnsw_m)
    index.hnsw.efConstruction = cfg.hnsw_ef_construction
    index.hnsw.efSearch = cfg.hnsw_ef_search
    print("Using HNSW index")
except Exception as e:
    print("HNSW unavailable, fallback to IndexFlatIP. Error:", e)
    index = faiss.IndexFlatIP(d)

index.add(embeddings)
print("Index ntotal:", index.ntotal)

try:
    faiss.write_index(index, cfg.faiss_index_path)
    print("Saved FAISS index to", cfg.faiss_index_path)
except Exception as e:
    print("Warning: could not save FAISS index:", e)

Embedding dimension d = 768
Using HNSW index
Index ntotal: 4000
Saved FAISS index to data/encr/foods_faiss.index


In [12]:
# =========================
# Cell 6 — LOAD Cross-Encoder (optional)
# =========================
cross_encoder = None
try:
    print("Trying to load Cross-Encoder (optional):", cfg.cross_encoder_model)
    cross_encoder = CrossEncoder(cfg.cross_encoder_model, device=device)
    print("Cross-Encoder loaded")
except Exception as e:
    print("Cross-Encoder not loaded (this is OK). Error:", e)
    cross_encoder = None

Trying to load Cross-Encoder (optional): cross-encoder/ms-marco-MiniLM-L-6-v2
Cross-Encoder loaded


In [13]:
# =========================
# Cell 7 — RECOMMEND FUNCTION
# =========================
def recommend(query: str,
              model: SentenceTransformer,
              index: faiss.Index,
              df: pd.DataFrame,
              top_k: int = 10,
              rerank_top_n: int = 50,
              cross_encoder: Optional[CrossEncoder] = None) -> pd.DataFrame:
    qv = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(qv)
    search_k = min(top_k if cross_encoder is None else max(top_k, rerank_top_n), index.ntotal)
    D, I = index.search(qv, search_k)
    ids = I[0]
    scores = D[0]
    results = df.reset_index(drop=True).loc[ids].copy()
    results["vector_score"] = scores
    if cross_encoder is not None:
        pairs = [[query, t] for t in results["text"].astype(str).tolist()]
        rerank_scores = cross_encoder.predict(pairs)
        results["rerank_score"] = rerank_scores
        results = results.sort_values("rerank_score", ascending=False).head(top_k)
    else:
        results = results.sort_values("vector_score", ascending=False).head(top_k)
    return results

In [23]:

sample_queries = [
    "tôi muôn ăn trái cây dĩa đá bào",
]

for q in sample_queries:
    print("\n=== Query:", q)
    res = recommend(q, sbert, index, df, top_k=20, rerank_top_n=30, cross_encoder=cross_encoder)
    display_cols = ["dish_name"] if "dish_name" in res.columns else ["text"]
    display_cols += ["vector_score"]
    if "rerank_score" in res.columns:
        display_cols += ["rerank_score"]
    print(res[display_cols].to_string(index=False))


=== Query: tôi muôn ăn trái cây dĩa đá bào
                         dish_name  vector_score  rerank_score
              Đà điểu xốt trái cây      0.943030      5.778545
            Trái Cây Trộn Sữa Chua      0.974560      5.293700
                Đu đủ trộn gà khìa      0.998812      3.741148
                      Trà Trái Cây      1.004556      3.067932
                         Bánh Xoài      1.016626      2.848240
                      Sữa Chua Mít      0.966799      2.736762
                    Sữa Chua Đu Đủ      1.005901      1.957336
                        Sinh Tố Bơ      0.979329      1.652818
                      Bia Úp Ngược      1.017239      1.310284
             Dâu Tươi Dầm Sữa Chua      0.943992      1.113546
                         Chuối Sấy      1.010029      0.557612
              Sinh Tố Táo Sữa Tươi      0.997739      0.350567
             Smoothies trà sữa dâu      0.972962      0.342537
  Smoothie Thanh Long Đỏ Nhiệt Đới      0.933523      0.113293
           

In [15]:
# =========================
# Cell 9 — SANITY CHECK / SAVE RESULTS
# =========================
all_rows = []
for q in sample_queries:
    r = recommend(q, sbert, index, df, top_k=5, rerank_top_n=30, cross_encoder=cross_encoder)
    for rank, row in enumerate(r.to_dict("records"), start=1):
        all_rows.append({
            "query": q,
            "rank": rank,
            "dish_name": row.get("dish_name") or row.get("title") or row.get("text")[:80],
            "vector_score": row.get("vector_score"),
            "rerank_score": row.get("rerank_score") if "rerank_score" in row else None
        })
pd.DataFrame(all_rows).to_csv("data/encr/recommendation_demo.csv", index=False)
print("Saved demo recommendations to /mnt/data/recommendation_demo.csv")

Saved demo recommendations to /mnt/data/recommendation_demo.csv


In [18]:
#loc cong tác
# 8. LỌC CỘNG TÁC (COLLABORATIVE FILTERING)
# =========================
def collaborative_candidates(user_row, users_df, foods_df, top_n=30):
    """
    Lọc cộng tác: lấy các món mà những user giống user hiện tại đã thích.
    """
    user_rated = user_row.get('danh_gia_mon', None)
    if pd.isna(user_rated) or not user_rated:
        return foods_df
    liked_dishes = [x.strip().lower() for x in str(user_rated).split(',') if x.strip()]
    similar_users = users_df[users_df['danh_gia_mon'].apply(
        lambda x: any(dish in str(x).lower() for dish in liked_dishes) if pd.notna(x) else False
    )]
    candidate_dishes = set()
    for dishes in similar_users['danh_gia_mon']:
        if pd.notna(dishes):
            for dish in str(dishes).split(','):
                candidate_dishes.add(dish.strip().lower())
    foods_candidates = foods_df[foods_df['dish_name'].str.lower().isin(candidate_dishes)]
    return foods_candidates if not foods_candidates.empty else foods_df

In [None]:
# =========================
# 9. TEST LỌC CỘNG TÁC
# =========================
# Đọc dữ liệu user
users_df = pd.read_csv("data/users_vietnamese.csv")
user = users_df.iloc[0]  # chọn user bất kỳ

# Lọc cộng tác lấy danh sách món ăn
candidates = collaborative_candidates(user, users_df, df, top_n=30)

# Recommend trên danh sách này
query = user['mon_yeu_thich'] if pd.notna(user['mon_yeu_thich']) else "món ngon"
res = recommend(query, sbert, index, candidates, top_k=10, rerank_top_n=20, cross_encoder=cross_encoder)
print(res[['dish_name', 'vector_score']].head(10))

                       dish_name  vector_score
1896     Mì Cay 7 Cấp Độ Cực Hot      0.666314
39              Mì xào giòn chay      0.841268
642   Gỏi hoa chuối thịt gà chay      0.817716
2813            Mì Quảng Ăn Chay      0.793400
1535                      Mì Bay      0.815184
1932      Mì Căn Xào Cà Ri Sả Ớt      0.828340
2104      Mì Gói Xào Rau Củ Chay      0.818670
2094        Mì Ý Xốt Cà Chua Nấm      0.845230
2042        Mì Ý Sốt Cà Chua Nấm      0.837598
1036            Mì tàu hũ cá hồi      0.843520
