In [1]:
import os
print(os.listdir("/kaggle/input"))

['instacart-online-grocery-basket-analysis-dataset']


In [2]:
# ============================================================================
# PRODUCT2VEC - COMPLETE KAGGLE NOTEBOOK
# ============================================================================
# This runs everything: data processing → training → export
# Takes about 15-20 minutes total
# ============================================================================

import pandas as pd
import numpy as np
import pickle
import json
import time
from gensim.models import Word2Vec

print("=" * 60)
print("PRODUCT2VEC TRAINING PIPELINE")
print("=" * 60)

# ============================================================================
# STEP 1: LOAD AND PROCESS DATA
# ============================================================================
print("\n[STEP 1] Loading CSV files...")

# Kaggle input path
INPUT_PATH = "/kaggle/input/instacart-online-grocery-basket-analysis-dataset"

# Load order_products__prior.csv (32M rows)
order_products = pd.read_csv(
    f"{INPUT_PATH}/order_products__prior.csv",
    usecols=["order_id", "product_id"]
)
print(f"  ✓ Loaded {len(order_products):,} order-product rows")

# Load products.csv
products = pd.read_csv(
    f"{INPUT_PATH}/products.csv",
    usecols=["product_id", "product_name"]
)
print(f"  ✓ Loaded {len(products):,} products")

# ============================================================================
# STEP 2: JOIN TABLES
# ============================================================================
print("\n[STEP 2] Joining tables...")

joined = pd.merge(order_products, products, on="product_id", how="left")
print(f"  ✓ Joined table: {len(joined):,} rows")

# ============================================================================
# STEP 3: CREATE BASKETS
# ============================================================================
print("\n[STEP 3] Creating baskets...")

start_time = time.time()
baskets = joined.groupby("order_id")["product_name"].apply(list).tolist()
print(f"  ✓ Created {len(baskets):,} baskets in {time.time()-start_time:.1f}s")

# Filter baskets (keep size 2-100)
baskets = [b for b in baskets if 2 <= len(b) <= 100]
print(f"  ✓ After filtering: {len(baskets):,} baskets")

# Show samples
print(f"\nSample baskets:")
for i, b in enumerate(baskets[:3]):
    display = b[:4] + ["..."] if len(b) > 4 else b
    print(f"  {i+1}: {display}")

# ============================================================================
# STEP 4: TRAIN WORD2VEC
# ============================================================================
print("\n[STEP 4] Training Word2Vec...")
print("  Config: vector_size=100, window=10, min_count=5, epochs=10")
print("  This takes ~10-15 minutes...\n")

start_time = time.time()

model = Word2Vec(
    sentences=baskets,
    vector_size=100,
    window=10,
    min_count=5,
    negative=10,
    sg=1,  # Skip-gram
    epochs=10,
    workers=4,
    seed=42
)

elapsed = time.time() - start_time
print(f"  ✓ Training complete in {elapsed:.1f}s ({elapsed/60:.1f} min)")
print(f"  ✓ Vocabulary: {len(model.wv):,} products")

# ============================================================================
# STEP 5: TEST THE MODEL
# ============================================================================
print("\n[STEP 5] Testing model...")

test_products = ["Banana", "Organic Whole Milk", "Bag of Organic Bananas"]
for product in test_products:
    if product in model.wv:
        print(f"\nSimilar to '{product}':")
        for p, score in model.wv.most_similar(product, topn=5):
            print(f"  • {p}: {score:.3f}")
        break
else:
    # Find any product to test
    test = list(model.wv.index_to_key)[0]
    print(f"\nSimilar to '{test}':")
    for p, score in model.wv.most_similar(test, topn=5):
        print(f"  • {p}: {score:.3f}")

# ============================================================================
# STEP 6: EXPORT FOR WEB
# ============================================================================
print("\n[STEP 6] Exporting for web...")

# Export vectors
vectors_dict = {product: model.wv[product].tolist() for product in model.wv.index_to_key}

# Export product list with categories
products_full = pd.read_csv(f"{INPUT_PATH}/products.csv")
aisles = pd.read_csv(f"{INPUT_PATH}/aisles.csv")
departments = pd.read_csv(f"{INPUT_PATH}/departments.csv")

products_full = products_full.merge(aisles, on="aisle_id", how="left")
products_full = products_full.merge(departments, on="department_id", how="left")

products_list = []
for product_name in model.wv.index_to_key:
    row = products_full[products_full["product_name"] == product_name]
    if len(row) > 0:
        products_list.append({
            "name": product_name,
            "aisle": row.iloc[0].get("aisle", "unknown"),
            "department": row.iloc[0].get("department", "unknown")
        })
    else:
        products_list.append({
            "name": product_name,
            "aisle": "unknown",
            "department": "unknown"
        })

print(f"  ✓ Prepared {len(products_list):,} products")

# Compute similarities (top 20 for each product)
print("  Computing similarities (this takes a few minutes)...")

product_names = list(model.wv.index_to_key)
similarities = {}

for i, product in enumerate(product_names):
    if i % 5000 == 0:
        print(f"    Processing {i:,}/{len(product_names):,}...")
    similar = model.wv.most_similar(product, topn=20)
    similarities[product] = [[name, round(score, 4)] for name, score in similar]

print(f"  ✓ Computed similarities for {len(similarities):,} products")

# ============================================================================
# STEP 7: SAVE FILES
# ============================================================================
print("\n[STEP 7] Saving files...")

# Save to Kaggle output
with open("/kaggle/working/products.json", "w") as f:
    json.dump(products_list, f)
print("  ✓ Saved products.json")

with open("/kaggle/working/similarities.json", "w") as f:
    json.dump(similarities, f)
print("  ✓ Saved similarities.json")

config = {
    "model_name": "Product2Vec",
    "num_products": len(model.wv),
    "vector_dimensions": 100,
    "training_baskets": len(baskets),
    "algorithm": "Skip-gram with Negative Sampling"
}
with open("/kaggle/working/config.json", "w") as f:
    json.dump(config, f, indent=2)
print("  ✓ Saved config.json")

# ============================================================================
# DONE!
# ============================================================================
print("\n" + "=" * 60)
print("COMPLETE!")
print("=" * 60)
print("""
Download these files from the 'Output' tab on the right:
  • products.json
  • similarities.json
  • config.json

Then put them in your website's web_data/ folder!
""")







  from google.cloud.aiplatform.utils import gcs_utils


PRODUCT2VEC TRAINING PIPELINE

[STEP 1] Loading CSV files...
  ✓ Loaded 32,434,489 order-product rows
  ✓ Loaded 49,688 products

[STEP 2] Joining tables...
  ✓ Joined table: 32,434,489 rows

[STEP 3] Creating baskets...
  ✓ Created 3,214,874 baskets in 61.2s
  ✓ After filtering: 3,058,106 baskets

Sample baskets:
  1: ['Organic Egg Whites', 'Michigan Organic Kale', 'Garlic Powder', 'Coconut Butter', '...']
  2: ['Total 2% with Strawberry Lowfat Greek Strained Yogurt', 'Unsweetened Almondmilk', 'Lemons', 'Organic Baby Spinach', '...']
  3: ['Plain Pre-Sliced Bagels', 'Honey/Lemon Cough Drops', 'Chewy 25% Low Sugar Chocolate Chip Granola', 'Oats & Chocolate Chewy Bars', '...']

[STEP 4] Training Word2Vec...
  Config: vector_size=100, window=10, min_count=5, epochs=10
  This takes ~10-15 minutes...

  ✓ Training complete in 2312.9s (38.5 min)
  ✓ Vocabulary: 47,575 products

[STEP 5] Testing model...

Similar to 'Banana':
  • Organic Fuji Apple: 0.755
  • Unsweetened Original Almond Bree