## Utils

In [1]:
#################################################################################
import numpy as np
import re
import pandas as pd

# constants
OZ_TO_ML = 29.5735
OZ_TO_G  = 28.3495
L_TO_ML  = 1000.0
KG_TO_G  = 1000.0
LB_TO_G  = 453.59237


UNIT_MAP = {
    "oz": "oz", "ounce": "oz", "ounces": "oz",
    "fl oz": "fl_oz", "floz": "fl_oz", "fl.oz": "fl_oz",
    "ml": "ml", "l": "l", "litre": "l", "liter": "l",
    "g": "g", "kg": "kg",
    "pack": "pack", "count": "count", "ct": "count", "piece": "count"
}

def extract_value_and_unit(text):
    """
    Extract 'value' and 'unit' (explicit or implicit) from product text.
    Requires 'Value:' and 'Unit:' to appear on their own lines.
    """
    t = str(text).lower()
    out = {"value": np.nan, "unit": None, "has_value": 0, "has_unit": 0}

    # 1Ô∏è‚É£ Explicit "Value:" label at start of line
    m = re.search(r'(?m)^[ \t]*value:\s*(\d+(?:\.\d+)?)', t)
    if m:
        out["value"] = float(m.group(1))
        out["has_value"] = 1

    # 2Ô∏è‚É£ Explicit "Unit:" label at start of line
    m = re.search(r'(?m)^[ \t]*unit:\s*([^\n\r]*)', t)
    if m:
        unit_raw = m.group(1).strip()
        unit_clean = UNIT_MAP.get(unit_raw.replace('.', '').replace(' ', ''), unit_raw)
        out["unit"] = unit_clean
        out["has_unit"] = 1

    # 3Ô∏è‚É£ Implicit numeric + unit pattern (fallback)
    if not out["has_unit"]:
        m = re.search(r'(\d+(?:\.\d+)?)\s*(fl\.?\s?oz|ounce|ounces|oz|ml|g|kg|l)\b', t)
        if m:
            val = float(m.group(1))
            unit_raw = m.group(2).replace('.', '').replace(' ', '')
            out["value"] = val if np.isnan(out["value"]) else out["value"]
            out["unit"] = UNIT_MAP.get(unit_raw, unit_raw)
            out["has_unit"] = 1
            out["has_value"] = 1

    return out


def add_value_unit_features(df, text_col="catalog_content"):
    """
    Adds 'value_only', 'unit_only', 'has_value', and 'has_unit' columns.
    """
    extracted = df[text_col].fillna("").apply(extract_value_and_unit)
    extracted_df = pd.DataFrame(list(extracted))

    df["value"] = extracted_df["value"]
    df["unit"] = extracted_df["unit"]
    df["has_value"] = extracted_df["has_value"]
    df["has_unit"] = extracted_df["has_unit"]

    return df

UNIT_MAP_CLEAN = {
    # --- volume ---
    "ml": "ml", "millilitre": "ml", "milliliter": "ml", "mililitro": "ml", "ltr": "l", "l": "l", "liters": "l", "2.5 gal.": "gal",
    "fl_oz": "fl_oz", "fl ounce": "fl_oz", "fl oz": "fl_oz", "fluid ounce": "fl_oz", "fluid ounces": "fl_oz", "fluid ounce(s)": "fl_oz", "20 oz.": "oz",

    # --- weight ---
    "g": "g", "gram": "g", "grams": "g", "gramm": "g", "gr": "g", "grams(gm)": "g",
    "kg": "kg", "pound": "lb", "pounds": "lb", "lb": "lb",

    # --- count / packaging ---
    "pack": "pack", "packs": "pack", "per package": "pack", "per box": "pack",
    "count": "count", "ct": "count", "each": "count", "each / pack: 1": "count",
    "bag": "pack", "box": "pack", "box/12": "pack", "carton": "pack", "case": "pack",
    "bottle": "count", "bottles": "count", "jar": "count", "can": "count", "capsule": "count",
    "pouch": "count", "bucket": "count", "k-cups": "count", "ziplock bags": "count", "paper cupcake liners": "count", "tea bags": "count",

    # --- others (dimensional / irrelevant) ---
    "in": "in", "sq ft": "sq_ft", "foot": "ft", "cm/inch)": "cm_inch",

    # --- noise / invalid ---
    "none": None, "": None, "-": None, "---": None, "1": None, "24": None,
    "product_weight": None, "units": None,
    "1 pk. color(s): -black. product type: -permanent. pack quantity: -1. tip type: -chisel. dimensions: overall product weight: -0.06 lbs.": None,
    "comes as a single 0.1 oz stick for on-the-go use": None,
    "unit√†": None,
    "7,2 oz": "oz"
}

def normalize_unit(unit):
    if pd.isna(unit) or not isinstance(unit, str):
        return None
    u = unit.strip().lower()
    u = u.replace('.', '').replace('(', '').replace(')', '').replace(':', '').strip()
    return UNIT_MAP_CLEAN.get(u, u)  # fallback: return itself if not found

UNIT_FINAL_MAP = {
    "oz": "oz",
    "fl_oz": "fl_oz",
    "count": "count",
    "lb": "lb",
    "g": "g",
    "ml": "ml",
    "l": "l",
    "kg": "kg",
    "pack": "pack",
    "per carton": "pack",
    "sq_ft": "sq_ft",
    "ft": "ft",
    "in": "in",
    "8": None, # Map '8' to None
    "gramsgm": "g",
    None: None
}

UNIT_CATEGORY_MAP = {
    "oz": "weight",
    "lb": "weight",
    "g": "weight",
    "kg": "weight",
    "ml": "volume",
    "l": "volume",
    "fl_oz": "volume",
    "count": "count",
    "pack": "count",
    "sq_ft": "dimension",
    "ft": "dimension",
    "in": "dimension",
    None: "unknown"
}


def clean_catalog_text(text):
    """
    Removes lines starting with 'Value:' or 'Unit:' (case-insensitive) from catalog text.
    Only removes if they are at the beginning of a line.
    """
    pattern = r'(?im)^[ \t]*(value:.*|unit:.*)$'
    cleaned = re.sub(pattern, '', str(text))
    return cleaned.strip()

def qty_to_base(qty, unit_final, unit_category):
    """Return (qty_in_base, base_type) where base_type is 'ml', 'g', or 'count' or None"""
    if pd.isna(qty) or qty <= 0 or unit_final is None:
        return (np.nan, None)
    u = str(unit_final).lower()
    if unit_category == 'volume':
        if u in ('ml',):
            return (qty, 'ml')
        if u in ('l', 'ltr'):
            return (qty * L_TO_ML, 'ml')
        if u in ('fl_oz','floz','fl ounce','fluid ounce','fluid ounces'):
            return (qty * OZ_TO_ML, 'ml')
        if u == 'oz':  # ambiguous: treat by category; here category=volume so ml
            return (qty * OZ_TO_ML, 'ml')
    if unit_category == 'weight':
        if u in ('g','gram','grams','gr','gramsgm'):
            return (qty, 'g')
        if u in ('kg',):
            return (qty * KG_TO_G, 'g')
        if u in ('lb','pound','pounds'):
            return (qty * LB_TO_G, 'g')
        if u == 'oz':  # treat ounce as weight here
            return (qty * OZ_TO_G, 'g')
    if unit_category == 'count':
        return (qty, 'count')
    return (np.nan, None)

def feat_eng(data):
    """Applies feature engineering steps to the input DataFrame."""

    # 1) price_per_unit (raw)

    # 2) standardized base price (price per ml or per g)
    # Ensure 'unit_final' and 'unit_category' columns exist
    if 'unit_final' not in data.columns or 'unit_category' not in data.columns:
         raise ValueError("DataFrame must contain 'unit_final' and 'unit_category' columns before calling feat_eng.")

    qty_base = data.apply(lambda r: qty_to_base(r['value'], r['unit_final'], r['unit_category']), axis=1)
    data['qty_base'] = [q[0] for q in qty_base]
    data['base_type'] = [q[1] for q in qty_base]

    # 4) simple text features (use catalog_content_clean or catalog_content which you replaced)
    def text_feats(t):
        t = str(t)
        words = re.findall(r'\w+', t)
        word_count = len(words)
        char_count = len(t)
        avg_word_len = np.mean([len(w) for w in words]) if words else 0
        bullet_count = len(re.findall(r'bullet', t, flags=re.I))  # simple bullet marker
        digits = len(re.findall(r'\d', t))
        return pd.Series([word_count, char_count, avg_word_len, bullet_count, digits])

    # Ensure 'catalog_content' column exists
    if 'catalog_content' not in data.columns:
         raise ValueError("DataFrame must contain 'catalog_content' column before calling feat_eng.")

    data[['word_count','char_count','avg_word_len','bullet_count','num_digits']] = data['catalog_content'].apply(text_feats)

    # 5) keyword flags
    keywords = {
        'organic': 'is_organic',
        'gluten-free': 'is_gluten_free',
        'gluten free': 'is_gluten_free',
        'sugar-free': 'is_sugar_free',
        'sugar free': 'is_sugar_free',
        'vegan': 'is_vegan',
        'new': 'is_new',
        'pack': 'has_pack_word',
        'bundle': 'has_bundle'
    }
    for kw, col in keywords.items():
        data[col] = data['catalog_content'].str.contains(re.escape(kw), case=False, na=False).astype(int)

    return data # Return the modified DataFrame
def process_FE(data):
  data = add_value_unit_features(data)

  data["unit_normalized"] = data["unit"].apply(normalize_unit)
  data["unit_final"] = data["unit_normalized"].map(UNIT_FINAL_MAP) # Applied to data DataFrame
  data["unit_category"] = data["unit_final"].map(UNIT_CATEGORY_MAP).fillna("unknown") # Applied to data DataFrame
  data['catalog_content'] = data['catalog_content'].apply(clean_catalog_text)

  # Drop unneeded columns to save space
  cols_to_drop = ["unit", "unit_normalized"]
  data = data.drop(columns=cols_to_drop)
  data = feat_eng(data)
  return data


## Clip

In [1]:
!pip install git+https://github.com/openai/CLIP.git ftfy

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-pm2poqgf
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-pm2poqgf
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.8/44.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369490 sha256=93e63097752f75c2465afd39c6af40427c110ec452b9cf19d2b5bd0ea7978a06
  Stored in directory: /tmp/p

In [2]:
 #-------------------------
# IMPROVED PIPELINE: Add CLIP Text + Alignment
# -------------------------
import torch
import clip
import numpy as np
from tqdm import tqdm
import os
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define base directory
base_dir = '/content/drive/MyDrive/amazon_ml_challenge'


Mounted at /content/drive


In [3]:
# #
# # -------------------------
# # Load and process data
# # -------------------------
# train_csv_path = os.path.join(base_dir, 'train.csv')
# data = pd.read_csv(train_csv_path)
# data = process_FE(data)

# # Impute missing values
# data['value'] = data['value'].fillna(-1)
# median_qty_base = data['qty_base'].median()
# data['qty_base'] = data['qty_base'].fillna(median_qty_base)
# data['base_type'] = data['base_type'].fillna('missing')
# data['unit_final'] = data['unit_final'].fillna('unknown')

# print("‚úÖ Data loaded and processed")

‚úÖ Data loaded and processed


In [6]:
# save_path_drive = os.path.join(base_dir, "processed_fe+textpre.csv")
# data.to_csv(save_path_drive, index=False)
# print(f"‚úÖ Processed data saved to: {save_path_drive}")

‚úÖ Processed data saved to: /content/drive/MyDrive/amazon_ml_challenge/processed_fe+textpre.csv


In [3]:
# Load processed data directly from Drive
processed_csv_path = os.path.join(base_dir, "processed_fe+textpre.csv")
data = pd.read_csv(processed_csv_path)
print(f"‚úÖ Processed data loaded from: {processed_csv_path}")
display(data.head())

‚úÖ Processed data loaded from: /content/drive/MyDrive/amazon_ml_challenge/processed_fe+textpre.csv


Unnamed: 0,sample_id,catalog_content,image_link,price,value,has_value,has_unit,unit_final,unit_category,qty_base,...,avg_word_len,bullet_count,num_digits,is_organic,is_gluten_free,is_sugar_free,is_vegan,is_new,has_pack_word,has_bundle
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,72.0,1,1,fl_oz,volume,2129.292,...,3.846154,0.0,3.0,0,0,0,0,0,1,0
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12,32.0,1,1,oz,weight,907.184,...,5.184211,5.0,14.0,0,0,0,0,0,1,0
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97,11.4,1,1,oz,weight,323.1843,...,4.25,5.0,9.0,0,0,0,0,0,1,0
3,55858,Item Name: Judee‚Äôs Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34,11.25,1,1,oz,weight,318.931875,...,5.009615,5.0,13.0,0,0,0,0,0,0,0
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49,12.0,1,1,count,count,12.0,...,4.041667,1.0,10.0,0,0,0,0,0,0,0


In [4]:
# -------------------------
# 1Ô∏è‚É£ Load image embeddings (already aligned to train.csv!)
# -------------------------
img_data = os.path.join(base_dir, "full_image_embeddings.npy")
img_embeddings = np.load(img_data)
img_tensor = torch.tensor(np.nan_to_num(img_embeddings, nan=0.0), dtype=torch.float)
print(f"‚úÖ Image embeddings loaded: {img_tensor.shape}")

# -------------------------
# 2Ô∏è‚É£ Load text+structured embeddings
# -------------------------
train_input_final_path = os.path.join(base_dir, 'train_input_final.pt')
train_text_embd_data = torch.load(train_input_final_path)

train_text_struct = train_text_embd_data['train_input']  # MiniLM + structured
sample_ids = train_text_embd_data['sample_ids']
targets = train_text_embd_data['targets']

print(f"‚úÖ Text+Structured loaded: {train_text_struct.shape}")

# -------------------------
# 2.5Ô∏è‚É£ VERIFY ORDER (important sanity check!)
# -------------------------
print("\nüîç Verifying alignment...")
print(f"Train CSV has {len(data)} rows")
print(f"Image embeddings has {len(img_tensor)} rows")
print(f"Sample IDs has {len(sample_ids)} entries")

# Check if sample_ids match train.csv order
if (data['sample_id'].values[:len(sample_ids)] == np.array(sample_ids)).all():
    print("‚úÖ Sample IDs are in train.csv order - perfect!")
    data_final = data.copy()
else:
    print("‚ö†Ô∏è  Sample IDs are NOT in train.csv order - reordering...")
    data_final = data.set_index('sample_id').loc[sample_ids].reset_index()


‚úÖ Image embeddings loaded: torch.Size([75000, 512])
‚úÖ Text+Structured loaded: torch.Size([75000, 403])

üîç Verifying alignment...
Train CSV has 75000 rows
Image embeddings has 75000 rows
Sample IDs has 75000 entries
‚úÖ Sample IDs are in train.csv order - perfect!


In [5]:

# -------------------------
# 3Ô∏è‚É£ Load CLIP model for text embeddings
# -------------------------
device = 'cuda' if torch.cuda.is_available() else 'cpu'
clip_model, _ = clip.load("ViT-B/32", device=device)
clip_model.eval()
print(f"‚úÖ CLIP model loaded on {device}")

# -------------------------
# 4Ô∏è‚É£ Extract CLIP text embeddings
# -------------------------
def compute_clip_text_embeddings(texts, clip_model, batch_size=256, device='cuda'):
    """Extract CLIP text embeddings"""
    all_embeds = []

    for i in tqdm(range(0, len(texts), batch_size), desc="CLIP Text Embeddings"):
        batch = texts[i:i+batch_size]
        batch_truncated = [str(t)[:300] for t in batch]
        text_tokens = clip.tokenize(batch_truncated, truncate=True).to(device)

        with torch.no_grad():
            features = clip_model.encode_text(text_tokens)
            features = features / features.norm(dim=-1, keepdim=True)

        all_embeds.append(features.cpu())

    return torch.cat(all_embeds, dim=0)

train_texts = data_final['catalog_content'].fillna("").tolist()

print("\nüöÄ Extracting CLIP text embeddings...")
clip_text_embeddings = compute_clip_text_embeddings(
    train_texts,
    clip_model,
    batch_size=256,
    device=device
)
print(f"‚úÖ CLIP text embeddings: {clip_text_embeddings.shape}")


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 338M/338M [00:01<00:00, 215MiB/s]


‚úÖ CLIP model loaded on cuda

üöÄ Extracting CLIP text embeddings...


CLIP Text Embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 293/293 [01:09<00:00,  4.22it/s]

‚úÖ CLIP text embeddings: torch.Size([75000, 512])





In [6]:

# -------------------------
# 5Ô∏è‚É£ Compute Image-Text Alignment (THE SECRET WEAPON!)
# -------------------------
def compute_alignment_features(image_embeds, text_embeds):
    """Image-text alignment catches listing quality issues"""
    min_len = min(image_embeds.shape[0], text_embeds.shape[0])
    image_embeds = image_embeds[:min_len]
    text_embeds = text_embeds[:min_len]

    # Cosine similarity (already L2 normalized)
    similarity = (image_embeds * text_embeds).sum(dim=1, keepdim=True)

    # Binary flags
    high_match = (similarity > 0.8).float()
    low_match = (similarity < 0.5).float()

    return torch.cat([similarity, high_match, low_match], dim=1)

print("\nüîó Computing image-text alignment...")
alignment_features = compute_alignment_features(img_tensor, clip_text_embeddings)
print(f"‚úÖ Alignment features: {alignment_features.shape}")

# Statistics
print(f"\nüìä Alignment Statistics:")
print(f"   Mean similarity: {alignment_features[:, 0].mean():.3f}")
print(f"   High matches (>0.8): {alignment_features[:, 1].sum().item():.0f} ({alignment_features[:, 1].mean()*100:.1f}%)")
print(f"   Low matches (<0.5): {alignment_features[:, 2].sum().item():.0f} ({alignment_features[:, 2].mean()*100:.1f}%)")

# -------------------------
# 6Ô∏è‚É£ Extract structured features (last columns from MiniLM+structured)
# -------------------------
structured_cols = [
    'value', 'has_value', 'has_unit',
    'unit_final', 'unit_category',
    'qty_base', 'base_type',
    'word_count', 'char_count', 'avg_word_len',
    'bullet_count', 'num_digits',
    'is_organic', 'is_gluten_free', 'is_sugar_free',
    'is_vegan', 'is_new', 'has_pack_word', 'has_bundle'
]
num_structured = len(structured_cols)  # ~19 after encoding

structured_features = train_text_struct[:, -num_structured:]
print(f"\nüì¶ Extracted structured features: {structured_features.shape}")

# -------------------------
# 7Ô∏è‚É£ Combine everything
# -------------------------
min_samples = min(
    img_tensor.shape[0],
    clip_text_embeddings.shape[0],
    alignment_features.shape[0],
    structured_features.shape[0]
)

full_input = torch.cat([
    img_tensor[:min_samples],              # 512-dim
    clip_text_embeddings[:min_samples],    # 512-dim
    alignment_features[:min_samples],      # 3-dim
    structured_features[:min_samples]      # ~19-dim
], dim=1)

targets = targets[:min_samples]

print(f"\n‚úÖ FINAL Combined input: {full_input.shape}")
print(f"   - CLIP Image: 512 dims")
print(f"   - CLIP Text: 512 dims")
print(f"   - Alignment: 3 dims")
print(f"   - Structured: {structured_features.shape[1]} dims")
print(f"   - TOTAL: {full_input.shape[1]} dims")



üîó Computing image-text alignment...
‚úÖ Alignment features: torch.Size([75000, 3])

üìä Alignment Statistics:
   Mean similarity: 0.207
   High matches (>0.8): 0 (0.0%)
   Low matches (<0.5): 75000 (100.0%)

üì¶ Extracted structured features: torch.Size([75000, 19])

‚úÖ FINAL Combined input: torch.Size([75000, 1046])
   - CLIP Image: 512 dims
   - CLIP Text: 512 dims
   - Alignment: 3 dims
   - Structured: 19 dims
   - TOTAL: 1046 dims


In [7]:

# -------------------------
# 8Ô∏è‚É£ Train/Val Split
# -------------------------
from sklearn.model_selection import train_test_split

train_input_np, val_input_np, train_targets_np, val_targets_np = train_test_split(
    full_input.numpy(), targets.numpy(), test_size=0.2, random_state=42
)

train_input = torch.tensor(train_input_np, dtype=torch.float)
val_input = torch.tensor(val_input_np, dtype=torch.float)
train_targets = torch.tensor(train_targets_np, dtype=torch.float)
val_targets = torch.tensor(val_targets_np, dtype=torch.float)

print(f"\n‚úÖ Train: {train_input.shape}, Val: {val_input.shape}")

# -------------------------
# 9Ô∏è‚É£ Save
# -------------------------
save_dir = os.path.join(base_dir, "combined_CLIP_final")
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "clip_full_with_alignment.pt")

torch.save({
    "train_input": train_input,
    "val_input": val_input,
    "train_targets": train_targets,
    "val_targets": val_targets
}, save_path)

print(f"\n‚úÖ Saved to: {save_path}")


‚úÖ Train: torch.Size([60000, 1046]), Val: torch.Size([15000, 1046])

‚úÖ Saved to: /content/drive/MyDrive/amazon_ml_challenge/combined_CLIP_final/clip_full_with_alignment.pt


In [9]:
# =========================================================================
# Load the CLIP-enhanced data
# =========================================================================
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load your saved CLIP data
# save_path = '/content/drive/MyDrive/amazon_ml_challenge/combined_CLIP_final/clip_full_with_alignment.pt'
# data = torch.load(save_path)

# train_input = data['train_input'].to(device)
# val_input = data['val_input'].to(device)
# train_targets = data['train_targets'].to(device)
# val_targets = data['val_targets'].to(device)

print(f"‚úÖ Data loaded")
print(f"Train: {train_input.shape}, Val: {val_input.shape}")
print(f"Input dimensions: {train_input.shape[1]}")

# =========================================================================
# Model Definition (Updated for 1046 dims)
# =========================================================================
class RegressionMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.3),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.2),

            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.model(x)

# =========================================================================
# Ensemble Prediction Function
# =========================================================================
def ensemble_predict(models, x):
    """Average predictions from multiple models"""
    predictions = []
    for model in models:
        model.eval()
        with torch.no_grad():
            pred = model(x)
            predictions.append(pred)
    return torch.stack(predictions).mean(dim=0)

# =========================================================================
# Training: Large Ensemble (10 models)
# =========================================================================
print("\n" + "="*70)
print("üöÄ Training Large Ensemble (10 models with CLIP features)")
print("="*70)

models_large = []
results = {}

# For comparison with original
val_true_orig = np.expm1(val_targets.cpu().numpy().squeeze())

for seed in range(10):
    torch.manual_seed(seed)
    np.random.seed(seed)

    print(f"\nüîÑ Training model {seed+1}/10 (seed={seed})...")

    model = RegressionMLP(train_input.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    criterion = nn.SmoothL1Loss()

    train_loader = DataLoader(
        TensorDataset(train_input, train_targets),
        batch_size=256,
        shuffle=True
    )

    # Training loop with progress bar
    for epoch in range(15):
        model.train()
        epoch_loss = 0

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        # Print progress every 5 epochs
        if (epoch + 1) % 5 == 0:
            avg_loss = epoch_loss / len(train_loader)
            print(f"   Epoch {epoch+1}/15: Loss = {avg_loss:.4f}")

    models_large.append(model)
    print(f"‚úÖ Model {seed+1}/10 complete")


‚úÖ Data loaded
Train: torch.Size([60000, 1046]), Val: torch.Size([15000, 1046])
Input dimensions: 1046

üöÄ Training Large Ensemble (10 models with CLIP features)

üîÑ Training model 1/10 (seed=0)...
   Epoch 5/15: Loss = 0.2696
   Epoch 10/15: Loss = 0.2202
   Epoch 15/15: Loss = 0.1853
‚úÖ Model 1/10 complete

üîÑ Training model 2/10 (seed=1)...
   Epoch 5/15: Loss = 0.2694
   Epoch 10/15: Loss = 0.2211
   Epoch 15/15: Loss = 0.1848
‚úÖ Model 2/10 complete

üîÑ Training model 3/10 (seed=2)...
   Epoch 5/15: Loss = 0.2695
   Epoch 10/15: Loss = 0.2197
   Epoch 15/15: Loss = 0.1875
‚úÖ Model 3/10 complete

üîÑ Training model 4/10 (seed=3)...
   Epoch 5/15: Loss = 0.2655
   Epoch 10/15: Loss = 0.2186
   Epoch 15/15: Loss = 0.1822
‚úÖ Model 4/10 complete

üîÑ Training model 5/10 (seed=4)...
   Epoch 5/15: Loss = 0.2698
   Epoch 10/15: Loss = 0.2188
   Epoch 15/15: Loss = 0.1828
‚úÖ Model 5/10 complete

üîÑ Training model 6/10 (seed=5)...
   Epoch 5/15: Loss = 0.2691
   Epoch 10/1

RuntimeError: Expected all tensors to be on the same device, but got mat1 is on cpu, different from other tensors on cuda:0 (when checking argument in method wrapper_CUDA_addmm)

take 2

In [16]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_absolute_error, mean_squared_error

device = 'cuda' if torch.cuda.is_available() else 'cpu'
EPOCHS = 30
BATCH = 256
MAX_GRAD_NORM = 1.0
max_price = 4000
max_log_value = np.log1p(max_price)

models_large = []
results = {}

# data loaders (use pin_memory and workers for GPU)
train_loader = DataLoader(
    TensorDataset(train_input, train_targets),
    batch_size=BATCH,
    shuffle=True,
    pin_memory=False,
    num_workers=0
)
val_loader = DataLoader(
    TensorDataset(val_input, val_targets),
    batch_size=BATCH,
    shuffle=False,
    pin_memory=False,
    num_workers= 0
)

for seed in range(10):
    torch.manual_seed(seed)
    np.random.seed(seed)
    if device == 'cuda':
        torch.cuda.manual_seed_all(seed)

    print(f"\nüîÑ Training model {seed+1}/10 (seed={seed})...")

    model = RegressionMLP(train_input.shape[1]).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
    criterion = nn.SmoothL1Loss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

    best_val_loss = float('inf')
    best_state = None

    scaler = torch.cuda.amp.GradScaler(enabled=(device == 'cuda'))

    for epoch in range(EPOCHS):
        # ---------- train ----------
        model.train()
        train_loss = 0.0
        for xb, yb in train_loader:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)

            optimizer.zero_grad()
            with torch.cuda.amp.autocast(enabled=(device == 'cuda')):
                preds = model(xb)
                loss = criterion(preds, yb)

            scaler.scale(loss).backward()
            # gradient clipping (unscale first)
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)

            scaler.step(optimizer)
            scaler.update()

            train_loss += loss.item() * xb.size(0)

        train_loss = train_loss / len(train_loader.dataset)

        # ---------- validate ----------
        model.eval()
        val_loss = 0.0
        all_preds = []
        all_targets = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device, non_blocking=True)
                yb = yb.to(device, non_blocking=True)

                with torch.cuda.amp.autocast(enabled=(device == 'cuda')):
                    preds = model(xb)
                    loss = criterion(preds, yb)

                val_loss += loss.item() * xb.size(0)

                # keep predictions (for potential debug)
                all_preds.append(preds.cpu())
                all_targets.append(yb.cpu())

        val_loss = val_loss / len(val_loader.dataset)
        scheduler.step(val_loss)

        # save best model (based on val loss)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_state = model.state_dict()

        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f"   Epoch {epoch+1}/{EPOCHS}: train_loss={train_loss:.4f}  val_loss={val_loss:.4f}")

    # After epochs: load best_state if available
    if best_state is not None:
        model.load_state_dict(best_state)

    models_large.append(model)
    print(f"‚úÖ Model {seed+1}/10 complete (best val_loss={best_val_loss:.4f})")



üîÑ Training model 1/10 (seed=0)...


  scaler = torch.cuda.amp.GradScaler(enabled=(device == 'cuda'))
  with torch.cuda.amp.autocast(enabled=(device == 'cuda')):
  with torch.cuda.amp.autocast(enabled=(device == 'cuda')):


   Epoch 1/30: train_loss=0.9009  val_loss=0.3071
   Epoch 5/30: train_loss=0.2590  val_loss=0.2572
   Epoch 10/30: train_loss=0.2095  val_loss=0.2422
   Epoch 15/30: train_loss=0.1607  val_loss=0.2503
   Epoch 20/30: train_loss=0.1205  val_loss=0.2469
   Epoch 25/30: train_loss=0.1040  val_loss=0.2470
   Epoch 30/30: train_loss=0.0961  val_loss=0.2451
‚úÖ Model 1/10 complete (best val_loss=0.2422)

üîÑ Training model 2/10 (seed=1)...
   Epoch 1/30: train_loss=0.8922  val_loss=0.3153
   Epoch 5/30: train_loss=0.2595  val_loss=0.2508
   Epoch 10/30: train_loss=0.2106  val_loss=0.2459
   Epoch 15/30: train_loss=0.1676  val_loss=0.2461
   Epoch 20/30: train_loss=0.1177  val_loss=0.2425
   Epoch 25/30: train_loss=0.0999  val_loss=0.2444
   Epoch 30/30: train_loss=0.0903  val_loss=0.2516
‚úÖ Model 2/10 complete (best val_loss=0.2413)

üîÑ Training model 3/10 (seed=2)...
   Epoch 1/30: train_loss=0.9048  val_loss=0.3201
   Epoch 5/30: train_loss=0.2606  val_loss=0.2647
   Epoch 10/30: trai

In [18]:

print("\n" + "="*70)
print("üéØ Evaluating Ensemble Performance")
print("="*70)
val_input = val_input.to(device)
val_targets = val_targets.to(device)

max_price = 4000
max_log_value = np.log1p(max_price)

# Ensemble predictions
val_preds_raw = ensemble_predict(models_large, val_input)

# Clip in log-space
val_preds_clipped = torch.clamp(val_preds_raw, min=0, max=max_log_value)
val_preds_orig = np.expm1(val_preds_clipped.cpu().numpy().squeeze())

# Calculate metrics
mae_ensemble = mean_absolute_error(val_true_orig, val_preds_orig)
rmse_ensemble = np.sqrt(mean_squared_error(val_true_orig, val_preds_orig))

# Individual model predictions for comparison
print("\nüìä Individual Model Performance:")
individual_maes = []
for i, model in enumerate(models_large):
    model.eval()
    with torch.no_grad():
        preds = model(val_input)
        preds_clipped = torch.clamp(preds, min=0, max=max_log_value)
        preds_orig = np.expm1(preds_clipped.cpu().numpy().squeeze())
        mae = mean_absolute_error(val_true_orig, preds_orig)
        individual_maes.append(mae)
        if i < 3:  # Print first 3
            print(f"   Model {i+1}: ${mae:.2f}")

print(f"   ...")
print(f"   Model 10: ${individual_maes[-1]:.2f}")
print(f"   Individual MAE range: ${min(individual_maes):.2f} - ${max(individual_maes):.2f}")
print(f"   Individual MAE mean: ${np.mean(individual_maes):.2f}")

# Final results
print("\n" + "="*70)
print("üèÜ FINAL RESULTS")
print("="*70)
print(f"Previous (MiniLM + Image):     MAE = $12.16")
print(f"New (CLIP + Alignment):        MAE = ${mae_ensemble:.2f}")
print(f"                               RMSE = ${rmse_ensemble:.2f}")

improvement = ((12.16 - mae_ensemble) / 12.16) * 100
print(f"\n‚ú® Improvement: {improvement:+.1f}%")

if mae_ensemble < 12.16:
    print("üéâ SUCCESS! The CLIP features improved performance!")
else:
    print("‚ö†Ô∏è  Performance didn't improve as expected. Possible reasons:")
    print("   - Need more training epochs")
    print("   - Try different learning rate")
    print("   - Check if alignment features are working")

# =========================================================================
# Error Analysis by Price Range
# =========================================================================
print("\n" + "="*70)
print("üìä Error Analysis by Price Range")
print("="*70)

price_ranges = [
    (0, 10, "$0-$10"),
    (10, 20, "$10-$20"),
    (20, 50, "$20-$50"),
    (50, 100, "$50-$100"),
    (100, float('inf'), "$100+")
]

for low, high, label in price_ranges:
    mask = (val_true_orig >= low) & (val_true_orig < high)
    if mask.sum() > 0:
        range_mae = np.abs(val_true_orig[mask] - val_preds_orig[mask]).mean()
        count = mask.sum()
        pct = (count / len(val_true_orig)) * 100
        print(f"{label:12} | Count: {count:5} ({pct:5.1f}%) | MAE: ${range_mae:6.2f}")




üéØ Evaluating Ensemble Performance

üìä Individual Model Performance:
   Model 1: $12.50
   Model 2: $13.10
   Model 3: $12.32
   ...
   Model 10: $12.91
   Individual MAE range: $12.32 - $13.10
   Individual MAE mean: $12.67

üèÜ FINAL RESULTS
Previous (MiniLM + Image):     MAE = $12.16
New (CLIP + Alignment):        MAE = $12.18
                               RMSE = $46.24

‚ú® Improvement: -0.2%
‚ö†Ô∏è  Performance didn't improve as expected. Possible reasons:
   - Need more training epochs
   - Try different learning rate
   - Check if alignment features are working

üìä Error Analysis by Price Range
$0-$10       | Count:  5746 ( 38.3%) | MAE: $  5.24
$10-$20      | Count:  3837 ( 25.6%) | MAE: $  5.27
$20-$50      | Count:  3790 ( 25.3%) | MAE: $ 12.62
$50-$100     | Count:  1235 (  8.2%) | MAE: $ 35.41
$100+        | Count:   392 (  2.6%) | MAE: $104.31


In [None]:
# # =========================================================================
# # Save the Ensemble Models
# # =========================================================================
# print("\n" + "="*70)
# print("üíæ Saving Models")
# print("="*70)

# save_dir = '/content/drive/MyDrive/amazon_ml_challenge/ensemble_CLIP_models'
# import os
# os.makedirs(save_dir, exist_ok=True)

# for i, model in enumerate(models_large):
#     model_path = os.path.join(save_dir, f'clip_ensemble_model_{i}.pt')
#     torch.save(model.state_dict(), model_path)

# print(f"‚úÖ Saved 10 models to: {save_dir}")

# # Save metadata
# metadata = {
#     'mae': mae_ensemble,
#     'rmse': rmse_ensemble,
#     'individual_maes': individual_maes,
#     'improvement_vs_baseline': improvement,
#     'input_dim': train_input.shape[1],
#     'features': 'CLIP Image (512) + CLIP Text (512) + Alignment (3) + Structured (19)'
# }

# metadata_path = os.path.join(save_dir, 'ensemble_metadata.pt')
# torch.save(metadata, metadata_path)
# print(f"‚úÖ Saved metadata to: {metadata_path}")

# print("\n" + "="*70)
# print("‚úÖ ALL DONE! Ready for test predictions")
# print("="*70)

## Stacking/Meta-Learning on Top of Ensemble

In [5]:
# =========================================================================
# Stacking/Meta-Learning on Top of Ensemble
# Runs entirely on CPU - no GPU needed!
# =========================================================================
import torch
import torch.nn as nn # Import nn for RegressionMLP definition
import numpy as np
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pickle

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# =========================================================================
# Model Definition (Same as training) - COPIED FROM ABOVE
# =========================================================================
class RegressionMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.3),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.2),

            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.model(x)


print("\n" + "="*70)
print("üß† STACKING META-MODEL TRAINING")
print("="*70)

# =========================================================================
# STEP 0: Load models from Drive (if not already in memory)
# =========================================================================
# Uncomment this section if starting fresh session:
print("\nüìÇ Loading base models from Drive...")
from google.colab import drive

# Mount Google Drive
# drive.mount('/content/drive') # Already mounted in previous cells

save_dir = '/content/drive/MyDrive/amazon_ml_challenge/ensemble_CLIP_models'
# Fix: Add weights_only=False to load metadata
metadata = torch.load(f'{save_dir}/ensemble_metadata.pt', weights_only=False)
input_dim = metadata['input_dim']

models_large = []
for i in range(10):
    # Fix: Add map_location=device for consistency, though models are loaded to GPU in original notebook
    model = RegressionMLP(input_dim).to(device)
    model.load_state_dict(torch.load(
        f'{save_dir}/clip_ensemble_model_{i}.pt',
        map_location=device,
        weights_only=False # Also add weights_only=False for model state dicts
    ))
    model.eval()
    models_large.append(model)
    print(f"  ‚úì Loaded model {i+1}/10")
print("‚úÖ All models loaded!")




üß† STACKING META-MODEL TRAINING

üìÇ Loading base models from Drive...
  ‚úì Loaded model 1/10
  ‚úì Loaded model 2/10
  ‚úì Loaded model 3/10
  ‚úì Loaded model 4/10
  ‚úì Loaded model 5/10
  ‚úì Loaded model 6/10
  ‚úì Loaded model 7/10
  ‚úì Loaded model 8/10
  ‚úì Loaded model 9/10
  ‚úì Loaded model 10/10
‚úÖ All models loaded!


In [7]:
import os
base_dir = '/content/drive/MyDrive/amazon_ml_challenge'
data_save_path = os.path.join(base_dir, "combined_CLIP_final", "clip_full_with_alignment.pt")

# Load data (train/val split)
try:
    data_loaded = torch.load(data_save_path, map_location=device, weights_only=False)
    # train_input = data_loaded['train_input'].to(device) # Assuming train_input is not needed here
    val_input = data_loaded['val_input'].to(device)
    # train_targets = data_loaded['train_targets'].to(device) # Assuming train_targets is not needed here
    val_targets = data_loaded['val_targets'].to(device)
    print(f"‚úÖ Validation data loaded from: {data_save_path}")
    print(f"   Val: {val_input.shape}")

except FileNotFoundError:
    print(f"‚ùå Error: Data file not found at {data_save_path}. Please ensure the file exists.")
    # You might want to exit or handle this error appropriately
    raise

‚úÖ Validation data loaded from: /content/drive/MyDrive/amazon_ml_challenge/combined_CLIP_final/clip_full_with_alignment.pt
   Val: torch.Size([15000, 1046])


In [9]:



# =========================================================================
# STEP 1: Generate predictions from all base models (on validation set)
# =========================================================================
print("\nüìä Generating base model predictions...")

# Data is already on the correct device after loading

# Collect individual predictions (in log-space first)
base_predictions_log = []
max_price = 4000
max_log_value = np.log1p(max_price)

for i, model in enumerate(models_large):
    model.eval()
    with torch.no_grad():
        pred = model(val_input)
        pred_clipped = torch.clamp(pred, min=0, max=max_log_value)
        base_predictions_log.append(pred_clipped.cpu().numpy().squeeze())
    print(f"  ‚úì Model {i+1}/10 predictions collected")

# Convert to numpy array: shape (10, num_samples)
base_predictions_log = np.array(base_predictions_log)
print(f"\n‚úÖ Base predictions shape: {base_predictions_log.shape}")

# =========================================================================
# STEP 2: Create meta-features
# =========================================================================
print("\nüîß Engineering meta-features...")

# Basic features: individual model predictions (in original scale)
base_preds_orig = np.expm1(base_predictions_log)  # (10, num_samples)

# Statistical features across models
meta_features = []

# 1. Individual predictions
for i in range(10):
    meta_features.append(base_preds_orig[i])

# 2. Ensemble statistics
meta_features.append(base_preds_orig.mean(axis=0))      # Mean
meta_features.append(base_preds_orig.std(axis=0))       # Std (confidence)
meta_features.append(base_preds_orig.min(axis=0))       # Min
meta_features.append(base_preds_orig.max(axis=0))       # Max
meta_features.append(base_preds_orig.max(axis=0) - base_preds_orig.min(axis=0))  # Range

# 3. Agreement metrics
# High variance = models disagree = less confident
coefficient_of_variation = base_preds_orig.std(axis=0) / (base_preds_orig.mean(axis=0) + 1e-6)
meta_features.append(coefficient_of_variation)

# 4. Percentiles
meta_features.append(np.percentile(base_preds_orig, 25, axis=0))  # Q1
meta_features.append(np.percentile(base_preds_orig, 75, axis=0))  # Q3

# Stack all features: (num_samples, num_features)
X_meta = np.column_stack(meta_features)
y_meta = np.expm1(val_targets.cpu().numpy().squeeze())

print(f"‚úÖ Meta-features shape: {X_meta.shape}")
print(f"   Features: 10 individual preds + {X_meta.shape[1]-10} statistical features")

# =========================================================================
# STEP 3: Train multiple meta-learners and select best
# =========================================================================
print("\n" + "="*70)
print("üèãÔ∏è Training Meta-Learners")
print("="*70)

meta_models = {}

# 1. Ridge Regression (L2 regularization)
print("\n1Ô∏è‚É£ Ridge Regression...")
ridge = Ridge(alpha=1.0)
ridge.fit(X_meta, y_meta)
ridge_pred = ridge.predict(X_meta)
ridge_mae = mean_absolute_error(y_meta, ridge_pred)
meta_models['ridge'] = ridge
print(f"   MAE: ${ridge_mae:.2f}")

# 2. Lasso Regression (L1 regularization, feature selection)
print("\n2Ô∏è‚É£ Lasso Regression...")
lasso = Lasso(alpha=0.1, max_iter=5000)
lasso.fit(X_meta, y_meta)
lasso_pred = lasso.predict(X_meta)
lasso_mae = mean_absolute_error(y_meta, lasso_pred)
meta_models['lasso'] = lasso
print(f"   MAE: ${lasso_mae:.2f}")
print(f"   Features selected: {np.sum(np.abs(lasso.coef_) > 1e-5)} / {len(lasso.coef_)}")

# 3. Random Forest (non-linear meta-learner)
print("\n3Ô∏è‚É£ Random Forest...")
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=5,
    min_samples_split=20,
    random_state=42,
    n_jobs=-1  # Use all CPU cores
)
rf.fit(X_meta, y_meta)
rf_pred = rf.predict(X_meta)
rf_mae = mean_absolute_error(y_meta, rf_pred)
meta_models['rf'] = rf
print(f"   MAE: ${rf_mae:.2f}")

# 4. Simple weighted average (baseline)
simple_avg = base_preds_orig.mean(axis=0)
simple_mae = mean_absolute_error(y_meta, simple_avg)
print(f"\nüìä Simple Average (baseline): ${simple_mae:.2f}")

# =========================================================================
# STEP 4: Select best meta-model
# =========================================================================
print("\n" + "="*70)
print("üèÜ META-MODEL COMPARISON")
print("="*70)

results = {
    'Simple Average': simple_mae,
    'Ridge': ridge_mae,
    'Lasso': lasso_mae,
    'Random Forest': rf_mae
}

for name, mae in sorted(results.items(), key=lambda x: x[1]):
    improvement = ((simple_mae - mae) / simple_mae) * 100
    symbol = "üéâ" if mae < simple_mae else "‚ö†Ô∏è"
    print(f"{symbol} {name:20} | MAE: ${mae:.2f} | Improvement: {improvement:+.2f}%")

best_name = min(results.items(), key=lambda x: x[1])[0]
best_model_key = best_name.lower().replace(' ', '_')
if best_model_key == 'simple_average':
    best_meta_model = None
else:
    # Handle case where key might not exactly match after replace (e.g., 'RandomForest' vs 'random_forest')
    # Find the actual key in meta_models
    actual_best_key = None
    for k in meta_models.keys():
        if k.replace('_', '') == best_model_key.replace('_', ''):
            actual_best_key = k
            break

    if actual_best_key:
        best_meta_model = meta_models[actual_best_key]
    else:
        best_meta_model = None # Should not happen if logic is correct


print(f"\n‚ú® Best meta-model: {best_name}")

# =========================================================================
# STEP 5: Analyze feature importance (if using tree-based model)
# =========================================================================
if best_name == 'Random Forest':
    print("\n" + "="*70)
    print("üîç FEATURE IMPORTANCE")
    print("="*70)

    feature_names = (
        [f"Model_{i+1}" for i in range(10)] +
        ['Mean', 'Std', 'Median', 'Min', 'Max', 'Range', 'CoefVar', 'Q1', 'Q3']
    )

    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1][:10]  # Top 10

    print("\nTop 10 Most Important Features:")
    for i, idx in enumerate(indices):
        print(f"  {i+1}. {feature_names[idx]:15} | Importance: {importances[idx]:.4f}")

# =========================================================================
# STEP 6: Error analysis by price range
# =========================================================================
print("\n" + "="*70)
print("üìä ERROR ANALYSIS BY PRICE RANGE")
print("="*70)

if best_meta_model is not None:
    best_pred = best_meta_model.predict(X_meta)
else:
    best_pred = simple_avg

price_ranges = [
    (0, 10, "$0-$10"),
    (10, 20, "$10-$20"),
    (20, 50, "$20-$50"),
    (50, 100, "$50-$100"),
    (100, float('inf'), "$100+")
]

print(f"\n{'Range':12} | {'Count':>5} | {'Simple Avg':>12} | {best_name:>12}")
print("-" * 70)

for low, high, label in price_ranges:
    mask = (y_meta >= low) & (y_meta < high)
    if mask.sum() > 0:
        simple_range_mae = np.abs(y_meta[mask] - simple_avg[mask]).mean()
        best_range_mae = np.abs(y_meta[mask] - best_pred[mask]).mean()
        count = mask.sum()

        improvement = simple_range_mae - best_range_mae
        symbol = "‚Üì" if improvement > 0 else "‚Üë"
        print(f"{label:12} | {count:5} | ${simple_range_mae:11.2f} | ${best_range_mae:11.2f} {symbol}")


üìä Generating base model predictions...
  ‚úì Model 1/10 predictions collected
  ‚úì Model 2/10 predictions collected
  ‚úì Model 3/10 predictions collected
  ‚úì Model 4/10 predictions collected
  ‚úì Model 5/10 predictions collected
  ‚úì Model 6/10 predictions collected
  ‚úì Model 7/10 predictions collected
  ‚úì Model 8/10 predictions collected
  ‚úì Model 9/10 predictions collected
  ‚úì Model 10/10 predictions collected

‚úÖ Base predictions shape: (10, 15000)

üîß Engineering meta-features...
‚úÖ Meta-features shape: (15000, 18)
   Features: 10 individual preds + 8 statistical features

üèãÔ∏è Training Meta-Learners

1Ô∏è‚É£ Ridge Regression...
   MAE: $13.11

2Ô∏è‚É£ Lasso Regression...


  model = cd_fast.enet_coordinate_descent(


   MAE: $13.08
   Features selected: 18 / 18

3Ô∏è‚É£ Random Forest...
   MAE: $12.59

üìä Simple Average (baseline): $12.28

üèÜ META-MODEL COMPARISON
‚ö†Ô∏è Simple Average       | MAE: $12.28 | Improvement: +0.00%
‚ö†Ô∏è Random Forest        | MAE: $12.59 | Improvement: -2.58%
‚ö†Ô∏è Lasso                | MAE: $13.08 | Improvement: -6.52%
‚ö†Ô∏è Ridge                | MAE: $13.11 | Improvement: -6.78%

‚ú® Best meta-model: Simple Average

üìä ERROR ANALYSIS BY PRICE RANGE

Range        | Count |   Simple Avg | Simple Average
----------------------------------------------------------------------
$0-$10       |  5746 | $       5.42 | $       5.42 ‚Üë
$10-$20      |  3837 | $       4.89 | $       4.89 ‚Üë
$20-$50      |  3790 | $      12.76 | $      12.76 ‚Üë
$50-$100     |  1235 | $      34.76 | $      34.76 ‚Üë
$100+        |   392 | $     109.57 | $     109.57 ‚Üë


## Unsupervised Category Discovery via Clustering

In [14]:
# =========================================================================
# Unsupervised Category Discovery via Clustering
# Use CLIP text embeddings to discover product categories
# =========================================================================
import torch
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns

print("\n" + "="*70)
print("üîç UNSUPERVISED CATEGORY DISCOVERY")
print("="*70)

# =========================================================================
# Load your CLIP text embeddings
# =========================================================================
# Assuming you have:
# clip_text_embeddings: (num_samples, 512) - CLIP text features
# train_texts: list of product descriptions
# targets: prices in log scale
base_dir = '/content/drive/MyDrive/amazon_ml_challenge'
clip_full_embeddings = torch.load("/content/drive/MyDrive/amazon_ml_challenge/combined_CLIP_final/clip_full_with_alignment.pt")
processed_csv_path = os.path.join(base_dir, "processed_fe+textpre.csv")
data = pd.read_csv(processed_csv_path)



üîç UNSUPERVISED CATEGORY DISCOVERY


In [15]:
train_texts=data.catalog_content
targets=data.price

In [None]:

print("\nüìä Data Info:")
print(f"Text embeddings shape: {clip_text_embeddings.shape}")
print(f"Number of samples: {len(train_texts)}")

# Convert to numpy for sklearn
X_text = clip_text_embeddings.cpu().numpy() if torch.is_tensor(clip_text_embeddings) else clip_text_embeddings
prices_orig = np.expm1(targets.cpu().numpy() if torch.is_tensor(targets) else targets)

# =========================================================================
# STEP 1: Determine optimal number of clusters
# =========================================================================
print("\n" + "="*70)
print("üìà Finding Optimal Number of Clusters")
print("="*70)

# Try different K values
K_range = range(5, 31, 5)  # Test 5, 10, 15, 20, 25, 30 clusters
inertias = []
silhouette_scores = []

from sklearn.metrics import silhouette_score

print("\nTesting different K values (this may take a few minutes)...")
for k in K_range:
    print(f"  Testing K={k}...", end=" ")
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10, max_iter=100)
    labels = kmeans.fit_predict(X_text)
    inertias.append(kmeans.inertia_)

    # Silhouette score (sample for speed if dataset is large)
    if len(X_text) > 10000:
        sample_idx = np.random.choice(len(X_text), 10000, replace=False)
        score = silhouette_score(X_text[sample_idx], labels[sample_idx])
    else:
        score = silhouette_score(X_text, labels)
    silhouette_scores.append(score)
    print(f"Silhouette: {score:.3f}")

# Plot elbow curve
print("\nüìä Elbow Method Analysis:")
for k, inertia, sil in zip(K_range, inertias, silhouette_scores):
    print(f"  K={k:2d} | Inertia: {inertia:12.0f} | Silhouette: {sil:.3f}")

# Suggest optimal K (highest silhouette score)
optimal_k = K_range[np.argmax(silhouette_scores)]
print(f"\n‚ú® Suggested K: {optimal_k} (highest silhouette score)")

# =========================================================================
# STEP 2: Perform clustering with optimal K
# =========================================================================
print("\n" + "="*70)
print(f"üéØ Clustering with K={optimal_k}")
print("="*70)

# KMeans clustering
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=20, max_iter=300)
cluster_labels = kmeans.fit_predict(X_text)

print(f"‚úÖ Clustering complete!")
print(f"\nCluster distribution:")
cluster_counts = Counter(cluster_labels)
for cluster_id in sorted(cluster_counts.keys()):
    count = cluster_counts[cluster_id]
    pct = (count / len(cluster_labels)) * 100
    print(f"  Cluster {cluster_id:2d}: {count:5d} samples ({pct:5.1f}%)")

# =========================================================================
# STEP 3: Analyze clusters - discover what they represent
# =========================================================================
print("\n" + "="*70)
print("üîç CLUSTER ANALYSIS")
print("="*70)

def analyze_cluster(cluster_id, texts, prices, labels, top_n=10):
    """Analyze what a cluster represents"""
    mask = labels == cluster_id
    cluster_texts = [texts[i] for i in range(len(texts)) if mask[i]]
    cluster_prices = prices[mask]

    print(f"\n{'='*70}")
    print(f"CLUSTER {cluster_id} - {mask.sum()} samples")
    print(f"{'='*70}")

    # Price statistics
    print(f"\nüí∞ Price Statistics:")
    print(f"  Mean: ${cluster_prices.mean():.2f}")
    print(f"  Median: ${np.median(cluster_prices):.2f}")
    print(f"  Min: ${cluster_prices.min():.2f}")
    print(f"  Max: ${cluster_prices.max():.2f}")
    print(f"  Std: ${cluster_prices.std():.2f}")

    # Sample texts (to manually identify category)
    print(f"\nüìù Sample Products (first {top_n}):")
    for i, idx in enumerate(np.where(mask)[0][:top_n]):
        text_preview = texts[idx][:100] + "..." if len(texts[idx]) > 100 else texts[idx]
        print(f"  {i+1}. [{prices[idx]:6.2f}] {text_preview}")

    # Common words (simple word frequency)
    from collections import Counter
    import re
    all_words = []
    for text in cluster_texts[:1000]:  # Sample for speed
        words = re.findall(r'\b[a-z]{3,}\b', str(text).lower())
        all_words.extend(words)

    # Filter stopwords
    stopwords = {'the', 'and', 'for', 'with', 'this', 'that', 'from', 'are', 'has'}
    word_counts = Counter([w for w in all_words if w not in stopwords])

    print(f"\nüî§ Top Keywords:")
    for word, count in word_counts.most_common(15):
        print(f"  {word:15} ({count})")

# Analyze first few clusters
print("\nüî¨ Analyzing clusters to discover categories...")
for cluster_id in range(min(5, optimal_k)):  # Analyze first 5 clusters
    analyze_cluster(cluster_id, train_texts, prices_orig, cluster_labels)

# =========================================================================
# STEP 4: Visualize clusters (2D projection)
# =========================================================================
print("\n" + "="*70)
print("üé® Visualizing Clusters")
print("="*70)

# Use PCA for quick 2D visualization
print("\nReducing to 2D with PCA...")
pca = PCA(n_components=2, random_state=42)
X_2d = pca.fit_transform(X_text)

# Plot
plt.figure(figsize=(14, 10))
scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1],
                     c=cluster_labels,
                     cmap='tab20',
                     alpha=0.6,
                     s=10)
plt.colorbar(scatter, label='Cluster ID')
plt.title(f'Product Clusters (K={optimal_k}) - PCA Projection')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.tight_layout()
plt.savefig('/content/clusters_pca.png', dpi=150, bbox_inches='tight')
print("‚úÖ Visualization saved: /content/clusters_pca.png")

# =========================================================================
# STEP 5: Analyze price patterns by cluster
# =========================================================================
print("\n" + "="*70)
print("üí∞ PRICE ANALYSIS BY CLUSTER")
print("="*70)

cluster_price_stats = []
for cluster_id in range(optimal_k):
    mask = cluster_labels == cluster_id
    cluster_prices = prices_orig[mask]

    stats = {
        'cluster': cluster_id,
        'count': mask.sum(),
        'mean': cluster_prices.mean(),
        'median': np.median(cluster_prices),
        'std': cluster_prices.std(),
        'min': cluster_prices.min(),
        'max': cluster_prices.max()
    }
    cluster_price_stats.append(stats)

# Sort by mean price
cluster_price_stats.sort(key=lambda x: x['mean'])

print("\nClusters sorted by average price:")
print(f"{'Cluster':>8} | {'Count':>6} | {'Mean':>8} | {'Median':>8} | {'Std':>8}")
print("-" * 70)
for stats in cluster_price_stats:
    print(f"{stats['cluster']:8d} | {stats['count']:6d} | "
          f"${stats['mean']:7.2f} | ${stats['median']:7.2f} | ${stats['std']:7.2f}")

# =========================================================================
# STEP 6: Create category features for modeling
# =========================================================================
print("\n" + "="*70)
print("üèóÔ∏è CREATING CATEGORY FEATURES")
print("="*70)

# One-hot encode clusters
from sklearn.preprocessing import OneHotEncoder

cluster_onehot = np.zeros((len(cluster_labels), optimal_k))
for i, label in enumerate(cluster_labels):
    cluster_onehot[i, label] = 1

print(f"‚úÖ One-hot encoded clusters: {cluster_onehot.shape}")

# Alternative: Use cluster center distances as features
cluster_distances = kmeans.transform(X_text)  # Distance to each cluster center
print(f"‚úÖ Cluster distances: {cluster_distances.shape}")


In [None]:

# =========================================================================
# STEP 7: Save results
# =========================================================================
print("\n" + "="*70)
print("üíæ SAVING RESULTS")
print("="*70)

# Save cluster assignments
results = {
    'cluster_labels': cluster_labels,
    'cluster_onehot': cluster_onehot,
    'cluster_distances': cluster_distances,
    'kmeans_model': kmeans,
    'optimal_k': optimal_k,
    'cluster_price_stats': cluster_price_stats
}

save_path = '/content/drive/MyDrive/amazon_ml_challenge/cluster_results.pt'
torch.save(results, save_path)
print(f"‚úÖ Results saved to: {save_path}")

# =========================================================================
# STEP 8: How to use these features
# =========================================================================
print("\n" + "="*70)
print("üìñ HOW TO USE CLUSTER FEATURES")
print("="*70)

usage_guide = """
Three ways to use discovered categories:

1Ô∏è‚É£ ADD AS FEATURES (Simple):
   # Concatenate one-hot clusters to your existing features
   enhanced_input = torch.cat([
       img_tensor,              # 512
       clip_text_embeddings,    # 512
       alignment_features,      # 3
       structured_features,     # 19
       torch.FloatTensor(cluster_onehot)  # K clusters
   ], dim=1)

   # Now train with 1046 + K features

2Ô∏è‚É£ TRAIN SEPARATE MODELS PER CLUSTER (Advanced):
   for cluster_id in range(K):
       mask = cluster_labels == cluster_id
       X_cluster = train_input[mask]
       y_cluster = train_targets[mask]

       # Train specialist model for this category
       model_cluster = RegressionMLP(input_dim).to(device)
       train(model_cluster, X_cluster, y_cluster)

   # At inference: predict cluster, then use that model

3Ô∏è‚É£ CLUSTER-WEIGHTED LOSS (Hybrid):
   # Weight loss by cluster difficulty
   cluster_stds = [stats['std'] for stats in cluster_price_stats]
   weights = torch.FloatTensor([cluster_stds[label] for label in cluster_labels])

   loss = (weights * criterion(pred, target)).mean()
"""

print(usage_guide)

print("\n" + "="*70)
print("‚úÖ CLUSTERING COMPLETE!")
print("="*70)
print(f"\nüéØ Discovered {optimal_k} product categories")
print(f"üìä Ready to enhance your model with category features!")
print(f"üí° Next: Try approach #1 (add as features) or #2 (separate models)")