## Embedding creation for test dataset

In [1]:
#################################################################################
import numpy as np
import re
import pandas as pd

# constants
OZ_TO_ML = 29.5735
OZ_TO_G  = 28.3495
L_TO_ML  = 1000.0
KG_TO_G  = 1000.0
LB_TO_G  = 453.59237


UNIT_MAP = {
    "oz": "oz", "ounce": "oz", "ounces": "oz",
    "fl oz": "fl_oz", "floz": "fl_oz", "fl.oz": "fl_oz",
    "ml": "ml", "l": "l", "litre": "l", "liter": "l",
    "g": "g", "kg": "kg",
    "pack": "pack", "count": "count", "ct": "count", "piece": "count"
}

def extract_value_and_unit(text):
    """
    Extract 'value' and 'unit' (explicit or implicit) from product text.
    Requires 'Value:' and 'Unit:' to appear on their own lines.
    """
    t = str(text).lower()
    out = {"value": np.nan, "unit": None, "has_value": 0, "has_unit": 0}

    # 1Ô∏è‚É£ Explicit "Value:" label at start of line
    m = re.search(r'(?m)^[ \t]*value:\s*(\d+(?:\.\d+)?)', t)
    if m:
        out["value"] = float(m.group(1))
        out["has_value"] = 1

    # 2Ô∏è‚É£ Explicit "Unit:" label at start of line
    m = re.search(r'(?m)^[ \t]*unit:\s*([^\n\r]*)', t)
    if m:
        unit_raw = m.group(1).strip()
        unit_clean = UNIT_MAP.get(unit_raw.replace('.', '').replace(' ', ''), unit_raw)
        out["unit"] = unit_clean
        out["has_unit"] = 1

    # 3Ô∏è‚É£ Implicit numeric + unit pattern (fallback)
    if not out["has_unit"]:
        m = re.search(r'(\d+(?:\.\d+)?)\s*(fl\.?\s?oz|ounce|ounces|oz|ml|g|kg|l)\b', t)
        if m:
            val = float(m.group(1))
            unit_raw = m.group(2).replace('.', '').replace(' ', '')
            out["value"] = val if np.isnan(out["value"]) else out["value"]
            out["unit"] = UNIT_MAP.get(unit_raw, unit_raw)
            out["has_unit"] = 1
            out["has_value"] = 1

    return out


def add_value_unit_features(df, text_col="catalog_content"):
    """
    Adds 'value_only', 'unit_only', 'has_value', and 'has_unit' columns.
    """
    extracted = df[text_col].fillna("").apply(extract_value_and_unit)
    extracted_df = pd.DataFrame(list(extracted))

    df["value"] = extracted_df["value"]
    df["unit"] = extracted_df["unit"]
    df["has_value"] = extracted_df["has_value"]
    df["has_unit"] = extracted_df["has_unit"]

    return df

UNIT_MAP_CLEAN = {
    # --- volume ---
    "ml": "ml", "millilitre": "ml", "milliliter": "ml", "mililitro": "ml", "ltr": "l", "l": "l", "liters": "l", "2.5 gal.": "gal",
    "fl_oz": "fl_oz", "fl ounce": "fl_oz", "fl oz": "fl_oz", "fluid ounce": "fl_oz", "fluid ounces": "fl_oz", "fluid ounce(s)": "fl_oz", "20 oz.": "oz",

    # --- weight ---
    "g": "g", "gram": "g", "grams": "g", "gramm": "g", "gr": "g", "grams(gm)": "g",
    "kg": "kg", "pound": "lb", "pounds": "lb", "lb": "lb",

    # --- count / packaging ---
    "pack": "pack", "packs": "pack", "per package": "pack", "per box": "pack",
    "count": "count", "ct": "count", "each": "count", "each / pack: 1": "count",
    "bag": "pack", "box": "pack", "box/12": "pack", "carton": "pack", "case": "pack",
    "bottle": "count", "bottles": "count", "jar": "count", "can": "count", "capsule": "count",
    "pouch": "count", "bucket": "count", "k-cups": "count", "ziplock bags": "count", "paper cupcake liners": "count", "tea bags": "count",

    # --- others (dimensional / irrelevant) ---
    "in": "in", "sq ft": "sq_ft", "foot": "ft", "cm/inch)": "cm_inch",

    # --- noise / invalid ---
    "none": None, "": None, "-": None, "---": None, "1": None, "24": None,
    "product_weight": None, "units": None,
    "1 pk. color(s): -black. product type: -permanent. pack quantity: -1. tip type: -chisel. dimensions: overall product weight: -0.06 lbs.": None,
    "comes as a single 0.1 oz stick for on-the-go use": None,
    "unit√†": None,
    "7,2 oz": "oz"
}

def normalize_unit(unit):
    if pd.isna(unit) or not isinstance(unit, str):
        return None
    u = unit.strip().lower()
    u = u.replace('.', '').replace('(', '').replace(')', '').replace(':', '').strip()
    return UNIT_MAP_CLEAN.get(u, u)  # fallback: return itself if not found

UNIT_FINAL_MAP = {
    "oz": "oz",
    "fl_oz": "fl_oz",
    "count": "count",
    "lb": "lb",
    "g": "g",
    "ml": "ml",
    "l": "l",
    "kg": "kg",
    "pack": "pack",
    "per carton": "pack",
    "sq_ft": "sq_ft",
    "ft": "ft",
    "in": "in",
    "8": None, # Map '8' to None
    "gramsgm": "g",
    None: None
}

UNIT_CATEGORY_MAP = {
    "oz": "weight",
    "lb": "weight",
    "g": "weight",
    "kg": "weight",
    "ml": "volume",
    "l": "volume",
    "fl_oz": "volume",
    "count": "count",
    "pack": "count",
    "sq_ft": "dimension",
    "ft": "dimension",
    "in": "dimension",
    None: "unknown"
}


def clean_catalog_text(text):
    """
    Removes lines starting with 'Value:' or 'Unit:' (case-insensitive) from catalog text.
    Only removes if they are at the beginning of a line.
    """
    pattern = r'(?im)^[ \t]*(value:.*|unit:.*)$'
    cleaned = re.sub(pattern, '', str(text))
    return cleaned.strip()

def qty_to_base(qty, unit_final, unit_category):
    """Return (qty_in_base, base_type) where base_type is 'ml', 'g', or 'count' or None"""
    if pd.isna(qty) or qty <= 0 or unit_final is None:
        return (np.nan, None)
    u = str(unit_final).lower()
    if unit_category == 'volume':
        if u in ('ml',):
            return (qty, 'ml')
        if u in ('l', 'ltr'):
            return (qty * L_TO_ML, 'ml')
        if u in ('fl_oz','floz','fl ounce','fluid ounce','fluid ounces'):
            return (qty * OZ_TO_ML, 'ml')
        if u == 'oz':  # ambiguous: treat by category; here category=volume so ml
            return (qty * OZ_TO_ML, 'ml')
    if unit_category == 'weight':
        if u in ('g','gram','grams','gr','gramsgm'):
            return (qty, 'g')
        if u in ('kg',):
            return (qty * KG_TO_G, 'g')
        if u in ('lb','pound','pounds'):
            return (qty * LB_TO_G, 'g')
        if u == 'oz':  # treat ounce as weight here
            return (qty * OZ_TO_G, 'g')
    if unit_category == 'count':
        return (qty, 'count')
    return (np.nan, None)

def feat_eng(data):
    """Applies feature engineering steps to the input DataFrame."""

    # 1) price_per_unit (raw)

    # 2) standardized base price (price per ml or per g)
    # Ensure 'unit_final' and 'unit_category' columns exist
    if 'unit_final' not in data.columns or 'unit_category' not in data.columns:
         raise ValueError("DataFrame must contain 'unit_final' and 'unit_category' columns before calling feat_eng.")

    qty_base = data.apply(lambda r: qty_to_base(r['value'], r['unit_final'], r['unit_category']), axis=1)
    data['qty_base'] = [q[0] for q in qty_base]
    data['base_type'] = [q[1] for q in qty_base]

    # 4) simple text features (use catalog_content_clean or catalog_content which you replaced)
    def text_feats(t):
        t = str(t)
        words = re.findall(r'\w+', t)
        word_count = len(words)
        char_count = len(t)
        avg_word_len = np.mean([len(w) for w in words]) if words else 0
        bullet_count = len(re.findall(r'bullet', t, flags=re.I))  # simple bullet marker
        digits = len(re.findall(r'\d', t))
        return pd.Series([word_count, char_count, avg_word_len, bullet_count, digits])

    # Ensure 'catalog_content' column exists
    if 'catalog_content' not in data.columns:
         raise ValueError("DataFrame must contain 'catalog_content' column before calling feat_eng.")

    data[['word_count','char_count','avg_word_len','bullet_count','num_digits']] = data['catalog_content'].apply(text_feats)

    # 5) keyword flags
    keywords = {
        'organic': 'is_organic',
        'gluten-free': 'is_gluten_free',
        'gluten free': 'is_gluten_free',
        'sugar-free': 'is_sugar_free',
        'sugar free': 'is_sugar_free',
        'vegan': 'is_vegan',
        'new': 'is_new',
        'pack': 'has_pack_word',
        'bundle': 'has_bundle'
    }
    for kw, col in keywords.items():
        data[col] = data['catalog_content'].str.contains(re.escape(kw), case=False, na=False).astype(int)

    return data # Return the modified DataFrame
def process_FE(data):
  data = add_value_unit_features(data)

  data["unit_normalized"] = data["unit"].apply(normalize_unit)
  data["unit_final"] = data["unit_normalized"].map(UNIT_FINAL_MAP) # Applied to data DataFrame
  data["unit_category"] = data["unit_final"].map(UNIT_CATEGORY_MAP).fillna("unknown") # Applied to data DataFrame
  data['catalog_content'] = data['catalog_content'].apply(clean_catalog_text)

  # Drop unneeded columns to save space
  cols_to_drop = ["unit", "unit_normalized"]
  data = data.drop(columns=cols_to_drop)
  data = feat_eng(data)
  return data


In [1]:
!pip install git+https://github.com/openai/CLIP.git ftfy
import clip


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-1h1doear
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-1h1doear
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.8/44.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369490 sha256=167858d6b1aedcf7fec93d62fb38298f923aaba4faefaf066817e301abd72f8d
  Stored in directory: /tmp/p

In [2]:
 #-------------------------
# IMPROVED PIPELINE: Add CLIP Text + Alignment
# -------------------------
import torch
import numpy as np
from tqdm import tqdm
import os
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define base directory
base_dir = '/content/drive/MyDrive/amazon_ml_challenge'

#
# -------------------------
# Load and process data
# -------------------------
# train_csv_path = os.path.join(base_dir, 'test.csv')
# data = pd.read_csv(train_csv_path)


Mounted at /content/drive


In [3]:
# data = process_FE(data)

# # Impute missing values
# data['value'] = data['value'].fillna(-1)
# median_qty_base = data['qty_base'].median()
# data['qty_base'] = data['qty_base'].fillna(median_qty_base)
# data['base_type'] = data['base_type'].fillna('missing')
# data['unit_final'] = data['unit_final'].fillna('unknown')

# print("‚úÖ Data loaded and processed")

‚úÖ Data loaded and processed


In [4]:
# save_path_drive = os.path.join(base_dir, "processed_fe+textpre_test_dataset.csv")
# data.to_csv(save_path_drive, index=False)
# print(f"‚úÖ Processed data saved to: {save_path_drive}")

‚úÖ Processed data saved to: /content/drive/MyDrive/amazon_ml_challenge/processed_fe+textpre_test_dataset.csv


In [3]:
processed_csv_path = os.path.join(base_dir, "processed_fe+textpre_test_dataset.csv")
data = pd.read_csv(processed_csv_path)
print(f"‚úÖ Processed data loaded from: {processed_csv_path}")
display(data.head())

‚úÖ Processed data loaded from: /content/drive/MyDrive/amazon_ml_challenge/processed_fe+textpre_test_dataset.csv


Unnamed: 0,sample_id,catalog_content,image_link,value,has_value,has_unit,unit_final,unit_category,qty_base,base_type,...,avg_word_len,bullet_count,num_digits,is_organic,is_gluten_free,is_sugar_free,is_vegan,is_new,has_pack_word,has_bundle
0,100179,Item Name: Rani 14-Spice Eshamaya's Mango Chut...,https://m.media-amazon.com/images/I/71hoAn78AW...,10.5,1,1,oz,weight,297.66975,g,...,4.687204,5.0,20.0,0,1,0,1,0,1,0
1,245611,Item Name: Natural MILK TEA Flavoring extract ...,https://m.media-amazon.com/images/I/61ex8NHCIj...,2.0,1,1,fl_oz,volume,59.147,ml,...,4.970696,6.0,9.0,0,1,0,0,1,1,0
2,146263,Item Name: Honey Filled Hard Candy - Bulk Pack...,https://m.media-amazon.com/images/I/61KCM61J8e...,32.0,1,1,oz,weight,907.184,g,...,4.685039,5.0,11.0,0,0,0,0,0,1,0
3,95658,Item Name: Vlasic Snack'mm's Kosher Dill 16 Oz...,https://m.media-amazon.com/images/I/51Ex6uOH7y...,2.0,1,1,count,count,2.0,count,...,3.307692,0.0,3.0,0,0,0,0,0,1,0
4,36806,"Item Name: McCormick Culinary Vanilla Extract,...",https://m.media-amazon.com/images/I/71QYlrOMoS...,32.0,1,1,fl_oz,volume,946.352,ml,...,5.101695,5.0,13.0,0,1,0,0,0,0,0


In [5]:
# -------------------------
# 1Ô∏è‚É£ Load image embeddings (already aligned to train.csv!)
# -------------------------
img_data = os.path.join(base_dir, "full_image_embeddings_testset.npy")
img_embeddings = np.load("/content/full_image_embeddings_test.npy")
img_tensor = torch.tensor(np.nan_to_num(img_embeddings, nan=0.0), dtype=torch.float)
print(f"‚úÖ Image embeddings loaded: {img_tensor.shape}")


‚úÖ Image embeddings loaded: torch.Size([75000, 512])


In [7]:

# -------------------------
# 2Ô∏è‚É£ Load text+structured embeddings
# -------------------------
train_input_final_path = os.path.join(base_dir, 'test_input_final.pt')
train_text_embd_data = torch.load(train_input_final_path)

train_text_struct = train_text_embd_data['train_input']  # MiniLM + structured
sample_ids = train_text_embd_data['sample_ids']



In [8]:


print(f"‚úÖ Text+Structured loaded: {train_text_struct.shape}")

# -------------------------
# 2.5Ô∏è‚É£ VERIFY ORDER (important sanity check!)
# -------------------------
print("\nüîç Verifying alignment...")
print(f"Train CSV has {len(data)} rows")
print(f"Image embeddings has {len(img_tensor)} rows")
print(f"Sample IDs has {len(sample_ids)} entries")

# Check if sample_ids match train.csv order
if (data['sample_id'].values[:len(sample_ids)] == np.array(sample_ids)).all():
    print("‚úÖ Sample IDs are in train.csv order - perfect!")
    data_final = data.copy() # only used for getting text
else:
    print("‚ö†Ô∏è  Sample IDs are NOT in train.csv order - reordering...")
    data_final = data.set_index('sample_id').loc[sample_ids].reset_index()

‚úÖ Text+Structured loaded: torch.Size([75000, 403])

üîç Verifying alignment...
Train CSV has 75000 rows
Image embeddings has 75000 rows
Sample IDs has 75000 entries
‚úÖ Sample IDs are in train.csv order - perfect!


In [9]:

# -------------------------
# 3Ô∏è‚É£ Load CLIP model for text embeddings
# -------------------------
device = 'cuda' if torch.cuda.is_available() else 'cpu'
clip_model, _ = clip.load("ViT-B/32", device=device)
clip_model.eval()
print(f"‚úÖ CLIP model loaded on {device}")

# -------------------------
# 4Ô∏è‚É£ Extract CLIP text embeddings
# -------------------------
def compute_clip_text_embeddings(texts, clip_model, batch_size=256, device='cuda'):
    """Extract CLIP text embeddings"""
    all_embeds = []

    for i in tqdm(range(0, len(texts), batch_size), desc="CLIP Text Embeddings"):
        batch = texts[i:i+batch_size]
        batch_truncated = [str(t)[:300] for t in batch]
        text_tokens = clip.tokenize(batch_truncated, truncate=True).to(device)

        with torch.no_grad():
            features = clip_model.encode_text(text_tokens)
            features = features / features.norm(dim=-1, keepdim=True)

        all_embeds.append(features.cpu())

    return torch.cat(all_embeds, dim=0)

train_texts = data_final['catalog_content'].fillna("").tolist()

print("\nüöÄ Extracting CLIP text embeddings...")
clip_text_embeddings = compute_clip_text_embeddings(
    train_texts,
    clip_model,
    batch_size=256,
    device=device
)
print(f"‚úÖ CLIP text embeddings: {clip_text_embeddings.shape}")


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 338M/338M [00:02<00:00, 162MiB/s]


‚úÖ CLIP model loaded on cuda

üöÄ Extracting CLIP text embeddings...


CLIP Text Embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 293/293 [01:09<00:00,  4.21it/s]

‚úÖ CLIP text embeddings: torch.Size([75000, 512])





In [10]:

# -------------------------
# 5Ô∏è‚É£ Compute Image-Text Alignment (THE SECRET WEAPON!)
# -------------------------
def compute_alignment_features(image_embeds, text_embeds):
    """Image-text alignment catches listing quality issues"""
    min_len = min(image_embeds.shape[0], text_embeds.shape[0])
    image_embeds = image_embeds[:min_len]
    text_embeds = text_embeds[:min_len]

    # Cosine similarity (already L2 normalized)
    similarity = (image_embeds * text_embeds).sum(dim=1, keepdim=True)

    # Binary flags
    high_match = (similarity > 0.8).float()
    low_match = (similarity < 0.5).float()

    return torch.cat([similarity, high_match, low_match], dim=1)

print("\nüîó Computing image-text alignment...")
alignment_features = compute_alignment_features(img_tensor, clip_text_embeddings)
print(f"‚úÖ Alignment features: {alignment_features.shape}")

# Statistics
print(f"\nüìä Alignment Statistics:")
print(f"   Mean similarity: {alignment_features[:, 0].mean():.3f}")
print(f"   High matches (>0.8): {alignment_features[:, 1].sum().item():.0f} ({alignment_features[:, 1].mean()*100:.1f}%)")
print(f"   Low matches (<0.5): {alignment_features[:, 2].sum().item():.0f} ({alignment_features[:, 2].mean()*100:.1f}%)")

# -------------------------
# 6Ô∏è‚É£ Extract structured features (last columns from MiniLM+structured)
# -------------------------
structured_cols = [
    'value', 'has_value', 'has_unit',
    'unit_final', 'unit_category',
    'qty_base', 'base_type',
    'word_count', 'char_count', 'avg_word_len',
    'bullet_count', 'num_digits',
    'is_organic', 'is_gluten_free', 'is_sugar_free',
    'is_vegan', 'is_new', 'has_pack_word', 'has_bundle'
]
num_structured = len(structured_cols)  # ~19 after encoding

structured_features = train_text_struct[:, -num_structured:]
print(f"\nüì¶ Extracted structured features: {structured_features.shape}")

# -------------------------
# 7Ô∏è‚É£ Combine everything
# -------------------------
min_samples = min(
    img_tensor.shape[0],
    clip_text_embeddings.shape[0],
    alignment_features.shape[0],
    structured_features.shape[0]
)

full_input = torch.cat([
    img_tensor[:min_samples],              # 512-dim
    clip_text_embeddings[:min_samples],    # 512-dim
    alignment_features[:min_samples],      # 3-dim
    structured_features[:min_samples]      # ~19-dim
], dim=1)

sample_ids_tmp = sample_ids[:min_samples]


print(f"\n‚úÖ FINAL Combined input: {full_input.shape}")
print(f"   - CLIP Image: 512 dims")
print(f"   - CLIP Text: 512 dims")
print(f"   - Alignment: 3 dims")
print(f"   - Structured: {structured_features.shape[1]} dims")
print(f"   - TOTAL: {full_input.shape[1]} dims")



üîó Computing image-text alignment...
‚úÖ Alignment features: torch.Size([75000, 3])

üìä Alignment Statistics:
   Mean similarity: 0.340
   High matches (>0.8): 0 (0.0%)
   Low matches (<0.5): 75000 (100.0%)

üì¶ Extracted structured features: torch.Size([75000, 19])

‚úÖ FINAL Combined input: torch.Size([75000, 1046])
   - CLIP Image: 512 dims
   - CLIP Text: 512 dims
   - Alignment: 3 dims
   - Structured: 19 dims
   - TOTAL: 1046 dims


AttributeError: 'list' object has no attribute 'shape'

In [11]:
print(f"{len(sample_ids_tmp)}")


75000


In [13]:

test_input = torch.tensor(full_input.numpy(), dtype=torch.float)
# test_targets = torch.tensor(targets.numpy(), dtype=torch.float)
test_ids = torch.tensor(sample_ids_tmp)


print(f"\n‚úÖ Train: {test_input.shape}")

# -------------------------
# 9Ô∏è‚É£ Save
# -------------------------
save_dir = os.path.join(base_dir, "combined_CLIP_final")
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "clip_test_with_alignment.pt")

torch.save({
    "test_input": test_input,
    "test_ids": test_ids,
    # "test_targets": test_targets
}, save_path)

print(f"\n‚úÖ Saved to: {save_path}")


‚úÖ Train: torch.Size([75000, 1046])

‚úÖ Saved to: /content/drive/MyDrive/amazon_ml_challenge/combined_CLIP_final/clip_test_with_alignment.pt


## Test set eval and submission file making

In [15]:
# =========================================================================
# Test Set Evaluation with Ensemble Models
# =========================================================================
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import os

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# =========================================================================
# Model Definition (Same as training)
# =========================================================================
class RegressionMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.3),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.2),

            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.model(x)

# =========================================================================
# Load Ensemble Models
# =========================================================================
print("\n" + "="*70)
print("üìÇ Loading Ensemble Models")
print("="*70)

save_dir = '/content/drive/MyDrive/amazon_ml_challenge/ensemble_CLIP_models'
metadata_path = os.path.join(save_dir, 'ensemble_metadata.pt')

# Load metadata
metadata = torch.load(metadata_path, weights_only=False)
input_dim = metadata['input_dim']
print(f"Input dimension: {input_dim}")
print(f"Training MAE: {metadata['mae']:.4f}")
print(f"Training RMSE: {metadata['rmse']:.4f}")

# Load all 10 models
models = []
for i in range(10):
    model_path = os.path.join(save_dir, f'clip_ensemble_model_{i}.pt')
    model = RegressionMLP(input_dim).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device, weights_only=False))
    model.eval()
    models.append(model)
    print(f"‚úÖ Loaded model {i+1}/10")

print(f"\n‚úÖ All models loaded successfully!")

# =========================================================================
# Load Test Data
# =========================================================================
print("\n" + "="*70)
print("üìä Loading Test Data")
print("="*70)

# Load your CLIP-processed test data
test_data_path = '/content/drive/MyDrive/amazon_ml_challenge/combined_CLIP_final/clip_test_with_alignment.pt'
test_data = torch.load(test_data_path)

test_input = test_data['test_input'].to(device)
test_ids = test_data['test_ids']  # Assuming you saved IDs

print(f"Test data shape: {test_input.shape}")
print(f"Number of test samples: {len(test_ids)}")

# =========================================================================
# Ensemble Prediction
# =========================================================================
print("\n" + "="*70)
print("üîÆ Making Ensemble Predictions")
print("="*70)

# Batch prediction for efficiency
batch_size = 512
test_loader = DataLoader(
    TensorDataset(test_input),
    batch_size=batch_size,
    shuffle=False
)

all_predictions = []

# Get predictions from each model
for model_idx, model in enumerate(models):
    print(f"Predicting with model {model_idx+1}/10...")
    model_preds = []

    with torch.no_grad():
        for (batch_x,) in tqdm(test_loader, desc=f"Model {model_idx+1}"):
            batch_x = batch_x.to(device)
            pred = model(batch_x)
            model_preds.append(pred.cpu())

    model_preds = torch.cat(model_preds, dim=0)
    all_predictions.append(model_preds)

# Average predictions across all models
print("\nüìä Averaging predictions from all models...")
ensemble_predictions = torch.stack(all_predictions).mean(dim=0).squeeze()

# Clamp predictions in log-space (same as validation)
max_price = 4000
max_log_value = np.log1p(max_price)
ensemble_predictions_clipped = torch.clamp(ensemble_predictions, min=0, max=max_log_value)

# Convert from log space to original scale
final_predictions = np.expm1(ensemble_predictions_clipped.numpy())

print(f"\n‚úÖ Predictions complete!")
print(f"Prediction range: [{final_predictions.min():.2f}, {final_predictions.max():.2f}]")
print(f"Prediction mean: {final_predictions.mean():.2f}")
print(f"Prediction median: {np.median(final_predictions):.2f}")


üìÇ Loading Ensemble Models
Input dimension: 1046
Training MAE: 12.1486
Training RMSE: 34.1616
‚úÖ Loaded model 1/10
‚úÖ Loaded model 2/10
‚úÖ Loaded model 3/10
‚úÖ Loaded model 4/10
‚úÖ Loaded model 5/10
‚úÖ Loaded model 6/10
‚úÖ Loaded model 7/10
‚úÖ Loaded model 8/10
‚úÖ Loaded model 9/10
‚úÖ Loaded model 10/10

‚úÖ All models loaded successfully!

üìä Loading Test Data
Test data shape: torch.Size([75000, 1046])
Number of test samples: 75000

üîÆ Making Ensemble Predictions
Predicting with model 1/10...


Model 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 147/147 [00:00<00:00, 242.03it/s]


Predicting with model 2/10...


Model 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 147/147 [00:00<00:00, 227.32it/s]


Predicting with model 3/10...


Model 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 147/147 [00:00<00:00, 335.28it/s]


Predicting with model 4/10...


Model 4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 147/147 [00:00<00:00, 347.82it/s]


Predicting with model 5/10...


Model 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 147/147 [00:00<00:00, 339.06it/s]


Predicting with model 6/10...


Model 6: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 147/147 [00:00<00:00, 350.39it/s]


Predicting with model 7/10...


Model 7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 147/147 [00:00<00:00, 338.89it/s]


Predicting with model 8/10...


Model 8: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 147/147 [00:00<00:00, 226.40it/s]


Predicting with model 9/10...


Model 9: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 147/147 [00:00<00:00, 343.69it/s]


Predicting with model 10/10...


Model 10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 147/147 [00:00<00:00, 279.67it/s]



üìä Averaging predictions from all models...

‚úÖ Predictions complete!
Prediction range: [1.07, 4000.00]
Prediction mean: 19.99
Prediction median: 15.74


In [16]:

# =========================================================================
# Create Submission File
# =========================================================================
print("\n" + "="*70)
print("üíæ Creating Submission File")
print("="*70)

submission_df = pd.DataFrame({
    'id': test_ids,
    'entity_value': final_predictions
})

# Save submission
submission_path = '/content/drive/MyDrive/amazon_ml_challenge/test_predictions_ensemble.csv'
submission_df.to_csv(submission_path, index=False)

print(f"‚úÖ Submission saved to: {submission_path}")
print(f"\nFirst 10 predictions:")
print(submission_df.head(10))

# =========================================================================
# Additional Analysis
# =========================================================================
print("\n" + "="*70)
print("üìà Prediction Statistics")
print("="*70)

print(f"\nDescriptive Statistics:")
print(f"  Count: {len(final_predictions)}")
print(f"  Mean: {final_predictions.mean():.4f}")
print(f"  Std: {final_predictions.std():.4f}")
print(f"  Min: {final_predictions.min():.4f}")
print(f"  25%: {np.percentile(final_predictions, 25):.4f}")
print(f"  50%: {np.median(final_predictions):.4f}")
print(f"  75%: {np.percentile(final_predictions, 75):.4f}")
print(f"  Max: {final_predictions.max():.4f}")

# Check prediction variance across models
print(f"\nüîç Ensemble Variance Analysis:")
pred_std = torch.stack(all_predictions).std(dim=0).squeeze().numpy()
print(f"  Mean std across models: {pred_std.mean():.4f}")
print(f"  Max std across models: {pred_std.max():.4f}")
print(f"  Min std across models: {pred_std.min():.4f}")

print("\n" + "="*70)
print("‚úÖ EVALUATION COMPLETE!")
print("="*70)

# =========================================================================
# Optional: Save individual model predictions for analysis
# =========================================================================
save_individual = False  # Set to True if you want to save individual predictions

if save_individual:
    print("\nüíæ Saving individual model predictions...")
    individual_preds = {}
    for i, preds in enumerate(all_predictions):
        individual_preds[f'model_{i}'] = np.expm1(preds.numpy())

    individual_df = pd.DataFrame(individual_preds)
    individual_df['id'] = test_ids
    individual_df['ensemble_mean'] = final_predictions

    individual_path = '/content/drive/MyDrive/amazon_ml_challenge/individual_predictions.csv'
    individual_df.to_csv(individual_path, index=False)
    print(f"‚úÖ Individual predictions saved to: {individual_path}")


üíæ Creating Submission File
‚úÖ Submission saved to: /content/drive/MyDrive/amazon_ml_challenge/test_predictions_ensemble.csv

First 10 predictions:
       id  entity_value
0  100179     15.691438
1  245611     15.215573
2  146263     23.166376
3   95658     21.184490
4   36806     20.689602
5  148239      7.142827
6   92659      5.977146
7    3780     17.961704
8  196940     15.539577
9   20472      8.523639

üìà Prediction Statistics

Descriptive Statistics:
  Count: 75000
  Mean: 19.9899
  Std: 22.2912
  Min: 1.0699
  25%: 10.0801
  50%: 15.7399
  75%: 23.6990
  Max: 4000.0020

üîç Ensemble Variance Analysis:
  Mean std across models: 0.2175
  Max std across models: 114.8519
  Min std across models: 0.0220

‚úÖ EVALUATION COMPLETE!


In [19]:
submission_df = pd.DataFrame({
    'sample_id': test_ids,
    'price': final_predictions
})

In [20]:
submission_df

Unnamed: 0,sample_id,price
0,100179,15.691438
1,245611,15.215573
2,146263,23.166376
3,95658,21.184490
4,36806,20.689602
...,...,...
74995,93616,21.545609
74996,249434,12.976159
74997,162217,4.529245
74998,230487,17.538809


In [21]:
# Save the DataFrame as a CSV file to the local Colab environment
submission_csv_path = "/content/submission_predictions.csv"
submission_df.to_csv(submission_csv_path, index=False)

print(f"‚úÖ DataFrame saved as CSV to: {submission_csv_path}")

‚úÖ DataFrame saved as CSV to: /content/submission_predictions.csv
