In [1]:
import pandas as pd
import numpy as np
import pickle
import os

print("Loading original CSV files...")
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

# --- Load the saved image embeddings ---
print("Loading pre-processed image embeddings (this may take a minute)...")
with open('../data/train_image_embeddings_FULL.pkl', 'rb') as f:
    train_embeddings = pickle.load(f)
with open('../data/test_image_embeddings_FULL.pkl', 'rb') as f:
    test_embeddings = pickle.load(f)
print("✅ Embeddings loaded.")

# --- Convert embeddings to DataFrame format ---
IMAGE_FEATURE_COUNT = 2048 # ResNet50 gives 2048 features
img_cols = [f'img_{i}' for i in range(IMAGE_FEATURE_COUNT)]

train_img_df = pd.DataFrame.from_dict(train_embeddings, orient='index', columns=img_cols)
train_img_df.index.name = 'sample_id'

test_img_df = pd.DataFrame.from_dict(test_embeddings, orient='index', columns=img_cols)
test_img_df.index.name = 'sample_id'

# --- Merge image features with the main data ---
# We use a 'left' merge to keep all original rows
print("Merging image features into main dataframes...")
train_df_full = train_df.merge(train_img_df, on='sample_id', how='left')
test_df_full = test_df.merge(test_img_df, on='sample_id', how='left')

# Fill missing image features with 0 (for any images that failed to process)
train_df_full[img_cols] = train_df_full[img_cols].fillna(0)
test_df_full[img_cols] = test_df_full[img_cols].fillna(0)

print("✅ Data loading and merging complete.")
display(train_df_full.head())

Loading original CSV files...
Loading pre-processed image embeddings (this may take a minute)...
✅ Embeddings loaded.
Merging image features into main dataframes...
✅ Data loading and merging complete.


Unnamed: 0,sample_id,catalog_content,image_link,price,img_0,img_1,img_2,img_3,img_4,img_5,...,img_2038,img_2039,img_2040,img_2041,img_2042,img_2043,img_2044,img_2045,img_2046,img_2047
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,0.065593,0.011724,0.670622,0.025974,0.590448,0.741287,...,0.205975,0.018378,0.067109,0.0,0.014577,0.999065,0.064973,0.027026,0.710594,0.010265
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12,0.193857,0.358459,0.001256,0.045958,0.005887,0.555398,...,0.14251,0.017008,0.120086,0.0,0.017559,0.0,0.493542,0.0,0.282566,0.187292
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97,0.071409,0.745915,1.709077,0.008416,0.005543,0.154619,...,0.201331,0.496778,0.0,0.0,0.145555,0.918642,0.553077,0.122053,0.384103,0.470413
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34,0.212073,0.238302,0.310819,0.335903,0.004677,0.026607,...,0.001492,0.0,0.134894,0.021352,0.019757,0.119571,0.995192,0.697332,0.142514,0.49212
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49,0.071024,0.115265,0.602926,0.309518,0.022051,0.131149,...,0.0,0.0,0.003945,0.0,0.192766,0.937161,0.020276,0.0,0.095623,0.040726


In [2]:
import re

# --- Extract Item Pack Quantity (IPQ) ---
def extract_ipq(text):
    if not isinstance(text, str): return 1.0
    match = re.search(r'(\d+\.?\d*)\s*(?:oz|ounce|count|pk|pack|fl oz|ct)', text, re.IGNORECASE)
    if match: return float(match.group(1))
    return 1.0

print("Extracting IPQ features...")
train_df_full['ipq'] = train_df_full['catalog_content'].apply(extract_ipq)
test_df_full['ipq'] = test_df_full['catalog_content'].apply(extract_ipq)

# --- Extract Brand ---
def extract_brand(text):
    if not isinstance(text, str) or len(text) == 0: return "unknown"
    return text.split()[0].lower()

print("Extracting Brand features...")
train_df_full['brand'] = train_df_full['catalog_content'].apply(extract_brand)
test_df_full['brand'] = test_df_full['catalog_content'].apply(extract_brand)
train_df_full['brand'] = train_df_full['brand'].astype('category')
test_df_full['brand'] = test_df_full['brand'].astype('category')

print("✅ Feature engineering complete.")

Extracting IPQ features...


  train_df_full['ipq'] = train_df_full['catalog_content'].apply(extract_ipq)
  test_df_full['ipq'] = test_df_full['catalog_content'].apply(extract_ipq)


Extracting Brand features...


  train_df_full['brand'] = train_df_full['catalog_content'].apply(extract_brand)


✅ Feature engineering complete.


  test_df_full['brand'] = test_df_full['catalog_content'].apply(extract_brand)


In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import lightgbm as lgb

# --- 1. Define All Feature Types ---
text_feature = 'catalog_content'
numeric_features = ['ipq']
categorical_features = ['brand']
image_features = [f'img_{i}' for i in range(IMAGE_FEATURE_COUNT)]

# --- 2. Create the Final Preprocessor ---
# We use PCA to reduce the 2048 image features to 128
# This is faster and often prevents overfitting
preprocessor_v4 = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(stop_words='english', max_features=20000, ngram_range=(1,2)), text_feature),
        ('num', StandardScaler(), numeric_features),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_features),
        ('img', PCA(n_components=128, random_state=42), image_features)
    ],
    remainder='drop' # Drop any columns we didn't specify
)

# --- 3. Create the Final Model Pipeline ---
model_final = Pipeline(steps=[
    ('preprocessor', preprocessor_v4),
    ('lgbm', lgb.LGBMRegressor(random_state=42, n_estimators=500, learning_rate=0.05, num_leaves=40))
])

# --- 4. Define SMAPE (for validation) ---
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    denominator[denominator == 0] = 1e-6 
    return np.mean(numerator / denominator) * 100

# --- 5. Train and Evaluate on the FULL Dataset ---
print("Preparing full dataset for training...")
X = train_df_full
y_log = np.log1p(X['price'])

# Split data for validation
X_train, X_val, y_train_log, y_val_log = train_test_split(X, y_log, test_size=0.15, random_state=42)

print("🚀 Training the final model (Text + IPQ + Brand + Images)...")
model_final.fit(X_train, y_train_log)

print("📈 Evaluating final model...")
preds_log = model_final.predict(X_val)
preds = np.expm1(preds_log)
y_val_true = np.expm1(y_val_log)
preds[preds < 0] = 0

validation_smape = smape(y_val_true, preds)
print("\n" + "="*40)
print(f"✅ FINAL MODEL VALIDATION SMAPE: {validation_smape:.4f}%")
print("="*40)

Preparing full dataset for training...
🚀 Training the final model (Text + IPQ + Brand + Images)...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 9.067526 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1173461
[LightGBM] [Info] Number of data points in the train set: 63750, number of used features: 19773
[LightGBM] [Info] Start training from score 2.740886
📈 Evaluating final model...





✅ FINAL MODEL VALIDATION SMAPE: 53.0882%


In [4]:
print("🚀 Training final model on ALL 75,000 samples...")
# Train the model on the full 75k sample set
model_final.fit(X, y_log)

print("📝 Generating final predictions on the test set...")
# Use the trained model to predict on the full test set
final_predictions = np.expm1(model_final.predict(test_df_full))
final_predictions[final_predictions < 0] = 0

# --- Create submission file ---
submission_final = pd.DataFrame({
    'sample_id': test_df_full['sample_id'],
    'price': final_predictions
})

output_path_final = '../outputs/submission_final_images.csv'
submission_final.to_csv(output_path_final, index=False)

print(f"\n✅ Final submission file created at: {output_path_final}")
display(submission_final.head())

🚀 Training final model on ALL 75,000 samples...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 6.748103 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1305550
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 19886
[LightGBM] [Info] Start training from score 2.739217
📝 Generating final predictions on the test set...





✅ Final submission file created at: ../outputs/submission_final_images.csv


Unnamed: 0,sample_id,price
0,100179,15.662521
1,245611,15.844833
2,146263,19.810901
3,95658,12.144444
4,36806,21.691201
