In [1]:
import os
print(os.getcwd())


/Users/manuhalapeth/Downloads/Shell_ML_Challenge_2025/notebooks


In [8]:
# ==================================
# Shell.ai 2025 - 01_preprocessing.py (Max Accuracy Version)
# ==================================

import pandas as pd
import numpy as np
import joblib
import os
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

# =============================
#  Load Raw Data
# =============================
data_path = "../data/raw"
train = pd.read_csv(os.path.join(data_path, "train.csv"))
test = pd.read_csv(os.path.join(data_path, "test.csv"))

# Add a flag to distinguish train/test
train["is_train"] = 1
test["is_train"] = 0

# Concatenate for unified preprocessing
full = pd.concat([train, test], axis=0).reset_index(drop=True)
print(f" Combined shape: {full.shape}")

# =============================
#  Split Column Groups
# =============================
ID_COL = "ID"
TARGET_COLS = [col for col in train.columns if col.startswith("BlendProperty")]

# Feature groups
blend_comp_cols = full.columns[0:5]      # Volume fraction columns
component_cols = full.columns[5:55]      # Component-specific features (50 columns)

# =============================
#  Impute Missing Values
# =============================
imp = SimpleImputer(strategy="mean")
full[component_cols] = imp.fit_transform(full[component_cols])

# =============================
#  Normalize Blend Composition (fractions sum to 1)
# =============================
full[blend_comp_cols] = full[blend_comp_cols].div(
    full[blend_comp_cols].sum(axis=1), axis=0
)

# =============================
#  Generate Interaction Features
# =============================
# Create multiplicative interaction features between blend fractions and component properties
for comp in blend_comp_cols:
    for prop in component_cols:
        full[f"{comp}_x_{prop}"] = full[comp] * full[prop]

# =============================
#  Feature Selection
# =============================
# Remove features with near-zero variance
selector = VarianceThreshold(threshold=1e-5)
full_numeric = full.drop(columns=[ID_COL, "is_train"] + TARGET_COLS)
full_selected = selector.fit_transform(full_numeric)
selected_cols = full_numeric.columns[selector.get_support()]

# =============================
#  Dimensionality Reduction (Optional but helpful)
# =============================
# Reduce dimensionality using PCA while preserving 99% of variance
pca = PCA(n_components=0.99, svd_solver='full')
X_pca = pca.fit_transform(full_selected)
print(f" PCA reduced to: {X_pca.shape[1]} dimensions")

# =============================
#  Final Feature Matrix
# =============================
X_final = pd.DataFrame(X_pca)
y_final = full[TARGET_COLS] if "BlendProperty1" in full.columns else None

# =============================
#  Split Back into Train/Test
# =============================
train_mask = full["is_train"] == 1
test_mask = ~train_mask

X_train = X_final[train_mask.values].reset_index(drop=True)
y_train = y_final[train_mask.values].reset_index(drop=True)
X_test = X_final[test_mask.values].reset_index(drop=True)

print(" Preprocessing complete:")
print("Train X:", X_train.shape)
print("Train y:", y_train.shape)
print("Test X :", X_test.shape)

# =============================
#  Save Processed Data
# =============================
os.makedirs("../data/processed", exist_ok=True)
joblib.dump((X_train, y_train), "../data/processed/train_processed.pkl")
joblib.dump(X_test, "../data/processed/test_processed.pkl")


ðŸ“Š Combined shape: (2500, 67)


  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full[prop]
  full[f"{comp}_x_{prop}"] = full[comp] * full

ðŸ”» PCA reduced to: 176 dimensions
âœ… Preprocessing complete:
Train X: (2000, 176)
Train y: (2000, 10)
Test X : (500, 176)


['../data/processed/test_processed.pkl']