In [6]:
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, accuracy_score
from catboost import CatBoostClassifier

# ==========================================
# 1. Updated Wrapper for Ensembles
# ==========================================
class LoneWolfPredictor:
    """
    Unified wrapper that handles 5-Fold Ensembling for both Model A and B.
    """
    def __init__(self, models_a, models_b, threshold):
        self.models_a = models_a  # List of 5 models
        self.models_b = models_b  # List of 5 models
        self.threshold = threshold

    def _get_avg_proba(self, models, X):
        """Helper to average predictions across all folds"""
        preds = np.zeros(len(X))
        for model in models:
            preds += model.predict_proba(X)[:, 1]
        return preds / len(models)

    def predict(self, X):
        # 1. Get Averaged Probs for A and B
        prob_a = self._get_avg_proba(self.models_a, X)
        prob_b = self._get_avg_proba(self.models_b, X)
        
        # 2. Blend
        blend_prob = 0.6 * prob_a + 0.4 * prob_b
        
        # 3. Threshold
        return (blend_prob >= self.threshold).astype(int)

# ==========================================
# 2. Setup Data
# ==========================================
TEAM_NAME = "LoneWolf"
os.makedirs(TEAM_NAME, exist_ok=True)

train_df = pd.read_csv("/kaggle/input/neural-net-nexus-2-0/train.csv")
test_df  = pd.read_csv("/kaggle/input/neural-net-nexus-2-0/test.csv")
y = train_df["Revenue"]

# Feature Prep
df = pd.concat([train_df.drop(columns=["Revenue"]), test_df.drop(columns=["ID"])], axis=0).reset_index(drop=True)
cat_features = df.select_dtypes(include="object").columns.tolist()
for col in cat_features:
    df[col] = df[col].fillna("Missing").astype(str)

X = df.iloc[:len(train_df)]
X_test = df.iloc[len(train_df):]

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ==========================================
# 3. Train & Collect Ensembles
# ==========================================
# Lists to store all 5 models for the Joblib
models_a_list = []
models_b_list = []

# Arrays for Inline Verification
oof_A = np.zeros(len(X))
test_A_inline = np.zeros(len(X_test))
oof_B = np.zeros(len(X))
test_B_inline = np.zeros(len(X_test))

print(">>> Training Model A (Ensemble)...")
for fold, (tr, val) in enumerate(folds.split(X, y)):
    model = CatBoostClassifier(
        iterations=1000, depth=6, learning_rate=0.03, # Reduced iters for speed in demo
        loss_function="Logloss", eval_metric="AUC",
        class_weights={0: 1.0, 1: 3.0}, cat_features=cat_features,
        early_stopping_rounds=100, verbose=0, random_seed=42
    )
    model.fit(X.iloc[tr], y.iloc[tr], eval_set=(X.iloc[val], y.iloc[val]), use_best_model=True)
    
    # Save for Wrapper
    models_a_list.append(model)
    
    # Save for Inline Calc
    oof_A[val] = model.predict_proba(X.iloc[val])[:, 1]
    test_A_inline += model.predict_proba(X_test)[:, 1] / folds.n_splits

print(">>> Training Model B (Ensemble)...")
for fold, (tr, val) in enumerate(folds.split(X, y)):
    model = CatBoostClassifier(
        iterations=800, depth=7, learning_rate=0.04,
        loss_function="Logloss", eval_metric="AUC",
        class_weights={0: 1.0, 1: 1.8}, cat_features=cat_features,
        early_stopping_rounds=100, verbose=0, random_seed=24
    )
    model.fit(X.iloc[tr], y.iloc[tr], eval_set=(X.iloc[val], y.iloc[val]), use_best_model=True)
    
    # Save for Wrapper
    models_b_list.append(model)
    
    # Save for Inline Calc
    oof_B[val] = model.predict_proba(X.iloc[val])[:, 1]
    test_B_inline += model.predict_proba(X_test)[:, 1] / folds.n_splits

# ==========================================
# 4. Optimization & Inline Prediction
# ==========================================
oof_blend = 0.6 * oof_A + 0.4 * oof_B
best_t, best_score = 0.5, 0

for t in np.arange(0.3, 0.7, 0.01):
    score = 0.5 * precision_score(y, (oof_blend >= t).astype(int)) + \
            0.5 * accuracy_score(y, (oof_blend >= t).astype(int))
    if score > best_score:
        best_t = t

print(f"Optimal Threshold: {best_t:.4f}")

# INLINE RESULT (Manual Way)
test_blend_inline = 0.6 * test_A_inline + 0.4 * test_B_inline
final_preds_inline = (test_blend_inline >= best_t).astype(int)

# ==========================================
# 5. Create & Verify Joblib Wrapper
# ==========================================
# A. Create Wrapper
unified_model = LoneWolfPredictor(models_a_list, models_b_list, best_t)

# B. Save & Reload
joblib.dump(unified_model, f"{TEAM_NAME}/full_ensemble.joblib")
loaded_model = joblib.load(f"{TEAM_NAME}/full_ensemble.joblib")

# C. Predict using Wrapper
final_preds_wrapper = loaded_model.predict(X_test)

# ==========================================
# 6. Final Verification
# ==========================================
comparison = (final_preds_inline == final_preds_wrapper)
match_count = np.sum(comparison)
total_count = len(comparison)

print("-" * 30)
print(f"Inline vs Wrapper Comparison:")
print(f"Matching Predictions: {match_count} / {total_count}")
print(f"Match Percentage:     {100 * match_count / total_count:.2f}%")
print("-" * 30)

if match_count == total_count:
    print("✅ SUCCESS: The Joblib produces EXACTLY the same output.")
else:
    print("❌ WARNING: Outputs differ. Check floating point precision or list order.")

>>> Training Model A (Ensemble)...
>>> Training Model B (Ensemble)...
Optimal Threshold: 0.6900
------------------------------
Inline vs Wrapper Comparison:
Matching Predictions: 2470 / 2470
Match Percentage:     100.00%
------------------------------
✅ SUCCESS: The Joblib produces EXACTLY the same output.


In [7]:
# ==========================================
# 7. Generate Submission CSV
# ==========================================
submission = pd.DataFrame({
    "ID": test_ids,
    "Revenue": final_preds_wrapper
})

submission_path = f"{TEAM_NAME}/submission_blend_safe.csv"
submission.to_csv(submission_path, index=False)

print(f"✅ Success! Submission saved to: {submission_path}")

✅ Success! Submission saved to: LoneWolf/submission_blend_safe.csv
