In [1]:
# Advanced Stacking Implementation
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,
                              HistGradientBoostingClassifier, GradientBoostingClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier as RF
import warnings; warnings.filterwarnings('ignore')

# Config
RANDOM_STATE = 42
N_FOLDS = 7
DATA_DIR = Path('.')
TRAIN_PATH = DATA_DIR / 'train.csv'
TEST_PATH = DATA_DIR / 'test.csv'
SUB_PATH = DATA_DIR / 'sample_submission.csv'
ENABLE_INTERACTIONS = True

train = pd.read_csv(TRAIN_PATH)
X_test = pd.read_csv(TEST_PATH)
submission_adv = pd.read_csv(SUB_PATH)

# Detect target
target_col = 'target' if 'target' in train.columns else train.columns[-1]
y = train[target_col].values
X = train.drop(columns=[target_col]).copy()

# 1. Type detection
cat_cols, num_cols, high_card_cols = [], [], []
for c in X.columns:
    if X[c].dtype == 'object' or X[c].dtype.name.startswith('category'):
        nun = X[c].nunique()
        if nun > 25: high_card_cols.append(c)
        cat_cols.append(c)
    else:
        nun = X[c].nunique()
        if 2 <= nun < 15: cat_cols.append(c)
        else: num_cols.append(c)
print(f"(ADV) Cat={len(cat_cols)} | High-card={len(high_card_cols)} | Num={len(num_cols)}")

# 2. Interactions
inter_cols = []
if ENABLE_INTERACTIONS and len(num_cols) >= 2:
    base_nums = num_cols[:5]
    pairs = list(zip(base_nums, base_nums[1:]))[:3]
    for a,b in pairs:
        colr = f"{a}_div_{b}"
        X[colr] = X[a] / (X[b].replace(0,np.nan).fillna(X[b].median()) + 1)
        X_test[colr] = X_test[a] / (X_test[b].replace(0,np.nan).fillna(X_test[b].median()) + 1)
        inter_cols.append(colr)
        cold = f"{a}_minus_{b}"
        X[cold] = X[a] - X[b]; X_test[cold] = X_test[a] - X_test[b]
        inter_cols.append(cold)
print('Interações adicionadas:', inter_cols)

# 3. Encoding
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_enc = X.copy(); X_test_enc = X_test.copy()
ord_cols = [c for c in cat_cols if c not in high_card_cols]
if ord_cols:
    X_enc[ord_cols] = encoder.fit_transform(X_enc[ord_cols])
    X_test_enc[ord_cols] = encoder.transform(X_test_enc[ord_cols])

from typing import Tuple
from sklearn.model_selection import StratifiedKFold as SKF

def target_encode_oof(series: pd.Series, y_vec: np.ndarray, n_splits=7, smoothing=12, random_state=42):
    skf_te = SKF(n_splits=n_splits, shuffle=True, random_state=random_state)
    global_mean = y_vec.mean()
    oof_vals = pd.Series(index=series.index, dtype=float)
    for tr_idx, val_idx in skf_te.split(series, y_vec):
        tr_s = series.iloc[tr_idx]
        tr_y = y_vec[tr_idx]
        stats = pd.DataFrame({'cat': tr_s, 'y': tr_y}).groupby('cat')['y'].agg(['mean','count'])
        counts = stats['count']; means = stats['mean']
        smooth = (counts*means + smoothing*global_mean)/(counts+smoothing)
        mapping = smooth.to_dict()
        oof_vals.iloc[val_idx] = series.iloc[val_idx].map(mapping).fillna(global_mean)
    return oof_vals.values, global_mean

for c in high_card_cols:
    te_oof, gm = target_encode_oof(X[c], y, n_splits=7, smoothing=12, random_state=RANDOM_STATE)
    stats_full = pd.DataFrame({'cat': X[c], 'y': y}).groupby('cat')['y'].agg(['mean','count'])
    counts_f = stats_full['count']; means_f = stats_full['mean']
    smooth_full = (counts_f*means_f + 12*gm)/(counts_f+12)
    mapping_f = smooth_full.to_dict()
    X_enc[c+'_te'] = te_oof
    X_test_enc[c+'_te'] = X_test[c].map(mapping_f).fillna(gm)
    enc_local = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    X_enc[c] = enc_local.fit_transform(X[[c]])
    X_test_enc[c] = enc_local.transform(X_test[[c]])

# 4. Low variance removal
low_var_cols = [c for c in X_enc.columns if X_enc[c].nunique() <= 1]
if low_var_cols:
    X_enc.drop(columns=low_var_cols, inplace=True)
    X_test_enc.drop(columns=[c for c in low_var_cols if c in X_test_enc.columns], inplace=True)
print('Low variance removidas:', low_var_cols)

# 5. Feature selection multi keep_pct
from sklearn.ensemble import ExtraTreesClassifier

def select_features(Xm, yv, keep_pct, rs):
    et = ExtraTreesClassifier(n_estimators=600, random_state=rs, n_jobs=-1)
    et.fit(Xm, yv)
    imp = pd.Series(et.feature_importances_, index=Xm.columns).sort_values(ascending=False)
    k = max(10, int(len(imp)*keep_pct))
    return imp.index[:k].tolist(), imp

keep_candidates = [0.85, 0.90, 0.95]
selections = []
skf_tmp = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
for kp in keep_candidates:
    cols, imp = select_features(X_enc, y, kp, RANDOM_STATE)
    rf_tmp = RandomForestClassifier(n_estimators=400, random_state=RANDOM_STATE, n_jobs=-1)
    accs=[]
    for tri,vai in skf_tmp.split(X_enc[cols], y):
        rf_tmp.fit(X_enc[cols].iloc[tri], y[tri])
        pr = rf_tmp.predict(X_enc[cols].iloc[vai])
        accs.append(accuracy_score(y[vai], pr))
    selections.append((kp, np.mean(accs), cols))
selections.sort(key=lambda x: x[1], reverse=True)
best_keep, best_acc_sel, best_cols = selections[0]
print(f'Seleção multi keep -> {best_keep} acc={best_acc_sel:.4f}')
X_sel = X_enc[best_cols].copy(); X_test_sel = X_test_enc[best_cols].copy()

# 6. Imputação de NaNs e Infs
X_sel.replace([np.inf,-np.inf], np.nan, inplace=True)
X_test_sel.replace([np.inf,-np.inf], np.nan, inplace=True)
for c in X_sel.columns:
    if X_sel[c].isna().any():
        X_sel[c].fillna(X_sel[c].median(), inplace=True)
    if X_test_sel[c].isna().any():
        X_test_sel[c].fillna(X_sel[c].median(), inplace=True)

# 7. Base models
rf_deep = RandomForestClassifier(n_estimators=1200, max_depth=None, min_samples_leaf=1, max_features='sqrt', random_state=RANDOM_STATE, n_jobs=-1)
rf_shallow = RandomForestClassifier(n_estimators=800, max_depth=10, min_samples_leaf=2, max_features=0.5, random_state=RANDOM_STATE, n_jobs=-1)
et = ExtraTreesClassifier(n_estimators=1300, max_depth=None, min_samples_leaf=1, random_state=RANDOM_STATE, n_jobs=-1)
gb = GradientBoostingClassifier(n_estimators=250, learning_rate=0.05, max_depth=3, random_state=RANDOM_STATE)
hgb = HistGradientBoostingClassifier(learning_rate=0.06, max_leaf_nodes=31, random_state=RANDOM_STATE)
models = {'rf_deep': rf_deep, 'rf_shallow': rf_shallow, 'et': et, 'gb': gb, 'hgb': hgb}

# 8. OOF
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
probas = {n: np.zeros(len(X_sel)) for n in models}
probas_test_folds = {n: [] for n in models}
fold_accs=[]
for fold,(tr_idx,val_idx) in enumerate(skf.split(X_sel,y),1):
    X_tr, X_val = X_sel.iloc[tr_idx], X_sel.iloc[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]
    for n,m in models.items():
        mm = clone(m)
        mm.fit(X_tr, y_tr)
        pv = mm.predict_proba(X_val)[:,1]
        probas[n][val_idx] = pv
        probas_test_folds[n].append(mm.predict_proba(X_test_sel)[:,1])
    avg_val = np.mean([probas[k][val_idx] for k in models], axis=0)
    fold_accs.append(accuracy_score(y_val, (avg_val>=0.5).astype(int)))
print('Fold mean acc (equal weights):', round(np.mean(fold_accs),4))

# 9. Pesos penalizando correlação
M = np.vstack([probas[n] for n in models]).T
corr_matrix = np.corrcoef(M, rowvar=False)
lam = 0.02

def score_weights(w):
    blend = M.dot(w)
    acc = accuracy_score(y, (blend>=0.5).astype(int))
    corr_penalty = 0.0
    for i in range(len(w)):
        for j in range(i+1,len(w)):
            corr_penalty += w[i]*w[j]*corr_matrix[i,j]
    return acc - lam*corr_penalty

rng = np.random.default_rng(RANDOM_STATE)
model_names = list(models.keys())
best_w = np.ones(len(model_names))/len(model_names)
best_score = score_weights(best_w)
for _ in range(600):
    w = rng.dirichlet([1.8]*len(model_names))
    sc = score_weights(w)
    if sc > best_score:
        best_score = sc; best_w = w
# local refine
for i in range(len(best_w)):
    for delta in [0.03,-0.03,0.06,-0.06]:
        w_try = best_w.copy(); w_try[i] = max(0.0, w_try[i]+delta)
        if w_try.sum()==0: continue
        w_try/=w_try.sum(); sc=score_weights(w_try)
        if sc>best_score: best_score=sc; best_w=w_try
print('Pesos finais:', dict(zip(model_names, np.round(best_w,4))), 'score=', round(best_score,4))

blend_oof = M.dot(best_w)
acc_blend_05 = accuracy_score(y, (blend_oof>=0.5).astype(int))
print('Blend acc thr=0.5:', round(acc_blend_05,4))

# 10. Stacking
stack_features = np.hstack([M, blend_oof.reshape(-1,1)])
meta_lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
meta_rf = RF(n_estimators=400, max_depth=5, random_state=RANDOM_STATE, n_jobs=-1)
meta_lr.fit(stack_features, y)
meta_rf.fit(stack_features, y)
proba_lr = meta_lr.predict_proba(stack_features)[:,1]
proba_rf = meta_rf.predict_proba(stack_features)[:,1]
print('Stack LR thr=0.5 acc:', round(accuracy_score(y, (proba_lr>=0.5).astype(int)),4),
      '| Stack RF thr=0.5 acc:', round(accuracy_score(y, (proba_rf>=0.5).astype(int)),4))

# 11. Threshold optimization
def optimize_threshold(proba, y_true):
    coarse = np.linspace(0.3,0.7,81); best_t=0.5; best_a=-1
    for t in coarse:
        a = accuracy_score(y_true, (proba>=t).astype(int))
        if a>best_a: best_a=a; best_t=t
    fine = np.linspace(max(0,best_t-0.03), min(1,best_t+0.03), 121)
    for t in fine:
        a = accuracy_score(y_true, (proba>=t).astype(int))
        if a>best_a: best_a=a; best_t=t
    return best_t, best_a

thr_blend, acc_blend = optimize_threshold(blend_oof, y)
thr_lr, acc_lr = optimize_threshold(proba_lr, y)
thr_rf, acc_rf = optimize_threshold(proba_rf, y)
print(f'Blend opt thr={thr_blend:.4f} acc={acc_blend:.4f}')
print(f'Stack LR opt thr={thr_lr:.4f} acc={acc_lr:.4f}')
print(f'Stack RF opt thr={thr_rf:.4f} acc={acc_rf:.4f}')

options = [('blend', acc_blend, thr_blend, blend_oof), ('stack_lr', acc_lr, thr_lr, proba_lr), ('stack_rf', acc_rf, thr_rf, proba_rf)]
options.sort(key=lambda x: x[1], reverse=True)
choice, choice_acc, choice_thr, choice_proba = options[0]
print('Melhor opção:', choice, 'acc=', round(choice_acc,4), 'thr=', round(choice_thr,4))

# 12. Test predictions
test_model_probas = {n: np.mean(probas_test_folds[n], axis=0) for n in model_names}
blend_test = np.zeros(len(X_test_sel))
for w,name in zip(best_w, model_names):
    blend_test += w*test_model_probas[name]
stack_test_features = np.hstack([
    np.vstack([test_model_probas[n] for n in model_names]).T,
    blend_test.reshape(-1,1)
])
stack_lr_test = meta_lr.predict_proba(stack_test_features)[:,1]
stack_rf_test = meta_rf.predict_proba(stack_test_features)[:,1]
final_proba = blend_test if choice=='blend' else (stack_lr_test if choice=='stack_lr' else stack_rf_test)
final_label = (final_proba >= choice_thr).astype(int)
submission_adv[target_col] = final_label
out_file = f'submission_advanced_{choice}_thr{choice_thr:.4f}.csv'
submission_adv.to_csv(out_file, index=False)

# 13. Report
pred_choice = (choice_proba >= choice_thr).astype(int)
print({'final_acc': choice_acc,
       'precision': precision_score(y,pred_choice),
       'recall': recall_score(y,pred_choice),
       'f1': f1_score(y,pred_choice),
       'threshold': choice_thr,
       'mode': choice,
       'weights': dict(zip(model_names, np.round(best_w,4))),
       'features_used': X_sel.shape[1],
       'submission_file': out_file})

(ADV) Cat=24 | High-card=1 | Num=8
Interações adicionadas: ['id_div_age_first_funding_year', 'id_minus_age_first_funding_year', 'age_first_funding_year_div_age_last_funding_year', 'age_first_funding_year_minus_age_last_funding_year', 'age_last_funding_year_div_age_first_milestone_year', 'age_last_funding_year_minus_age_first_milestone_year']
Low variance removidas: []
Seleção multi keep -> 0.95 acc=0.7801
Fold mean acc (equal weights): 0.7786
Pesos finais: {'rf_deep': np.float64(0.0856), 'rf_shallow': np.float64(0.3404), 'et': np.float64(0.1623), 'gb': np.float64(0.4118), 'hgb': np.float64(0.0)} score= 0.7849
Blend acc thr=0.5: 0.791
Stack LR thr=0.5 acc: 0.7864 | Stack RF thr=0.5 acc: 0.8406
Blend opt thr=0.5000 acc=0.7910
Stack LR opt thr=0.5600 acc=0.7926
Stack RF opt thr=0.4380 acc=0.8483
Melhor opção: stack_rf acc= 0.8483 thr= 0.438
{'final_acc': 0.848297213622291, 'precision': 0.82, 'recall': 0.9808612440191388, 'f1': 0.8932461873638344, 'threshold': np.float64(0.4379999999999999