# Notebook Final Compacto
Este arquivo contém somente as etapas essenciais para: (1) Treinar o ensemble final (RandomForest + ExtraTrees + HistGradientBoosting + LogisticRegression opcional como meta/blender simples), (2) Gerar métricas de validação cruzada (accuracy principal) e (3) Produzir o arquivo de submissão.

Regras atendidas: Uso exclusivo de Numpy / Pandas / Scikit-Learn (+ matplotlib opcional).

Para detalhes de EDA, hipóteses e justificativas, ver notebook completo original (`teste.ipynb`).

In [1]:
# Imports essenciais
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import warnings; warnings.filterwarnings('ignore')

DATA_DIR = Path('.')
TRAIN_PATH = DATA_DIR / 'train.csv'
TEST_PATH = DATA_DIR / 'test.csv'
SUB_PATH = DATA_DIR / 'sample_submission.csv'
RANDOM_STATE = 42
N_FOLDS = 7

In [2]:
# Carregar dados
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
submission = pd.read_csv(SUB_PATH)
target_col = 'target' if 'target' in train.columns else train.columns[-1]  # ajuste se necessário
y = train[target_col].values
X = train.drop(columns=[target_col]).copy()
X_test = test.copy()
print('Shape train:', X.shape, 'Shape test:', X_test.shape)

Shape train: (646, 32) Shape test: (277, 32)


In [3]:
# Detecção simples de tipos e separação cat/num
cat_cols = [c for c in X.columns if X[c].dtype == 'object' or X[c].nunique() < 20]  # heurística rápida
num_cols = [c for c in X.columns if c not in cat_cols]
print(f'Cols categóricas: {len(cat_cols)} | numéricas: {len(num_cols)}')

Cols categóricas: 24 | numéricas: 8


In [4]:
# Função utilitária: target encoding OOF para alta cardinalidade (opcional, aplica só se necessário)
def target_encode_oof(series, y, n_splits=5, smoothing=5, noise_std=0.0, random_state=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    global_mean = y.mean()
    oof = pd.Series(index=series.index, dtype=float)
    for tr_idx, val_idx in skf.split(series, y):
        tr_values = series.iloc[tr_idx]
        tr_target = y[tr_idx]
        stats = tr_values.to_frame('cat').join(pd.Series(tr_target, index=tr_values.index, name='y'))\
            .groupby('cat')['y'].agg(['mean','count'])
        counts = stats['count']
        means = stats['mean']
        smooth = (counts * means + smoothing * global_mean) / (counts + smoothing)
        mapping = smooth.to_dict()
        oof.iloc[val_idx] = series.iloc[val_idx].map(mapping).fillna(global_mean)
    if noise_std > 0:
        oof += np.random.normal(0, noise_std, size=len(oof))
    return oof.values, global_mean

In [5]:
# Aplicar encoding simples: para colunas categóricas pequenas usar OrdinalEncoder;
# (Se houver col. de alta cardinalidade, poderia aplicar target encoding, mas aqui mantemos simples)
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_enc = X.copy()
X_test_enc = X_test.copy()
if cat_cols:
    X_enc[cat_cols] = encoder.fit_transform(X_enc[cat_cols])
    X_test_enc[cat_cols] = encoder.transform(X_test_enc[cat_cols])
print('Encoding categórico concluído.')

Encoding categórico concluído.


In [6]:
# (Opcional) Remover colunas quase constantes rapidamente
low_var_cols = [c for c in X_enc.columns if X_enc[c].nunique() <= 1]
if low_var_cols:
    X_enc.drop(columns=low_var_cols, inplace=True)
    X_test_enc.drop(columns=[c for c in low_var_cols if c in X_test_enc.columns], inplace=True)
print('Cols baixa variância removidas:', low_var_cols)

Cols baixa variância removidas: []


In [7]:
# Seleção simples baseada em ExtraTrees para manter top k features (k proporcional)
def select_features_via_importance(X_mat, y, keep_pct=0.9, random_state=42):
    et = ExtraTreesClassifier(n_estimators=400, random_state=random_state, n_jobs=-1)
    et.fit(X_mat, y)
    importances = pd.Series(et.feature_importances_, index=X_mat.columns).sort_values(ascending=False)
    k = max(5, int(len(importances) * keep_pct))
    selected = importances.index[:k].tolist()
    return selected, importances

selected_features, importances = select_features_via_importance(X_enc, y, keep_pct=0.9, random_state=RANDOM_STATE)
print('Features selecionadas:', len(selected_features))
X_sel = X_enc[selected_features]
X_test_sel = X_test_enc[selected_features]

Features selecionadas: 28


In [8]:
# Definir modelos base
rf = RandomForestClassifier(n_estimators=800, max_depth=None, min_samples_split=2, random_state=RANDOM_STATE, n_jobs=-1)
et = ExtraTreesClassifier(n_estimators=1000, max_depth=None, random_state=RANDOM_STATE, n_jobs=-1)
hgb = HistGradientBoostingClassifier(max_depth=None, learning_rate=0.06, max_leaf_nodes=31, random_state=RANDOM_STATE)
log_meta = LogisticRegression(max_iter=1000, n_jobs=None) if 'LogisticRegression' else None  # placeholder

In [9]:
# Cross-validation com blending simples (pesos otimizados por busca discreta curta)
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
oof_preds = np.zeros(len(X_sel))
test_preds_collect = []
weights_grid = [(0.33,0.33,0.34),(0.4,0.3,0.3),(0.5,0.25,0.25),(0.34,0.4,0.26),(0.37,0.33,0.30)]
best_w = None; best_acc = -1
fold_metrics = []
for fold,(tr_idx,val_idx) in enumerate(skf.split(X_sel, y),1):
    X_tr, X_val = X_sel.iloc[tr_idx], X_sel.iloc[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]
    rf.fit(X_tr, y_tr)
    et.fit(X_tr, y_tr)
    hgb.fit(X_tr, y_tr)
    prf = rf.predict_proba(X_val)[:,1]
    pet = et.predict_proba(X_val)[:,1]
    phg = hgb.predict_proba(X_val)[:,1]
    # Escolher melhor peso por acurácia local
    local_best = -1; local_w = None; local_pred = None
    for wr,we,wh in weights_grid:
        blend = wr*prf + we*pet + wh*phg
        pred_label = (blend >= 0.5).astype(int)
        acc = accuracy_score(y_val, pred_label)
        if acc > local_best:
            local_best = acc; local_w = (wr,we,wh); local_pred = blend
    oof_preds[val_idx] = local_pred
    if local_best > best_acc:
        best_acc = local_best; best_w = local_w
    fold_metrics.append(local_best)
    # Predições em test para este fold
    prf_t = rf.predict_proba(X_test_sel)[:,1]
    pet_t = et.predict_proba(X_test_sel)[:,1]
    phg_t = hgb.predict_proba(X_test_sel)[:,1]
    test_preds_collect.append(local_w[0]*prf_t + local_w[1]*pet_t + local_w[2]*phg_t)
print('Melhor peso global encontrado:', best_w)
print('Acurácias por fold:', np.round(fold_metrics,4))
oof_labels = (oof_preds >= 0.5).astype(int)
cv_accuracy = accuracy_score(y, oof_labels)
cv_precision = precision_score(y, oof_labels)
cv_recall = recall_score(y, oof_labels)
cv_f1 = f1_score(y, oof_labels)
print(f'CV Accuracy: {cv_accuracy:.4f} | Precision: {cv_precision:.4f} | Recall: {cv_recall:.4f} | F1: {cv_f1:.4f}')

Melhor peso global encontrado: (0.33, 0.33, 0.34)
Acurácias por fold: [0.8172 0.7742 0.7935 0.7826 0.7391 0.8152 0.8261]
CV Accuracy: 0.7926 | Precision: 0.7934 | Recall: 0.9187 | F1: 0.8514


In [10]:
# Threshold tuning rápido (grid simples) para maximizar accuracy OOF
best_thr = 0.5; best_thr_acc = -1
for thr in np.linspace(0.3,0.7,21):
    pred_l = (oof_preds >= thr).astype(int)
    acc = accuracy_score(y, pred_l)
    if acc > best_thr_acc:
        best_thr_acc = acc; best_thr = thr
print(f'Melhor threshold: {best_thr:.3f} | Accuracy OOF: {best_thr_acc:.4f}')

Melhor threshold: 0.500 | Accuracy OOF: 0.7926


In [11]:
# Predição final (média das predições por fold já ponderadas)
final_proba = np.mean(test_preds_collect, axis=0)
final_label = (final_proba >= best_thr).astype(int)
submission[target_col] = final_label
out_name = f'submission_compacto_thr{best_thr:.3f}.csv'
submission.to_csv(out_name, index=False)
print('Arquivo de submissão salvo em:', out_name)

Arquivo de submissão salvo em: submission_compacto_thr0.500.csv


## Notas finais
- Este notebook compacto não inclui EDA detalhada.
- Pesos do ensemble selecionados por busca discreta local em cada fold; melhor combinação global registrada.
- Ajuste simples de threshold para maximizar accuracy.
- Para explicações completas (hipóteses, gráficos, seleção de features), consultar notebook expandido original.