In [None]:
# --------------------------------------------------
# 0. Pacotes
# --------------------------------------------------
from sklearn.datasets import load_digits
from sklearn.model_selection import (train_test_split,
                                     StratifiedKFold,
                                     cross_validate)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score,
                             f1_score,
                             make_scorer)
from sklearn.neural_network import MLPClassifier
import numpy as np
import pandas as pd

# --------------------------------------------------
# 1. Wrapper: MLP com escolha de inicialização
# --------------------------------------------------
class InitMLP(MLPClassifier):
    """
    MLPClassifier com escolha da estratégia de inicialização:
    'glorot' (padrão), 'normal' ou 'he_uniform'.
    """
    def __init__(self, *,              # força kwargs-only
                 weight_init="glorot",  # novo parâmetro
                 **kwargs):             # passa o resto para o MLP original
        super().__init__(**kwargs)
        self.weight_init = weight_init

    # --- substitui os pesos depois da _initialize do pai ---
    def _initialize(self, y, layer_units, dtype):
        super()._initialize(y, layer_units, dtype)
        rng = self._random_state
        for i, (fan_in, fan_out) in enumerate(zip(layer_units[:-1],
                                                  layer_units[1:])):
            shape = (fan_in, fan_out)
            if self.weight_init == "normal":
                scale = 1. / np.sqrt(fan_in)
                self.coefs_[i] = rng.normal(0.0, scale, size=shape)
            elif self.weight_init == "he_uniform":
                limit = np.sqrt(6. / fan_in)
                self.coefs_[i] = rng.uniform(-limit, limit, size=shape)
            # ‘glorot’ já foi gerado pelo método do pai

# --------------------------------------------------
# 2. Dados
# --------------------------------------------------
X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42)

# --------------------------------------------------
# 3. Configurações (arquitetura, L2, inicialização)
# --------------------------------------------------

# Escopo de configs considerado para as arquiteturas:
# 1 a 4 camadas
# 16 a 256 camadas
# alpha: 1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5
# inicialização: 'glorot', 'normal', 'he_uniform'
# ativation: 'relu', 'logistic', 'tanh' (padrão é 'relu')
configs = {
    #  nome        layers                   alpha       init
    "glo64_l2-4": dict(layers=(64,),       alpha=1e-4, weight_init="glorot"),
    "glo64_l2-3": dict(layers=(64,),       alpha=1e-3, weight_init="glorot"),
    "norm64_l2-4":dict(layers=(64,),       alpha=1e-4, weight_init="normal"),
    "heDeep_l2-4":dict(layers=(128, 64),   alpha=1e-4, weight_init="he_uniform"),
    "heDeep_l2-3":dict(layers=(128, 64),   alpha=1e-3, weight_init="he_uniform"),
    ## ... adicione mais 5 ou mais combinacoes ...
    # "reluWide_l2-2": dict(layers=(n,n),
    #                   alpha=1e-2,
    #                   weight_init="he_uniform",
    #                   activation="relu")
    "norm128_l2-3": dict(layers=(128,), alpha=1e-3, weight_init="normal", activation="relu"),
    "he64_l2-2":    dict(layers=(64,),  alpha=1e-2, weight_init="he_uniform", activation="logistic"),
    "glo256_128_64_l2-1": dict(layers=(256, 128, 64), alpha=1e-1, weight_init="glorot", activation="tanh"),
    "he256_16_l2-3":  dict(layers=(256, 16), alpha=1e-3, weight_init="he_uniform", activation="relu"),
    "glo32_128_64_l2-5": dict(layers=(32, 128, 64), alpha=1e-5, weight_init="glorot", activation="logistic"),
    "norm128_64_32_l2-1": dict(layers=(128, 64, 32), alpha=1e-1, weight_init="normal", activation="tanh"),
    "he128_64_l2-2":   dict(layers=(128, 64), alpha=1e-2, weight_init="he_uniform", activation="relu"),
    "norm256_l2-4": dict(layers=(256,), alpha=1e-4, weight_init="normal", activation="logistic"),
    "glo32x4_l2-0":      dict(layers=(32, 32, 32, 32), alpha=1, weight_init="glorot", activation="relu")
}

# --------------------------------------------------
# 4. Validação cruzada no treino
# --------------------------------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {"acc": "accuracy",
           "f1":  make_scorer(f1_score, average="macro")}

rows = []
for name, p in configs.items():
    clf = InitMLP(
        hidden_layer_sizes=p["layers"],
        alpha=p["alpha"],
        weight_init=p["weight_init"],
        max_iter=200,
        early_stopping=True,
        n_iter_no_change=5,
        learning_rate_init=1e-3,
        solver="adam",
        random_state=42,
    )


    pipe = Pipeline([("scale", StandardScaler()),
                     ("clf",   clf)])

    res = cross_validate(pipe, X_train, y_train,
                         cv=cv, scoring=scoring,
                         return_train_score=False)

    rows.append({
        "config":   name,
        "layers":   p["layers"],
        "alpha":    p["alpha"],
        "init":     p["weight_init"],
        "f1_mean":  res["test_f1"].mean(),
        "f1_std":   res["test_f1"].std(),
        "acc_mean": res["test_acc"].mean(),
        "acc_std":  res["test_acc"].std(),
    })

    print(f"{name:12s} | CV macro-F1 = "
          f"{res['test_f1'].mean():.4f} ± {res['test_f1'].std():.4f}")

summary = (pd.DataFrame(rows)
              .sort_values("f1_mean", ascending=False))

best_conf  = summary.iloc[0]
best_name  = best_conf["config"]
best_param = configs[best_name]
print("\n>> Selecionado:", best_name, dict(best_param))

# --------------------------------------------------
# 5. Re-treino em todo o treino + teste final
# --------------------------------------------------
best_clf = InitMLP(
    hidden_layer_sizes=best_param["layers"],
    alpha=best_param["alpha"],
    weight_init=best_param["weight_init"],
    max_iter=200,
    early_stopping=True,
    n_iter_no_change=5,
    learning_rate_init=1e-3,
    solver="adam",
    random_state=42,
)

best_pipe = Pipeline([("scale", StandardScaler()),
                      ("clf",   best_clf)])
best_pipe.fit(X_train, y_train)

y_pred   = best_pipe.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
test_f1  = f1_score(y_test, y_pred, average="macro")

print(f"\n>> TESTE | acc = {test_acc:.5f} | macro-F1 = {test_f1:.5f}")

# --------------------------------------------------
# 6. Tabela resumo (para o relatório)
# --------------------------------------------------
print("\nResumo completo:")
display(summary[["config", "layers", "alpha", "init",
                 "acc_mean", "acc_std", "f1_mean", "f1_std"]])


glo64_l2-4   | CV macro-F1 = 0.9749 ± 0.0087
glo64_l2-3   | CV macro-F1 = 0.9756 ± 0.0058
norm64_l2-4  | CV macro-F1 = 0.9714 ± 0.0068
heDeep_l2-4  | CV macro-F1 = 0.9722 ± 0.0100
heDeep_l2-3  | CV macro-F1 = 0.9736 ± 0.0095
norm128_l2-3 | CV macro-F1 = 0.9771 ± 0.0084
he64_l2-2    | CV macro-F1 = 0.9728 ± 0.0098
glo256_128_64_l2-1 | CV macro-F1 = 0.9736 ± 0.0117
he256_16_l2-3 | CV macro-F1 = 0.9777 ± 0.0053
glo32_128_64_l2-5 | CV macro-F1 = 0.9784 ± 0.0067
norm128_64_32_l2-1 | CV macro-F1 = 0.9791 ± 0.0049
he128_64_l2-2 | CV macro-F1 = 0.9791 ± 0.0072
norm256_l2-4 | CV macro-F1 = 0.9798 ± 0.0086
glo32x4_l2-0 | CV macro-F1 = 0.9771 ± 0.0074

>> Selecionado: norm256_l2-4 {'layers': (256,), 'alpha': 0.0001, 'weight_init': 'normal', 'activation': 'logistic'}

>> TESTE | acc = 0.95556 | macro-F1 = 0.95473

Resumo completo:


Unnamed: 0,config,layers,alpha,init,acc_mean,acc_std,f1_mean,f1_std
12,norm256_l2-4,"(256,)",0.0001,normal,0.979815,0.008648,0.979797,0.008625
11,he128_64_l2-2,"(128, 64)",0.01,he_uniform,0.979123,0.0073,0.979127,0.007226
10,norm128_64_32_l2-1,"(128, 64, 32)",0.1,normal,0.979123,0.004924,0.979077,0.00493
9,glo32_128_64_l2-5,"(32, 128, 64)",1e-05,glorot,0.978426,0.006759,0.978419,0.006735
8,he256_16_l2-3,"(256, 16)",0.001,he_uniform,0.977734,0.005198,0.977652,0.005265
13,glo32x4_l2-0,"(32, 32, 32, 32)",1.0,glorot,0.977037,0.007497,0.977095,0.007447
5,norm128_l2-3,"(128,)",0.001,normal,0.97704,0.008394,0.977091,0.008362
1,glo64_l2-3,"(64,)",0.001,glorot,0.975644,0.005831,0.975572,0.005797
0,glo64_l2-4,"(64,)",0.0001,glorot,0.97494,0.008666,0.974929,0.008686
7,glo256_128_64_l2-1,"(256, 128, 64)",0.1,glorot,0.973553,0.01179,0.973609,0.011749
