In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# Carregar dataset Breast Cancer (binário: maligno/benigno)
data = load_breast_cancer(as_frame=True)

# Todas as 30 features contínuas
X = data.data
y = data.target

# Divisão estratificada 70/30
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=0
)

print("Tamanho treino:", X_train.shape)
print("Tamanho teste:", X_test.shape)

Tamanho treino: (398, 30)
Tamanho teste: (171, 30)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Pipeline baseline (A)
clf = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("knn", KNeighborsClassifier(
            n_neighbors=5,
            weights='uniform',
            metric='euclidean'
        ))
    ]
)


In [None]:
# k-NN no Breast Cancer com scikit-learn: baseline + tuning + avaliação
import numpy as np
import matplotlib.pyplot as plt
import joblib
import os

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    ConfusionMatrixDisplay, roc_curve, auc
)

# Diretório para salvar artefatos
OUTDIR = "outputs"
os.makedirs(OUTDIR, exist_ok=True)

#  Dados
data = load_breast_cancer(as_frame=True)
X, y = data.data, data.target  # 30 features
class_names = ["malignant", "benign"]

#  Split estratificado 70/30
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

#  Configuração A: Baseline
pipe_A = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='euclidean'))
])
pipe_A.fit(X_train, y_train)

# 4) GridSearchCV (Configuração B)
pipe = Pipeline([("scaler", StandardScaler()), ("knn", KNeighborsClassifier())])
param_grid = {
    "knn__n_neighbors": [1, 3, 5, 7, 9, 11, 15, 21],
    "knn__weights": ["uniform", "distance"],
    "knn__metric": ["euclidean", "manhattan"]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(pipe, param_grid=param_grid, cv=cv, n_jobs=-1, scoring="accuracy")
grid.fit(X_train, y_train)
best_model = grid.best_estimator_
best_params = grid.best_params_
print("Melhores hiperparâmetros (B):", best_params)

#  Configuração C: Variante
pipe_C = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=7, weights='distance', metric='manhattan'))
])
pipe_C.fit(X_train, y_train)

#  Avaliação função
def evaluate_model(name, model):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else None
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = ConfusionMatrixDisplay.from_predictions(
        y_test, y_pred, display_labels=class_names, cmap="Blues", values_format="d"
    )
    cm.ax_.set_title(f"Matriz de confusão — {name}")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTDIR, f"cm_{name}.png"))
    plt.close()
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "y_proba": y_proba}

res_A = evaluate_model("A_baseline", pipe_A)
res_B = evaluate_model("B_bestgrid", best_model)
res_C = evaluate_model("C_variant", pipe_C)

# 7) ROC Curve
plt.figure(figsize=(6,5))
for res, label in zip([res_A, res_B, res_C], ["A", "B", "C"]):
    if res["y_proba"] is not None:
        fpr, tpr, _ = roc_curve(y_test, res["y_proba"])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{label} (AUC={roc_auc:.3f})")
plt.plot([0,1],[0,1],"--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve — kNN Breast Cancer")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "roc_knn.png"))
plt.close()

# 8) Salvar modelo vencedor (B)
joblib.dump(best_model, os.path.join(OUTDIR, "knn_pipeline_best.joblib"))
print(f"Modelo B salvo em {OUTDIR}/knn_pipeline_best.joblib")

# 9) Exemplo de predição de nova amostra
nova_amostra = np.array([X_test.iloc[0]])  # pega primeira amostra de teste
pred = best_model.predict(nova_amostra)[0]
proba = best_model.predict_proba(nova_amostra)[0]
print(f"Nova amostra → classe: {class_names[pred]} | probabilidades: {proba.round(3)}")

Melhores hiperparâmetros (B): {'knn__metric': 'manhattan', 'knn__n_neighbors': 7, 'knn__weights': 'uniform'}
Modelo B salvo em outputs/knn_pipeline_best.joblib
Nova amostra → classe: malignant | probabilidades: [1. 0.]




In [None]:

# 7️ Gerar tabela A/B/C e exportar CSV
import pandas as pd

# Cria tabela resumida com métricas
table = pd.DataFrame([
    {"Config": "A (baseline)", "k": 5, "weights": "uniform", "metric": "euclidean",
     "Accuracy": res_A["accuracy"], "Precision": res_A["precision"],
     "Recall": res_A["recall"], "F1": res_A["f1"]},

    {"Config": "B (best_grid)",
     "k": best_params["knn__n_neighbors"],
     "weights": best_params["knn__weights"],
     "metric": best_params["knn__metric"],
     "Accuracy": res_B["accuracy"], "Precision": res_B["precision"],
     "Recall": res_B["recall"], "F1": res_B["f1"]},

    {"Config": "C (variant)", "k": 7, "weights": "distance", "metric": "manhattan",
     "Accuracy": res_C["accuracy"], "Precision": res_C["precision"],
     "Recall": res_C["recall"], "F1": res_C["f1"]}
])

# Mostrar tabela no Colab
print("Tabela A/B/C com métricas:")
display(table)

# Salvar CSV
csv_path = os.path.join(OUTDIR, "table_ABC.csv")
table.to_csv(csv_path, index=False)
print(f"Tabela A/B/C salva em: {csv_path}")


Tabela A/B/C com métricas:


Unnamed: 0,Config,k,weights,metric,Accuracy,Precision,Recall,F1
0,A (baseline),5,uniform,euclidean,0.959064,0.938596,1.0,0.968326
1,B (best_grid),7,uniform,manhattan,0.959064,0.938596,1.0,0.968326
2,C (variant),7,distance,manhattan,0.959064,0.938596,1.0,0.968326


Tabela A/B/C salva em: outputs/table_ABC.csv


In [None]:
nova_amostra = np.array([X_test.iloc[0]])  # pega primeira amostra de teste
pred = best_model.predict(nova_amostra)[0]
proba = best_model.predict_proba(nova_amostra)[0]
print(f"Nova amostra → classe: {'malignant' if pred==0 else 'benign'} | probabilidades: {proba.round(3)}")


Nova amostra → classe: malignant | probabilidades: [1. 0.]




In [21]:
# =========================================
# Gerar scripts de inferência: CLI e CSV
# =========================================
predict_one_code = f"""
import joblib
import numpy as np
import sys

# Carregar modelo
model = joblib.load("{OUTDIR}/knn_pipeline_best.joblib")
class_names = ["malignant", "benign"]

# Receber 30 features como argumentos
if len(sys.argv) != 31:
    print("Uso: python predict_one.py f1 f2 ... f30")
    sys.exit(1)

features = np.array([float(x) for x in sys.argv[1:]]).reshape(1, -1)
pred = model.predict(features)[0]
proba = model.predict_proba(features)[0]
print(f"Classe prevista: {{class_names[pred]}}")
print(f"Probabilidades: {{proba.round(3)}}")
"""

predict_csv_code = f"""
import joblib
import pandas as pd
import sys
import numpy as np

# Carregar modelo
model = joblib.load("{OUTDIR}/knn_pipeline_best.joblib")
class_names = ["malignant", "benign"]

# Receber CSV como argumento
if len(sys.argv) != 2:
    print("Uso: python predict_csv.py arquivo.csv")
    sys.exit(1)

csv_file = sys.argv[1]
df = pd.read_csv(csv_file)

if df.shape[1] != 30:
    print("O CSV deve ter 30 colunas (features).")
    sys.exit(1)

preds = model.predict(df.values)
probs = model.predict_proba(df.values)

for i, (p, prob) in enumerate(zip(preds, probs)):
    print(f"Amostra {{i}}: Classe prevista: {{class_names[p]}}, Probabilidades: {{prob.round(3)}}")
"""

# Salvar scripts
with open("predict_one.py", "w") as f:
    f.write(predict_one_code)

with open("predict_csv.py", "w") as f:
    f.write(predict_csv_code)

print("Scripts predict_one.py e predict_csv.py gerados com sucesso!")


Scripts predict_one.py e predict_csv.py gerados com sucesso!


In [11]:
# Prever uma amostra (exemplo com 30 valores fictícios)
!python predict_one.py 17.99 10.38 122.8 1001 0.1184 0.2776 0.3001 0.1471 0.2419 0.07871 1.095 0.9053 8.589 153.4 0.006399 0.04904 0.05373 0.01587 0.03003 0.006193 25.38 17.33 184.6 2019 0.1622 0.6656 0.7119 0.2654 0.4601 0.1189


Classe prevista: malignant
Probabilidades: [1. 0.]


In [17]:
# =========================================
# Criar CSV de teste com 30 colunas
# =========================================
import pandas as pd
import numpy as np

# Número de amostras de teste
num_amostras = 3  # você pode mudar para quantas quiser

# Criar dados aleatórios para 30 features
data = np.random.rand(num_amostras, 30)

# Criar DataFrame com colunas f1, f2, ..., f30
df = pd.DataFrame(data, columns=[f"f{i+1}" for i in range(30)])

# Salvar CSV
csv_test_file = "novas_amostras.csv"
df.to_csv(csv_test_file, index=False)
print(f"CSV de teste criado: {csv_test_file}")
df.head()


CSV de teste criado: novas_amostras.csv


Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30
0,0.975511,0.258846,0.46659,0.24872,0.951589,0.152461,0.563084,0.131391,0.12733,0.298307,...,0.798994,0.380513,0.441298,0.073959,0.909336,0.669996,0.735784,0.196618,0.971911,0.429418
1,0.581542,0.319754,0.284664,0.16577,0.988016,0.630843,0.516598,0.792485,0.056111,0.527679,...,0.919489,0.005894,0.269664,0.722096,0.687614,0.836714,0.180162,0.182522,0.879604,0.771256
2,0.846215,0.099168,0.828524,0.91022,0.477992,0.996887,0.009922,0.121434,0.37284,0.001471,...,0.465479,0.589887,0.874439,0.442747,0.576638,0.820559,0.417855,0.061493,0.0626,0.047855


In [18]:
# Prever várias amostras de CSV
!python predict_csv.py novas_amostras.csv


Amostra 0: Classe prevista: benign, Probabilidades: [0.143 0.857]
Amostra 1: Classe prevista: benign, Probabilidades: [0.286 0.714]
Amostra 2: Classe prevista: benign, Probabilidades: [0.143 0.857]
