In [1]:
import pandas as pd

df = pd.read_csv('https://archive.ics.uci.edu/ml/'
                 'machine-learning-databases'
                 '/breast-cancer-wisconsin/wdbc.data', header=None)

# if the Breast Cancer dataset is temporarily unavailable from the
# UCI machine learning repository, un-comment the following line
# of code to load the dataset from a local path:

# df = pd.read_csv('wdbc.data', header=None)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [2]:
from sklearn.preprocessing import LabelEncoder

X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

array(['B', 'M'], dtype=object)

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y,
                     test_size=0.20,
                     stratify=y,
                     random_state=1)

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(StandardScaler(),
                        PCA(n_components=2),
                        LogisticRegression())



In [5]:
pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)
test_acc = pipe_lr.score(X_test, y_test)
print(f'Test accuracy: {test_acc:.3f}')

Test accuracy: 0.956


In [6]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Crear pipelines
pipe_lr = Pipeline([
    ('scaler', StandardScaler()),  # Escalado
    ('pca', PCA()),                # PCA
    ('classifier', LogisticRegression())  # Regresión logística
])

pipe_svc = Pipeline([
    ('scaler', MinMaxScaler()),  # Otro tipo de escalado
    ('pca', PCA()),              # PCA
    ('classifier', SVC())        # SVM
])

# Definir hiperparámetros a buscar para cada pipeline
param_grid = [
    {
        'scaler': [StandardScaler(), MinMaxScaler()],  # Probar diferentes tipos de escalado
        'pca__n_components': [2, 3],                  # Número de componentes PCA
        'classifier__C': [0.1, 1, 10],                # Regularización para Logistic Regression
        'classifier': [LogisticRegression()]
    },
    {
        'scaler': [StandardScaler(), MinMaxScaler()],
        'pca__n_components': [2, 3],
        'classifier__C': [0.1, 1, 10],                # Regularización para SVM
        'classifier__kernel': ['linear', 'rbf'],       # Tipos de kernel para SVM
        'classifier': [SVC()]
    }
]

# Configurar GridSearchCV para buscar los mejores parámetros en las pipelines
grid_search = GridSearchCV(Pipeline([('scaler', StandardScaler()), ('pca', PCA()), ('classifier', LogisticRegression())]),
                           param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Ajustar el modelo (entrenar) con GridSearchCV
grid_search.fit(X_train, y_train)

# Imprimir los mejores hiperparámetros y el mejor modelo
print("Mejores parámetros encontrados:")
print(grid_search.best_params_)

print("\nMejor puntuación (accuracy):")
print(grid_search.best_score_)

# Predecir con el mejor modelo en los datos de prueba
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluar el modelo final en el conjunto de prueba
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy en el conjunto de prueba: {accuracy}")

Mejores parámetros encontrados:
{'classifier': LogisticRegression(), 'classifier__C': 10, 'pca__n_components': 3, 'scaler': MinMaxScaler()}

Mejor puntuación (accuracy):
0.9516483516483516

Accuracy en el conjunto de prueba: 0.9649122807017544


In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np

# Crear el objeto KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # shuffle=True para mezclar los datos

# Crear el modelo con más iteraciones
model = LogisticRegression(max_iter=10000)

# Lista para almacenar las precisiones y los modelos entrenados en cada fold
accuracies = []
models = []

# Aplicar K-Fold Cross Validation
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Escalar los datos (fit en los datos de entrenamiento y transform en el test)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Entrenar el modelo
    model.fit(X_train_scaled, y_train)

    # Guardar el modelo entrenado
    models.append(model)

    # Predecir y calcular la precisión
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f"Accuracy: {accuracy}")

# Encontrar el modelo con la mejor precisión
best_fold_index = np.argmax(accuracies)
best_accuracy = accuracies[best_fold_index]
best_model = models[best_fold_index]

print(f"\nBest Fold Accuracy: {best_accuracy}")
print(f"Best Model Coefficients (weights): {best_model.coef_}")



Accuracy: 0.9736842105263158
Accuracy: 0.9824561403508771
Accuracy: 0.9649122807017544
Accuracy: 0.9912280701754386
Accuracy: 0.9734513274336283

Best Fold Accuracy: 0.9912280701754386
Best Model Coefficients (weights): [[ 0.40156378  0.38658596  0.38863478  0.51110221  0.51285053 -0.26107407
   0.74908943  0.75347212 -0.17000218 -0.26852376  1.18207024 -0.15739296
   0.63153046  1.04213482  0.11121105 -1.02821908 -0.11372338  0.42809495
  -0.35651824 -0.41297071  0.96562907  1.20953785  0.84060571  0.96117449
   0.44605483  0.0273856   0.82957485  0.80472483  0.80281117  0.54589541]]


In [8]:
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score
import numpy as np

# Crear el objeto LeaveOneOut
loo = LeaveOneOut()

# Listas para almacenar las predicciones y las etiquetas verdaderas
y_true_all = []
y_pred_all = []

# Aplicar Leave-One-Out Cross Validation
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Entrenar el modelo
    model.fit(X_train, y_train)

    # Predecir la etiqueta de prueba
    y_pred = model.predict(X_test)

    # Almacenar las predicciones y las etiquetas verdaderas
    y_pred_all.append(y_pred[0])
    y_true_all.append(y_test[0])

# Calcular la precisión total
accuracy = accuracy_score(y_true_all, y_pred_all)
print(f"Final Accuracy: {accuracy}")



Final Accuracy: 0.9543057996485061


In [9]:
from sklearn.model_selection import StratifiedKFold

# Crear el objeto StratifiedKFold
skf = StratifiedKFold(n_splits=5)  # 5 folds

# Aplicar Stratified K-Fold Cross Validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Entrenar el modelo
    model.fit(X_train, y_train)

    # Predecir y calcular la precisión
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")


Accuracy: 0.9385964912280702
Accuracy: 0.9473684210526315
Accuracy: 0.9824561403508771
Accuracy: 0.9298245614035088
Accuracy: 0.9557522123893806
