In [28]:
# Imports
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier
import pickle

In [29]:
# Load the Breast Cancer dataset from sklearn
# The dataset contains features related to the characteristics of cell nuclei present in breast cancer biopsies
data = load_breast_cancer()

In [30]:
# Convert the data into a pandas DataFrame for easier manipulation
# 'data' contains the features and 'target' contains the labels (0 = malignant, 1 = benign)
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

In [31]:
# Feature selection: Select the top 10 features using SelectKBest
# SelectKBest selects features that have the highest correlation with the target variable (y) using the ANOVA F-value test
selector = SelectKBest(f_classif, k=10)  # Select the 10 most relevant features
X_selected = selector.fit_transform(X, y)  # Fit the selector and apply it to the data

In [32]:
# Get the names of the selected features
selected_features = X.columns[selector.get_support()]  # Get the column names for selected features
print(f"Selected features: {selected_features}")

Selected features: Index(['mean radius', 'mean perimeter', 'mean area', 'mean concavity',
       'mean concave points', 'worst radius', 'worst perimeter', 'worst area',
       'worst concavity', 'worst concave points'],
      dtype='object')


In [33]:
# Save the selected feature names to a file for later use 
with open('selected_features.pkl', 'wb') as f:
    pickle.dump(selected_features, f)


In [34]:
# Split the dataset into training and test sets (using only the selected features)
# This helps evaluate the model's performance on unseen data
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [35]:
# Scale the features using StandardScaler
# StandardScaler standardizes the data by removing the mean and scaling to unit variance
# This ensures that all features are on the same scale, which helps many machine learning algorithms perform better
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform the training data
X_test_scaled = scaler.transform(X_test)  # Transform the test data (without fitting to prevent data leakage)

In [36]:
# Save the scaler to a file for later use in the Streamlit app
# Save the scaler to a pickle file
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [37]:
# Define the model: MLPClassifier (Multi-layer Perceptron)
# MLPClassifier is a neural network model that can be used for classification tasks
mlp = MLPClassifier(max_iter=500, random_state=42)  # Specify a maximum of 500 iterations for training

In [38]:
# Define the hyperparameter grid for GridSearchCV
# GridSearchCV will try all combinations of parameters to find the best model
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (200,)],  # Different sizes for the hidden layers
    'activation': ['tanh', 'relu'],  # Activation functions (non-linearities)
    'solver': ['adam', 'sgd'],  # Solvers for optimization (adam is faster and often more reliable)
    'alpha': [0.0001, 0.001, 0.01],  # Regularization parameter to avoid overfitting
    'learning_rate': ['constant', 'adaptive']  # How the learning rate changes during training
}

# Perform GridSearchCV to find the best hyperparameters for the model
# We use 5-fold cross-validation to evaluate each combination of parameters
grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the GridSearchCV on the training data to find the best model
grid_search.fit(X_train_scaled, y_train)

# Output the best hyperparameters and the corresponding score (accuracy) from GridSearchCV
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters found: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (200,), 'learning_rate': 'constant', 'solver': 'adam'}
Best cross-validation score: 0.9648


In [39]:
# Get the best model from GridSearchCV (the one with the best hyperparameters)
best_model = grid_search.best_estimator_

In [40]:
# Evaluate the model on the test set using classification metrics
# We predict the labels for the test set and compare them to the true labels
from sklearn.metrics import classification_report
y_pred = best_model.predict(X_test_scaled)

In [41]:
# Save the best model to a file for use in a Streamlit app or for future use
with open('model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Print a confirmation message to indicate the model, scaler, and selected features have been saved successfully
print("Model, scaler, and selected features saved successfully!")

Model, scaler, and selected features saved successfully!
