In [None]:
# Import Libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import GradientBoostingClassifier

In [None]:


# Read in the data
rawDF = pd.read_csv("Cancer_Data.csv")

# Take a peek at the data
rawDF.head()


In [None]:
# Drop the 'id' column as its useless to us
rawDF = rawDF.drop(["Unnamed: 32","id"], axis = 1)

# Let's also change diagnosis into a numeric binary variable
rawDF.diagnosis = [1 if each == "M" else 0 for each in rawDF.diagnosis]

# Let's take a deeper look at the data and use the describe function
rawDF.describe()

In [None]:
# Let's also use a for loop to check for NA values
for col in rawDF:
    naCount = rawDF[col].isna().sum()
    print(f"The number of NA values in the {col} col is {naCount}")

In [None]:
# Visualization Functions

# This function generates histograms for each feature to show the distribution of the data
def generateHistograms(rawDF, column_names):
    n_cols = 5
    n_rows = (len(column_names) + n_cols - 1) // n_cols
    axs = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 5))
    axs = axs.flatten()
    
    for i, col in enumerate(column_names):
        axs[i].hist(rawDF[col].dropna(), bins=15)
        axs[i].set_xlabel(col, labelpad=14)
        axs[i].set_ylabel("Frequency", labelpad=14)
        axs[i].set_title(f"Distribution of {col}")

    for j in range(i + 1, len(axs)):
        axs[j].axis('off')

    plt.tight_layout()
    plt.show()

# This function generates boxplots for each feature to show outliers
# That said, we may not need to worry about outliers in the context of this data
def generateBoxplots(rawDF, column_names):
    n_cols = 5
    n_rows = (len(column_names) + n_cols - 1) // n_cols
    axs = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 5))
    axs = axs.flatten()
    
    for i, col in enumerate(column_names):
        sns.boxplot(x=rawDF[col], ax=axs[i])
        axs[i].set_title(f"Boxplot of {col}")

    for j in range(i + 1, len(axs)):
        axs[j].axis('off')

    plt.tight_layout()
    plt.show()

columns_to_plot = rawDF.columns.drop('diagnosis')
generateHistograms(rawDF, columns_to_plot)
generateBoxplots(rawDF, columns_to_plot)

In [None]:
# Triangle Correlation Heatmap
plt.figure(figsize=(20, 8))
mask = np.triu(np.ones_like(rawDF.corr(), dtype=bool))
heatmap = sns.heatmap(rawDF.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Triangle Correlation Heatmap', fontdict={'fontsize':18}, pad=16)
plt.savefig('heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Splitting Data

yDF = rawDF["diagnosis"]
xDF = rawDF.drop(columns=["diagnosis"], axis=1)

xTrain, xTest, yTrain, yTest = train_test_split(
    xDF, yDF, 
    test_size=0.05,
    shuffle = True,
    random_state=42)

In [None]:
# Include SMOTE to handle class imbalance

smote = SMOTE(random_state=42)
xTrainBalanced, yTrainBalanced = smote.fit_resample(xTrain, yTrain)

In [None]:
# Feature Selection

select = SelectKBest(f_classif, k=25)
xTrainSelect = select.fit_transform(xTrainBalanced, yTrainBalanced)
xTestSelect = select.transform(xTest)

In [None]:
# Standardization

xScaler = StandardScaler()
xTrainScaled = xScaler.fit_transform(xTrainBalanced, yTrainBalanced)
xTestScaled = xScaler.transform(xTestSelect)

In [None]:
# Logistic Regression Hyperparameter Tuning

logParams = {'penalty': ['l1', 'l2'], 
            'C': [0.01, 0.1, 1],
            'solver': ['liblinear']}

logSearch = GridSearchCV(estimator = LogisticRegression(),  
                           param_grid = logParams,
                           scoring = 'recall',
                           cv = 5,
                           verbose=0,
                           n_jobs=-1)


logSearch.fit(xTrainScaled, yTrainBalanced) 

logBestParams = logSearch.best_params_

logBestParams

In [None]:
# SVM Hyperparameter Tuning

svmParams = {'C':[1, 10, 100, 1000],
            'gamma':[1, 0.1, 0.001, 0.0001], 
            'kernel':['linear','rbf']}

svmSearch = GridSearchCV(estimator = SVC(),  
                        param_grid = svmParams,
                        scoring = 'recall',
                        cv = 5,
                        verbose=0,
                        n_jobs=-1)


svmSearch.fit(xTrainScaled, yTrain) 

svmBestParams = svmSearch.best_params_

svmBestParams

In [None]:
# MLP Hyperparameter Tuning

mlpParams = {'max_iter': [1000],
            'hidden_layer_sizes': [(50,50), (50,50,50), (100)],
            'activation': ['relu'],
            'solver': ['adam'],
            'alpha': [0.0001, 0.05],
            'learning_rate': ['constant','adaptive']}

mlpSearch = GridSearchCV(estimator = MLPClassifier(),  
                           param_grid = mlpParams,
                           scoring = 'recall',
                           cv = 5,
                           verbose=0,
                           n_jobs=-1)


mlpSearch.fit(xTrainScaled, yTrain) 

mlpBestParams = mlpSearch.best_params_

mlpBestParams

In [None]:
# Decision Tree Hyperparameter Tuning

treeParams = {'criterion': ['gini', 'entropy'],
              'splitter': ['best', 'random'],
              'max_depth': [None, 10, 20, 30, 40, 50],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'max_features': [None, 'sqrt', 'log2']}

              
treeSearch = GridSearchCV(estimator = DecisionTreeClassifier(),  
                           param_grid = treeParams,
                           scoring = 'recall',
                           cv = 5,
                           verbose=0,
                           n_jobs=-1)


treeSearch.fit(xTrainScaled, yTrain) 

treeBestParams = treeSearch.best_params_

treeBestParams

In [None]:
# Random Forest Hyperparameter Tuning

rfParams = {'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', None]}

rfSearch = GridSearchCV(estimator=RandomForestClassifier(),
                             param_grid=rfParams,
                             scoring='recall',
                             cv=5,
                             verbose=0,
                             n_jobs=-1)  # Use all available CPUs

rfSearch.fit(xTrainScaled, yTrain)

rfBestParams = rfSearch.best_params_

rfBestParams

In [None]:
knnParams = {'n_neighbors': [3, 5, 7, 9, 11],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'p': [1, 2]}

knnSearch = GridSearchCV(estimator=KNeighborsClassifier(),
                        param_grid=knnParams,
                        scoring='recall',
                        cv=5,
                        verbose=0,
                        n_jobs=-1)

knnSearch.fit(xTrainScaled, yTrain)

knnBestParams = knnSearch.best_params_

knnBestParams


In [None]:
# Gradient Boosting Hyperparameter Tuning

# Gradient Boosting model
gbParams = {'n_estimators': [100, 200, 300], 
             'learning_rate': [0.01, 0.1, 0.5, 1], 
             'max_depth': [3, 4, 5, 6]}

gbSearch = GridSearchCV(estimator=GradientBoostingClassifier(), 
                         param_grid= gbParams, 
                         scoring='recall', 
                         cv=5,
                         verbose=0,
                         n_jobs=-1)

gbSearch.fit(xTrainScaled, yTrainBalanced)

gbBestParams = gbSearch.best_params_

gbBestParams

In [None]:
# Cross-validation function
def crossValidate(clf, xTrain, yTrain, params, cv=5):
    cv = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    accuracyScores, f1Scores, precisionScores, recallScores = [], [], [], []

    for i, (trainIndex, valIndex) in enumerate(cv.split(xTrain, yTrain)):
        xLocalTrain, xVal = xTrain[trainIndex], xTrain[valIndex]
        yLocalTrain, yVal = yTrain[trainIndex], yTrain[valIndex]

        model = clf(**params)
        model.fit(xLocalTrain, yLocalTrain)

        yPred = model.predict(xVal)

        accuracyScores.append(accuracy_score(yVal, yPred))
        f1Scores.append(f1_score(yVal, yPred))
        precisionScores.append(precision_score(yVal, yPred))
        recallScores.append(recall_score(yVal, yPred))

        print(f"Completed Fold {i}")
        print(f"    Accuracy={accuracyScores[i]}    Recall={recallScores[i]}    f1Score={f1Scores[i]}")

    print("Mean accuracy score:", np.mean(accuracyScores))
    print("Mean f1Score:", np.mean(f1Scores))
    print("Mean precision score:", np.mean(precisionScores))
    print("Mean recall score:", np.mean(recallScores))

In [None]:
# Cross-validate models
crossValidate(LogisticRegression, xTrainScaled, yTrainBalanced, logBestParams)
crossValidate(SVC, xTrainScaled, yTrainBalanced, svmBestParams)
crossValidate(MLPClassifier, xTrainScaled, yTrainBalanced, mlpBestParams)
crossValidate(DecisionTreeClassifier, xTrainScaled, yTrainBalanced, treeBestParams)
crossValidate(RandomForestClassifier, xTrainScaled, yTrainBalanced, rfBestParams)
crossValidate(KNeighborsClassifier, xTrainScaled, yTrainBalanced, knnBestParams)
crossValidate(GradientBoostingClassifier, xTrainScaled, yTrainBalanced, gbBestParams)