In [None]:
# Import Libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import GradientBoostingClassifier

In [None]:


# Read in the data
rawDF = pd.read_csv("Cancer_Data.csv")

# Take a peek at the data
rawDF.head()


In [None]:
# Drop the 'id' column as its useless to us
rawDF = rawDF.drop(["Unnamed: 32","id"], axis = 1)

# Let's also change diagnosis into a numeric binary variable
rawDF.diagnosis = [1 if each == "M" else 0 for each in rawDF.diagnosis]

# Let's take a deeper look at the data and use the describe function
rawDF.describe()

In [None]:
# Let's also use a for loop to check for NA values
for col in rawDF:
    naCount = rawDF[col].isna().sum()
    print(f"The number of NA values in the {col} col is {naCount}")

In [None]:
# Visualization Functions

# This function generates histograms for each feature to show the distribution of the data
def generateHistograms(rawDF, column_names):
    n_cols = 5
    n_rows = (len(column_names) + n_cols - 1) // n_cols
    axs = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 5))
    axs = axs.flatten()
    
    for i, col in enumerate(column_names):
        axs[i].hist(rawDF[col].dropna(), bins=15)
        axs[i].set_xlabel(col, labelpad=14)
        axs[i].set_ylabel("Frequency", labelpad=14)
        axs[i].set_title(f"Distribution of {col}")

    for j in range(i + 1, len(axs)):
        axs[j].axis('off')

    plt.tight_layout()
    plt.show()

# This function generates boxplots for each feature to show outliers
# That said, we may not need to worry about outliers in the context of this data
def generateBoxplots(rawDF, column_names):
    n_cols = 5
    n_rows = (len(column_names) + n_cols - 1) // n_cols
    axs = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 5))
    axs = axs.flatten()
    
    for i, col in enumerate(column_names):
        sns.boxplot(x=rawDF[col], ax=axs[i])
        axs[i].set_title(f"Boxplot of {col}")

    for j in range(i + 1, len(axs)):
        axs[j].axis('off')

    plt.tight_layout()
    plt.show()

columns_to_plot = rawDF.columns.drop('diagnosis')
generateHistograms(rawDF, columns_to_plot)
generateBoxplots(rawDF, columns_to_plot)

In [None]:
# Triangle Correlation Heatmap
plt.figure(figsize=(20, 8))
mask = np.triu(np.ones_like(rawDF.corr(), dtype=bool))
heatmap = sns.heatmap(rawDF.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Triangle Correlation Heatmap', fontdict={'fontsize':18}, pad=16)
plt.savefig('heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Splitting Data

yDF = rawDF["diagnosis"]
xDF = rawDF.drop(columns=["diagnosis"], axis=1)

xTrain, xTest, yTrain, yTest = train_test_split(
    xDF, yDF, 
    test_size=0.05,
    shuffle = True,
    random_state=42)

In [None]:
# Include SMOTE to handle class imbalance

smote = SMOTE(random_state=42)
xTrainBalanced, yTrainBalanced = smote.fit_resample(xTrain, yTrain)

In [None]:
# Feature Selection

select = SelectKBest(f_classif, k=25)
xTrainSelect = select.fit_transform(xTrainBalanced, yTrainBalanced)
xTestSelect = select.transform(xTest)

In [None]:
# Standardization

xScaler = StandardScaler()
xTrainScaled = xScaler.fit_transform(xTrainBalanced, yTrainBalanced)
xTestScaled = xScaler.transform(xTestSelect)

In [None]:
# Logistic Regression Hyperparameter Tuning

logParams = {'penalty': ['l1', 'l2'], 
            'C': [0.01, 0.1, 1],
            'solver': ['liblinear']}

logSearch = GridSearchCV(estimator = LogisticRegression(),  
                           param_grid = logParams,
                           scoring = 'recall',
                           cv = 5,
                           verbose=0,
                           n_jobs=-1)


logSearch.fit(xTrainScaled, yTrainBalanced) 

logBestParams = logSearch.best_params_

logBestParams

In [None]:
# SVM Hyperparameter Tuning

svmParams = {'C':[1, 10, 100, 1000],
            'gamma':[1, 0.1, 0.001, 0.0001], 
            'kernel':['linear','rbf']}

svmSearch = GridSearchCV(estimator = SVC(),  
                        param_grid = svmParams,
                        scoring = 'recall',
                        cv = 5,
                        verbose=0,
                        n_jobs=-1)


svmSearch.fit(xTrainScaled, yTrain) 

svmBestParams = svmSearch.best_params_

svmBestParams

In [None]:
# MLP Hyperparameter Tuning

mlpParams = {'max_iter': [1000],
            'hidden_layer_sizes': [(50,50), (50,50,50), (100)],
            'activation': ['relu'],
            'solver': ['adam'],
            'alpha': [0.0001, 0.05],
            'learning_rate': ['constant','adaptive']}

mlpSearch = GridSearchCV(estimator = MLPClassifier(),  
                           param_grid = mlpParams,
                           scoring = 'recall',
                           cv = 5,
                           verbose=0,
                           n_jobs=-1)


mlpSearch.fit(xTrainScaled, yTrain) 

mlpBestParams = mlpSearch.best_params_

mlpBestParams

In [None]:
# Decision Tree Hyperparameter Tuning

treeParams = {'criterion': ['gini', 'entropy'],
              'splitter': ['best', 'random'],
              'max_depth': [None, 10, 20, 30, 40, 50],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'max_features': [None, 'sqrt', 'log2']}

              
treeSearch = GridSearchCV(estimator = DecisionTreeClassifier(),  
                           param_grid = treeParams,
                           scoring = 'recall',
                           cv = 5,
                           verbose=0,
                           n_jobs=-1)


treeSearch.fit(xTrainScaled, yTrain) 

treeBestParams = treeSearch.best_params_

treeBestParams

In [None]:
# Random Forest Hyperparameter Tuning

rfParams = {'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', None]}

rfSearch = GridSearchCV(estimator=RandomForestClassifier(),
                             param_grid=rfParams,
                             scoring='recall',
                             cv=5,
                             verbose=0,
                             n_jobs=-1)  # Use all available CPUs

rfSearch.fit(xTrainScaled, yTrain)

rfBestParams = rfSearch.best_params_

rfBestParams

In [None]:
knnParams = {'n_neighbors': [3, 5, 7, 9, 11],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'p': [1, 2]}

knnSearch = GridSearchCV(estimator=KNeighborsClassifier(),
                        param_grid=knnParams,
                        scoring='recall',
                        cv=5,
                        verbose=0,
                        n_jobs=-1)

knnSearch.fit(xTrainScaled, yTrain)

knnBestParams = knnSearch.best_params_

knnBestParams


In [None]:
# Gradient Boosting Hyperparameter Tuning

# Gradient Boosting model
gbParams = {'n_estimators': [100, 200, 300], 
             'learning_rate': [0.01, 0.1, 0.5, 1], 
             'max_depth': [3, 4, 5, 6]}

gbSearch = GridSearchCV(estimator=GradientBoostingClassifier(), 
                         param_grid= gbParams, 
                         scoring='recall', 
                         cv=5,
                         verbose=0,
                         n_jobs=-1)

gbSearch.fit(xTrainScaled, yTrainBalanced)

gbBestParams = gbSearch.best_params_

gbBestParams

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracyScores = []
f1Scores = []
precisionScores = []
recallScores = []
aucScores = []

for i, (trainIndex, valIndex) in enumerate(cv.split(xTrain, yTrain)):

    ### making training and validation sets
    # Convert indices to list
    trainIndex = trainIndex.tolist()
    valIndex = valIndex.tolist()
    
    # Split the data into training and validation sets for this fold
    xLocalTrain, xVal = xTrain.iloc[trainIndex], xDF.iloc[valIndex]
    yLocalTrain, yVal = yTrain.iloc[trainIndex], yDF.iloc[valIndex]


    ### feature scaling
    xScaler = StandardScaler()
    xColNames = xLocalTrain.columns.values.tolist()
    # train the scaler and apply it to the training set
    xTrainScaled = xScaler.fit_transform(xLocalTrain[xColNames])
    # apply the scaling to the validation set
    xValScaled = xScaler.transform(xVal[xColNames])

    ### model training
    # instantiate the model
    clf = LogisticRegression(**logisticParams)
    # Train the classifier on the training data
    clf.fit(xTrainScaled, yLocalTrain)
    
    ### model prediction and evaluation
    # Make predictions on the test data
    yPred = clf.predict(xValScaled)

    # Calculate metrics and store them
    score = accuracy_score(yVal, yPred)
    accuracyScores.append(score)

    score = f1_score(yVal, yPred)
    f1Scores.append(score)

    score = precision_score(yVal, yPred)
    precisionScores.append(score)

    score = recall_score(yVal, yPred)
    recallScores.append(score)

    print(f"Completed Fold {i}")
    print(f"    Accuracy={accuracyScores[i]}    Recall={recallScores[i]}    FScore={f1Scores[i]}")

## Calculate the mean scores across all folds
mean_score = sum(accuracyScores) / len(accuracyScores)
print("Mean accuracy score:", mean_score)

mean_score = sum(f1Scores) / len(f1Scores)
print("Mean f1 score:", mean_score)

mean_score = sum(precisionScores) / len(precisionScores)
print("Mean precision score:", mean_score)

mean_score = sum(recallScores) / len(recallScores)
print("Mean recall score:", mean_score)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracyScores = []
f1Scores = []
precisionScores = []
recallScores = []
aucScores = []

for i, (trainIndex, valIndex) in enumerate(cv.split(xTrain, yTrain)):

    ### making training and validation sets
    # Convert indices to list
    trainIndex = trainIndex.tolist()
    valIndex = valIndex.tolist()
    
    # Split the data into training and validation sets for this fold
    xLocalTrain, xVal = xTrain.iloc[trainIndex], xDF.iloc[valIndex]
    yLocalTrain, yVal = yTrain.iloc[trainIndex], yDF.iloc[valIndex]


    ### feature scaling
    xScaler = StandardScaler()
    xColNames = xLocalTrain.columns.values.tolist()
    # train the scaler and apply it to the training set
    xTrainScaled = xScaler.fit_transform(xLocalTrain[xColNames])
    # apply the scaling to the validation set
    xValScaled = xScaler.transform(xVal[xColNames])

    ### model training
    # instantiate the model
    clf = DecisionTreeClassifier(**treeParams)
    # Train the classifier on the training data
    clf.fit(xTrainScaled, yLocalTrain)
    
    ### model prediction and evaluation
    # Make predictions on the test data
    yPred = clf.predict(xValScaled)

    # Calculate metrics and store them
    score = accuracy_score(yVal, yPred)
    accuracyScores.append(score)

    score = f1_score(yVal, yPred)
    f1Scores.append(score)

    score = precision_score(yVal, yPred)
    precisionScores.append(score)

    score = recall_score(yVal, yPred)
    recallScores.append(score)

    print(f"Completed Fold {i}")
    print(f"    Accuracy={accuracyScores[i]}    Recall={recallScores[i]}    FScore={f1Scores[i]}")

## Calculate the mean scores across all folds
mean_score = sum(accuracyScores) / len(accuracyScores)
print("Mean accuracy score:", mean_score)

mean_score = sum(f1Scores) / len(f1Scores)
print("Mean f1 score:", mean_score)

mean_score = sum(precisionScores) / len(precisionScores)
print("Mean precision score:", mean_score)

mean_score = sum(recallScores) / len(recallScores)
print("Mean recall score:", mean_score)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracyScores = []
f1Scores = []
precisionScores = []
recallScores = []
aucScores = []

for i, (trainIndex, valIndex) in enumerate(cv.split(xTrain, yTrain)):

    ### making training and validation sets
    # Convert indices to list
    trainIndex = trainIndex.tolist()
    valIndex = valIndex.tolist()
    
    # Split the data into training and validation sets for this fold
    xLocalTrain, xVal = xTrain.iloc[trainIndex], xDF.iloc[valIndex]
    yLocalTrain, yVal = yTrain.iloc[trainIndex], yDF.iloc[valIndex]


    ### feature scaling
    xScaler = StandardScaler()
    xColNames = xLocalTrain.columns.values.tolist()
    # train the scaler and apply it to the training set
    xTrainScaled = xScaler.fit_transform(xLocalTrain[xColNames])
    # apply the scaling to the validation set
    xValScaled = xScaler.transform(xVal[xColNames])

    ### model training
    # instantiate the model
    clf = SVC(**svmParams)
    # Train the classifier on the training data
    clf.fit(xTrainScaled, yLocalTrain)
    
    ### model prediction and evaluation
    # Make predictions on the test data
    yPred = clf.predict(xValScaled)

    # Calculate metrics and store them
    score = accuracy_score(yVal, yPred)
    accuracyScores.append(score)

    score = f1_score(yVal, yPred)
    f1Scores.append(score)

    score = precision_score(yVal, yPred)
    precisionScores.append(score)

    score = recall_score(yVal, yPred)
    recallScores.append(score)

    print(f"Completed Fold {i}")
    print(f"    Accuracy={accuracyScores[i]}    Recall={recallScores[i]}    FScore={f1Scores[i]}")

## Calculate the mean scores across all folds
mean_score = sum(accuracyScores) / len(accuracyScores)
print("Mean accuracy score:", mean_score)

mean_score = sum(f1Scores) / len(f1Scores)
print("Mean f1 score:", mean_score)

mean_score = sum(precisionScores) / len(precisionScores)
print("Mean precision score:", mean_score)

mean_score = sum(recallScores) / len(recallScores)
print("Mean recall score:", mean_score)



In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracyScores = []
f1Scores = []
precisionScores = []
recallScores = []
aucScores = []

for i, (trainIndex, valIndex) in enumerate(cv.split(xTrain, yTrain)):

    ### making training and validation sets
    # Convert indices to list
    trainIndex = trainIndex.tolist()
    valIndex = valIndex.tolist()
    
    # Split the data into training and validation sets for this fold
    xLocalTrain, xVal = xTrain.iloc[trainIndex], xDF.iloc[valIndex]
    yLocalTrain, yVal = yTrain.iloc[trainIndex], yDF.iloc[valIndex]


    ### feature scaling
    xScaler = StandardScaler()
    xColNames = xLocalTrain.columns.values.tolist()
    # train the scaler and apply it to the training set
    xTrainScaled = xScaler.fit_transform(xLocalTrain[xColNames])
    # apply the scaling to the validation set
    xValScaled = xScaler.transform(xVal[xColNames])

    ### model training
    # instantiate the model
    clf = MLPClassifier(**mlpParams)
    # Train the classifier on the training data
    clf.fit(xTrainScaled, yLocalTrain)
    
    ### model prediction and evaluation
    # Make predictions on the test data
    yPred = clf.predict(xValScaled)

    # Calculate metrics and store them
    score = accuracy_score(yVal, yPred)
    accuracyScores.append(score)

    score = f1_score(yVal, yPred)
    f1Scores.append(score)

    score = precision_score(yVal, yPred)
    precisionScores.append(score)

    score = recall_score(yVal, yPred)
    recallScores.append(score)

    print(f"Completed Fold {i}")
    print(f"    Accuracy={accuracyScores[i]}    Recall={recallScores[i]}    FScore={f1Scores[i]}")

## Calculate the mean scores across all folds
mean_score = sum(accuracyScores) / len(accuracyScores)
print("Mean accuracy score:", mean_score)

mean_score = sum(f1Scores) / len(f1Scores)
print("Mean f1 score:", mean_score)

mean_score = sum(precisionScores) / len(precisionScores)
print("Mean precision score:", mean_score)

mean_score = sum(recallScores) / len(recallScores)
print("Mean recall score:", mean_score)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

knn_accuracyScores = []
knn_f1Scores = []
knn_precisionScores = []
knn_recallScores = []

for i, (trainIndex, valIndex) in enumerate(cv.split(xTrain, yTrain)):
    # Split the data
    xLocalTrain, xVal = xTrain.iloc[trainIndex], xTrain.iloc[valIndex]
    yLocalTrain, yVal = yTrain.iloc[trainIndex], yTrain.iloc[valIndex]

    # Feature scaling
    xScaler = StandardScaler()
    xTrainScaled = xScaler.fit_transform(xLocalTrain)
    xValScaled = xScaler.transform(xVal)

    # Instantiate and train the KNN classifier
    knn = KNeighborsClassifier(**knnParams)  # Assuming knnParams is predefined
    knn.fit(xTrainScaled, yLocalTrain)

    # Predict and evaluate
    yPred = knn.predict(xValScaled)
    knn_accuracyScores.append(accuracy_score(yVal, yPred))
    knn_f1Scores.append(f1_score(yVal, yPred))
    knn_precisionScores.append(precision_score(yVal, yPred))
    knn_recallScores.append(recall_score(yVal, yPred))

    print(f"Completed Fold {i}")
    print(f"    Accuracy={knn_accuracyScores[i]}    Recall={knn_recallScores[i]}    FScore={knn_f1Scores[i]}")

# Calculate the mean scores across all folds for KNN
print("KNN Mean accuracy score:", np.mean(knn_accuracyScores))
print("KNN Mean f1 score:", np.mean(knn_f1Scores))
print("KNN Mean precision score:", np.mean(knn_precisionScores))
print("KNN Mean recall score:", np.mean(knn_recallScores))


In [None]:
rf_accuracyScores = []
rf_f1Scores = []
rf_precisionScores = []
rf_recallScores = []

for i, (trainIndex, valIndex) in enumerate(cv.split(xTrain, yTrain)):
    # Split the data
    xLocalTrain, xVal = xTrain.iloc[trainIndex], xTrain.iloc[valIndex]
    yLocalTrain, yVal = yTrain.iloc[trainIndex], yTrain.iloc[valIndex]

    # Feature scaling (optional for RF but maintaining consistency)
    xScaler = StandardScaler()
    xTrainScaled = xScaler.fit_transform(xLocalTrain)
    xValScaled = xScaler.transform(xVal)

    # Instantiate and train the RF classifier
    rf = RandomForestClassifier(**rfParams)  # Assuming rfParams is predefined
    rf.fit(xTrainScaled, yLocalTrain)

    # Predict and evaluate
    yPred = rf.predict(xValScaled)
    rf_accuracyScores.append(accuracy_score(yVal, yPred))
    rf_f1Scores.append(f1_score(yVal, yPred))
    rf_precisionScores.append(precision_score(yVal, yPred))
    rf_recallScores.append(recall_score(yVal, yPred))

    print(f"Completed Fold {i}")
    print(f"    Accuracy={rf_accuracyScores[i]}    Recall={rf_recallScores[i]}    FScore={rf_f1Scores[i]}")

# Calculate the mean scores across all folds for RF
print("RF Mean accuracy score:", np.mean(rf_accuracyScores))
print("RF Mean f1 score:", np.mean(rf_f1Scores))
print("RF Mean precision score:", np.mean(rf_precisionScores))
print("RF Mean recall score:", np.mean(rf_recallScores))


In [None]:
### feature scaling
xScaler = StandardScaler()
xColNames = xLocalTrain.columns.values.tolist()
# train the scaler and apply it to the training set
xTrainScaled = xScaler.fit_transform(xTrain[xColNames])
# apply the scaling to the testing set
xTestScaled = xScaler.transform(xTest[xColNames])

In [None]:
################ 
# Logistic Regression
################
clf = LogisticRegression(**logisticParams)
clf.fit(xTrainScaled, yTrain)

preds = clf.predict(xTestScaled)

lrAccuracy = accuracy_score(yTest, preds)
lrFScore = f1_score(yTest, preds)
lrPrecision = precision_score(yTest, preds)
lrRecall = recall_score(yTest, preds)

################ 
# Neural Net
################
clf = MLPClassifier(**mlpParams)
clf.fit(xTrainScaled, yTrain)

preds = clf.predict(xTestScaled)

nnAccuracy = accuracy_score(yTest, preds)
nnFScore = f1_score(yTest, preds)
nnPrecision = precision_score(yTest, preds)
nnRecall = recall_score(yTest, preds)

################ 
# Decision Tree
################
clf = DecisionTreeClassifier(**treeParams)
clf.fit(xTrainScaled, yTrain)

preds = clf.predict(xTestScaled)

treeAccuracy = accuracy_score(yTest, preds)
treeFScore = f1_score(yTest, preds)
treePrecision = precision_score(yTest, preds)
treeRecall = recall_score(yTest, preds)

################ 
# Random Forest
################
clf = RandomForestClassifier(**rfParams)
clf.fit(xTrainScaled, yTrain)

preds = clf.predict(xTestScaled)

rfAccuracy = accuracy_score(yTest, preds)
rfFScore = f1_score(yTest, preds)
rfPrecision = precision_score(yTest, preds)
rfRecall = recall_score(yTest, preds)

################ 
# K-NN
################
clf = KNeighborsClassifier(**knnParams)
clf.fit(xTrainScaled, yTrain)

preds = clf.predict(xTestScaled)

knnAccuracy = accuracy_score(yTest, preds)
knnFScore = f1_score(yTest, preds)
knnPrecision = precision_score(yTest, preds)
knnRecall = recall_score(yTest, preds)

################ 
# SVM
################

clf = SVC(**svmParams)
clf.fit(xTrainScaled, yTrain)

preds = clf.predict(xTestScaled)

svmAccuracy = accuracy_score(yTest, preds)
svmFScore = f1_score(yTest, preds)
svmPrecision = precision_score(yTest, preds)
svmRecall = recall_score(yTest, preds)

scoreDict = {"model" : ["Neural Network", "SVM", "K-NN", "Decision Tree", "RF", "LR"],
            "accuracy" : [nnAccuracy, svmAccuracy, knnAccuracy, treeAccuracy, rfAccuracy, lrAccuracy],
            "fScore" : [nnFScore, svmFScore, knnFScore, treeFScore, rfFScore, lrFScore],
            "precision" : [nnPrecision, svmPrecision, knnPrecision, treePrecision, rfPrecision, lrPrecision],
            "recall" : [nnRecall, svmRecall, knnRecall, treeRecall, rfRecall, lrRecall]}

resultsDF = pd.DataFrame.from_dict(scoreDict)

resultsDF

In [None]:
styled_results = resultsDF.style.set_table_styles(
    [{'selector': 'th', 'props': [('font-size', '14pt'), ('text-align', 'center'), ('color', 'black')]},
     {'selector': 'td', 'props': [('text-align', 'center'), ('color', 'black')]},
     {'selector': 'tr:nth-of-type(odd)', 'props': [('background', '#f5f5f5')]},
     {'selector': 'tr:nth-of-type(even)', 'props': [('background', 'white')]},
     {'selector': 'tr:hover', 'props': [('background-color', '#ffff99')]},
    ], overwrite=False)

styled_results = styled_results.format({
    'accuracy': '{:,.2f}'.format,
    'fScore': '{:,.2f}'.format,
    'precision': '{:,.2f}'.format,
    'recall': '{:,.2f}'.format
})

styled_results = styled_results.hide()

styled_results