In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance
from itertools import combinations
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import random
random.seed(100)
warnings.filterwarnings('ignore')


In [112]:
df = pd.read_csv('Breast_Cancer.csv')
df.head(3)

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive


In [113]:
df.rename(columns={'T Stage ': 'T Stage'}, inplace=True)
df.isnull().sum()
df["Grade"].value_counts()
df["Grade"] = df["Grade"].apply(lambda x: int(x.replace(" anaplastic; Grade IV", "4")))

categorical_cols = ['Race', 'Marital Status', 'A Stage', 'T Stage', 'N Stage',
                     '6th Stage', 'differentiate', 'Estrogen Status', 'Progesterone Status']
numerical_cols = df[['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months', 'Grade']]


In [114]:
def identify_outliers(df, column):
    Q1 = df[column].quantile(0.1)
    Q3 = df[column].quantile(0.9)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    return df[(df[column] < lower_limit) | (df[column] > upper_limit)]

# Define a function to remove outliers using IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.1)
    Q3 = df[column].quantile(0.9)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_limit) & (df[column] <= upper_limit)]

# Print the outliers for each numerical column
for column in numerical_cols.columns:
    outliers = identify_outliers(numerical_cols, column)
    print(f"\nOutliers for {column}:")
    print(outliers[column])
    print(f"Number of outliers in {column}: {len(outliers)}")

# Remove the outliers
for column in numerical_cols.columns:
    numerical_cols = remove_outliers(numerical_cols, column)

# Encode the categorical columns
onehot_encoder = OneHotEncoder(sparse=False)
encoded_cols = pd.DataFrame(onehot_encoder.fit_transform(df[categorical_cols]))
encoded_cols.columns = onehot_encoder.get_feature_names_out()

# Concatenate the dataframes
df_encoded = pd.concat([numerical_cols.reset_index(drop=True), 
                        encoded_cols.reset_index(drop=True), 
                        df["Status"].reset_index(drop=True)], 
                        axis=1)


Outliers for Age:
Series([], Name: Age, dtype: int64)
Number of outliers in Age: 0

Outliers for Tumor Size:
289     140
740     140
894     133
1007    140
1512    140
3965    140
Name: Tumor Size, dtype: int64
Number of outliers in Tumor Size: 6

Outliers for Regional Node Examined:
941     61
2462    57
3950    60
Name: Regional Node Examined, dtype: int64
Number of outliers in Regional Node Examined: 3

Outliers for Reginol Node Positive:
100     24
219     27
238     26
288     28
482     24
522     28
530     28
535     24
544     29
550     31
574     26
633     46
662     27
838     33
909     29
922     27
989     37
1039    28
1116    24
1120    30
1128    37
1199    27
1246    26
1267    35
1382    25
1411    24
1567    29
1667    29
1711    25
1872    26
2028    29
2031    27
2142    32
2182    41
2287    26
2293    28
2425    28
2568    26
2638    28
2755    26
2928    30
2934    34
3017    34
3265    33
3401    24
3601    24
3646    26
3677    26
3822    26
3840    24
38

In [115]:
# Use StandardScaler to normalize numerical features
scaler = StandardScaler()
df_encoded[['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months', 'Grade']] = scaler.fit_transform(
    df_encoded[['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months', 'Grade']])

In [116]:
# Count number of rows before dropping NA
num_rows_before = df_encoded.shape[0]
df_encoded = df_encoded.dropna()
num_rows_after = df_encoded.shape[0]

print(f"Number of rows before dropping NA: {num_rows_before}")
print(f"Number of rows after dropping NA: {num_rows_after}")
print(f"Number of rows dropped: {num_rows_before - num_rows_after}")


Number of rows before dropping NA: 4024
Number of rows after dropping NA: 3961
Number of rows dropped: 63


In [117]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# remove Survival Months
X = df_encoded.drop(['Status', 'Survival Months'], axis=1)
y = df_encoded['Status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123)

In [118]:
y.value_counts()

Status
Alive    3354
Dead      607
Name: count, dtype: int64

# SVC

In [119]:
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Assume you have X and y from your dataset

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Create an SVC classifier
clf = SVC()

# Define the parameter grid for the grid search
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(clf, param_grid, scoring='f1_macro')
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best classifier from the grid search
best_clf = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_clf.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred)

# Generate the confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)

# Print the classification report and confusion matrix
print("Classification Report:")
print(report)
print("\nConfusion Matrix:")
print(confusion_mat)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
print(grid_search.best_params_)


Classification Report:
              precision    recall  f1-score   support

       Alive       0.85      0.85      0.85       672
        Dead       0.17      0.17      0.17       121

    accuracy                           0.75       793
   macro avg       0.51      0.51      0.51       793
weighted avg       0.75      0.75      0.75       793


Confusion Matrix:
[[572 100]
 [100  21]]

Best Hyperparameters:
{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


# KNN

In [120]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()

# Define the parameter grid for the grid search
param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(clf, param_grid, scoring='f1_macro')
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best classifier from the grid search
best_clf = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_clf.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred)

# Generate the confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)

# Print the classification report and confusion matrix
print("Classification Report:")
print(report)
print("\nConfusion Matrix:")
print(confusion_mat)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
print(grid_search.best_params_)

Classification Report:
              precision    recall  f1-score   support

       Alive       0.86      0.81      0.83       672
        Dead       0.20      0.26      0.23       121

    accuracy                           0.73       793
   macro avg       0.53      0.54      0.53       793
weighted avg       0.76      0.73      0.74       793


Confusion Matrix:
[[545 127]
 [ 89  32]]

Best Hyperparameters:
{'n_neighbors': 3, 'p': 1, 'weights': 'distance'}


# RF

In [121]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# Define the parameter grid for the grid search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(clf, param_grid, scoring='f1_macro')
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best classifier from the grid search
best_clf = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_clf.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred)

# Generate the confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)

# Print the classification report and confusion matrix
print("Classification Report:")
print(report)
print("\nConfusion Matrix:")
print(confusion_mat)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
print(grid_search.best_params_)

Classification Report:
              precision    recall  f1-score   support

       Alive       0.86      0.93      0.89       672
        Dead       0.30      0.17      0.22       121

    accuracy                           0.81       793
   macro avg       0.58      0.55      0.56       793
weighted avg       0.78      0.81      0.79       793


Confusion Matrix:
[[624  48]
 [100  21]]

Best Hyperparameters:
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


# Naive Bayes + Random Over Sampler

In [126]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply Random OverSampler to the training set
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Create a Naive Bayes classifier
clf = GaussianNB()

# Define the parameter grid for the grid search
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best classifier from the grid search
best_clf = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_clf.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred)

# Generate the confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)

# Print the classification report and confusion matrix
print("Classification Report:")
print(report)
print("\nConfusion Matrix:")
print(confusion_mat)

# Print the best hyperparameters
print("\nBest Hyperparameters:")
print(grid_search.best_params_)


Classification Report:
              precision    recall  f1-score   support

       Alive       0.89      0.84      0.87       672
        Dead       0.33      0.45      0.38       121

    accuracy                           0.78       793
   macro avg       0.61      0.64      0.62       793
weighted avg       0.81      0.78      0.79       793


Confusion Matrix:
[[564 108]
 [ 67  54]]

Best Hyperparameters:
{'var_smoothing': 1e-09}


 # Wrap everything with Random Over Sampler

In [127]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

def perform_classification(X, y, classifier, param_grid, scoring='f1_macro'):
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Apply Random OverSampler to the training set
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring=scoring)
    grid_search.fit(X_train_resampled, y_train_resampled)

    # Get the best classifier from the grid search
    best_clf = grid_search.best_estimator_

    # Make predictions on the test set
    y_pred = best_clf.predict(X_test)

    # Generate the classification report
    report = classification_report(y_test, y_pred)

    # Generate the confusion matrix
    confusion_mat = confusion_matrix(y_test, y_pred)

    # Print the classification report and confusion matrix
    print("Classification Report:")
    print(report)
    print("\nConfusion Matrix:")
    print(confusion_mat)

    # Print the best hyperparameters
    print("\nBest Hyperparameters:")
    print(grid_search.best_params_)


In [128]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


# Define the parameter grids for each classifier
svc_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4]
}

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4]
}


svc_classifier = SVC()
perform_classification(X, y, svc_classifier, svc_param_grid)

dt_classifier = DecisionTreeClassifier()
perform_classification(X, y, dt_classifier, dt_param_grid)

rf_classifier = RandomForestClassifier()
perform_classification(X, y, rf_classifier, rf_param_grid)


Classification Report:
              precision    recall  f1-score   support

       Alive       0.85      0.81      0.83       672
        Dead       0.17      0.21      0.19       121

    accuracy                           0.72       793
   macro avg       0.51      0.51      0.51       793
weighted avg       0.75      0.72      0.73       793


Confusion Matrix:
[[546 126]
 [ 95  26]]

Best Hyperparameters:
{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Classification Report:
              precision    recall  f1-score   support

       Alive       0.87      0.86      0.87       672
        Dead       0.25      0.26      0.26       121

    accuracy                           0.77       793
   macro avg       0.56      0.56      0.56       793
weighted avg       0.77      0.77      0.77       793


Confusion Matrix:
[[581  91]
 [ 90  31]]

Best Hyperparameters:
{'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Classification Report:
              