In [2]:
import numpy as np 
import pandas as pd 

df = pd.read_csv('./Breast_Cancer_dataset.csv')
df

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68.0,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4.0,Positive,Positive,24.0,1,60,Alive
1,50.0,White,,T2,N2,IIIA,Moderately differentiated,2,Regional,35.0,Positive,Positive,14.0,5,62,Alive
2,58.0,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63.0,Positive,Positive,14.0,7,75,Alive
3,58.0,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,,Positive,Positive,2.0,1,84,Alive
4,47.0,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41.0,,Positive,3.0,1,50,Alive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4019,62.0,,Married,T1,N1,IIA,Moderately differentiated,2,Regional,9.0,Positive,Positive,1.0,1,49,Alive
4020,56.0,White,,T2,N2,IIIA,Moderately differentiated,2,Regional,46.0,Positive,Positive,14.0,8,69,Alive
4021,68.0,White,Married,T2,N1,IIB,Moderately differentiated,2,Regional,22.0,Positive,Negative,11.0,3,69,Alive
4022,58.0,Black,Divorced,T2,N1,IIB,Moderately differentiated,2,Regional,44.0,Positive,Positive,11.0,1,72,Alive


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     3823 non-null   float64
 1   Race                    3622 non-null   object 
 2   Marital Status          3703 non-null   object 
 3   T Stage                 4024 non-null   object 
 4   N Stage                 4024 non-null   object 
 5   6th Stage               4024 non-null   object 
 6   differentiate           4024 non-null   object 
 7   Grade                   4024 non-null   object 
 8   A Stage                 4024 non-null   object 
 9   Tumor Size              3622 non-null   float64
 10  Estrogen Status         3823 non-null   object 
 11  Progesterone Status     4024 non-null   object 
 12  Regional Node Examined  3421 non-null   float64
 13  Reginol Node Positive   4024 non-null   int64  
 14  Survival Months         4024 non-null   

In [4]:
numerical_cols = ['Age', 'Tumor Size', 'Regional Node Examined', 
'Regional Node Examined', 'Reginol Node Positive', 'Survival Months']

categorical_cols = ['Race', 'Marital Status', 'T Stage ', 'N Stage', '6th Stage',
'differentiate', 'Grade', 'A Stage', 'Estrogen Status', 'Progesterone Status']

We can see here, in total we have 4024 rows, yet, some of the columns have a lot missing values. For example,Regional Node Examined has only 3421 out of 4024. 

And Let's split the columns into 2 types: numerical and categorical.

### Handle Missing Values

In [5]:
# Check for missing values in each column
missing_values = df.isnull().sum()
print("Number of missing values for each column:")
print(missing_values[missing_values > 0])

Number of missing values for each column:
Age                       201
Race                      402
Marital Status            321
Tumor Size                402
Estrogen Status           201
Regional Node Examined    603
dtype: int64


In [6]:
# Fill numerical missing values with the median (not mean since we haven't handled the outliers)
for col in numerical_cols:
        df[col].fillna(df[col].median(), inplace=True)

# Fill categorical columns with "Unknown"
for col in categorical_cols:
        df[col].fillna('Unknown', inplace=True)

In [7]:
# Recheck for missing values in each column
missing_values2 = df.isnull().sum()
print(missing_values2[missing_values2 > 0])

Series([], dtype: int64)


Now we've filled the missing values with median (for numericals) / "Unknown" (for categoricals).

We can proceede with handling outliers.

### Handle Outliers

In [8]:
# Remove outliers using the interquartile range (IQR)
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Apply outlier removal to numerical columns
for col in numerical_cols:
    df = remove_outliers(df, col)

### Normalize Numerical Values

In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,0.974359,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,0.04918,Positive,Positive,0.821429,0.0,0.520408,Alive
1,0.512821,White,Unknown,T2,N2,IIIA,Moderately differentiated,2,Regional,0.557377,Positive,Positive,0.464286,0.571429,0.540816,Alive
3,0.717949,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,0.393443,Positive,Positive,0.035714,0.0,0.765306,Alive
4,0.435897,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,0.655738,Unknown,Positive,0.071429,0.0,0.418367,Alive
5,0.538462,White,Single,T1,N1,IIA,Moderately differentiated,2,Regional,0.311475,Positive,Positive,0.607143,0.142857,0.816327,Alive


In [10]:
df

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,0.974359,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,0.049180,Positive,Positive,0.821429,0.000000,0.520408,Alive
1,0.512821,White,Unknown,T2,N2,IIIA,Moderately differentiated,2,Regional,0.557377,Positive,Positive,0.464286,0.571429,0.540816,Alive
3,0.717949,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,0.393443,Positive,Positive,0.035714,0.000000,0.765306,Alive
4,0.435897,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,0.655738,Unknown,Positive,0.071429,0.000000,0.418367,Alive
5,0.538462,White,Single,T1,N1,IIA,Moderately differentiated,2,Regional,0.311475,Positive,Positive,0.607143,0.142857,0.816327,Alive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4019,0.820513,Unknown,Married,T1,N1,IIA,Moderately differentiated,2,Regional,0.131148,Positive,Positive,0.000000,0.000000,0.408163,Alive
4020,0.666667,White,Unknown,T2,N2,IIIA,Moderately differentiated,2,Regional,0.737705,Positive,Positive,0.464286,1.000000,0.612245,Alive
4021,0.974359,White,Married,T2,N1,IIB,Moderately differentiated,2,Regional,0.344262,Positive,Negative,0.357143,0.285714,0.612245,Alive
4022,0.717949,Black,Divorced,T2,N1,IIB,Moderately differentiated,2,Regional,0.704918,Positive,Positive,0.357143,0.000000,0.642857,Alive


### Feature Selection/ Ranking (Filters)
Now we've done preprocessing with the data, let's select the best features using mutual information.

In [11]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

X = df.drop(columns=['Status'])  
y = df['Status'] # Our target
y = LabelEncoder().fit_transform(y) # Encode label to our target column
for col in categorical_cols:
    X[col] = LabelEncoder().fit_transform(X[col]) # Encode label to each of the categorical feature column

In [12]:
mutual_info_scores = mutual_info_classif(X, y) # Apply mutual information to rank each feature
mutual_info = pd.Series(mutual_info_scores, index=X.columns)
print("Features ranked by mutual information:")
print(mutual_info.sort_values(ascending=False))

Features ranked by mutual information:
Survival Months           0.093882
Grade                     0.015642
A Stage                   0.011100
differentiate             0.010270
Reginol Node Positive     0.008181
T Stage                   0.005899
N Stage                   0.003256
Race                      0.003102
6th Stage                 0.002067
Tumor Size                0.000825
Estrogen Status           0.000220
Age                       0.000000
Marital Status            0.000000
Progesterone Status       0.000000
Regional Node Examined    0.000000
dtype: float64


As we can see from above, the survoval months is the best ranking feature, while other features are less relevant in predicting the status of a patient, e.g. Race, Marital Status. And we selet the top 5 features for model testing.

In [13]:
top_features_mutual_info = mutual_info.sort_values(ascending=False).head(5).index
print("Top features based on mutual information:", top_features_mutual_info)

Top features based on mutual information: Index(['Survival Months', 'Grade', 'A Stage', 'differentiate',
       'Reginol Node Positive'],
      dtype='object')


In [14]:
X_selected = X[list(top_features_mutual_info)]
X_selected

Unnamed: 0,Survival Months,Grade,A Stage,differentiate,Reginol Node Positive
0,0.520408,3,1,1,0.000000
1,0.540816,2,1,0,0.571429
3,0.765306,3,1,1,0.000000
4,0.418367,3,1,1,0.000000
5,0.816327,2,1,0,0.142857
...,...,...,...,...,...
4019,0.408163,2,1,0,0.000000
4020,0.612245,2,1,0,1.000000
4021,0.612245,2,1,0,0.285714
4022,0.642857,2,1,0,0.000000


In [15]:
from sklearn.model_selection import train_test_split
# Split data into 20%-80% train-test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

### Modeling

#### 1. KNN

In [16]:
def knn(X_train, y_train, X_test, k):
    predictions = []
    for _, test_point in X_test.iterrows():
        test_point = test_point.values
        distances = np.sqrt(np.sum((X_train - test_point) ** 2, axis=1))
        # Get the indices of datapoints in X_train which are the k-nearest neighbours of test_point
        k_indices = distances.argsort().loc[:k] 
        k_nearest_labels = y_train[k_indices] # Get neighbours' labels
        
        label_counts = {} # Count number of Alive v.s. Dead
        for label in k_nearest_labels:
            if label in label_counts:
                label_counts[label] += 1
            else:
                label_counts[label] = 1
        
        majority_label = max(label_counts, key=label_counts.get)
        predictions.append(majority_label)
    return np.array(predictions)

predictions = knn(X_train, y_train, X_test, k=5)
accuracy = np.mean(predictions == y_test)
print("The accuracy for using KNN (5NN) is: ",accuracy)

The accuracy for using KNN (5NN) is:  0.9003164556962026


#### 2. Naive Bayes

"Implements the Gaussian Naive Bayes algorithm for classification. The likelihood of the features is assumed to be Gaussian" --- Scikit-Learn

In [17]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy for using Naive Bayes is: ",accuracy)

The accuracy for using Naive Bayes is:  0.9066455696202531


#### 3. C4.5 Decision Tree
We split the tree using the entropy criterion (feature that gives us the most information gain).

In [18]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion='entropy')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy for using C4.5 Decision Tree is: ",accuracy)

The accuracy for using C4.5 Decision Tree is:  0.9066455696202531


#### 4. Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy for using Random Forest (100 trees) is: ",accuracy)

The accuracy for using Random Forest (100 trees) is:  0.9034810126582279


#### 5. Gradient Boosting

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy for using Gradient Boosting is: ",accuracy)

The accuracy for using Gradient Boosting is:  0.9256329113924051


#### 6. Neural Networks

In [21]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy for using Neural Network is: ",accuracy)

The accuracy for using Neural Network is:  0.9256329113924051


### Hyperparameter Tuning

#### 1. Neural Network

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
# Define the param grid and model for grid searching
param_grid = {
    'hidden_layer_sizes': [(10, 10), (20, 20), (30, 30)],
    'batch_size': [16, 32, 64],
    'learning_rate_init': [0.0001, 0.001, 0.01, 0.1]
}
mlp = MLPClassifier(max_iter=1000, random_state=42)

grid_search = GridSearchCV(
    estimator=mlp,
    param_grid=param_grid,
    scoring='accuracy', # accuracy as metrics
    cv=3, # cross validation (3-folds)
    n_jobs=-1 # speed up search
)

# Find the NN that gives the best cross validation accuracy
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters: ", best_params)
print("Best Cross-Validation Accuracy: ", best_score) 
# Choose the model with the best cross validation accuracy for testing
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy with Best Parameters: ", test_accuracy)
print("Classification Report on Test Set:\n", classification_report(y_test, y_pred))

Best Parameters:  {'batch_size': 32, 'hidden_layer_sizes': (30, 30), 'learning_rate_init': 0.01}
Best Cross-Validation Accuracy:  0.9057412894011673
Test Accuracy with Best Parameters:  0.9319620253164557
Classification Report on Test Set:
               precision    recall  f1-score   support

           0       0.94      0.99      0.96       569
           1       0.79      0.43      0.56        63

    accuracy                           0.93       632
   macro avg       0.87      0.71      0.76       632
weighted avg       0.93      0.93      0.92       632



#### 2. Gradient Boosting

In [26]:
# Define the parameter grid and model for grid searching
param_grid = {
    'n_estimators': [50, 100, 500, 1000], 
    'learning_rate': [0.0001, 0.001, 0.01, 0.1] 
}
gb = GradientBoostingClassifier(random_state=42)

grid_search_gb = GridSearchCV(
    estimator=gb,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    n_jobs=-1
)

# Find the GB that gives the best cross validation accuracy
grid_search_gb.fit(X_train, y_train)
best_model_gb = grid_search_gb.best_estimator_
best_params_gb = grid_search_gb.best_params_
best_score_gb = grid_search_gb.best_score_
print("Best Parameters:", best_params_gb)
print("Best Cross-Validation Accuracy:", best_score_gb)
# Choose the model with the best cross validation accuracy for testing
y_pred_gb = best_model_gb.predict(X_test)
test_accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Test Accuracy with Best Parameters:", test_accuracy_gb)
print("Classification Report on Test Set:\n", classification_report(y_test, y_pred_gb))

Best Parameters: {'learning_rate': 0.01, 'n_estimators': 500}
Best Cross-Validation Accuracy: 0.9093051762266954
Test Accuracy with Best Parameters: 0.9272151898734177
Classification Report on Test Set:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96       569
           1       0.72      0.44      0.55        63

    accuracy                           0.93       632
   macro avg       0.83      0.71      0.75       632
weighted avg       0.92      0.93      0.92       632

