In [142]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance


warnings.filterwarnings('ignore')


In [143]:
df = pd.read_csv('Breast_Cancer.csv')
df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


In [144]:
df.rename(columns={'T Stage ': 'T Stage'}, inplace=True)
df.isnull().sum()

Age                       0
Race                      0
Marital Status            0
T Stage                   0
N Stage                   0
6th Stage                 0
differentiate             0
Grade                     0
A Stage                   0
Tumor Size                0
Estrogen Status           0
Progesterone Status       0
Regional Node Examined    0
Reginol Node Positive     0
Survival Months           0
Status                    0
dtype: int64

In [145]:
df["Grade"].value_counts()

2                        2351
3                        1111
1                         543
 anaplastic; Grade IV      19
Name: Grade, dtype: int64

In [146]:
df["Grade"] = df["Grade"].apply(lambda x: int(x.replace(" anaplastic; Grade IV", "4")))

In [147]:
categorical_cols = ['Race', 'Marital Status', 'A Stage', 'T Stage', 'N Stage',
                     '6th Stage', 'differentiate', 'Estrogen Status', 'Progesterone Status']

onehot_encoder = OneHotEncoder(sparse=False)
encoded_cols = pd.DataFrame(onehot_encoder.fit_transform(df[categorical_cols]))
encoded_cols.columns = onehot_encoder.get_feature_names_out()


In [148]:
numerical_cols = df[['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months', 'Grade']]

df_encoded = pd.concat([numerical_cols, encoded_cols, df["Status"]], axis=1)

In [149]:
# Use MinMaxScaler to normalize numerical features
scaler = StandardScaler()
df_encoded[['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months', 'Grade']] = scaler.fit_transform(
    df_encoded[['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months', 'Grade']])

In [150]:
df_encoded

Unnamed: 0,Age,Tumor Size,Regional Node Examined,Reginol Node Positive,Survival Months,Grade,Race_Black,Race_Other,Race_White,Marital Status_Divorced,...,6th Stage_IIIC,differentiate_Moderately differentiated,differentiate_Poorly differentiated,differentiate_Undifferentiated,differentiate_Well differentiated,Estrogen Status_Negative,Estrogen Status_Positive,Progesterone Status_Negative,Progesterone Status_Positive,Status
0,1.565253,-1.253661,1.190676,-0.618172,-0.492961,1.331031,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,Alive
1,-0.443222,0.214345,-0.044095,0.164807,-0.405695,-0.235987,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Alive
2,0.449434,1.540287,-0.044095,0.556296,0.161530,-0.235987,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Alive
3,0.449434,-0.590691,-1.525820,-0.618172,0.554224,1.331031,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,Alive
4,-0.777968,0.498475,-1.402343,-0.618172,-0.929288,1.331031,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,Alive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4019,0.895761,-1.016886,-1.649297,-0.618172,-0.972921,-0.235987,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Alive
4020,0.226270,0.735251,-0.044095,0.752041,-0.100266,-0.235987,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Alive
4021,1.565253,-0.401271,-0.414526,-0.226682,-0.100266,-0.235987,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,Alive
4022,0.449434,0.640541,-0.414526,-0.618172,0.030632,-0.235987,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Alive


## KNN model

In [151]:
# do it with cv and grid search

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# remove Survival Months
X = df_encoded.drop(['Status', 'Survival Months'], axis=1)
y = df_encoded['Status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123)


knn = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(1, 25)}
knn_cv = GridSearchCV(knn, param_grid, cv=5)
knn_cv.fit(X_train, y_train)
knn_cv.best_params_
knn_cv.best_score_

# test set accuracy
knn_cv.score(X_test, y_test)

0.8397515527950311

In [152]:
y_pred = knn_cv.predict(X_test)

print(classification_report(y_test.to_numpy(), y_pred))

              precision    recall  f1-score   support

       Alive       0.85      0.98      0.91       678
        Dead       0.45      0.08      0.13       127

    accuracy                           0.84       805
   macro avg       0.65      0.53      0.52       805
weighted avg       0.79      0.84      0.79       805



In [154]:
# confusion matrix
confusion_matrix(y_test.to_numpy(), y_pred)

array([[666,  12],
       [117,  10]])

In [156]:
# check importance of each variable
r = permutation_importance(knn_cv, X_test, y_test,
                            n_repeats=30,
                            random_state=0)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{X.columns[i]:<8} "
              f"{r.importances_mean[i]:.3f}"
              f" +/- {r.importances_std[i]:.3f}")

Progesterone Status_Positive0.003 +/- 0.001
Progesterone Status_Negative0.003 +/- 0.001
Estrogen Status_Positive0.003 +/- 0.001
Estrogen Status_Negative0.003 +/- 0.001
Marital Status_Widowed0.001 +/- 0.000


## SVM with custom kernel

In [132]:
from numba import njit
import numba as nb
from sklearn import svm
from sklearn.model_selection import GridSearchCV

# gammaList = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
CList = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

@njit
def histogramKernel(xs, ys):
    return np.minimum(xs, ys).sum()

class GramBuilder:
    def __init__(self, Kernel):
        self._Kernel = Kernel
    def generateMatrixBuilder(self, X1, X2):
        gram_matrix = np.zeros((X1.shape[0], X2.shape[0]))
        for i, x1 in enumerate(X1):
            for j, x2 in enumerate(X2):
                gram_matrix[i, j] = self._Kernel(x1, x2)
        
        return gram_matrix
    

histo_svc = svm.SVC(kernel=GramBuilder(histogramKernel).generateMatrixBuilder)
histo_clf = GridSearchCV(histo_svc, {'C' : CList}, verbose = 1, n_jobs = -1)
histo_clf.fit(X_train.to_numpy(), y_train.to_numpy())

Fitting 5 folds for each of 7 candidates, totalling 35 fits


In [133]:
print("Best parameters set found on development set:")
print(histo_clf.best_params_)
print() 
print("Grid scores on train set:")
means = histo_clf.cv_results_['mean_test_score']
stds = histo_clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, histo_clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))

Best parameters set found on development set:

{'C': 0.001}

Grid scores on development set:

0.848 (+/-0.001) for {'C': 0.001}
0.848 (+/-0.001) for {'C': 0.01}
0.848 (+/-0.001) for {'C': 0.1}
0.843 (+/-0.008) for {'C': 1}
0.843 (+/-0.011) for {'C': 10}
0.840 (+/-0.019) for {'C': 100}
0.841 (+/-0.018) for {'C': 1000}


In [157]:
# test set

y_pred = histo_clf.predict(X_test.to_numpy())
print("Accuracy score on test set: ", accuracy_score(y_test.to_numpy(), y_pred))

Accuracy score on test set:  0.8422360248447205


In [158]:
confusion_matrix(y_test.to_numpy(), y_pred)

array([[678,   0],
       [127,   0]])

In [159]:
print(classification_report(y_test.to_numpy(), y_pred))

              precision    recall  f1-score   support

       Alive       0.84      1.00      0.91       678
        Dead       0.00      0.00      0.00       127

    accuracy                           0.84       805
   macro avg       0.42      0.50      0.46       805
weighted avg       0.71      0.84      0.77       805



## Random forest based classifiers

In [162]:
# random forest classifier

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
param_grid = {'n_estimators': np.arange(1, 100, 10), 'max_depth': np.arange(3, 25, 3),
               'min_samples_split': np.arange(2, 5)}
rf_cv = GridSearchCV(rf, param_grid, cv=5)
rf_cv.fit(X_train, y_train)
rf_cv.best_params_, rf_cv.best_score_

({'max_depth': 6, 'min_samples_split': 3, 'n_estimators': 41},
 0.8574104305323456)

In [164]:
# test set accuracy
y_pred = rf_cv.predict(X_test)

print(classification_report(y_test.to_numpy(), y_pred))

# confusion matrix
confusion_matrix(y_test.to_numpy(), y_pred)


              precision    recall  f1-score   support

       Alive       0.85      0.99      0.92       678
        Dead       0.65      0.10      0.18       127

    accuracy                           0.85       805
   macro avg       0.75      0.55      0.55       805
weighted avg       0.82      0.85      0.80       805



array([[671,   7],
       [114,  13]])