In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance
from itertools import combinations
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import random
random.seed(100)
warnings.filterwarnings('ignore')


In [24]:
df = pd.read_csv('Breast_Cancer.csv')
df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


In [25]:
df.rename(columns={'T Stage ': 'T Stage'}, inplace=True)
df.isnull().sum()
df["Grade"].value_counts()
df["Grade"] = df["Grade"].apply(lambda x: int(x.replace(" anaplastic; Grade IV", "4")))

categorical_cols = ['Race', 'Marital Status', 'A Stage', 'T Stage', 'N Stage',
                     '6th Stage', 'differentiate', 'Estrogen Status', 'Progesterone Status']
numerical_cols = df[['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months', 'Grade']]


In [26]:
def identify_outliers(df, column):
    Q1 = df[column].quantile(0.1)
    Q3 = df[column].quantile(0.9)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    return df[(df[column] < lower_limit) | (df[column] > upper_limit)]

def remove_outliers(df, column):
    Q1 = df[column].quantile(0.1)
    Q3 = df[column].quantile(0.9)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_limit) & (df[column] <= upper_limit)]

for column in numerical_cols.columns:
    outliers = identify_outliers(numerical_cols, column)
    print(f"\nOutliers for {column}:")
    print(outliers[column])
    print(f"Number of outliers in {column}: {len(outliers)}")

for column in numerical_cols.columns:
    numerical_cols = remove_outliers(numerical_cols, column)

onehot_encoder = OneHotEncoder(sparse=False)
encoded_cols = pd.DataFrame(onehot_encoder.fit_transform(df[categorical_cols]))
encoded_cols.columns = onehot_encoder.get_feature_names_out()

df_encoded = pd.concat([numerical_cols.reset_index(drop=True), 
                        encoded_cols.reset_index(drop=True), 
                        df["Status"].reset_index(drop=True)], 
                        axis=1)


Outliers for Age:
Series([], Name: Age, dtype: int64)
Number of outliers in Age: 0

Outliers for Tumor Size:
289     140
740     140
894     133
1007    140
1512    140
3965    140
Name: Tumor Size, dtype: int64
Number of outliers in Tumor Size: 6

Outliers for Regional Node Examined:
941     61
2462    57
3950    60
Name: Regional Node Examined, dtype: int64
Number of outliers in Regional Node Examined: 3

Outliers for Reginol Node Positive:
100     24
219     27
238     26
288     28
482     24
522     28
530     28
535     24
544     29
550     31
574     26
633     46
662     27
838     33
909     29
922     27
989     37
1039    28
1116    24
1120    30
1128    37
1199    27
1246    26
1267    35
1382    25
1411    24
1567    29
1667    29
1711    25
1872    26
2028    29
2031    27
2142    32
2182    41
2287    26
2293    28
2425    28
2568    26
2638    28
2755    26
2928    30
2934    34
3017    34
3265    33
3401    24
3601    24
3646    26
3677    26
3822    26
3840    24
38

In [27]:
scaler = StandardScaler()
df_encoded[['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months', 'Grade']] = scaler.fit_transform(
    df_encoded[['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months', 'Grade']])

num_rows_before = df_encoded.shape[0]
df_encoded = df_encoded.dropna()
num_rows_after = df_encoded.shape[0]

print(f"Number of rows before dropping NA: {num_rows_before}")
print(f"Number of rows after dropping NA: {num_rows_after}")
print(f"Number of rows dropped: {num_rows_before - num_rows_after}")


Number of rows before dropping NA: 4024
Number of rows after dropping NA: 3961
Number of rows dropped: 63


In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# remove Survival Months
X = df_encoded.drop(['Status', 'Survival Months'], axis=1)
y = df_encoded['Status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123)

# Random forest with ray

In [29]:
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix

# Read the JSON file
with open('results/random_forest_ray.json') as f:
    config = json.load(f)

In [30]:
clf = RandomForestClassifier(n_estimators=config['best_config']['n_estimators'],
                             max_depth=config['best_config']['max_depth'],
                             min_samples_split=config['best_config']['min_samples_split'],
                             min_samples_leaf=config['best_config']['min_samples_leaf'])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Calculate the F1 score using the 'f1_macro' metric
f1_macro = f1_score(y_test, y_pred, average='macro')
print("F1 Score (Macro):", f1_macro)

confusion_mat = confusion_matrix(y_test, y_pred)
confusion_df = pd.DataFrame(confusion_mat)

print("\nConfusion Matrix:")
print(confusion_df)

F1 Score (Macro): 0.523899344310344

Confusion Matrix:
     0   1
0  656  16
1  111  10


# SVC with ray

In [31]:
from sklearn.svm import SVC

with open('results/svc_ray.json') as f:
    config = json.load(f)

clf = SVC(C=config['best_config']['C'],
          kernel=config['best_config']['kernel'],
          gamma=config['best_config']['gamma'])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Calculate the F1 score using the 'f1_macro' metric
f1_macro = f1_score(y_test, y_pred, average='macro')
print("F1 Score (Macro):", f1_macro)

confusion_mat = confusion_matrix(y_test, y_pred)

confusion_df = pd.DataFrame(confusion_mat)

print("\nConfusion Matrix:")
print(confusion_df)

F1 Score (Macro): 0.4716245638886703

Confusion Matrix:
     0  1
0  664  8
1  119  2
