<a href="https://colab.research.google.com/github/mikakia/Project-in-HealthCare/blob/FirstStep/Breast_Cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import  missingno as msno

import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC, NuSVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA

from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold, cross_val_score,cross_val_predict

In [4]:
!pip3 install -U ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [5]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# data (as pandas dataframes)
X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets

# metadata
print(breast_cancer_wisconsin_diagnostic.metadata)

# variable information
#print(breast_cancer_wisconsin_diagnostic.variables)


{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'ID': 230, 'type': 'NATIVE', 'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'venue': 'Electronic imaging', 'year': 1993, 'journal': None, 'DOI': '1

In [6]:
feature_names = X.columns
feature_names = ['id', 'diagnosis'] + feature_names.tolist()
print(feature_names)

['id', 'diagnosis', 'radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1', 'compactness1', 'concavity1', 'concave_points1', 'symmetry1', 'fractal_dimension1', 'radius2', 'texture2', 'perimeter2', 'area2', 'smoothness2', 'compactness2', 'concavity2', 'concave_points2', 'symmetry2', 'fractal_dimension2', 'radius3', 'texture3', 'perimeter3', 'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave_points3', 'symmetry3', 'fractal_dimension3']


#Exploring Dataset

In [9]:
url = "https://raw.githubusercontent.com/mikakia/Project-in-HealthCare/main/wdbc.data"
df = pd.read_csv(url, sep=",",header=None,names=feature_names)
df.head()

Unnamed: 0,id,diagnosis,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [None]:
df.dtypes


In [None]:
df.info()

In [None]:
df.isna().sum()

#Preprocessing

In [None]:
df.describe()

###Plots

In [None]:
df_no_id_dg = df.drop(['id', 'diagnosis'], axis=1)

plt.figure(figsize=(15,6))
sns.boxplot(data=df_no_id_dg)
plt.xticks(rotation=90)
plt.title("Boxplot of WDBC features")
plt.show()


In [None]:
sns.scatterplot(x='radius1', y='area1', data=df)
plt.title("Radius vs Area")
plt.show()

##Outlier check

In [None]:
print("Min value:", df['area1'].min())
print("Max value:", df['area1'].max())

In [None]:
#check for outliers in areas (mean per patient)
area_columns = ['area1', 'area2', 'area3']
mean_area_per_patient = df[area_columns].mean(axis=1)  # axis=1 means row-wise
print(mean_area_per_patient.max())

In [None]:
#check for outliers in radius (mean per patient)
area_columns = ['radius1', 'radius2', 'radius3']
mean_area_per_patient = df[area_columns].mean(axis=1)  # axis=1 means row-wise
print(mean_area_per_patient.min())


The mean of smoothness is 0.12 > 0.2 where 0.2 the max expected mean. It can be measurement error.

In [None]:
#check for outliers in smoothness (mean per patient)
area_columns = ['smoothness1', 'smoothness2', 'smoothness3']
mean_area_per_patient = df[area_columns].mean(axis=1)  # axis=1 means row-wise
print(mean_area_per_patient.max())


###Check the outliers ranges for smoothness1,2,3
normal extreme values indicating suspicious/malignant cell

In [None]:
cols = ['smoothness1','smoothness2','smoothness3']

for col in cols:

  Q1 = df[col].quantile(0.25)
  Q3 = df[col].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR

  # Show outlier rows
  outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
  print(outliers[[col]])


##Standarlization

In [None]:


features = df.drop(['id', 'diagnosis'], axis=1)
scaler = StandardScaler()

scaled_features = scaler.fit_transform(features)
scaled_df_sel_feat= pd.DataFrame(scaled_features, columns=features.columns)

df_scaled = pd.concat([df[['id', 'diagnosis']], scaled_df_sel_feat], axis=1)

df_scaled.head()



#Training and Test

In [None]:
X = df_scaled.drop(['id', 'diagnosis'], axis=1)
y = df_scaled['diagnosis']

In [None]:
X_notscaled = df.drop(['id', 'diagnosis'], axis=1)
y_notscaled = df['diagnosis']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
X_train_ns, X_test_ns, y_train_ns, y_test_ns = train_test_split(X_notscaled, y_notscaled, test_size=0.2, random_state=42, stratify=y_notscaled)

##KNN with k folds
Accuracy: 97%

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
kf = KFold(n_splits=14, shuffle=True, random_state=42)

In [None]:
scores = cross_val_score(knn, X, y, cv=kf, scoring='accuracy')

print("Accuracy for each fold:", scores)
print("Mean accuracy:", scores.mean())

In [None]:
y_pred = cross_val_predict(knn, X, y, cv=kf)
results = pd.DataFrame({'True': y, 'Predicted': y_pred})
print(results.head(20))

###KNN without kfolds
Accuracy: 96%

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [None]:
y_pred_knn = knn_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))
print("Confusion Matric\n",confusion_matrix(y_test, y_pred_knn))

##Random Forest
Accuracy: 97%

In [None]:
rf_model = RandomForestClassifier(n_estimators=30,max_depth=None,random_state=42)
rf_model.fit(X_train_ns, y_train_ns)

In [None]:
y_pred_rf = rf_model.predict(X_test_ns)
print("Accuracy:", accuracy_score(y_test_ns, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test_ns, y_pred_rf))

##Neural Network
Accuracy: 97%%

In [None]:

nn_model = MLPClassifier(hidden_layer_sizes=(30,15), activation='relu', max_iter=500, random_state=42)
nn_model.fit(X_train, y_train)

In [None]:
y_pred_nn = nn_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_nn))
print("Classification Report:\n", classification_report(y_test, y_pred_nn))

##SVM with rbf
Accuracy: 97%

In [None]:
svm_model = SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_nn))
print("Classification Report:")
print(classification_report(y_test, y_pred))