# Supervised Learning on Breast Cancer Dataset using Python

### The objective of this assignment is to evaluate the understanding and application of supervised learning techniques on a real-world dataset.

### 2 = load data

In [7]:
# Import libraries
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Check first 5 rows
X.head()


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


### 3 = Explore

In [8]:
# Check size of data
X.shape, y.shape


((569, 30), (569,))

In [9]:
# See feature names
X.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [10]:
# Understand target labels
data.target_names

array(['malignant', 'benign'], dtype='<U9')

In [11]:
# Check class balance
y.value_counts()

1    357
0    212
Name: count, dtype: int64

In [12]:
# Statistical summary
X.describe()


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


### 4 =preprocessing

In [13]:
X.isnull().sum()


mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
dtype: int64

In [14]:
## To make all columns comparable.
## Feature scaling was applied using StandardScaler to normalize feature values, ensuring fair 
## contribution of all features to distance-based and gradient-based classifiers.

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [15]:
## The dataset was split into training and testing sets to train the model and evaluate its ability to generalize to new, unseen data.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


## Train model

### 1 = Logistic Regression Classification

In [16]:
##Logistic Regression was imported to perform binary classification of tumors as malignant or benign. Accuracy score, confusion matrix, and classification report were used to evaluate the performance of the model.

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [17]:
# Create model
logreg = LogisticRegression()

In [18]:
# Train model
# The Logistic Regression model was trained using the training dataset, allowing it to learn patterns between tumor features and their corresponding diagnoses.
logreg.fit(X_train, y_train)

In [19]:
# Predict
y_pred_logreg = logreg.predict(X_test)

In [None]:
# Evaluate prediction
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)

# matrix of evaluation
conf_matrix_logreg = confusion_matrix(y_test, y_pred_logreg)

# detailed performance report
report_logreg = classification_report(y_test, y_pred_logreg)

In [None]:
# prediction accuracy
print("Logistic Regression Accuracy:", accuracy_logreg)

Logistic Regression Accuracy: 0.9736842105263158


In [25]:
print("Confusion Matrix:\n", conf_matrix_logreg)

Confusion Matrix:
 [[41  2]
 [ 1 70]]


In [None]:
# Report
print("Classification Report:\n", report_logreg)

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



#Logistic Regression was implemented and evaluated using accuracy, confusion matrix, and classification report, achieving an accuracy of 97.37%.


### 2 = Decision Tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Create Decision Tree model
dtree = DecisionTreeClassifier(random_state=42)

# Train the model
dtree.fit(X_train, y_train)

# Predict on test data
y_pred_dtree = dtree.predict(X_test)

# Evaluate the model
accuracy_dtree = accuracy_score(y_test, y_pred_dtree)
conf_matrix_dtree = confusion_matrix(y_test, y_pred_dtree)
report_dtree = classification_report(y_test, y_pred_dtree)

print("Decision Tree Accuracy:", accuracy_dtree)
print("Confusion Matrix:\n", conf_matrix_dtree)
print("Classification Report:\n", report_dtree)


In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [27]:
# Create Decision Tree model
dtree = DecisionTreeClassifier(random_state=42)

In [28]:
# Train the model
dtree.fit(X_train, y_train)


In [29]:
# Predict on test data
y_pred_dtree = dtree.predict(X_test)

In [30]:
# Evaluate the model
accuracy_dtree = accuracy_score(y_test, y_pred_dtree)
conf_matrix_dtree = confusion_matrix(y_test, y_pred_dtree)
report_dtree = classification_report(y_test, y_pred_dtree)

In [31]:
print("Decision Tree Accuracy:", accuracy_dtree)

Decision Tree Accuracy: 0.9473684210526315


In [32]:
print("Confusion Matrix:\n", conf_matrix_dtree)

Confusion Matrix:
 [[40  3]
 [ 3 68]]


In [33]:
print("Classification Report:\n", report_dtree)

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [34]:
## Decision Tree Classifier predicted cancer with an accuracy of 93%, and the confusion matrix and classification report show detailed performance metrics for each class.

### 3 = Random Forest Classifier

In [35]:
# Import Random Forest and evaluation tools
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [36]:
# Create Random Forest model
rf = RandomForestClassifier(random_state=42)

In [37]:
# Train the model
rf.fit(X_train, y_train)


In [38]:
# Predict on test data
y_pred_rf = rf.predict(X_test)

In [39]:
# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)

In [40]:
print("Random Forest Accuracy:", accuracy_rf)

Random Forest Accuracy: 0.9649122807017544


In [41]:
print("Confusion Matrix:\n", conf_matrix_rf)

Confusion Matrix:
 [[40  3]
 [ 1 70]]


In [42]:
print("Classification Report:\n", report_rf)

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [43]:
#The Random Forest model performs very well, correctly identifying most cancer cases while rarely misclassifying benign cases, achieving an overall accuracy of 96%.

### 4 = Support Vector Machine (SVM)

# Create SVM model
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Predict on test data
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
report_svm = classification_report(y_test, y_pred_svm)

# Print results
print("SVM Accuracy:", accuracy_svm)
print("Confusion Matrix:\n", conf_matrix_svm)
print("Classification Report:\n", report_svm)

In [44]:
# Import SVM and evaluation tools
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [45]:
# Create SVM model
svm_model = SVC(kernel='linear', random_state=42)

In [46]:
# Train the model
svm_model.fit(X_train, y_train)

In [47]:
# Predict on test data
y_pred_svm = svm_model.predict(X_test)

In [48]:
# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
report_svm = classification_report(y_test, y_pred_svm)

In [49]:
print("SVM Accuracy:", accuracy_svm)

SVM Accuracy: 0.956140350877193


In [50]:
print("Confusion Matrix:\n", conf_matrix_svm)

Confusion Matrix:
 [[41  2]
 [ 3 68]]


In [53]:
print("Classification Report:\n", report_svm)

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94        43
           1       0.97      0.96      0.96        71

    accuracy                           0.96       114
   macro avg       0.95      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114





SVM classified breast tumors with 96% accuracy, effectively distinguishing malignant and benign cases.**


In [55]:
## SVM classified breast tumors with 96% accuracy, effectively distinguishing malignant and benign cases.


### 5 = k-Nearest Neighbors (k-NN) Classifier

In [56]:
# Import k-NN and evaluation tools
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [57]:
# Create k-NN model
knn = KNeighborsClassifier(n_neighbors=5)

In [58]:
# Train the model
knn.fit(X_train, y_train)

In [59]:
# Predict on test data
y_pred_knn = knn.predict(X_test)


In [60]:
# Evaluate the model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)
report_knn = classification_report(y_test, y_pred_knn)

In [61]:
print("k-NN Accuracy:", accuracy_knn)

k-NN Accuracy: 0.9473684210526315


In [62]:
print("Confusion Matrix:\n", conf_matrix_knn)

Confusion Matrix:
 [[40  3]
 [ 3 68]]


In [63]:
print("Classification Report:\n", report_knn)

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [64]:
## The k-Nearest Neighbors (k-NN) classifier was used to classify breast tumors.
## The model achieved an accuracy of 95%, showing reliable performance in distinguishing malignant and benign cases.

In [None]:
# Algorithm             	Accuracy
# Logistic Regression	    97%
# Decision Tree	            93%
# Random Forest	            96%
# Support Vector Machine	96%
# k-Nearest Neighbors	    95%

## Model Comparison

The performance of five supervised learning algorithms was compared using accuracy as the evaluation metric. Logistic Regression achieved the highest accuracy of 97%, followed by Random Forest and Support Vector Machine with 96% each. k-Nearest Neighbors achieved 95%, while Decision Tree showed comparatively lower performance at 93%.

## Conclusion



In this assessment, supervised learning techniques were applied to the breast cancer dataset to classify tumors as malignant or benign. Among the models tested, Logistic Regression performed the best and was selected as the final model due to its high accuracy and consistent performance.