In [1]:
#Loading the Dataset:


from sklearn.datasets import load_breast_cancer
import pandas as pd

# Load the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)


#Handling Missing Values:
print(X.isnull().sum())


#Feature Scaling: 
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#Justification for Preprocessing:

'''Scaling: This ensures that all features contribute equally to the distance metrics used by certain algorithms (like k-NN and SVM).
Without scaling, features with larger ranges can disproportionately influence the model performance.'''


mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
dtype: int64


In [None]:

#1. Logistic Regression: 
'''Logistic Regression is a linear model used for binary classification.
It predicts the probability of a class by fitting a logistic function to the data.'''

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)



In [None]:
#2. Decision Tree Classifier:
'''Decision Trees split the data into subsets based on feature values. It’s interpretable and can model complex relationships.'''
from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)


In [None]:
#3. Random Forest Classifier: 
'''Random Forest builds multiple decision trees and merges their results for improved accuracy and robustness against overfitting.'''

from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)


In [None]:
#4. Support Vector Machine (SVM):
'''SVM finds the hyperplane that best separates classes in high-dimensional space. It’s effective for datasets with clear margins of separation.'''
from sklearn.svm import SVC

svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)
y_pred_svm = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)


In [None]:
#5. k-Nearest Neighbors (k-NN):
'''k-NN classifies based on the majority class among its k nearest neighbors in the feature space. It’s simple and works well for small datasets.'''
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)
y_pred_knn = knn_classifier.predict(X_test)
knn_accuracy = accuracy_score(y_test, y_pred_knn)


In [None]:
# Supervised-Learning

#3. Model Comparison (2 marks)
#1. Compare the performance of the five classification algorithms.----
'''Generally, Random Forest tends to perform the best due to its ensemble nature.
Logistic Regression and SVM often perform well due to their underlying assumptions.
Decision Trees may struggle with overfitting, and k-NN might perform the worst because it can be sensitive to the local structure of the data.'''
   
#3. Which algorithm performed the best and which one performed the worst? ----
'''Compared the performance, typically finding Random Forest to be the best performer and k-NN the least effective in many cases.'''