<a href="https://colab.research.google.com/github/marissahalim/ICS635/blob/main/Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

In [None]:
# Load the Breast Cancer dataset using load_breast_cancer from sklearn
import sklearn
import pandas as pd
import sklearn.tree
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

bc = load_breast_cancer()

# Partition the data into an 80% training set and a 20% test set.
X, y = bc.data, bc.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

# Scale the features using StandardScaler for KNN
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# array for metrics
metric_scores = ['Accuracy', 'Precision', 'Recall', 'F1']


# K-Nearest Neighbors

In [None]:
# ----- Train KNN -----

# start with n_neighbors = 5
model_KNN = neighbors.KNeighborsClassifier(n_neighbors=5)

model_KNN.fit(X_train_scaled, y_train)


# ----- Evaluate KNN -----

# Test model
pred_KNN = model_KNN.predict(X_test_scaled)

# Accuracy
accuracy_KNN = accuracy_score(y_test, pred_KNN)
print(f"Accuracy of KNN (n=5): {accuracy_KNN:.4f}")

# Precision
precision_KNN = precision_score(y_test, pred_KNN)
print(f"Precision of KNN (n=5): {precision_KNN:.4f}")

# Recall
recall_KNN = recall_score(y_test, pred_KNN)
print(f"Recall of KNN (n=5): {recall_KNN:.4f}")

# F1-score
f1_KNN = f1_score(y_test, pred_KNN)
print(f"F1 score of KNN (n=5): {f1_KNN:.4f}")

metrics_KNN = [accuracy_KNN, precision_KNN, recall_KNN, f1_KNN]

print("\n")

# Confusion matrix
print("KNN Confusion matrix:")
confusion_matrix(y_test, pred_KNN)

Accuracy of KNN (n=5): 0.9825
Precision of KNN (n=5): 1.0000
Recall of KNN (n=5): 0.9730
F1 score of KNN (n=5): 0.9863


KNN Confusion matrix:


array([[40,  0],
       [ 2, 72]])

# Decision Tree

In [None]:
# ----- Train Decision Tree -----

# default settings
model_DT_def = sklearn.tree.DecisionTreeClassifier()
model_DT_def.fit(X_train, y_train)

model_DT_def.get_depth()

# visualize model
# sklearn.tree.plot_tree(model_DT_def, proportion=True)

# ----- Evaluate Decision Tree -----

# Test model
pred_DT_def = model_DT_def.predict(X_test)

# Accuracy
accuracy_DT_def = accuracy_score(y_test, pred_DT_def)
print(f"Accuracy of Decision Tree (default): {accuracy_DT_def:.4f}")

# Precision
precision_DT_def = precision_score(y_test, pred_DT_def)
print(f"Precision of Decision Tree (default): {precision_DT_def:.4f}")

# Recall
recall_DT_def = recall_score(y_test, pred_DT_def)
print(f"Recall of Decision Tree (default): {recall_DT_def:.4f}")

# F1-score
f1_DT_def = f1_score(y_test, pred_DT_def)
print(f"F1 score of Decision Tree (default): {f1_DT_def:.4f}")

metrics_DT = [accuracy_DT_def, precision_DT_def, recall_DT_def, f1_DT_def]

print("\n")

# Confusion matrix
cm_DT_def = confusion_matrix(y_test, pred_DT_def)
print("Confusion matrix (default):")
print(cm_DT_def)




Accuracy of Decision Tree (default): 0.9211
Precision of Decision Tree (default): 0.9577
Recall of Decision Tree (default): 0.9189
F1 score of Decision Tree (default): 0.9379


Confusion matrix (default):
[[37  3]
 [ 6 68]]


# Random Forest

In [None]:
# ----- Train Random Forest -----

# 100 trees
model_RF = RandomForestClassifier()
model_RF.fit(X_train, y_train)


# ----- Evaluate Random Forest -----

# Test model
pred_RF = model_RF.predict(X_test)

# Accuracy
accuracy_RF = accuracy_score(y_test, pred_RF)
print(f"Accuracy of Random Forest (default): {accuracy_RF:.4f}")

# Precision
precision_RF = precision_score(y_test, pred_RF)
print(f"Precision of Random Forest (default): {precision_RF:.4f}")


# Recall
recall_RF = recall_score(y_test, pred_RF)
print(f"Recall of Random Forest (default): {recall_RF:.4f}")


# F1-score
f1_RF = f1_score(y_test, pred_RF)
print(f"F1 score of Random Forest (default): {f1_RF:.4f}")

metrics_RF = [accuracy_RF, precision_RF, recall_RF, f1_RF]

print("\n")

# Confusion matrix
cm_RF = confusion_matrix(y_test, pred_RF)
print(cm_RF)



Accuracy of Random Forest (default): 0.9474
Precision of Random Forest (default): 0.9452
Recall of Random Forest (default): 0.9718
F1 score of Random Forest (default): 0.9583


[[39  4]
 [ 2 69]]


# Evaluation

In [None]:
models_df = pd.DataFrame({
    'Metric': metric_scores,
    'KNN': metrics_KNN,
    'Decision Tree': metrics_DT,
    'Random Forest': metrics_RF
})

print(models_df)

      Metric       KNN  Decision Tree  Random Forest
0   Accuracy  0.973684       0.921053       0.947368
1  Precision  0.986301       0.957746       0.945205
2     Recall  0.972973       0.918919       0.971831
3         F1  0.979592       0.937931       0.958333


# Ablation Study

In [None]:
# ----- KNN -----

# ----- Train KNN -----

# start with n_neighbors = 3
model_KNN = neighbors.KNeighborsClassifier(n_neighbors=3)

model_KNN.fit(X_train_scaled, y_train)


# ----- Evaluate KNN -----

# Test model
pred_KNN = model_KNN.predict(X_test_scaled)

# Accuracy
accuracy_KNN = accuracy_score(y_test, pred_KNN)
print(f"Accuracy of KNN (n=3): {accuracy_KNN:.4f}")

# Precision
precision_KNN = precision_score(y_test, pred_KNN)
print(f"Precision of KNN (n=3): {precision_KNN:.4f}")

# Recall
recall_KNN = recall_score(y_test, pred_KNN)
print(f"Recall of KNN (n=3): {recall_KNN:.4f}")

# F1-score
f1_KNN = f1_score(y_test, pred_KNN)
print(f"F1 score of KNN (n=3): {f1_KNN:.4f}")

metrics_KNN = [accuracy_KNN, precision_KNN, recall_KNN, f1_KNN]

print("\n")

# Confusion matrix
print("KNN Confusion matrix:")
confusion_matrix(y_test, pred_KNN)

Accuracy of KNN (n=3): 0.9737
Precision of KNN (n=3): 0.9863
Recall of KNN (n=3): 0.9730
F1 score of KNN (n=3): 0.9796


KNN Confusion matrix:


array([[39,  1],
       [ 2, 72]])

In [None]:
# ----- KNN -----

# ----- Train KNN -----

# start with n_neighbors = 1
model_KNN = neighbors.KNeighborsClassifier(n_neighbors=1)

model_KNN.fit(X_train_scaled, y_train)


# ----- Evaluate KNN -----

# Test model
pred_KNN = model_KNN.predict(X_test_scaled)

# Accuracy
accuracy_KNN = accuracy_score(y_test, pred_KNN)
print(f"Accuracy of KNN (n=1): {accuracy_KNN:.4f}")

# Precision
precision_KNN = precision_score(y_test, pred_KNN)
print(f"Precision of KNN (n=1): {precision_KNN:.4f}")

# Recall
recall_KNN = recall_score(y_test, pred_KNN)
print(f"Recall of KNN (n=1): {recall_KNN:.4f}")

# F1-score
f1_KNN = f1_score(y_test, pred_KNN)
print(f"F1 score of KNN (n=1): {f1_KNN:.4f}")

metrics_KNN = [accuracy_KNN, precision_KNN, recall_KNN, f1_KNN]

print("\n")

# Confusion matrix
print("KNN Confusion matrix:")
confusion_matrix(y_test, pred_KNN)

Accuracy of KNN (n=1): 0.9474
Precision of KNN (n=1): 0.9722
Recall of KNN (n=1): 0.9459
F1 score of KNN (n=1): 0.9589


KNN Confusion matrix:


array([[38,  2],
       [ 4, 70]])

In [None]:
# ----- Decision Trees -----

# max_depth
model_DT_maxD = sklearn.tree.DecisionTreeClassifier(max_depth=5)
model_DT_maxD.fit(X_train, y_train)

# visualize model
# sklearn.tree.plot_tree(model_DT_maxD, proportion=True)

pred_DT_maxD = model_DT_maxD.predict(X_test)

accuracy_DT_maxD = accuracy_score(y_test, pred_DT_maxD)
print(f"Accuracy of Decision Tree (max depth): {accuracy_DT_maxD:.4f}")

precision_DT_maxD = precision_score(y_test, pred_DT_maxD)
print(f"Precision of Decision Tree (max depth): {precision_DT_maxD:.4f}")

recall_DT_maxD = precision_score(y_test, pred_DT_maxD)
print(f"Recall of Decision Tree (max depth): {recall_DT_maxD:.4f}")

f1_DT_maxD = f1_score(y_test, pred_DT_maxD)
print(f"F1 score of Decision Tree (max depth): {f1_DT_maxD:.4f}")

cm_DT_maxD = confusion_matrix(y_test, pred_DT_maxD)
print("Confusion matrix (max depth):")
print(cm_DT_maxD)

Accuracy of Decision Tree (max depth): 0.9211
Precision of Decision Tree (max depth): 0.9189
Recall of Decision Tree (max depth): 0.9189
F1 score of Decision Tree (max depth): 0.9379
Confusion matrix (max depth):
[[37  6]
 [ 3 68]]


In [None]:
# ----- Random Forests -----

model_RF_maxD = RandomForestClassifier(n_estimators=100,
                                       max_depth=None,
                                       min_samples_split=2)
model_RF_maxD.fit(X_train, y_train)

