<a href="https://colab.research.google.com/github/marissahalim/ICS635/blob/main/Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

In [207]:
# Load the Breast Cancer dataset using load_breast_cancer from sklearn
import sklearn
import pandas as pd
import sklearn.tree
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

bc = load_breast_cancer()

# Partition the data into an 80% training set and a 20% test set.
X, y = bc.data, bc.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

# Scale the features using StandardScaler for KNN
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# array for metrics
metric_scores = ['Accuracy', 'Precision', 'Recall', 'F1']


# K-Nearest Neighbors

In [208]:
# ----- Train KNN -----

# start with n_neighbors = 5
model_KNN = neighbors.KNeighborsClassifier(n_neighbors=5)

model_KNN.fit(X_train_scaled, y_train)


# ----- Evaluate KNN -----

# Test model
pred_KNN = model_KNN.predict(X_test_scaled)

# Accuracy
accuracy_KNN = accuracy_score(y_test, pred_KNN)
print(f"Accuracy of KNN (n=5): {accuracy_KNN:.4f}")

# Precision
precision_KNN = precision_score(y_test, pred_KNN)
print(f"Precision of KNN (n=5): {precision_KNN:.4f}")

# Recall
recall_KNN = recall_score(y_test, pred_KNN)
print(f"Recall of KNN (n=5): {recall_KNN:.4f}")

# F1-score
f1_KNN = f1_score(y_test, pred_KNN)
print(f"F1 score of KNN (n=5): {f1_KNN:.4f}")

metrics_KNN = [accuracy_KNN, precision_KNN, recall_KNN, f1_KNN]

print("\n")

# Confusion matrix
print("KNN Confusion matrix:")
confusion_matrix(y_test, pred_KNN)

Accuracy of KNN (n=5): 0.9386
Precision of KNN (n=5): 0.9067
Recall of KNN (n=5): 1.0000
F1 score of KNN (n=5): 0.9510


KNN Confusion matrix:


array([[39,  7],
       [ 0, 68]])

# Decision Tree

In [209]:
# ----- Train Decision Tree -----

# default settings
model_DT_def = sklearn.tree.DecisionTreeClassifier()
model_DT_def.fit(X_train, y_train)

model_DT_def.get_depth()

# visualize model
# sklearn.tree.plot_tree(model_DT_def, proportion=True)

# ----- Evaluate Decision Tree -----

# Test model
pred_DT_def = model_DT_def.predict(X_test)

# Accuracy
accuracy_DT_def = accuracy_score(y_test, pred_DT_def)
print(f"Accuracy of Decision Tree (default): {accuracy_DT_def:.4f}")

# Precision
precision_DT_def = precision_score(y_test, pred_DT_def)
print(f"Precision of Decision Tree (default): {precision_DT_def:.4f}")

# Recall
recall_DT_def = recall_score(y_test, pred_DT_def)
print(f"Recall of Decision Tree (default): {recall_DT_def:.4f}")

# F1-score
f1_DT_def = f1_score(y_test, pred_DT_def)
print(f"F1 score of Decision Tree (default): {f1_DT_def:.4f}")

metrics_DT = [accuracy_DT_def, precision_DT_def, recall_DT_def, f1_DT_def]

print("\n")

# Confusion matrix
cm_DT_def = confusion_matrix(y_test, pred_DT_def)
print("Decision Tree Confusion matrix:")
print(cm_DT_def)




Accuracy of Decision Tree (default): 0.9298
Precision of Decision Tree (default): 0.9286
Recall of Decision Tree (default): 0.9559
F1 score of Decision Tree (default): 0.9420


Decision Tree Confusion matrix:
[[41  5]
 [ 3 65]]


# Random Forest

In [210]:
# ----- Train Random Forest -----

# 100 trees
model_RF = RandomForestClassifier()
model_RF.fit(X_train, y_train)


# ----- Evaluate Random Forest -----

# Test model
pred_RF = model_RF.predict(X_test)

# Accuracy
accuracy_RF = accuracy_score(y_test, pred_RF)
print(f"Accuracy of Random Forest (default): {accuracy_RF:.4f}")

# Precision
precision_RF = precision_score(y_test, pred_RF)
print(f"Precision of Random Forest (default): {precision_RF:.4f}")


# Recall
recall_RF = recall_score(y_test, pred_RF)
print(f"Recall of Random Forest (default): {recall_RF:.4f}")


# F1-score
f1_RF = f1_score(y_test, pred_RF)
print(f"F1 score of Random Forest (default): {f1_RF:.4f}")

metrics_RF = [accuracy_RF, precision_RF, recall_RF, f1_RF]

print("\n")

# Confusion matrix
print("Random Forest Confusion matrix:")
cm_RF = confusion_matrix(y_test, pred_RF)
print(cm_RF)



Accuracy of Random Forest (default): 0.9649
Precision of Random Forest (default): 0.9444
Recall of Random Forest (default): 1.0000
F1 score of Random Forest (default): 0.9714


Random Forest Confusion matrix:
[[42  4]
 [ 0 68]]


# Evaluation

In [211]:
models_df = pd.DataFrame({
    'Metric': metric_scores,
    'KNN': metrics_KNN,
    'Decision Tree': metrics_DT,
    'Random Forest': metrics_RF
})

print(models_df)

      Metric       KNN  Decision Tree  Random Forest
0   Accuracy  0.938596       0.929825       0.964912
1  Precision  0.906667       0.928571       0.944444
2     Recall  1.000000       0.955882       1.000000
3         F1  0.951049       0.942029       0.971429


# Ablation Study

In [191]:
# ----- KNN -----

# ----- Train KNN -----

# start with n_neighbors = 3
model_KNN = neighbors.KNeighborsClassifier(n_neighbors=3)

model_KNN.fit(X_train_scaled, y_train)


# ----- Evaluate KNN -----

# Test model
pred_KNN = model_KNN.predict(X_test_scaled)

# Accuracy
accuracy_KNN = accuracy_score(y_test, pred_KNN)
print(f"Accuracy of KNN (n=3): {accuracy_KNN:.4f}")

# Precision
precision_KNN = precision_score(y_test, pred_KNN)
print(f"Precision of KNN (n=3): {precision_KNN:.4f}")

# Recall
recall_KNN = recall_score(y_test, pred_KNN)
print(f"Recall of KNN (n=3): {recall_KNN:.4f}")

# F1-score
f1_KNN = f1_score(y_test, pred_KNN)
print(f"F1 score of KNN (n=3): {f1_KNN:.4f}")

metrics_KNN_3 = [accuracy_KNN, precision_KNN, recall_KNN, f1_KNN]

print("\n")

# Confusion matrix
print("KNN Confusion matrix:")
confusion_matrix(y_test, pred_KNN)

Accuracy of KNN (n=3): 0.9825
Precision of KNN (n=3): 0.9861
Recall of KNN (n=3): 0.9861
F1 score of KNN (n=3): 0.9861


KNN Confusion matrix:


array([[41,  1],
       [ 1, 71]])

In [192]:
# ----- KNN -----

# ----- Train KNN -----

# start with n_neighbors = 1
model_KNN = neighbors.KNeighborsClassifier(n_neighbors=1)

model_KNN.fit(X_train_scaled, y_train)


# ----- Evaluate KNN -----

# Test model
pred_KNN = model_KNN.predict(X_test_scaled)

# Accuracy
accuracy_KNN = accuracy_score(y_test, pred_KNN)
print(f"Accuracy of KNN (n=1): {accuracy_KNN:.4f}")

# Precision
precision_KNN = precision_score(y_test, pred_KNN)
print(f"Precision of KNN (n=1): {precision_KNN:.4f}")

# Recall
recall_KNN = recall_score(y_test, pred_KNN)
print(f"Recall of KNN (n=1): {recall_KNN:.4f}")

# F1-score
f1_KNN = f1_score(y_test, pred_KNN)
print(f"F1 score of KNN (n=1): {f1_KNN:.4f}")

metrics_KNN_1 = [accuracy_KNN, precision_KNN, recall_KNN, f1_KNN]

print("\n")

# Confusion matrix
print("KNN Confusion matrix:")
confusion_matrix(y_test, pred_KNN)

Accuracy of KNN (n=1): 0.9298
Precision of KNN (n=1): 0.9706
Recall of KNN (n=1): 0.9167
F1 score of KNN (n=1): 0.9429


KNN Confusion matrix:


array([[40,  2],
       [ 6, 66]])

In [193]:
models_KNN = pd.DataFrame({
    'Metric': metric_scores,
    'KNN (n=5)': metrics_KNN,
    'KNN (n=3)': metrics_KNN_3,
    'KNN (n=1)': metrics_KNN_1
})

print(models_KNN)

      Metric  KNN (n=5)  KNN (n=3)  KNN (n=1)
0   Accuracy   0.982456   0.982456   0.929825
1  Precision   0.986111   0.986111   0.970588
2     Recall   0.986111   0.986111   0.916667
3         F1   0.986111   0.986111   0.942857


In [194]:
# ----- Decision Trees -----

# max_depth
model_DT_maxD = sklearn.tree.DecisionTreeClassifier(max_depth=5)
model_DT_maxD.fit(X_train, y_train)

# visualize model
# sklearn.tree.plot_tree(model_DT_maxD, proportion=True)

pred_DT_maxD = model_DT_maxD.predict(X_test)

accuracy_DT_maxD = accuracy_score(y_test, pred_DT_maxD)
print(f"Accuracy of Decision Tree (max depth): {accuracy_DT_maxD:.4f}")

precision_DT_maxD = precision_score(y_test, pred_DT_maxD)
print(f"Precision of Decision Tree (max depth): {precision_DT_maxD:.4f}")

recall_DT_maxD = precision_score(y_test, pred_DT_maxD)
print(f"Recall of Decision Tree (max depth): {recall_DT_maxD:.4f}")

f1_DT_maxD = f1_score(y_test, pred_DT_maxD)
print(f"F1 score of Decision Tree (max depth): {f1_DT_maxD:.4f}")

cm_DT_maxD = confusion_matrix(y_test, pred_DT_maxD)
print("Confusion matrix (max depth):")
print(cm_DT_maxD)

Accuracy of Decision Tree (max depth): 0.9649
Precision of Decision Tree (max depth): 0.9722
Recall of Decision Tree (max depth): 0.9722
F1 score of Decision Tree (max depth): 0.9722
Confusion matrix (max depth):
[[40  2]
 [ 2 70]]


In [195]:
# ----- Random Forests -----

# test an increase in the min_sample_split
model_RF_mss = RandomForestClassifier(min_samples_split=4)
model_RF_mss.fit(X_train, y_train)
pred_RF_mss = model_RF_mss.predict(X_test)

accuracy_RF_mss = accuracy_score(y_test, pred_RF_mss)
print(f"Accuracy of Decision Tree (mss = 4): {accuracy_RF_mss:.4f}")

precision_RF_mss = precision_score(y_test, pred_RF_mss)
print(f"Precision of Decision Tree (max depth): {precision_RF_mss:.4f}")

recall_RF_mss = precision_score(y_test, pred_RF_mss)
print(f"Recall of Decision Tree (max depth): {recall_RF_mss:.4f}")

f1_RF_mss = f1_score(y_test, pred_RF_mss)
print(f"F1 score of Decision Tree (max depth): {f1_RF_mss:.4f}")


model_RF_increase_mss = [accuracy_RF_mss, precision_RF_mss, recall_RF_mss, f1_RF_mss]


# test an increase in the tree estimators
model_RF_est = RandomForestClassifier(n_estimators=200)
model_RF_est.fit(X_train, y_train)
pred_RF_est = model_RF_est.predict(X_test)

accuracy_RF_est = accuracy_score(y_test, pred_RF_est)
print(f"Accuracy of Decision Tree (mss = 4): {accuracy_RF_est:.4f}")

precision_RF_est = precision_score(y_test, pred_RF_est)
print(f"Precision of Decision Tree (max depth): {precision_RF_est:.4f}")

recall_RF_est = precision_score(y_test, pred_RF_est)
print(f"Recall of Decision Tree (max depth): {recall_RF_est:.4f}")

f1_RF_est = f1_score(y_test, pred_RF_est)
print(f"F1 score of Decision Tree (max depth): {f1_RF_est:.4f}")

model_RF_increase_est = [accuracy_RF_est, precision_RF_est, recall_RF_est, f1_RF_est]


Accuracy of Decision Tree (mss = 4): 0.9649
Precision of Decision Tree (max depth): 0.9595
Recall of Decision Tree (max depth): 0.9595
F1 score of Decision Tree (max depth): 0.9726
Accuracy of Decision Tree (mss = 4): 0.9737
Precision of Decision Tree (max depth): 0.9726
Recall of Decision Tree (max depth): 0.9726
F1 score of Decision Tree (max depth): 0.9793


In [196]:
models_RF = pd.DataFrame({
    'Metric': metric_scores,
    'Random Forest (default)': metrics_RF,
    'Random Forest (min_samples_split = 4)': model_RF_increase_mss,
    'Random Forest (n_estimators = 200)': model_RF_increase_est
})

print(models_RF)

      Metric  Random Forest (default)  Random Forest (min_samples_split = 4)  \
0   Accuracy                 0.982456                               0.964912   
1  Precision                 0.986111                               0.959459   
2     Recall                 0.986111                               0.959459   
3         F1                 0.986111                               0.972603   

   Random Forest (n_estimators = 200)  
0                            0.973684  
1                            0.972603  
2                            0.972603  
3                            0.979310  
