In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

# Load the data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
df = pd.read_csv(url, header=None)
columns = ['id', 'diagnosis'] + [f'feature_{i}' for i in range(1, 31)]
df.columns = columns

# Split features and labels
x = df.drop(['id', 'diagnosis'], axis=1)
y = df['diagnosis'].map({'M': 1, 'B': 0})

# Split the training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Define the evaluation function
def evaluate_model(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn)
    tpr = tp / (tp + fn)
    return f1, precision, recall, fp, tp, fpr, tpr

# Original data model
dt_original = DecisionTreeClassifier(random_state=42)
f1_orig, prec_orig, rec_orig, fp_orig, tp_orig, fpr_orig, tpr_orig = evaluate_model(
    dt_original, x_train_scaled, x_test_scaled, y_train, y_test
)

# PCA1 model
pca1 = PCA(n_components=1)
x_train_pca1 = pca1.fit_transform(x_train_scaled)
x_test_pca1 = pca1.transform(x_test_scaled)
dt_pca1 = DecisionTreeClassifier(random_state=42)
f1_pca1, prec_pca1, rec_pca1, fp_pca1, tp_pca1, fpr_pca1, tpr_pca1 = evaluate_model(
    dt_pca1, x_train_pca1, x_test_pca1, y_train, y_test
)

# PCA2 model
pca2 = PCA(n_components=2)
x_train_pca2 = pca2.fit_transform(x_train_scaled)
x_test_pca2 = pca2.transform(x_test_scaled)
dt_pca2 = DecisionTreeClassifier(random_state=42)
f1_pca2, prec_pca2, rec_pca2, fp_pca2, tp_pca2, fpr_pca2, tpr_pca2 = evaluate_model(
    dt_pca2, x_train_pca2, x_test_pca2, y_train, y_test
)

# Output the results
print("Original data model:")
print(f"F1 Score: {f1_orig:.4f}, Precision: {prec_orig:.4f}, Recall: {rec_orig:.4f}")
print(f"FP: {fp_orig}, TP: {tp_orig}, FPR: {fpr_orig:.4f}, TPR: {tpr_orig:.4f}\n")

print("PCA1 model (only the first principal component):")
print(f"F1 Score: {f1_pca1:.4f}, Precision: {prec_pca1:.4f}, Recall: {rec_pca1:.4f}")
print(f"FP: {fp_pca1}, TP: {tp_pca1}, FPR: {fpr_pca1:.4f}, TPR: {tpr_pca1:.4f}\n")

print("PCA2 model (the first two principal components):")
print(f"F1 Score: {f1_pca2:.4f}, Precision: {prec_pca2:.4f}, Recall: {rec_pca2:.4f}")
print(f"FP: {fp_pca2}, TP: {tp_pca2}, FPR: {fpr_pca2:.4f}, TPR: {tpr_pca2:.4f}\n")

Original data model:
F1 Score: 0.9302, Precision: 0.9302, Recall: 0.9302
FP: 3, TP: 40, FPR: 0.0423, TPR: 0.9302

PCA1 model (only the first principal component):
F1 Score: 0.8889, Precision: 0.9474, Recall: 0.8372
FP: 2, TP: 36, FPR: 0.0282, TPR: 0.8372

PCA2 model (the first two principal components):
F1 Score: 0.9425, Precision: 0.9318, Recall: 0.9535
FP: 3, TP: 41, FPR: 0.0423, TPR: 0.9535

