In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# 加載數據
a_train_data = pd.read_csv('實驗A/train_data.csv')
a_test_data = pd.read_csv('實驗A/test_data.csv')
b_train_data = pd.read_csv('實驗B/train_data.csv')
b_test_data = pd.read_csv('實驗B/test_data.csv')
print(a_train_data[a_train_data["BMI"] == 0])

# 假設最後一列是標籤
y_train_a = a_train_data.iloc[:, -1]
y_test_a = a_test_data.iloc[:, -1]
y_train_b = b_train_data.iloc[:, -1]
y_test_b = b_test_data.iloc[:, -1]

# 特徵數據
X_train_a = a_train_data.iloc[:, :-1]
X_test_a = a_test_data.iloc[:, :-1]
X_train_b = b_train_data.iloc[:, :-1]
X_test_b = b_test_data.iloc[:, :-1]

# 數據標準化
scaler_a = StandardScaler()
scaler_b = StandardScaler()

X_train_a_scaled = scaler_a.fit_transform(X_train_a)
X_test_a_scaled = scaler_a.transform(X_test_a)
X_train_b_scaled = scaler_b.fit_transform(X_train_b)
X_test_b_scaled = scaler_b.transform(X_test_b)

# 應用PCA
pca_a = PCA(n_components=0.95)
pca_b = PCA(n_components=0.95)

X_train_a_pca = pca_a.fit_transform(X_train_a_scaled)
X_test_a_pca = pca_a.transform(X_test_a_scaled)
X_train_b_pca = pca_b.fit_transform(X_train_b_scaled)
X_test_b_pca = pca_b.transform(X_test_b_scaled)

# K-means聚類
kmeans_a = KMeans(n_clusters=2, random_state=42)
kmeans_a.fit(X_train_a_pca)

kmeans_b = KMeans(n_clusters=2, random_state=42)
kmeans_b.fit(X_train_b_pca)

predictions_a = kmeans_a.predict(X_test_a_pca)
predictions_b = kmeans_b.predict(X_test_b_pca)

for i, num in enumerate(predictions_a):
    if num == 1:
        predictions_a[i] = 0
    else:
        predictions_a[i] = 1


# 計算性能指標
results_a = {
    "Accuracy": accuracy_score(y_test_a, predictions_a),
    "Recall": recall_score(y_test_a, predictions_a),
    "Precision": precision_score(y_test_a, predictions_a),
    "F1 Score": f1_score(y_test_a, predictions_a)
}

results_b = {
    "Accuracy": accuracy_score(y_test_b, predictions_b),
    "Recall": recall_score(y_test_b, predictions_b),
    "Precision": precision_score(y_test_b, predictions_b),
    "F1 Score": f1_score(y_test_b, predictions_b)
}

results_a, results_b



Empty DataFrame
Columns: [Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age, Outcome]
Index: []


({'Accuracy': 0.7164179104477612,
  'Recall': 0.7887323943661971,
  'Precision': 0.5714285714285714,
  'F1 Score': 0.6627218934911242},
 {'Accuracy': 0.73,
  'Recall': 0.7297297297297297,
  'Precision': 0.6136363636363636,
  'F1 Score': 0.6666666666666666})