In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from datasets import load_dataset
from PIL import Image
import matplotlib.pyplot as plt
import re
from torchvision import transforms
import json
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


# 要用git clone 手動下載這個dataset，再移到相對位置

# === 1. Load Dataset ===
dataset = load_dataset("imagefolder", data_dir="./Dataset/train")

# Train and validation datasets
class TransformedDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, transform):
        self.dataset = dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        example = self.dataset[idx]
        image = self.transform(example["image"])
        label = example["label"]
        return image, label

transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((75, 50)),
    transforms.ToTensor()
])

split = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = TransformedDataset(split["train"], transform)
val_dataset = TransformedDataset(split["test"], transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Test dataset
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, img_dir, transform, label_map):
        self.img_paths = []
        self.labels = []
        self.transform = transform
        self.label_map = label_map

        for file in os.listdir(img_dir):
            if not file.endswith(".png"):
                continue
            label_str = file.split("_")[-1].split(".")[0]
            if label_str.upper() == "UNKNOWN":
                continue
            if label_str not in label_map:
                continue  # 若 label_map 沒有這個 label，可以跳過或報錯
            self.img_paths.append(os.path.join(img_dir, file))
            self.labels.append(label_map[label_str])

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        image = Image.open(self.img_paths[idx]).convert("L")
        image = self.transform(image)
        label = self.labels[idx]
        return image, label
    
# e.g., {"G1": 0, "M17": 1, ...}
with open("output/label_map.json", "r") as f:
    label_map = json.load(f)
test_dataset = TestDataset(img_dir="./Dataset/test", transform=transform, label_map=label_map)
test_loader = DataLoader(test_dataset, batch_size=64)

class EgyptianCNN(nn.Module):
    def __init__(self, num_classes):
        super(EgyptianCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
        self.pool1 = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
        self.pool2 = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(64 * 17 * 11, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x, return_features=False):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        features = F.relu(self.fc1(x))
        if return_features:
            return features
        return self.fc2(features)

# 1. 載入訓練好的模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = len(label_map)
model = EgyptianCNN(num_classes=num_classes).to(device)
model.load_state_dict(torch.load("output/egyptian_cnn_weights.pth", map_location=torch.device("cpu")))
model.eval()

def extract_features(model, dataloader):
    model.eval()
    features = []
    labels = []
    with torch.no_grad():
        for images, targets in dataloader:
            images = images.to(device)
            feats = model(images, return_features=True)
            features.append(feats.cpu().numpy())
            labels.append(targets.numpy())
    return np.vstack(features), np.hstack(labels)

X_train_feats, y_train = extract_features(model, train_loader)
X_test_feats, y_test = extract_features(model, test_loader)

# LDA 可降至 n_classes - 1 維（這裡是 9）
lda_components = 9

# 執行 LDA
lda = LinearDiscriminantAnalysis(n_components=lda_components)
x_train_lda = lda.fit_transform(X_train_feats, y_train)
x_test_lda = lda.transform(X_test_feats)

  model.load_state_dict(torch.load("output/egyptian_cnn_weights.pth", map_location=torch.device("cpu")))


In [3]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

# LDA + K-means clustering

# 設定群數（Oracle MNIST 是 10 類）
n_clusters = 10

# 建立 K-means 模型並訓練
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(x_test_lda)

# 評估 clustering 效果
ari = adjusted_rand_score(y_test, cluster_labels)
print(f"Adjusted Rand Index (ARI): {ari:.4f}")

Adjusted Rand Index (ARI): 0.4363




In [4]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import adjusted_rand_score

# LDA + EM algorithm

# 設定群數（仍為 10）
n_components = 10

# 建立 GMM 模型
gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=42)
gmm.fit(x_test_lda)  # 只用 test 資料進行 unsupervised clustering

# 分群預測（返回最可能的群編號）
cluster_labels = gmm.predict(x_test_lda)

# 計算 Adjusted Rand Index (ARI)
ari = adjusted_rand_score(y_test, cluster_labels)
print(f"EM Clustering ARI: {ari:.4f}")

EM Clustering ARI: 0.4380


