In [None]:
from datasets import load_dataset
import pandas as pd
import os
import urllib.request

# 載入 WikiArt 資料集的前 5GB 部分
dataset = load_dataset("huggan/wikiart", split="train")

# 指定要過濾的 style 類別
target_styles = {
    "Impressionism", "Realism", "Romanticism", "Symbolism", "Art_Nouveau"
}

# 過濾符合條件的資料
filtered_dataset = dataset.filter(lambda x: x["style"] in target_styles)

# 建立資料夾儲存圖片
save_dir = "wikiart_selected_styles"
os.makedirs(save_dir, exist_ok=True)

# 下載圖片
for i, item in enumerate(filtered_dataset):
    url = item["image"]["url"]
    style = item["style"].replace(" ", "_")  # 避免空格問題
    filename = f"{save_dir}/{style}_{i}.jpg"
    try:
        urllib.request.urlretrieve(url, filename)
        print(f"Downloaded: {filename}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")


  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 2.37k/2.37k [00:00<00:00, 9.61kB/s]
Downloading metadata: 100%|██████████| 5.91k/5.91k [00:00<00:00, 18.0kB/s]
Downloading data:  32%|███▏      | 23/72 [1:08:53<2:28:12, 181.48s/files]

In [None]:
import os
import torch
import clip
from PIL import Image
from tqdm import tqdm
import numpy as np

# 設定圖片資料夾
image_dir = "wikiart_filtered_styles"  # 你下載圖片的資料夾

# 設定 device
device = "cuda" if torch.cuda.is_available() else "cpu"

# 載入 CLIP 模型與處理器
model, preprocess = clip.load("ViT-B/32", device=device)

# 儲存結果
features = []
filenames = []

# 讀取所有圖片並編碼
for filename in tqdm(os.listdir(image_dir)):
    if filename.lower().endswith((".jpg", ".png", ".jpeg")):
        image_path = os.path.join(image_dir, filename)
        try:
            image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
            with torch.no_grad():
                feature = model.encode_image(image)
                feature = feature / feature.norm(dim=-1, keepdim=True)  # normalize
                features.append(feature.cpu().numpy())
                filenames.append(filename)
        except Exception as e:
            print(f"無法處理圖片 {filename}:{e}")

# 將特徵儲存成 numpy array（512 維）
features = np.concatenate(features, axis=0)
np.save("clip_features.npy", features)

# 若要儲存對應的檔名
with open("clip_filenames.txt", "w") as f:
    for name in filenames:
        f.write(f"{name}\n")

print("✅ 特徵向量與檔名儲存完成！")


In [None]:
import numpy as np
import pandas as pd

# 載入 512 維特徵向量和對應檔名
features = np.load("clip_features.npy")
with open("clip_filenames.txt", "r") as f:
    filenames = [line.strip() for line in f]

# 將檔名和特徵一起存入 DataFrame
df_full = pd.DataFrame(features)
df_full["filename"] = filenames

# 設定每個子集的大小
subset_size = 100

# 記錄三個子集
subsets = []

for i in range(3):
    subset_df = df_full.sample(n=subset_size, random_state=42 + i).reset_index(drop=True)
    subsets.append(subset_df)

# 儲存到 CSV 或保留在記憶體中
for i, subset_df in enumerate(subsets):
    subset_df.to_csv(f"clip_subset_{i+1}.csv", index=False)

print("三個隨機子集已建立並儲存為 clip_subset_1.csv ~ clip_subset_3.csv")
