<a href="https://colab.research.google.com/github/ll3i/FashionData/blob/main/%EC%96%B4%EC%85%88%EB%B8%94_%EC%B5%9C%EC%A2%85_%EA%B3%BC%EC%A0%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import json
import pandas as pd
from collections import defaultdict
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models
from PIL import Image
import numpy as np
from torchvision import models, transforms
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tqdm import tqdm

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


[1-1]

In [None]:
# 데이터 폴더 경로 설정
directories = [
    '/content/drive/MyDrive/dataset/training_image',
    '/content/drive/MyDrive/dataset/validation_image'  # 추가한 경로
]

# 성별 및 스타일별 이미지 개수를 저장할 구조
image_count = defaultdict(lambda: defaultdict(int)) # 중첩된 딕셔너리의 구조: {성별: {스타일: 이미지 수}} 형태
total_images = 0  # 전체 이미지 수를 저장할 변수

# 파일 이름에서 성별과 스타일 추출하여 통계 내기
for directory in directories:  # 각 디렉토리에 대해 반복
    for filename in os.listdir(directory):
        if filename.endswith('.jpg'):  # JPG 파일만 처리
            parts = filename.split('_')  # 파일명을 '_'로 분할

            # 성별과 스타일 정보를 추출
            gender_identifier = parts[-1][0]  # 마지막 부분 첫 글자 (W or T)
            style = parts[3]  # 네 번째 부분은 스타일 정보

            # 성별을 '여성' / '남성'으로 변환
            gender = '여성' if gender_identifier == 'W' else '남성'

            # 성별과 스타일의 이미지 수를 집계
            image_count[gender][style] += 1
            total_images += 1  # 전체 이미지 수 증가

# DataFrame으로 변환
data = []
for gender, styles in image_count.items():
    for style, count in styles.items():
        data.append([gender, style, count])

df = pd.DataFrame(data, columns=['성별', '스타일', '이미지 수'])

# 예쁘게 출력
df_sorted = df.sort_values(by=['성별', '스타일']).reset_index(drop=True)

# 총 이미지 수 출력 및 DataFrame 표시
print(f"총 이미지 수: {total_images}")
display(df_sorted)


총 이미지 수: 5021


Unnamed: 0,성별,스타일,이미지 수
0,남성,bold,325
1,남성,hiphop,340
2,남성,hippie,342
3,남성,ivy,316
4,남성,metrosexual,336
5,남성,mods,349
6,남성,normcore,415
7,남성,sportivecasual,350
8,여성,athleisure,81
9,여성,bodyconscious,118


[1-2]

In [None]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 클래스 목록 (성별 + 스타일 조합으로 구성)
fashion_classes = [
    "남성_bold", "남성_hiphop", "남성_hippie", "남성_ivy", "남성_metrosexual",
    "남성_mods", "남성_normcore", "남성_sportivecasual", "여성_athleisure",
    "여성_bodyconscious", "여성_cityglam", "여성_classic", "여성_disco",
    "여성_ecology", "여성_feminine", "여성_genderless", "여성_grunge",
    "여성_hiphop", "여성_hippie", "여성_kitsch", "여성_lingerie",
    "여성_lounge", "여성_military", "여성_minimal", "여성_normcore",
    "여성_oriental", "여성_popart", "여성_powersuit", "여성_punk",
    "여성_space", "여성_sportivecasual"
]

# 스타일 매핑 (각 클래스 조합을 숫자로 매핑)
style_mapping = {style: idx for idx, style in enumerate(fashion_classes)}

# 데이터셋 클래스 정의
class CustomImageDataset(Dataset):
    def __init__(self, image_folder, transform=None):
        self.image_folder = image_folder
        self.image_paths = [os.path.join(image_folder, fname) for fname in os.listdir(image_folder)]
        self.transform = transform
        self.labels = []

        # 파일명에서 성별 및 스타일 레이블 추출
        for img in self.image_paths:
            fname = os.path.basename(img)
            parts = fname.split('_')
            gender = parts[-1].split('.')[0]  # 성별 추출 ('M' 또는 'W')
            style = parts[-2]  # 스타일 추출

            gender_label = "남성" if gender == 'M' else "여성"
            label = f"{gender_label}_{style}"  # 성별 + 스타일 조합
            style_label = style_mapping.get(label, -1)  # 스타일 매핑

            if style_label != -1:
                self.labels.append(style_label)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        img = Image.open(img_path).convert("RGB")

        if self.transform:
            img = self.transform(img)

        label = self.labels[idx]
        return img, label

# 트레이닝 데이터 증강 강화
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),  # 좌우 뒤집기
    transforms.RandomRotation(degrees=30),  # 회전 범위 증가
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # 색상 조정
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),  # 무작위 자르기 및 리사이즈
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 검증 데이터에 약한 데이터 증강 적용 (좌우 반전, 미세 회전 등)
val_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),  # 좌우 뒤집기
    transforms.RandomRotation(degrees=10),  # 미세한 회전 적용
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 트레이닝 데이터셋 로드
train_dataset = CustomImageDataset('/content/drive/MyDrive/dataset/processed_segmentation_cleaned', transform=train_transform)

# 검증 데이터셋 로드 (별도의 검증 이미지 폴더 사용)
val_dataset = CustomImageDataset('/content/drive/MyDrive/dataset/processed_segmentation_cleaned_for_val', transform=val_transform)

# 데이터 로더 설정
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# ResNet-18 기반 패션 스타일 및 성별 예측 모델
class FashionGenderModel(nn.Module):
    def __init__(self, num_classes=31):  # 총 31개의 클래스
        super(FashionGenderModel, self).__init__()
        self.resnet = models.resnet18(weights=None)  # Pretrained weights 사용 안 함
        self.resnet.fc = nn.Identity()  # ResNet의 기본 fully connected 레이어를 제거

        # 31개 클래스(성별 + 스타일 조합)를 위한 출력 레이어 추가
        self.fc = nn.Linear(512, num_classes)
        self.dropout = nn.Dropout(p=0.4)  # Dropout 비율 조정 (0.4)

    def forward(self, x):
        x = self.resnet(x)
        x = self.dropout(x)
        out = self.fc(x)  # 31개 클래스 예측
        return out

# 모델, 손실 함수, 옵티마이저, 스케줄러 설정
model = FashionGenderModel(num_classes=31).to(device)  # 모델을 CUDA로 이동
criterion = nn.CrossEntropyLoss()  # 하나의 CrossEntropyLoss 사용
optimizer = optim.AdamW(model.parameters(), lr=0.005, weight_decay=0.02)  # 학습률 조정
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1)

# 모델 학습 함수
def train_model(train_loader, val_loader, model, optimizer, criterion, scheduler, epochs=200, save_path='/content/drive/MyDrive/dataset/model_final2.pth', start_epoch=0):
    best_val_accuracy = 0.0  # 최상의 검증 정확도를 저장하기 위한 변수

    for epoch in range(start_epoch, epochs):
        model.train()
        total_train_loss = 0
        correct_train = 0
        total_train = 0

        # 학습 단계
        for images, labels in train_loader:
            images = images.to(device)  # 데이터를 CUDA로 이동
            labels = labels.to(device)

            optimizer.zero_grad()

            # 모델 예측
            outputs = model(images)

            # 손실 계산
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

            # 정확도 계산
            _, predicted = torch.max(outputs, 1)
            correct_train += (predicted == labels).sum().item()
            total_train += labels.size(0)

        train_accuracy = correct_train / total_train

        # 검증 단계
        model.eval()
        total_val_loss = 0
        correct_val = 0
        total_val = 0
        with torch.no_grad():
            for val_images, val_labels in val_loader:
                val_images = val_images.to(device)  # 검증 데이터도 CUDA로 이동
                val_labels = val_labels.to(device)

                val_outputs = model(val_images)

                val_loss = criterion(val_outputs, val_labels)
                total_val_loss += val_loss.item()

                _, val_predicted = torch.max(val_outputs, 1)
                correct_val += (val_predicted == val_labels).sum().item()
                total_val += val_labels.size(0)

        val_accuracy = correct_val / total_val

        # 학습률을 퍼센트로 표시
        lr_percent = optimizer.param_groups[0]['lr'] * 100
        scheduler.step()

        # 에포크 마다 출력
        print(f'Epoch {epoch+1}, Train Loss: {total_train_loss / len(train_loader):.4f}, '
              f'Train Accuracy: {train_accuracy * 100:.2f}%, '
              f'Validation Loss: {total_val_loss / len(val_loader):.4f}, '
              f'Validation Accuracy: {val_accuracy * 100:.2f}%, '
              f'Learning Rate: {lr_percent:.2f}%')

        # 최상의 검증 정확도를 가진 모델 저장
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), save_path)
            print(f'Model saved with Validation Accuracy: {val_accuracy * 100:.2f}%')

 # 모델 가중치 파일 경로
save_path = '/content/drive/MyDrive/dataset/model_final2.pth'

# 모델을 생성하고 가중치가 저장된 파일이 존재하면 로드
model = FashionGenderModel(num_classes=31).to(device)
if os.path.exists(save_path):
    model.load_state_dict(torch.load(save_path, map_location=device))
    print(f"Saved model weights loaded from {save_path}")
else:
    print("No saved weights found. Starting with a new model.")


# 모델 학습 시작
train_model(train_loader, val_loader, model, optimizer, criterion, scheduler, epochs=3, save_path='/content/drive/MyDrive/dataset/model_final2.pth')


  model.load_state_dict(torch.load(save_path, map_location=device))


Saved model weights loaded from /content/drive/MyDrive/dataset/model_final2.pth
Epoch 1, Train Loss: 0.3092, Train Accuracy: 91.45%, Validation Loss: 1.9767, Validation Accuracy: 63.83%, Learning Rate: 0.50%
Model saved with Validation Accuracy: 63.83%
Epoch 2, Train Loss: 0.2939, Train Accuracy: 92.29%, Validation Loss: 2.0089, Validation Accuracy: 62.99%, Learning Rate: 0.49%
Epoch 3, Train Loss: 0.2988, Train Accuracy: 91.74%, Validation Loss: 2.0055, Validation Accuracy: 62.88%, Learning Rate: 0.45%


[2-1]

In [None]:
!pip install ujson

Collecting ujson
  Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ujson
Successfully installed ujson-5.10.0


In [None]:
# 모든 행과 열을 출력하도록 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# 이미지와 JSON 파일의 경로
image_dir = '/content/drive/MyDrive/dataset/'
training_image_dir = os.path.join(image_dir, 'training_image')
validation_image_dir = os.path.join(image_dir, 'validation_image')
training_label_dir = os.path.join(image_dir, 'training_label')
validation_label_dir = os.path.join(image_dir, 'validation_label')

# 이미지 파일과 JSON 파일의 패턴 정의
image_pattern = re.compile(r"^(W|T)_(\d+)_(\d+)_(\w+)_(\w)\.jpg$")
label_pattern = re.compile(r"^(W|T)_(\d+)_(\d+)_(\w+)_(\w)_(\d+)\.json$")

# 유효한 이미지 ID를 저장할 집합
def get_valid_image_ids(image_dir):
    valid_image_ids = set()
    for filename in os.listdir(image_dir):
        if image_pattern.match(filename):
            img_id = image_pattern.match(filename).group(2)
            valid_image_ids.add(img_id)
    return valid_image_ids

# 통계 계산 함수 (Q5 포함)
def calculate_statistics_with_q5(label_dir, valid_image_ids):
    stats = []
    valid_labels = []

    for filename in os.listdir(label_dir):
        match = label_pattern.match(filename)
        if match:
            img_id = match.group(2)
            style = match.group(4)
            gender = "여성" if match.group(1) == "W" else "남성"
            if img_id in valid_image_ids:
                with open(os.path.join(label_dir, filename), 'r') as f:
                    data = json.load(f)
                    # 통계에 사용할 정보 추가
                    stats.append({"성별": gender, "스타일": style, "이미지 수": 1})
                    # Q5 포함하여 유효한 라벨 저장
                    valid_labels.append({
                        "respondent_id": data["user"]["R_id"],
                        "gender": gender,
                        "style": style,
                        "image_id": img_id,
                        "Q5": data["item"]["survey"]["Q5"],
                        "img_name": data["item"]["imgName"]
                    })

    # 통계 데이터프레임 생성 및 집계
    stats_df = pd.DataFrame(stats)
    stats_df = stats_df.groupby(["성별", "스타일"]).sum().reset_index()
    return stats_df, valid_labels

# 유효한 이미지 ID 확인
valid_training_ids = get_valid_image_ids(training_image_dir)
valid_validation_ids = get_valid_image_ids(validation_image_dir)

# 통계 테이블 및 유효 라벨 데이터 생성
training_stats, valid_training_labels = calculate_statistics_with_q5(training_label_dir, valid_training_ids)
validation_stats, valid_validation_labels = calculate_statistics_with_q5(validation_label_dir, valid_validation_ids)

# 출력
print("Training 통계표:")
display(training_stats)
print("\nValidation 통계표:")
display(validation_stats)

KeyboardInterrupt: 

2-1 수정

In [None]:
import os
import re
import json
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

# 모든 행과 열을 출력하도록 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# 경로 설정
image_dir = '/content/drive/MyDrive/dataset/'
training_image_dir = os.path.join(image_dir, 'training_image')
validation_image_dir = os.path.join(image_dir, 'validation_image')
training_label_dir = os.path.join(image_dir, 'training_label')
validation_label_dir = os.path.join(image_dir, 'validation_label')

# 이미지와 JSON 파일의 패턴 정의
image_pattern = re.compile(r"^(W|T)_(\d+)_(\d+)_(\w+)_(\w)\.jpg$")
label_pattern = re.compile(r"^(W|T)_(\d+)_(\d+)_(\w+)_(\w)_(\d+)\.json$")

# 유효한 이미지 ID를 저장할 집합
def get_valid_image_ids(image_dir):
    valid_image_ids = set()
    print(f"Scanning directory {image_dir} for valid images...")
    for filename in os.listdir(image_dir):
        match = image_pattern.match(filename)
        if match:
            img_id = match.group(2)
            valid_image_ids.add(img_id)
    print(f"Found {len(valid_image_ids)} valid image IDs in {image_dir}")
    return valid_image_ids

# JSON 파일을 비동기적으로 처리하는 함수
def process_json_file(filename, label_dir, valid_image_ids):
    try:
        match = label_pattern.match(filename)
        if match:
            img_id = match.group(2)
            style = match.group(4)
            gender = "여성" if match.group(1) == "W" else "남성"
            survey_id = match.group(6)  # 설문 ID 추출

            # 유효한 이미지 ID에 해당하는 JSON 파일만 처리
            if img_id in valid_image_ids:
                with open(os.path.join(label_dir, filename), 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    return {
                        "성별": gender,
                        "스타일": style,
                        "설문 ID 수": 1,
                        "respondent_id": data["user"]["R_id"],
                        "gender": gender,
                        "style": style,
                        "image_id": img_id,
                        "survey_id": survey_id,
                        "Q5": data["item"]["survey"]["Q5"],
                        "img_name": data["item"]["imgName"]
                    }
    except OSError as e:
        print(f"Could not process file {filename}: {e}")
    return None

# 통계 계산 함수 (멀티 스레드 방식)
def calculate_statistics_with_q5(label_dir, valid_image_ids):
    stats = []
    valid_labels = []
    print(f"Processing JSON files in directory {label_dir} with multithreading...")

    # ThreadPoolExecutor를 사용하여 JSON 파일을 병렬로 처리
    with ThreadPoolExecutor() as executor:
        future_to_file = {executor.submit(process_json_file, filename, label_dir, valid_image_ids): filename for filename in os.listdir(label_dir)}

        for future in as_completed(future_to_file):
            result = future.result()
            if result is not None:
                # 통계에 사용할 정보 추가 (설문 ID 수를 기준으로)
                stats.append({"성별": result["성별"], "스타일": result["스타일"], "설문 ID 수": 1})

                # Q5 포함하여 유효한 라벨 저장
                valid_labels.append({
                    "respondent_id": result["respondent_id"],
                    "gender": result["gender"],
                    "style": result["style"],
                    "image_id": result["image_id"],
                    "survey_id": result["survey_id"],
                    "Q5": result["Q5"],
                    "img_name": result["img_name"]
                })

    # 통계 데이터프레임 생성 및 집계
    print(f"Aggregating statistics for {label_dir}...")
    stats_df = pd.DataFrame(stats)
    if not stats_df.empty:
        stats_df = stats_df.groupby(["성별", "스타일"]).sum().reset_index()
    else:
        print("No valid data found for aggregation.")

    print(f"Finished processing {label_dir}. Number of entries in stats: {len(stats_df)}")
    return stats_df, valid_labels

# 유효한 이미지 ID 확인
valid_training_ids = get_valid_image_ids(training_image_dir)
valid_validation_ids = get_valid_image_ids(validation_image_dir)

# 통계 테이블 및 유효 라벨 데이터 생성
training_stats, valid_training_labels = calculate_statistics_with_q5(training_label_dir, valid_training_ids)
validation_stats, valid_validation_labels = calculate_statistics_with_q5(validation_label_dir, valid_validation_ids)

# 출력
print("Training 통계표:")
display(training_stats)
print("\nValidation 통계표:")
display(validation_stats)


Scanning directory /content/drive/MyDrive/dataset/training_image for valid images...
Found 4066 valid image IDs in /content/drive/MyDrive/dataset/training_image
Scanning directory /content/drive/MyDrive/dataset/validation_image for valid images...
Found 951 valid image IDs in /content/drive/MyDrive/dataset/validation_image
Processing JSON files in directory /content/drive/MyDrive/dataset/training_label with multithreading...
Aggregating statistics for /content/drive/MyDrive/dataset/training_label...
Finished processing /content/drive/MyDrive/dataset/training_label. Number of entries in stats: 53
Processing JSON files in directory /content/drive/MyDrive/dataset/validation_label with multithreading...
Aggregating statistics for /content/drive/MyDrive/dataset/validation_label...
Finished processing /content/drive/MyDrive/dataset/validation_label. Number of entries in stats: 40
Training 통계표:


Unnamed: 0,성별,스타일,설문 ID 수
0,남성,athleisure,2
1,남성,bodyconscious,10
2,남성,bold,18
3,남성,cityglam,11
4,남성,classic,21
5,남성,disco,1
6,남성,ecology,9
7,남성,genderless,18
8,남성,grunge,3
9,남성,hiphop,64



Validation 통계표:


Unnamed: 0,성별,스타일,설문 ID 수
0,남성,bodyconscious,1
1,남성,bold,3
2,남성,cityglam,2
3,남성,classic,5
4,남성,ecology,1
5,남성,hiphop,3
6,남성,hippie,1
7,남성,ivy,3
8,남성,mods,1
9,남성,normcore,4


2-2 수정

In [None]:
import os
import pandas as pd

# 모든 행과 열을 출력하도록 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# 상위 100명 응답자를 추출하는 함수
def get_top_respondents(valid_labels, top_n=100):
    # 응답자 ID별로 라벨링 데이터 개수를 집계하여 상위 N명을 추출
    respondent_counts = pd.DataFrame(valid_labels).groupby("respondent_id").size()
    top_respondents = respondent_counts.nlargest(top_n).index.tolist()
    print(f"상위 {top_n}명 응답자 ID 추출 완료.")
    return top_respondents

# 응답자별 스타일 선호/비선호 정보를 생성하는 함수
def create_preference_df(valid_labels, top_respondents, dataset_name):
    rows = []

    for respondent in top_respondents:
        # 해당 응답자의 모든 데이터 추출
        respondent_data = [entry for entry in valid_labels if entry['respondent_id'] == respondent]
        preferred = [entry['img_name'] for entry in respondent_data if entry['Q5'] == 2]
        non_preferred = [entry['img_name'] for entry in respondent_data if entry['Q5'] == 1]

        rows.append({
            '응답자 ID': respondent,
            f'{dataset_name} 스타일 선호': preferred,
            f'{dataset_name} 스타일 비선호': non_preferred
        })

    print(f"{dataset_name} 스타일 선호 정보 데이터프레임 생성 완료.")
    return pd.DataFrame(rows)

# 유효한 라벨 데이터에서 상위 100명의 응답자를 추출
top_training_respondents = get_top_respondents(valid_training_labels, top_n=100)
top_validation_respondents = get_top_respondents(valid_validation_labels, top_n=100)

# 각 응답자의 선호/비선호 스타일 정리
training_df = create_preference_df(valid_training_labels, top_training_respondents, 'Training')
validation_df = create_preference_df(valid_validation_labels, top_validation_respondents, 'Validation')

# 데이터프레임 출력
print("Training 응답자 스타일 선호 정보:")
display(training_df)
print("\nValidation 응답자 스타일 선호 정보:")
display(validation_df)

# 두 데이터를 하나의 CSV 파일로 합쳐서 저장
top_preference_df = pd.merge(training_df, validation_df, on="응답자 ID", how="outer")
output_csv_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/top_100_respondents_preferences.csv'
top_preference_df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
print(f"\n상위 100명의 응답자 스타일 선호도가 {output_csv_path}에 저장되었습니다.")


상위 100명 응답자 ID 추출 완료.
상위 100명 응답자 ID 추출 완료.
Training 스타일 선호 정보 데이터프레임 생성 완료.
Validation 스타일 선호 정보 데이터프레임 생성 완료.
Training 응답자 스타일 선호 정보:


Unnamed: 0,응답자 ID,Training 스타일 선호,Training 스타일 비선호
0,21432,"[W_28523_90_hiphop_M.jpg, W_09891_90_hiphop_M.jpg, W_29023_00_metrosexual_M.jpg, W_15294_50_ivy_M.jpg, W_32448_50_ivy_M.jpg]","[W_29224_10_sportivecasual_M.jpg, W_26017_10_sportivecasual_M.jpg, W_26151_80_bold_M.jpg, W_26296_70_hippie_M.jpg, W_12383_80_bold_M.jpg, W_24439_00_metrosexual_M.jpg, W_26397_70_hippie_M.jpg, W_25107_70_hippie_M.jpg]"
1,60234,"[W_01693_19_normcore_M.jpg, W_17337_50_ivy_M.jpg, W_16539_50_ivy_M.jpg, W_07102_50_ivy_M.jpg, W_12748_50_ivy_M.jpg]","[W_17508_80_bold_M.jpg, W_02844_90_hiphop_M.jpg, W_16755_00_metrosexual_M.jpg, W_18424_80_bold_M.jpg, W_16673_70_hippie_M.jpg, W_15423_80_bold_M.jpg, W_06546_60_mods_M.jpg, W_15259_60_mods_M.jpg]"
2,62264,"[W_07260_10_sportivecasual_M.jpg, W_16449_10_sportivecasual_M.jpg, W_16403_10_sportivecasual_M.jpg, W_07098_19_normcore_M.jpg, W_07307_19_normcore_M.jpg, W_05869_60_mods_M.jpg]","[W_16136_80_bold_M.jpg, W_15477_00_metrosexual_M.jpg, W_15159_80_bold_M.jpg, W_16428_90_hiphop_M.jpg, W_16189_50_ivy_M.jpg, W_16189_50_ivy_M.jpg, W_04233_60_mods_M.jpg]"
3,64345,"[W_28449_10_sportivecasual_M.jpg, W_17467_19_normcore_M.jpg, W_00843_10_sportivecasual_M.jpg, W_29596_10_sportivecasual_M.jpg, W_27156_90_hiphop_M.jpg, W_32150_00_metrosexual_M.jpg, W_15856_60_mods_M.jpg, W_24155_60_mods_M.jpg]","[W_24825_80_bold_M.jpg, W_25884_90_hiphop_M.jpg, W_11105_00_metrosexual_M.jpg, W_24352_70_hippie_M.jpg, W_24325_60_mods_M.jpg]"
4,63740,"[W_04201_19_lounge_W.jpg, W_01179_10_sportivecasual_W.jpg, W_11824_70_military_W.jpg, W_14975_60_minimal_W.jpg, W_18894_50_feminine_W.jpg]","[W_05226_10_sportivecasual_W.jpg, W_08607_90_kitsch_W.jpg, W_19264_90_kitsch_W.jpg, W_03656_90_hiphop_W.jpg, W_05353_80_bodyconscious_W.jpg, W_06083_80_bodyconscious_W.jpg, W_07532_70_hippie_W.jpg]"
5,64221,"[W_28698_10_sportivecasual_M.jpg, W_25039_90_hiphop_M.jpg, W_32524_00_metrosexual_M.jpg, W_28728_50_ivy_M.jpg, W_24013_60_mods_M.jpg]","[W_26397_70_hippie_M.jpg, W_25471_70_hippie_M.jpg, W_12130_80_bold_M.jpg, W_28207_90_hiphop_M.jpg, W_17747_80_bold_M.jpg, W_15129_50_ivy_M.jpg, W_07333_70_hippie_M.jpg]"
6,64460,"[W_24294_80_bold_M.jpg, W_24934_90_hiphop_M.jpg, W_26530_00_metrosexual_M.jpg, W_06514_80_bold_M.jpg, W_10792_50_ivy_M.jpg, W_12817_50_ivy_M.jpg, W_15653_60_mods_M.jpg]","[W_25069_90_hiphop_M.jpg, W_31356_80_bold_M.jpg, W_25082_70_hippie_M.jpg, W_30522_00_metrosexual_M.jpg, W_25526_60_mods_M.jpg]"
7,28912,"[W_01754_10_sportivecasual_M.jpg, W_04680_10_sportivecasual_M.jpg, W_02669_50_ivy_M.jpg]","[W_17260_19_normcore_M.jpg, W_16725_70_hippie_M.jpg, W_15745_80_bold_M.jpg, W_04723_90_hiphop_M.jpg, W_15923_80_bold_M.jpg, W_04242_60_mods_M.jpg, W_15246_50_ivy_M.jpg, W_03007_70_hippie_M.jpg]"
8,60184,"[W_01687_19_normcore_M.jpg, W_17427_00_metrosexual_M.jpg, W_17348_50_ivy_M.jpg]","[W_12453_10_sportivecasual_M.jpg, W_12095_80_bold_M.jpg, W_28377_80_bold_M.jpg, W_27913_00_metrosexual_M.jpg, W_25030_70_hippie_M.jpg, W_27819_70_hippie_M.jpg, W_16016_70_hippie_M.jpg, W_04245_70_hippie_M.jpg]"
9,62525,"[W_00760_19_normcore_W.jpg, W_05751_10_sportivecasual_W.jpg, W_19165_00_cityglam_W.jpg, W_03477_80_powersuit_W.jpg, W_01207_60_minimal_W.jpg, W_11223_60_popart_W.jpg, W_13104_50_classic_W.jpg, W_18495_50_feminine_W.jpg]","[W_13667_00_oriental_W.jpg, W_08167_60_minimal_W.jpg, W_07447_50_feminine_W.jpg]"



Validation 응답자 스타일 선호 정보:


Unnamed: 0,응답자 ID,Validation 스타일 선호,Validation 스타일 비선호
0,63405,"[W_02677_60_mods_M.jpg, W_01853_60_mods_M.jpg, W_04684_90_hiphop_M.jpg]","[W_12904_50_ivy_M.jpg, W_15140_80_bold_M.jpg, W_12304_80_bold_M.jpg, W_07187_70_hippie_M.jpg]"
1,59642,"[W_05716_19_normcore_W.jpg, W_14706_19_normcore_W.jpg, W_02095_60_popart_W.jpg]","[W_08246_80_bodyconscious_W.jpg, W_00359_90_grunge_W.jpg, W_11444_80_bodyconscious_W.jpg]"
2,63748,"[W_17867_50_ivy_M.jpg, W_00829_10_sportivecasual_M.jpg]","[W_00539_10_sportivecasual_M.jpg, W_10079_60_mods_M.jpg, W_11144_00_metrosexual_M.jpg, W_06955_10_sportivecasual_M.jpg]"
3,63913,[W_06883_60_mods_M.jpg],"[W_10066_50_ivy_M.jpg, W_15843_00_metrosexual_M.jpg, W_15947_80_bold_M.jpg, W_16444_10_sportivecasual_M.jpg, W_05876_70_hippie_M.jpg]"
4,64221,"[W_28925_90_hiphop_M.jpg, W_25086_10_sportivecasual_M.jpg]","[W_17747_80_bold_M.jpg, W_07333_70_hippie_M.jpg, W_26397_70_hippie_M.jpg, W_02936_00_metrosexual_M.jpg]"
5,62155,[W_27854_50_ivy_M.jpg],"[W_17353_50_ivy_M.jpg, W_32383_00_metrosexual_M.jpg, W_06186_60_mods_M.jpg, W_26120_19_normcore_M.jpg]"
6,63479,"[W_04994_60_popart_W.jpg, W_06438_00_ecology_W.jpg, W_18202_60_minimal_W.jpg]","[W_18878_19_genderless_W.jpg, W_11936_00_oriental_W.jpg]"
7,64216,"[W_02816_60_mods_M.jpg, W_15662_19_normcore_M.jpg]","[W_24931_50_ivy_M.jpg, W_24486_70_hippie_M.jpg, W_26288_70_hippie_M.jpg]"
8,7905,[],"[W_17603_50_ivy_M.jpg, W_28909_19_normcore_M.jpg, W_26179_60_mods_M.jpg, W_07025_10_sportivecasual_M.jpg]"
9,21432,"[W_06522_50_ivy_M.jpg, W_15294_50_ivy_M.jpg, W_29023_00_metrosexual_M.jpg]",[W_26397_70_hippie_M.jpg]



상위 100명의 응답자 스타일 선호도가 /content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/top_100_respondents_preferences.csv에 저장되었습니다.


[2-2]

In [None]:
# 응답자별로 상위 100명을 추출하는 함수
def get_top_respondents(valid_labels, top_n=100):
    # 응답자 ID별로 라벨링 데이터 개수를 집계하여 상위 N명을 추출
    respondent_counts = pd.DataFrame(valid_labels).groupby("respondent_id").size()
    top_respondents = respondent_counts.nlargest(top_n).index.tolist()
    return top_respondents

# 유효한 라벨 데이터에서 상위 100명의 응답자를 추출
top_training_respondents = get_top_respondents(valid_training_labels, top_n=100)
top_validation_respondents = get_top_respondents(valid_validation_labels, top_n=100)

# 응답자 스타일 선호도 데이터프레임 생성
def create_preference_df(valid_labels, top_respondents, dataset_name):
    rows = []

    for respondent in top_respondents:
        respondent_data = [entry for entry in valid_labels if entry['respondent_id'] == respondent]
        preferred = [entry['img_name'] for entry in respondent_data if entry['Q5'] == 2]
        non_preferred = [entry['img_name'] for entry in respondent_data if entry['Q5'] == 1]

        rows.append({
            '응답자 ID': respondent,
            f'{dataset_name} 스타일 선호': preferred,
            f'{dataset_name} 스타일 비선호': non_preferred
        })

    return pd.DataFrame(rows)

# DataFrame으로 생성 및 출력
training_df = create_preference_df(valid_training_labels, top_training_respondents, 'Training')
validation_df = create_preference_df(valid_validation_labels, top_validation_respondents, 'Validation')

print("Training 응답자 스타일 선호 정보:")
display(training_df)
print("\nValidation 응답자 스타일 선호 정보:")
display(validation_df)

# DataFrame을 CSV 파일로 저장
output_csv_path = 'top_100_respondents_preferences.csv'
top_preference_df.to_csv(output_csv_path, index=False)
print(f"\n상위 100명의 응답자 선호도가 {output_csv_path}에 저장되었습니다.")

Training 응답자 스타일 선호 정보:


Unnamed: 0,응답자 ID,Training 스타일 선호,Training 스타일 비선호
0,21432,"[W_28523_90_hiphop_M.jpg, W_09891_90_hiphop_M.jpg, W_29023_00_metrosexual_M.jpg, W_15294_50_ivy_M.jpg, W_32448_50_ivy_M.jpg]","[W_29224_10_sportivecasual_M.jpg, W_26017_10_sportivecasual_M.jpg, W_26151_80_bold_M.jpg, W_26296_70_hippie_M.jpg, W_12383_80_bold_M.jpg, W_24439_00_metrosexual_M.jpg, W_26397_70_hippie_M.jpg, W_25107_70_hippie_M.jpg]"
1,60234,"[W_01693_19_normcore_M.jpg, W_17337_50_ivy_M.jpg, W_16539_50_ivy_M.jpg, W_07102_50_ivy_M.jpg, W_12748_50_ivy_M.jpg]","[W_17508_80_bold_M.jpg, W_02844_90_hiphop_M.jpg, W_16755_00_metrosexual_M.jpg, W_18424_80_bold_M.jpg, W_16673_70_hippie_M.jpg, W_15423_80_bold_M.jpg, W_06546_60_mods_M.jpg, W_15259_60_mods_M.jpg]"
2,62264,"[W_07260_10_sportivecasual_M.jpg, W_16449_10_sportivecasual_M.jpg, W_16403_10_sportivecasual_M.jpg, W_07098_19_normcore_M.jpg, W_07307_19_normcore_M.jpg, W_05869_60_mods_M.jpg]","[W_16136_80_bold_M.jpg, W_15477_00_metrosexual_M.jpg, W_15159_80_bold_M.jpg, W_16428_90_hiphop_M.jpg, W_16189_50_ivy_M.jpg, W_16189_50_ivy_M.jpg, W_04233_60_mods_M.jpg]"
3,64345,"[W_28449_10_sportivecasual_M.jpg, W_17467_19_normcore_M.jpg, W_00843_10_sportivecasual_M.jpg, W_29596_10_sportivecasual_M.jpg, W_27156_90_hiphop_M.jpg, W_32150_00_metrosexual_M.jpg, W_15856_60_mods_M.jpg, W_24155_60_mods_M.jpg]","[W_24825_80_bold_M.jpg, W_25884_90_hiphop_M.jpg, W_11105_00_metrosexual_M.jpg, W_24352_70_hippie_M.jpg, W_24325_60_mods_M.jpg]"
4,63740,"[W_04201_19_lounge_W.jpg, W_01179_10_sportivecasual_W.jpg, W_11824_70_military_W.jpg, W_14975_60_minimal_W.jpg, W_18894_50_feminine_W.jpg]","[W_05226_10_sportivecasual_W.jpg, W_08607_90_kitsch_W.jpg, W_19264_90_kitsch_W.jpg, W_03656_90_hiphop_W.jpg, W_05353_80_bodyconscious_W.jpg, W_06083_80_bodyconscious_W.jpg, W_07532_70_hippie_W.jpg]"
5,64221,"[W_28698_10_sportivecasual_M.jpg, W_25039_90_hiphop_M.jpg, W_32524_00_metrosexual_M.jpg, W_28728_50_ivy_M.jpg, W_24013_60_mods_M.jpg]","[W_26397_70_hippie_M.jpg, W_25471_70_hippie_M.jpg, W_12130_80_bold_M.jpg, W_28207_90_hiphop_M.jpg, W_17747_80_bold_M.jpg, W_15129_50_ivy_M.jpg, W_07333_70_hippie_M.jpg]"
6,64460,"[W_24294_80_bold_M.jpg, W_24934_90_hiphop_M.jpg, W_26530_00_metrosexual_M.jpg, W_06514_80_bold_M.jpg, W_10792_50_ivy_M.jpg, W_12817_50_ivy_M.jpg, W_15653_60_mods_M.jpg]","[W_25069_90_hiphop_M.jpg, W_31356_80_bold_M.jpg, W_25082_70_hippie_M.jpg, W_30522_00_metrosexual_M.jpg, W_25526_60_mods_M.jpg]"
7,28912,"[W_01754_10_sportivecasual_M.jpg, W_04680_10_sportivecasual_M.jpg, W_02669_50_ivy_M.jpg]","[W_17260_19_normcore_M.jpg, W_16725_70_hippie_M.jpg, W_15745_80_bold_M.jpg, W_04723_90_hiphop_M.jpg, W_15923_80_bold_M.jpg, W_04242_60_mods_M.jpg, W_15246_50_ivy_M.jpg, W_03007_70_hippie_M.jpg]"
8,60184,"[W_01687_19_normcore_M.jpg, W_17427_00_metrosexual_M.jpg, W_17348_50_ivy_M.jpg]","[W_12453_10_sportivecasual_M.jpg, W_12095_80_bold_M.jpg, W_28377_80_bold_M.jpg, W_27913_00_metrosexual_M.jpg, W_25030_70_hippie_M.jpg, W_27819_70_hippie_M.jpg, W_16016_70_hippie_M.jpg, W_04245_70_hippie_M.jpg]"
9,62525,"[W_00760_19_normcore_W.jpg, W_05751_10_sportivecasual_W.jpg, W_19165_00_cityglam_W.jpg, W_03477_80_powersuit_W.jpg, W_01207_60_minimal_W.jpg, W_11223_60_popart_W.jpg, W_13104_50_classic_W.jpg, W_18495_50_feminine_W.jpg]","[W_13667_00_oriental_W.jpg, W_08167_60_minimal_W.jpg, W_07447_50_feminine_W.jpg]"


Unnamed: 0,응답자 ID,Validation 스타일 선호,Validation 스타일 비선호
0,63405,"[W_02677_60_mods_M.jpg, W_01853_60_mods_M.jpg, W_04684_90_hiphop_M.jpg]","[W_12904_50_ivy_M.jpg, W_15140_80_bold_M.jpg, W_12304_80_bold_M.jpg, W_07187_70_hippie_M.jpg]"
1,59642,"[W_05716_19_normcore_W.jpg, W_14706_19_normcore_W.jpg, W_02095_60_popart_W.jpg]","[W_08246_80_bodyconscious_W.jpg, W_00359_90_grunge_W.jpg, W_11444_80_bodyconscious_W.jpg]"
2,63748,"[W_17867_50_ivy_M.jpg, W_00829_10_sportivecasual_M.jpg]","[W_00539_10_sportivecasual_M.jpg, W_10079_60_mods_M.jpg, W_11144_00_metrosexual_M.jpg, W_06955_10_sportivecasual_M.jpg]"
3,63913,[W_06883_60_mods_M.jpg],"[W_10066_50_ivy_M.jpg, W_15843_00_metrosexual_M.jpg, W_15947_80_bold_M.jpg, W_16444_10_sportivecasual_M.jpg, W_05876_70_hippie_M.jpg]"
4,64221,"[W_28925_90_hiphop_M.jpg, W_25086_10_sportivecasual_M.jpg]","[W_17747_80_bold_M.jpg, W_07333_70_hippie_M.jpg, W_26397_70_hippie_M.jpg, W_02936_00_metrosexual_M.jpg]"
5,62155,[W_27854_50_ivy_M.jpg],"[W_17353_50_ivy_M.jpg, W_32383_00_metrosexual_M.jpg, W_06186_60_mods_M.jpg, W_26120_19_normcore_M.jpg]"
6,63479,"[W_04994_60_popart_W.jpg, W_06438_00_ecology_W.jpg, W_18202_60_minimal_W.jpg]","[W_18878_19_genderless_W.jpg, W_11936_00_oriental_W.jpg]"
7,64216,"[W_02816_60_mods_M.jpg, W_15662_19_normcore_M.jpg]","[W_24931_50_ivy_M.jpg, W_24486_70_hippie_M.jpg, W_26288_70_hippie_M.jpg]"
8,7905,[],"[W_17603_50_ivy_M.jpg, W_28909_19_normcore_M.jpg, W_26179_60_mods_M.jpg, W_07025_10_sportivecasual_M.jpg]"
9,21432,"[W_06522_50_ivy_M.jpg, W_15294_50_ivy_M.jpg, W_29023_00_metrosexual_M.jpg]",[W_26397_70_hippie_M.jpg]


NameError: name 'top_preference_df' is not defined


Validation 응답자 스타일 선호 정보:


[3-1]

### 정의
 - 아이템 기반 필터링 (Item-based Filtering): 사용자들이 선호하는 항목 간의 유사성을 분석하여, 사용자가 좋아하는 항목과 유사한 다른 항목을 추천하는 방법이다. 이 방식은 사용자들 간의 유사성보다는 아이템 간의 관계에 초점을 맞춘다.

 - 사용자 기반 필터링 (User-based Filtering): 비슷한 취향을 가진 사용자들 간의 관계를 분석하여, 유사한 사용자가 선호하는 항목을 추천하는 방법이다. 이 방식은 사용자의과거 선호도와 다른 사용자의 행동을 기반으로 추천을 생성한다.  

 #### 1.아이템 기반 필터링
 ##### 적용방법
  - 유사성 측정: 사용자가 선호한 스타일(2)을 바탕으로 스타일 간의 유사성을 계산한다.예를 들어, 코사인 유사도(cosine similarity) 또는 자카드 유사도(Jaccardsimilarity) 등을 사용하여 스타일 간의 관계를 분석한다.
  
  - 추천 생성: 사용자가 선호하는 스타일과 유사한 다른 스타일을 추천한다. 사용자가 A 스타일을 선호한다면, A 스타일과 유사한 B, C, D 스타일을 추천할 수 있다.

 ##### 장점
  - 사용자의 자신의 평가 데이터를 활용하여 추천을 수행하기 때문에 더 관련성 높은 추천제공한다.
   
  - 다양한 항목을 추천할 가능성이 높아 추천 목록의 다양성 더 커진다.

  - 비선호 스타일에 대한 정보가 부족하더라도, 이미 선호한 스타일을 기반으로 추천이 이루어지므로 추천의 질이 비교적 안정적이다.

  - 스타일의 특성을 분석하여 유사한 스타일을 추천함으로써, 사용자의 취향을 잘 반영할 수 있다.

  - 기존 스타일과의 유사성을 활용하여 새로운 스타일을 추천할 수있는 유연성을 가진다.

   ##### 단점
   - 이웃 사용자의 정보가 익명으로 처리되기 때문에 설명하기 어렵다.

   - 스타일 간의 유사성을 제대로 분석하지 못하면 추천의 품질이떨어질 수 있다.

   #### 2.사용자 기반 필터링
   ##### 적용방법
   - 유사 사용자 측정: 사용자의 선호(2)와 비선호(1) 데이터를 바탕으로 유사한 사용자를 찾는다. 이때, 사용자의 비선호 정보도 포함하여 유사성을 분석한다.

   - 추천 생성: 유사한 사용자가 선호하는 스타일을 추천한다. 예를 들어, 사용자가 B 스타일을 비선호하는 경우, 그와 비슷한 사용자가 선호하는 다른 스타일을 추천할 수 있다.

   ##### 장점
   - 사용자의 취향을 반영한 추천이 가능하여, 개인화된 경험을 제공할 수 있다. → 추천의 이유를 명확하게 설명할 수 있다.

   - 비슷한 취향을 가진 사용자들 간의 추천을 통해 다양한 스타일을추천할 수 있다.

   ##### 단점
   - 비선호(1) 스타일에 대한 정보가 많지 않으면, 유사한 사용자를 찾기 어려워질 수 있으며, 이는 추천의 질 저하로 이어질 수 있다.

   - 사용자 수가 적거나 비선호 데이터가 부족할 경우, 유사한 사용자를 찾기가 힘들어질 수 있다. 이는 추천의 신뢰성을 떨어뜨릴 수 있다.

   - 다른 사용자의 평가를 바탕으로 추천이 이루어지므로 정확도가 떨어질 수 있다.

3-2 수정

In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18()  # ResNet-18 모델 생성
        state_dict = torch.load(model_path)  # 가중치 로드

        # 모델의 state_dict에서 fc 레이어 관련 키 제거
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}

        resnet.load_state_dict(state_dict, strict=False)  # 가중치를 strict=False로 로드
        self.features = nn.Sequential(*list(resnet.children())[:-2])  # 마지막 fc 레이어 제외

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)  # 배치 차원 추가 및 디바이스로 전송
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).cpu().numpy().flatten()
                features.append(feature_vector)
    return np.array(features)

# 기타 필요한 함수들 정의 (get_labels, calculate_cosine_similarity, predict_style_preference, calculate_metrics 등)

# ResNet 특징 추출기 초기화 및 가중치 로드
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'  # 학습된 가중치 경로 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)
# 파일명에서 스타일(라벨)을 추출하는 함수
image_pattern = re.compile(r'^(W|T)_(\d+)_(.*?)_(.*?)_(W|M)\.jpg$')
def get_labels(image_paths):
    labels = []
    for filename in image_paths:
        match = image_pattern.match(os.path.basename(filename))
        if match:
            _, _, _, style, _ = match.groups()
            labels.append(style)
    return labels

# 코사인 유사도 계산 함수
def calculate_cosine_similarity(val_features, train_features):
    return cosine_similarity(val_features, train_features)

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.8):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = np.argmax(score_row)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append('Unknown')  # 임계값보다 낮으면 Unknown
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# 2-2에서 생성한 응답자 선호도 CSV 파일 로드
top_100_file_path = 'top_100_respondents_preferences.csv'
preferences_df = pd.read_csv(top_100_file_path)

# 학습 및 검증 이미지 경로 추출 함수
def extract_image_paths_from_preferences(preference_column, image_dir):
    image_paths = []
    for img_list in preference_column.dropna():
        images = img_list.strip("[]").replace("'", "").split(", ")
        # 이미지 파일 경로로 변환 후 리스트에 추가
        image_paths.extend([os.path.join(image_dir, img.strip()) for img in images if img.strip()])
    return [img_path for img_path in image_paths if os.path.isfile(img_path)]  # 실제 파일만 반환


# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/training_image'
val_image_dir = '/content/drive/MyDrive/dataset/validation_image'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'  # 학습된 가중치 경로 설정

# 학습 및 검증 이미지 경로 추출
train_image_paths = extract_image_paths_from_preferences(preferences_df['Training 스타일 선호'], train_image_dir)
val_image_paths = extract_image_paths_from_preferences(preferences_df['Validation 스타일 선호'], val_image_dir)

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 학습 및 검증 이미지의 특징 벡터 추출
print("Extracting features for training images...")
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)

print("Extracting features for validation images...")
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)

# 학습 이미지와 검증 이미지의 라벨 추출
train_labels = get_labels(train_image_paths)
val_labels = get_labels(val_image_paths)

# 학습 이미지의 선호도 (Training 스타일 선호/비선호에 따라 레이블 생성)
train_preferences = [1 if label in preferences_df['Training 스타일 선호'].values else 0 for label in train_labels]

# 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences)

# 정확도, 정밀도, 재현율, F1 점수 계산
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_labels)

# 결과 출력
print(f"Prediction Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


  state_dict = torch.load(model_path)  # 가중치 로드
  state_dict = torch.load(model_path)  # 가중치 로드


Extracting features for training images...


Extracting Features:  12%|█▏        | 40/325 [00:30<03:23,  1.40it/s]

## 속도 개선

In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18()  # ResNet-18 모델 생성
        state_dict = torch.load(model_path)  # 가중치 로드

        # 모델의 state_dict에서 fc 레이어 관련 키 제거
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}

        resnet.load_state_dict(state_dict, strict=False)  # 가중치를 strict=False로 로드
        self.features = nn.Sequential(*list(resnet.children())[:-2])  # 마지막 fc 레이어 제외

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)  # 배치 차원 추가 및 디바이스로 전송
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
    return np.array(features)

# 파일명에서 스타일(라벨)을 추출하는 함수
image_pattern = re.compile(r'^(W|T)_(\d+)_(.*?)_(.*?)_(W|M)\.jpg$')
def get_labels(image_paths):
    labels = []
    for filename in image_paths:
        match = image_pattern.match(os.path.basename(filename))
        if match:
            _, _, _, style, _ = match.groups()
            labels.append(style)
    return labels

# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 2차원으로 변환 (만약 3차원 이상의 텐서인 경우)
    if val_features.dim() > 2:
        val_features = val_features.view(val_features.size(0), -1)
    if train_features.dim() > 2:
        train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores


# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.8):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = np.argmax(score_row)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append('Unknown')  # 임계값보다 낮으면 Unknown
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# 2-2에서 생성한 응답자 선호도 CSV 파일 로드
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/top_100_respondents_preferences.csv'
preferences_df = pd.read_csv(top_100_file_path)

# 학습 및 검증 이미지 경로 추출 함수
def extract_image_paths_from_preferences(preference_column, image_dir):
    image_paths = []
    for img_list in preference_column.dropna():
        images = img_list.strip("[]").replace("'", "").split(", ")
        # 이미지 파일 경로로 변환 후 리스트에 추가
        image_paths.extend([os.path.join(image_dir, img.strip()) for img in images if img.strip()])
    return [img_path for img_path in image_paths if os.path.isfile(img_path)]  # 실제 파일만 반환

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/training_image'
val_image_dir = '/content/drive/MyDrive/dataset/validation_image'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'  # 학습된 가중치 경로 설정

# 학습 및 검증 이미지 경로 추출
train_image_paths = extract_image_paths_from_preferences(preferences_df['Training 스타일 선호'], train_image_dir)
val_image_paths = extract_image_paths_from_preferences(preferences_df['Validation 스타일 선호'], val_image_dir)

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 학습 및 검증 이미지의 특징 벡터 추출
print("Extracting features for training images...")
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)

print("Extracting features for validation images...")
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)

# 학습 이미지와 검증 이미지의 라벨 추출
train_labels = get_labels(train_image_paths)
val_labels = get_labels(val_image_paths)

# 학습 이미지의 선호도 (Training 스타일 선호/비선호에 따라 레이블 생성)
train_preferences = [1 if label in preferences_df['Training 스타일 선호'].values else 0 for label in train_labels]

# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences)

# 정확도, 정밀도, 재현율, F1 점수 계산
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_labels)

# 결과 출력
print(f"Prediction Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


  state_dict = torch.load(model_path)  # 가중치 로드


Extracting features for training images...


Extracting Features: 100%|██████████| 325/325 [01:38<00:00,  3.31it/s]


Extracting features for validation images...


Extracting Features: 100%|██████████| 114/114 [00:35<00:00,  3.23it/s]

Calculating similarity...
Predicting preferences...





ValueError: Mix of label input types (string and number)

In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18()  # ResNet-18 모델 생성
        state_dict = torch.load(model_path)  # 가중치 로드

        # 모델의 state_dict에서 fc 레이어 관련 키 제거
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}

        resnet.load_state_dict(state_dict, strict=False)  # 가중치를 strict=False로 로드
        self.features = nn.Sequential(*list(resnet.children())[:-2])  # 마지막 fc 레이어 제외

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)  # 배치 차원 추가 및 디바이스로 전송
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
    return np.array(features)

# 파일명에서 스타일(라벨)을 추출하는 함수
image_pattern = re.compile(r'^(W|T)_(\d+)_(.*?)_(.*?)_(W|M)\.jpg$')
def get_labels(image_paths):
    labels = []
    for filename in image_paths:
        match = image_pattern.match(os.path.basename(filename))
        if match:
            _, _, _, style, _ = match.groups()
            labels.append(style)
    return labels

# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 2차원으로 변환 (3차원 이상의 텐서인 경우)
    if val_features.dim() > 2:
        val_features = val_features.view(val_features.size(0), -1)
    if train_features.dim() > 2:
        train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.8):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = np.argmax(score_row)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)  # 임계값보다 낮으면 비선호로 처리
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# 2-2에서 생성한 응답자 선호도 CSV 파일 로드
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/top_100_respondents_preferences.csv'
preferences_df = pd.read_csv(top_100_file_path)

# 학습 및 검증 이미지 경로 추출 함수
def extract_image_paths_from_preferences(preference_column, image_dir):
    image_paths = []
    for img_list in preference_column.dropna():
        images = img_list.strip("[]").replace("'", "").split(", ")
        # 이미지 파일 경로로 변환 후 리스트에 추가
        image_paths.extend([os.path.join(image_dir, img.strip()) for img in images if img.strip()])
    return [img_path for img_path in image_paths if os.path.isfile(img_path)]  # 실제 파일만 반환

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/training_image'
val_image_dir = '/content/drive/MyDrive/dataset/validation_image'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'  # 학습된 가중치 경로 설정

# 학습 및 검증 이미지 경로 추출
train_image_paths = extract_image_paths_from_preferences(preferences_df['Training 스타일 선호'], train_image_dir)
val_image_paths = extract_image_paths_from_preferences(preferences_df['Validation 스타일 선호'], val_image_dir)

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 학습 및 검증 이미지의 특징 벡터 추출
print("Extracting features for training images...")
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)

print("Extracting features for validation images...")
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)

# 학습 이미지와 검증 이미지의 라벨 추출
train_labels = get_labels(train_image_paths)
val_labels = get_labels(val_image_paths)

# 학습 이미지의 선호도 레이블 (숫자 형식으로 일관되게 변환)
train_preferences = [1 if label in preferences_df['Training 스타일 선호'].values else 0 for label in train_labels]

# 검증 이미지의 라벨도 숫자 형식으로 변환
val_preferences = [1 if label in preferences_df['Validation 스타일 선호'].values else 0 for label in val_labels]

# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences)

# 정확도, 정밀도, 재현율, F1 점수 계산
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_preferences)

# 결과 출력
print(f"Prediction Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


  state_dict = torch.load(model_path)  # 가중치 로드


Extracting features for training images...


Extracting Features: 100%|██████████| 325/325 [01:36<00:00,  3.37it/s]


Extracting features for validation images...


Extracting Features: 100%|██████████| 114/114 [00:33<00:00,  3.37it/s]

Calculating similarity...
Predicting preferences...
Prediction Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000





3-2 중복 확인

In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18()  # ResNet-18 모델 생성
        state_dict = torch.load(model_path)  # 가중치 로드

        # 모델의 state_dict에서 fc 레이어 관련 키 제거
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}

        resnet.load_state_dict(state_dict, strict=False)  # 가중치를 strict=False로 로드
        self.features = nn.Sequential(*list(resnet.children())[:-2])  # 마지막 fc 레이어 제외

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)  # 배치 차원 추가 및 디바이스로 전송
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
    return np.array(features)

# 파일명에서 스타일(라벨)을 추출하는 함수
image_pattern = re.compile(r'^(W|T)_(\d+)_(.*?)_(.*?)_(W|M)\.jpg$')
def get_labels(image_paths):
    labels = []
    for filename in image_paths:
        match = image_pattern.match(os.path.basename(filename))
        if match:
            _, _, _, style, _ = match.groups()
            labels.append(style)
    return labels

# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 2차원으로 변환 (3차원 이상의 텐서인 경우)
    if val_features.dim() > 2:
        val_features = val_features.view(val_features.size(0), -1)
    if train_features.dim() > 2:
        train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores


# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.8):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = np.argmax(score_row)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)  # 임계값보다 낮으면 비선호로 처리
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# 2-2에서 생성한 응답자 선호도 CSV 파일 로드
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/top_100_respondents_preferences.csv'
preferences_df = pd.read_csv(top_100_file_path)

# 학습 및 검증 이미지 경로 추출 함수
def extract_image_paths_from_preferences(preference_column, image_dir):
    image_paths = []
    for img_list in preference_column.dropna():
        images = img_list.strip("[]").replace("'", "").split(", ")
        # 이미지 파일 경로로 변환 후 리스트에 추가
        image_paths.extend([os.path.join(image_dir, img.strip()) for img in images if img.strip()])
    return [img_path for img_path in image_paths if os.path.isfile(img_path)]  # 실제 파일만 반환

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/training_image'
val_image_dir = '/content/drive/MyDrive/dataset/validation_image'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'  # 학습된 가중치 경로 설정

# 학습 및 검증 이미지 경로 추출
train_image_paths = extract_image_paths_from_preferences(preferences_df['Training 스타일 선호'], train_image_dir)
val_image_paths = extract_image_paths_from_preferences(preferences_df['Validation 스타일 선호'], val_image_dir)

# 중복 데이터 확인
common_images = set(train_image_paths) & set(val_image_paths)
print(f"Train과 Validation 데이터셋 간 중복 이미지 수: {len(common_images)}")

if common_images:
    print("공통 이미지 파일:")
    print(common_images)

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 학습 및 검증 이미지의 특징 벡터 추출
print("Extracting features for training images...")
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)

print("Extracting features for validation images...")
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)

# 학습 이미지와 검증 이미지의 라벨 추출
train_labels = get_labels(train_image_paths)
val_labels = get_labels(val_image_paths)

# 학습 이미지의 선호도 레이블 (숫자 형식으로 일관되게 변환)
train_preferences = [1 if label in preferences_df['Training 스타일 선호'].values else 0 for label in train_labels]

# 검증 이미지의 라벨도 숫자 형식으로 변환
val_preferences = [1 if label in preferences_df['Validation 스타일 선호'].values else 0 for label in val_labels]

# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences)

# 정확도, 정밀도, 재현율, F1 점수 계산
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_preferences)

# 결과 출력
print(f"Prediction Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Train과 Validation 데이터셋 간 중복 이미지 수: 0


  state_dict = torch.load(model_path)  # 가중치 로드


Extracting features for training images...


Extracting Features: 100%|██████████| 325/325 [01:36<00:00,  3.36it/s]


Extracting features for validation images...


Extracting Features: 100%|██████████| 114/114 [00:33<00:00,  3.42it/s]

Calculating similarity...
Predicting preferences...
Prediction Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000





3-2 레이블 확인

In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18()  # ResNet-18 모델 생성
        state_dict = torch.load(model_path)  # 가중치 로드

        # 모델의 state_dict에서 fc 레이어 관련 키 제거
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}

        resnet.load_state_dict(state_dict, strict=False)  # 가중치를 strict=False로 로드
        self.features = nn.Sequential(*list(resnet.children())[:-2])  # 마지막 fc 레이어 제외

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)  # 배치 차원 추가 및 디바이스로 전송
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
    return np.array(features)

# 파일명에서 스타일(라벨)을 추출하는 함수
image_pattern = re.compile(r'^(W|T)_(\d+)_(.*?)_(.*?)_(W|M)\.jpg$')
def get_labels(image_paths):
    labels = []
    for filename in image_paths:
        match = image_pattern.match(os.path.basename(filename))
        if match:
            _, _, _, style, _ = match.groups()
            labels.append(style)
    return labels

# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 2차원으로 변환 (3차원 이상의 텐서인 경우)
    if val_features.dim() > 2:
        val_features = val_features.view(val_features.size(0), -1)
    if train_features.dim() > 2:
        train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores


# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.6):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = np.argmax(score_row)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)  # 임계값보다 낮으면 비선호로 처리
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# 2-2에서 생성한 응답자 선호도 CSV 파일 로드
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/top_100_respondents_preferences.csv'
preferences_df = pd.read_csv(top_100_file_path)

# 학습 및 검증 이미지 경로 추출 함수
def extract_image_paths_from_preferences(preference_column, image_dir):
    image_paths = []
    for img_list in preference_column.dropna():
        images = img_list.strip("[]").replace("'", "").split(", ")
        # 이미지 파일 경로로 변환 후 리스트에 추가
        image_paths.extend([os.path.join(image_dir, img.strip()) for img in images if img.strip()])
    return [img_path for img_path in image_paths if os.path.isfile(img_path)]  # 실제 파일만 반환

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/training_image'
val_image_dir = '/content/drive/MyDrive/dataset/validation_image'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'  # 학습된 가중치 경로 설정

# 학습 및 검증 이미지 경로 추출
train_image_paths = extract_image_paths_from_preferences(preferences_df['Training 스타일 선호'], train_image_dir)
val_image_paths = extract_image_paths_from_preferences(preferences_df['Validation 스타일 선호'], val_image_dir)

# 중복 데이터 확인
common_images = set(train_image_paths) & set(val_image_paths)
print(f"Train과 Validation 데이터셋 간 중복 이미지 수: {len(common_images)}")

if common_images:
    print("공통 이미지 파일:")
    print(common_images)

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 학습 및 검증 이미지의 특징 벡터 추출
print("Extracting features for training images...")
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)

print("Extracting features for validation images...")
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)

# 학습 이미지와 검증 이미지의 라벨 추출
train_labels = get_labels(train_image_paths)
val_labels = get_labels(val_image_paths)

# 학습 이미지의 선호도 레이블 (숫자 형식으로 일관되게 변환)
train_preferences = [1 if label in preferences_df['Training 스타일 선호'].values else 0 for label in train_labels]

# 검증 이미지의 라벨도 숫자 형식으로 변환
val_preferences = [1 if label in preferences_df['Validation 스타일 선호'].values else 0 for label in val_labels]

# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences)

# 디버깅을 위한 유사도, 예측 값, 실제 값 출력
print("Similarity scores sample:")
print(similarity_scores[:5])  # 유사도 값 일부 샘플 출력

print("Predicted Preferences Sample:", predicted_preferences[:10])
print("Validation Preferences Sample:", val_preferences[:10])

# 정확도, 정밀도, 재현율, F1 점수 계산
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_preferences)

# 결과 출력
print(f"Prediction Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# 레이블 비교를 위한 차이점 계산
diff_count = sum([1 for pred, actual in zip(predicted_preferences, val_preferences) if pred != actual])
print(f"Prediction and validation labels difference count: {diff_count}")


Train과 Validation 데이터셋 간 중복 이미지 수: 0


  state_dict = torch.load(model_path)  # 가중치 로드


Extracting features for training images...


Extracting Features: 100%|██████████| 325/325 [01:38<00:00,  3.28it/s]


Extracting features for validation images...


Extracting Features: 100%|██████████| 114/114 [00:32<00:00,  3.47it/s]

Calculating similarity...
Predicting preferences...
Similarity scores sample:
[[0.8853961  0.81390333 1.0000001  ... 0.9012476  0.8740969  0.85757166]
 [0.87628806 0.87581486 0.8823871  ... 0.88525504 0.88264567 0.8860845 ]
 [0.907057   0.8814254  0.8661924  ... 0.91906625 0.9227008  0.88517034]
 [0.9006591  0.8929478  0.8804139  ... 0.92253864 0.9202245  0.87378615]
 [0.8648086  0.89711004 0.85186684 ... 0.88210243 0.9062601  0.8720805 ]]
Predicted Preferences Sample: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Validation Preferences Sample: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Prediction Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Prediction and validation labels difference count: 0





train_preferences와 val_preferences의 값을 점검하는 코드

In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18()  # ResNet-18 모델 생성
        state_dict = torch.load(model_path)  # 가중치 로드

        # 모델의 state_dict에서 fc 레이어 관련 키 제거
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}

        resnet.load_state_dict(state_dict, strict=False)  # 가중치를 strict=False로 로드
        self.features = nn.Sequential(*list(resnet.children())[:-2])  # 마지막 fc 레이어 제외

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)  # 배치 차원 추가 및 디바이스로 전송
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
    return np.array(features)

# 파일명에서 스타일(라벨)을 추출하는 함수
image_pattern = re.compile(r'^(W|T)_(\d+)_(.*?)_(.*?)_(W|M)\.jpg$')
def get_labels(image_paths):
    labels = []
    for filename in image_paths:
        match = image_pattern.match(os.path.basename(filename))
        if match:
            _, _, _, style, _ = match.groups()
            labels.append(style)
    return labels

# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 2차원으로 변환 (3차원 이상의 텐서인 경우)
    if val_features.dim() > 2:
        val_features = val_features.view(val_features.size(0), -1)
    if train_features.dim() > 2:
        train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.6):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = np.argmax(score_row)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)  # 임계값보다 낮으면 비선호로 처리
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# 2-2에서 생성한 응답자 선호도 CSV 파일 로드
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/top_100_respondents_preferences.csv'
preferences_df = pd.read_csv(top_100_file_path)

# 학습 및 검증 이미지 경로 추출 함수
def extract_image_paths_from_preferences(preference_column, image_dir):
    image_paths = []
    for img_list in preference_column.dropna():
        images = img_list.strip("[]").replace("'", "").split(", ")
        # 이미지 파일 경로로 변환 후 리스트에 추가
        image_paths.extend([os.path.join(image_dir, img.strip()) for img in images if img.strip()])
    return [img_path for img_path in image_paths if os.path.isfile(img_path)]  # 실제 파일만 반환

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/training_image'
val_image_dir = '/content/drive/MyDrive/dataset/validation_image'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'  # 학습된 가중치 경로 설정

# 학습 및 검증 이미지 경로 추출
train_image_paths = extract_image_paths_from_preferences(preferences_df['Training 스타일 선호'], train_image_dir)
val_image_paths = extract_image_paths_from_preferences(preferences_df['Validation 스타일 선호'], val_image_dir)

# 중복 데이터 확인
common_images = set(train_image_paths) & set(val_image_paths)
print(f"Train과 Validation 데이터셋 간 중복 이미지 수: {len(common_images)}")

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 학습 및 검증 이미지의 특징 벡터 추출
print("Extracting features for training images...")
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)

print("Extracting features for validation images...")
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)

# 학습 이미지와 검증 이미지의 라벨 추출
train_labels = get_labels(train_image_paths)
val_labels = get_labels(val_image_paths)

# 학습 이미지의 선호도 레이블 (숫자 형식으로 일관되게 변환)
train_preferences = [1 if label in preferences_df['Training 스타일 선호'].values else 0 for label in train_labels]

# 검증 이미지의 라벨도 숫자 형식으로 변환
val_preferences = [1 if label in preferences_df['Validation 스타일 선호'].values else 0 for label in val_labels]

# 디버깅을 위한 train_preferences와 val_preferences의 분포 확인
print("Train Preferences 분포:", pd.Series(train_preferences).value_counts())
print("Validation Preferences 분포:", pd.Series(val_preferences).value_counts())

# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold=0.6)

# 디버깅을 위한 유사도, 예측 값, 실제 값 출력
print("Similarity scores sample:")
print(similarity_scores[:5])  # 유사도 값 일부 샘플 출력

print("Predicted Preferences Sample:", predicted_preferences[:10])
print("Validation Preferences Sample:", val_preferences[:10])

# 정확도, 정밀도, 재현율, F1 점수 계산
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_preferences)

# 결과 출력
print(f"Prediction Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# 레이블 비교를 위한 차이점 계산
diff_count = sum([1 for pred, actual in zip(predicted_preferences, val_preferences) if pred != actual])
print(f"Prediction and validation labels difference count: {diff_count}")


Train과 Validation 데이터셋 간 중복 이미지 수: 0


  state_dict = torch.load(model_path)  # 가중치 로드


Extracting features for training images...


Extracting Features: 100%|██████████| 325/325 [01:39<00:00,  3.27it/s]


Extracting features for validation images...


Extracting Features: 100%|██████████| 114/114 [00:37<00:00,  3.05it/s]

Train Preferences 분포: 0    325
Name: count, dtype: int64
Validation Preferences 분포: 0    114
Name: count, dtype: int64
Calculating similarity...
Predicting preferences...
Similarity scores sample:
[[0.91163343 0.88510025 1.0000002  ... 0.9313352  0.9092157  0.9164553 ]
 [0.9194087  0.8975162  0.9257333  ... 0.9293745  0.9197666  0.9180166 ]
 [0.9294648  0.91828763 0.9135726  ... 0.93681353 0.9356378  0.9288393 ]
 [0.9313044  0.9213519  0.9338911  ... 0.9501404  0.9390071  0.9293614 ]
 [0.91531354 0.9061108  0.90248096 ... 0.9264375  0.92872643 0.9156902 ]]
Predicted Preferences Sample: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Validation Preferences Sample: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Prediction Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Prediction and validation labels difference count: 0





In [None]:
import pandas as pd

# CSV 파일 경로 설정
csv_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/top_100_respondents_preferences.csv'

# 여러 구분자와 인코딩을 시도하며 파일 읽기
preferences_df = None
separators = [',', ';', '\t']
encodings = ['utf-8', 'ISO-8859-1', 'utf-16']

for sep in separators:
    for encoding in encodings:
        try:
            print(f"Trying separator='{sep}' and encoding='{encoding}'")
            preferences_df = pd.read_csv(csv_file_path, sep=sep, encoding=encoding)

            # 파일이 성공적으로 로드되면 반복 중단
            print("File loaded successfully")
            break
        except Exception as e:
            print(f"Failed with separator='{sep}' and encoding='{encoding}': {e}")
    if preferences_df is not None:
        break

# 열 데이터가 정상적으로 로드되었는지 점검
if preferences_df is not None:
    # 각 응답자의 'Training 스타일 선호'와 'Validation 스타일 선호' 열 확인
    print("Training 스타일 선호 샘플:\n", preferences_df['Training 스타일 선호'].head())
    print("Validation 스타일 선호 샘플:\n", preferences_df['Validation 스타일 선호'].head())

    # 빈 배열이거나 데이터가 없는 경우 알림
    if preferences_df['Training 스타일 선호'].isnull().all() or preferences_df['Validation 스타일 선호'].isnull().all():
        print("Warning: 'Training 스타일 선호' 및 'Validation 스타일 선호' 열에 데이터가 없습니다.")
    else:
        print("파일이 정상적으로 준비되었습니다.")
else:
    print("preferences_df가 준비되지 않았습니다.")


Trying separator=',' and encoding='utf-8'
File loaded successfully
Training 스타일 선호 샘플:
 0                                                                                                                                          ['W_02804_19_normcore_M.jpg', 'W_06843_60_mods_M.jpg']
1    ['W_00829_10_sportivecasual_M.jpg', 'W_28209_10_sportivecasual_M.jpg', 'W_28211_19_normcore_M.jpg', 'W_28722_10_sportivecasual_M.jpg', 'W_17305_70_hippie_M.jpg', 'W_25073_90_hiphop_M.jpg']
2                                                                                                                                                                                             NaN
3                                                                                                                                                                        ['W_15186_50_ivy_M.jpg']
4                                                                                                                                       

In [None]:
import pandas as pd
import ast

# CSV 파일 경로 설정
csv_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/top_100_respondents_preferences.csv'

# CSV 파일 로드 시도
try:
    # 여러 구분자와 인코딩 설정을 시도
    print("Trying separator=',' and encoding='utf-8'")
    preferences_df = pd.read_csv(csv_file_path, sep=',', encoding='utf-8')
    print("File loaded successfully")

except UnicodeDecodeError:
    print("Trying separator=',' and encoding='ISO-8859-1'")
    preferences_df = pd.read_csv(csv_file_path, sep=',', encoding='ISO-8859-1')
    print("File loaded successfully with ISO-8859-1 encoding")

except pd.errors.ParserError:
    print("Trying separator=';'")
    preferences_df = pd.read_csv(csv_file_path, sep=';', encoding='utf-8')
    print("File loaded successfully with separator ';'")

# CSV 파일 내용 점검
print("Training 스타일 선호 샘플:\n", preferences_df['Training 스타일 선호'].head())
print("Validation 스타일 선호 샘플:\n", preferences_df['Validation 스타일 선호'].head())

# 'Training 스타일 선호' 및 'Validation 스타일 선호' 열을 리스트 형식으로 변환
preferences_df['Training 스타일 선호'] = preferences_df['Training 스타일 선호'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])
preferences_df['Validation 스타일 선호'] = preferences_df['Validation 스타일 선호'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])

# 변환 후 첫 5개 출력
print("Converted 'Training 스타일 선호' Sample:\n", preferences_df['Training 스타일 선호'].head())
print("Converted 'Validation 스타일 선호' Sample:\n", preferences_df['Validation 스타일 선호'].head())


Trying separator=',' and encoding='utf-8'
File loaded successfully
Training 스타일 선호 샘플:
 0                                                                                                                                          ['W_02804_19_normcore_M.jpg', 'W_06843_60_mods_M.jpg']
1    ['W_00829_10_sportivecasual_M.jpg', 'W_28209_10_sportivecasual_M.jpg', 'W_28211_19_normcore_M.jpg', 'W_28722_10_sportivecasual_M.jpg', 'W_17305_70_hippie_M.jpg', 'W_25073_90_hiphop_M.jpg']
2                                                                                                                                                                                             NaN
3                                                                                                                                                                        ['W_15186_50_ivy_M.jpg']
4                                                                                                                                       

수정 수정 제발 돼라. csv 파일에서 선호 비선호가 1 0으로 안바뀌고 다 0 으로 비선호로 된거 였음

In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18()  # ResNet-18 모델 생성
        state_dict = torch.load(model_path)  # 가중치 로드

        # 모델의 state_dict에서 fc 레이어 관련 키 제거
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}

        resnet.load_state_dict(state_dict, strict=False)  # 가중치를 strict=False로 로드
        self.features = nn.Sequential(*list(resnet.children())[:-2])  # 마지막 fc 레이어 제외

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)  # 배치 차원 추가 및 디바이스로 전송
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
    return np.array(features)

# 파일명에서 스타일(라벨)을 추출하는 함수
image_pattern = re.compile(r'^(W|T)_(\d+)_(.*?)_(.*?)_(W|M)\.jpg$')
def get_labels(image_paths):
    labels = []
    for filename in image_paths:
        match = image_pattern.match(os.path.basename(filename))
        if match:
            _, _, _, style, _ = match.groups()
            labels.append(style)
    return labels

# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 2차원으로 변환 (3차원 이상의 텐서인 경우)
    if val_features.dim() > 2:
        val_features = val_features.view(val_features.size(0), -1)
    if train_features.dim() > 2:
        train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.6):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = np.argmax(score_row)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)  # 임계값보다 낮으면 비선호로 처리
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# 2-2에서 생성한 응답자 선호도 CSV 파일 로드
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/top_100_respondents_preferences.csv'
preferences_df = pd.read_csv(top_100_file_path)

# 학습 및 검증 이미지 경로 추출 함수
def extract_image_paths_from_preferences(preference_column, image_dir):
    image_paths = []
    for img_list in preference_column.dropna():
        images = img_list.strip("[]").replace("'", "").split(", ")
        # 이미지 파일 경로로 변환 후 리스트에 추가
        image_paths.extend([os.path.join(image_dir, img.strip()) for img in images if img.strip()])
    return [img_path for img_path in image_paths if os.path.isfile(img_path)]  # 실제 파일만 반환

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/training_image'
val_image_dir = '/content/drive/MyDrive/dataset/validation_image'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'  # 학습된 가중치 경로 설정

# 학습 및 검증 이미지 경로 추출
train_image_paths = extract_image_paths_from_preferences(preferences_df['Training 스타일 선호'], train_image_dir)
val_image_paths = extract_image_paths_from_preferences(preferences_df['Validation 스타일 선호'], val_image_dir)

# 중복 데이터 확인
common_images = set(train_image_paths) & set(val_image_paths)
print(f"Train과 Validation 데이터셋 간 중복 이미지 수: {len(common_images)}")

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 학습 및 검증 이미지의 특징 벡터 추출
print("Extracting features for training images...")
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)

print("Extracting features for validation images...")
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)

# 학습 이미지와 검증 이미지의 라벨 추출
train_labels = get_labels(train_image_paths)
val_labels = get_labels(val_image_paths)

# 학습 이미지의 선호도 레이블 (숫자 형식으로 일관되게 변환)
train_preferences = [
    1 if any(img in preferences_df['Training 스타일 선호'].dropna().values for img in [label]) else 0
    for label in train_labels
]

# 검증 이미지의 라벨도 숫자 형식으로 변환
val_preferences = [
    1 if any(img in preferences_df['Validation 스타일 선호'].dropna().values for img in [label]) else 0
    for label in val_labels
]

# 결과 분포 출력
print("Train Preferences 분포:", pd.Series(train_preferences).value_counts())
print("Validation Preferences 분포:", pd.Series(val_preferences).value_counts())

# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold=0.6)

# 디버깅을 위한 유사도, 예측 값, 실제 값 출력
print("Similarity scores sample:")
print(similarity_scores[:5])  # 유사도 값 일부 샘플 출력

print("Predicted Preferences Sample:", predicted_preferences[:10])
print("Validation Preferences Sample:", val_preferences[:10])

# 정확도, 정밀도, 재현율, F1 점수 계산
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_preferences)

# 결과 출력
print(f"Prediction Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# 레이블 비교를 위한 차이점 계산
diff_count = sum([1 for pred, actual in zip(predicted_preferences, val_preferences) if pred != actual])
print(f"Prediction and validation labels difference count: {diff_count}")


UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 15-16: invalid continuation byte

## 선권씨 xlsx 쓴 버전

In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18()  # ResNet-18 모델 생성
        state_dict = torch.load(model_path)  # 가중치 로드

        # 모델의 state_dict에서 fc 레이어 관련 키 제거
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}

        resnet.load_state_dict(state_dict, strict=False)  # 가중치를 strict=False로 로드
        self.features = nn.Sequential(*list(resnet.children())[:-2])  # 마지막 fc 레이어 제외

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)  # 배치 차원 추가 및 디바이스로 전송
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
    return np.array(features)

# 파일명에서 스타일(라벨)을 추출하는 함수
image_pattern = re.compile(r'^(W|T)_(\d+)_(.*?)_(.*?)_(W|M)\.jpg$')
def get_labels(image_paths):
    labels = []
    for filename in image_paths:
        match = image_pattern.match(os.path.basename(filename))
        if match:
            _, _, _, style, _ = match.groups()
            labels.append(style)
    return labels

# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 빈 배열 또는 비정상 차원일 경우 예외 처리
    if val_features.numel() == 0 or train_features.numel() == 0:
        raise ValueError("Error: One or both of the feature arrays are empty or invalid.")

    # 2차원으로 변환 (3차원 이상의 텐서인 경우)
    if val_features.dim() > 2:
        val_features = val_features.view(val_features.size(0), -1)
    if train_features.dim() > 2:
        train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores


# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.6):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = np.argmax(score_row)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)  # 임계값보다 낮으면 비선호로 처리
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# 2-2에서 생성한 응답자 선호도 Excel 파일 로드
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/preferences0.xlsx'
preferences_df = pd.read_excel(top_100_file_path)  # Excel 파일 읽기

# 학습 및 검증 이미지 경로 추출 함수
def extract_image_paths_from_preferences(preference_column, image_dir):
    image_paths = []
    for img_list in preference_column.dropna():
        images = img_list.strip("[]").replace("'", "").split(", ")
        # 이미지 파일 경로로 변환 후 리스트에 추가
        image_paths.extend([os.path.join(image_dir, img.strip()) for img in images if img.strip()])
    return [img_path for img_path in image_paths if os.path.isfile(img_path)]  # 실제 파일만 반환

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned'
val_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned_for_val'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'  # 학습된 가중치 경로 설정

# 학습 및 검증 이미지 경로 추출
train_image_paths = extract_image_paths_from_preferences(preferences_df['Training Style Preferred'], train_image_dir)
val_image_paths = extract_image_paths_from_preferences(preferences_df['Validation Style Preferred'], val_image_dir)

# 중복 데이터 확인
common_images = set(train_image_paths) & set(val_image_paths)
print(f"Train과 Validation 데이터셋 간 중복 이미지 수: {len(common_images)}")

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 학습 및 검증 이미지의 특징 벡터 추출
print("Extracting features for training images...")
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)

print("Extracting features for validation images...")
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)

# 학습 이미지와 검증 이미지의 라벨 추출
train_labels = get_labels(train_image_paths)
val_labels = get_labels(val_image_paths)

# 학습 이미지의 선호도 레이블 (숫자 형식으로 일관되게 변환)
train_preferences = [
    1 if any(img in preferences_df['Training Style Preferred'].dropna().values for img in [label]) else 0
    for label in train_labels
]

# 검증 이미지의 라벨도 숫자 형식으로 변환
val_preferences = [
    1 if any(img in preferences_df['Validation Style Preferred'].dropna().values for img in [label]) else 0
    for label in val_labels
]

# 결과 분포 출력
print("Train Preferences 분포:", pd.Series(train_preferences).value_counts())
print("Validation Preferences 분포:", pd.Series(val_preferences).value_counts())

# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold=0.5)

# 디버깅을 위한 유사도, 예측 값, 실제 값 출력
print("Similarity scores sample:")
print(similarity_scores[:5])  # 유사도 값 일부 샘플 출력

print("Predicted Preferences Sample:", predicted_preferences[:10])
print("Validation Preferences Sample:", val_preferences[:10])

# 정확도, 정밀도, 재현율, F1 점수 계산
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_preferences)

# 결과 출력
print(f"Prediction Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# 레이블 비교를 위한 차이점 계산
diff_count = sum([1 for pred, actual in zip(predicted_preferences, val_preferences) if pred != actual])
print(f"Prediction and validation labels difference count: {diff_count}")


Train과 Validation 데이터셋 간 중복 이미지 수: 0


  state_dict = torch.load(model_path)  # 가중치 로드


Extracting features for training images...


Extracting Features: 0it [00:00, ?it/s]


Extracting features for validation images...


Extracting Features: 0it [00:00, ?it/s]


Train Preferences 분포: Series([], Name: count, dtype: int64)
Validation Preferences 분포: Series([], Name: count, dtype: int64)
Calculating similarity...


ValueError: Error: One or both of the feature arrays are empty or invalid.

In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18()  # ResNet-18 모델 생성
        state_dict = torch.load(model_path)  # 가중치 로드

        # 모델의 state_dict에서 fc 레이어 관련 키 제거
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)  # 가중치를 strict=False로 로드
        self.features = nn.Sequential(*list(resnet.children())[:-2])  # 마지막 fc 레이어 제외

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)  # 배치 차원 추가 및 디바이스로 전송
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    return np.array(features)

# 파일명에서 스타일(라벨)을 추출하는 함수
image_pattern = re.compile(r'^(W|T)_(\d+)_(.*?)_(.*?)_(W|M)\.jpg$')
def get_labels(image_paths):
    labels = []
    for filename in image_paths:
        match = image_pattern.match(os.path.basename(filename))
        if match:
            _, _, _, style, _ = match.groups()
            labels.append(style)
    return labels

# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 2차원으로 변환 (3차원 이상의 텐서인 경우)
    if val_features.dim() > 2:
        val_features = val_features.view(val_features.size(0), -1)
    if train_features.dim() > 2:
        train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.6):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = np.argmax(score_row)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)  # 임계값보다 낮으면 비선호로 처리
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# 2-2에서 생성한 응답자 선호도 Excel 파일 로드
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/preferences0.xlsx'
preferences_df = pd.read_excel(top_100_file_path)  # Excel 파일 읽기

# 학습 및 검증 이미지 경로 추출 함수
def extract_image_paths_from_preferences(preference_column, image_dir):
    image_paths = []
    for img_list in preference_column.dropna():
        images = img_list.strip("[]").replace("'", "").split(", ")
        # 이미지 파일 경로로 변환 후 리스트에 추가
        image_paths.extend([os.path.join(image_dir, img.strip()) for img in images if img.strip()])
    return [img_path for img_path in image_paths if os.path.isfile(img_path)]  # 실제 파일만 반환

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned'
val_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned_for_val'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'  # 학습된 가중치 경로 설정

# 학습 및 검증 이미지 경로 추출
print("Extracting train_image_paths and val_image_paths...")
train_image_paths = extract_image_paths_from_preferences(preferences_df['Training Style Preferred'], train_image_dir)
val_image_paths = extract_image_paths_from_preferences(preferences_df['Validation Style Preferred'], val_image_dir)

# 디버깅: 추출된 경로 수 출력
print(f"Number of train_image_paths: {len(train_image_paths)}")
print(f"Number of val_image_paths: {len(val_image_paths)}")

# 중복 데이터 확인
common_images = set(train_image_paths) & set(val_image_paths)
print(f"Train과 Validation 데이터셋 간 중복 이미지 수: {len(common_images)}")

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 학습 및 검증 이미지의 특징 벡터 추출
print("Extracting features for training images...")
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)

print("Extracting features for validation images...")
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)

# 학습 이미지와 검증 이미지의 라벨 추출
train_labels = get_labels(train_image_paths)
val_labels = get_labels(val_image_paths)

# 학습 이미지의 선호도 레이블 (숫자 형식으로 일관되게 변환)
train_preferences = [
    1 if any(img in preferences_df['Training Style Preferred'].dropna().values for img in [label]) else 0
    for label in train_labels
]

# 검증 이미지의 라벨도 숫자 형식으로 변환
val_preferences = [
    1 if any(img in preferences_df['Validation Style Preferred'].dropna().values for img in [label]) else 0
    for label in val_labels
]

# 결과 분포 출력
print("Train Preferences 분포:", pd.Series(train_preferences).value_counts())
print("Validation Preferences 분포:", pd.Series(val_preferences).value_counts())

# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold=0.5)

# 디버깅을 위한 유사도, 예측 값, 실제 값 출력
print("Similarity scores sample:")
print(similarity_scores[:5])  # 유사도 값 일부 샘플 출력

print("Predicted Preferences Sample:", predicted_preferences[:10])
print("Validation Preferences Sample:", val_preferences[:10])

# 정확도, 정밀도, 재현율, F1 점수 계산
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_preferences)

# 결과 출력
print(f"Prediction Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# 레이블 비교를 위한 차이점 계산
diff_count = sum([1 for pred, actual in zip(predicted_preferences, val_preferences) if pred != actual])
print(f"Prediction and validation labels difference count: {diff_count}")



Extracting train_image_paths and val_image_paths...
Number of train_image_paths: 0
Number of val_image_paths: 0
Train과 Validation 데이터셋 간 중복 이미지 수: 0


  state_dict = torch.load(model_path)  # 가중치 로드


Extracting features for training images...


Extracting Features: 0it [00:00, ?it/s]


Extracting features for validation images...


Extracting Features: 0it [00:00, ?it/s]

Train Preferences 분포: Series([], Name: count, dtype: int64)
Validation Preferences 분포: Series([], Name: count, dtype: int64)
Calculating similarity...





IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [None]:
import pandas as pd

# 엑셀 파일 로드 및 열 이름 수정
file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/preferences0.xlsx'
preferences_df = pd.read_excel(file_path)

# 열 이름 표준화
preferences_df.columns = ["Respondent ID", "Training Style Preferred", "Training Style Not Preferred",
                          "Validation Style Preferred", "Validation Style Not Preferred"]

# Training과 Validation의 선호 스타일 데이터를 리스트로 변환
def parse_style_list(style_string):
    if pd.isna(style_string):
        return []
    return [item.strip() for item in style_string.strip("[]").replace("'", "").split(",")]

preferences_df["Training Style Preferred"] = preferences_df["Training Style Preferred"].apply(parse_style_list)
preferences_df["Validation Style Preferred"] = preferences_df["Validation Style Preferred"].apply(parse_style_list)

# 변환된 데이터 확인
print("Training 스타일 선호 샘플:")
print(preferences_df["Training Style Preferred"].head())
print("Validation 스타일 선호 샘플:")
print(preferences_df["Validation Style Preferred"].head())


Training 스타일 선호 샘플:
0    [W_07098_19_normcore_M.jpg, W_16449_10_sportivecasual_M.jpg, W_16403_10_sportivecasual_M.jpg, W_07307_19_normcore_M.jpg, W_07260_10_sportivecasual_M.jpg, W_01394_10_sportivecasual_M.jpg, W_17702_90_hiphop_M.jpg, W_04687_00_metrosexual_M.jpg, W_04324_90_hiphop_M.jpg, W_02693_70_hippie_M.jpg, W_16445_50_ivy_M.jpg, W_05869_60_mods_M.jpg]
1                                                                                                                                                                             [W_02978_10_sportivecasual_M.jpg, W_02771_10_sportivecasual_M.jpg, W_00831_19_normcore_M.jpg, W_17273_19_normcore_M.jpg, W_06993_90_hiphop_M.jpg, W_01737_50_ivy_M.jpg]
2                                                                                                                           [W_01989_19_normcore_W.jpg, W_09698_19_genderless_W.jpg, W_19801_19_genderless_W.jpg, W_05786_10_sportivecasual_W.jpg, W_13163_90_kitsch_W.jpg, W_07554_70_hippie_W.jpg, W_0

In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=False)  # ResNet-18 모델 생성
        state_dict = torch.load(model_path)  # 가중치 로드
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])  # 마지막 fc 레이어 제외

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    return image

def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    return np.array(features)


# 파일명에서 스타일(라벨)을 추출하는 함수
image_pattern = re.compile(r'^(W|T)_(\d+)_(.*?)_(.*?)_(W|M)\.jpg$')
def get_labels(image_paths):
    labels = []
    for filename in image_paths:
        match = image_pattern.match(os.path.basename(filename))
        if match:
            _, _, _, style, _ = match.groups()
            labels.append(style)
    return labels

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned'
val_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned_for_val'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'  # 학습된 가중치 경로 설정

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 2-2에서 생성한 응답자 선호도 Excel 파일 로드
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/preferences0.xlsx'
preferences_df = pd.read_excel(top_100_file_path)

# 학습 및 검증 이미지 경로 추출 함수
def extract_image_paths_from_preferences(preference_column, image_dir):
    image_paths = []
    for img_list in preference_column.dropna():
        images = img_list.strip("[]").replace("'", "").split(", ")
        image_paths.extend([os.path.join(image_dir, img.strip()) for img in images if img.strip()])
    return [img_path for img_path in image_paths if os.path.isfile(img_path)]

# 학습 및 검증 이미지 경로 추출
train_image_paths = extract_image_paths_from_preferences(preferences_df['Training Style Preferred'], train_image_dir)
val_image_paths = extract_image_paths_from_preferences(preferences_df['Validation Style Preferred'], val_image_dir)

# 중복 데이터 확인
common_images = set(train_image_paths) & set(val_image_paths)
print(f"Train과 Validation 데이터셋 간 중복 이미지 수: {len(common_images)}")

# 학습 및 검증 이미지의 특징 벡터 추출 및 저장
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)

# 학습 이미지와 검증 이미지의 라벨 추출
train_labels = get_labels(train_image_paths)
val_labels = get_labels(val_image_paths)

# 학습 이미지의 선호도 레이블 (숫자 형식으로 일관되게 변환)
train_preferences = [
    1 if any(img in preferences_df['Training Style Preferred'].dropna().values for img in [label]) else 0
    for label in train_labels
]

# 검증 이미지의 라벨도 숫자 형식으로 변환
val_preferences = [
    1 if any(img in preferences_df['Validation Style Preferred'].dropna().values for img in [label]) else 0
    for label in val_labels
]

# 결과 분포 출력
print("Train Preferences 분포:", pd.Series(train_preferences).value_counts())
print("Validation Preferences 분포:", pd.Series(val_preferences).value_counts())

# 이후의 유사도 계산 및 선호도 예측 코드는 동일하게 진행합니다.

# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 빈 배열 또는 비정상 차원일 경우 예외 처리
    if val_features.numel() == 0 or train_features.numel() == 0:
        raise ValueError("Error: One or both of the feature arrays are empty or invalid.")

    # 2차원으로 변환 (3차원 이상의 텐서인 경우)
    if val_features.dim() > 2:
        val_features = val_features.view(val_features.size(0), -1)
    if train_features.dim() > 2:
        train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores


# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.6):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = np.argmax(score_row)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)  # 임계값보다 낮으면 비선호로 처리
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold=0.5)

# 디버깅을 위한 유사도, 예측 값, 실제 값 출력
print("Similarity scores sample:")
print(similarity_scores[:5])  # 유사도 값 일부 샘플 출력

print("Predicted Preferences Sample:", predicted_preferences[:10])
print("Validation Preferences Sample:", val_preferences[:10])

# 정확도, 정밀도, 재현율, F1 점수 계산
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_preferences)

# 결과 출력
print(f"Prediction Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# 레이블 비교를 위한 차이점 계산
diff_count = sum([1 for pred, actual in zip(predicted_preferences, val_preferences) if pred != actual])
print(f"Prediction and validation labels difference count: {diff_count}")



  state_dict = torch.load(model_path)  # 가중치 로드


Train과 Validation 데이터셋 간 중복 이미지 수: 0


Extracting Features: 0it [00:00, ?it/s]
Extracting Features: 0it [00:00, ?it/s]

Train Preferences 분포: Series([], Name: count, dtype: int64)
Validation Preferences 분포: Series([], Name: count, dtype: int64)
Calculating similarity...





ValueError: Error: One or both of the feature arrays are empty or invalid.

In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=False)
        state_dict = torch.load(model_path)
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수 및 저장
def extract_features_for_dataset(image_paths, feature_extractor, device, save_path):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    features = np.array(features)
    np.save(save_path, features)  # 특징 벡터 저장
    return features

# 파일명에서 스타일(라벨)을 추출하는 함수
image_pattern = re.compile(r'^segmented_(W|T)_(\d+)_(.*?)_(.*?)_(W|M)\.jpg$')
def get_labels(image_paths):
    labels = []
    for filename in image_paths:
        match = image_pattern.match(os.path.basename(filename))
        if match:
            _, _, _, style, _ = match.groups()
            labels.append(style)
    return labels

# 수정된 함수: 파일명에 접두사 추가하여 경로 생성
def extract_image_paths_from_preferences(preference_column, image_dir):
    image_paths = []
    for img_list in preference_column.dropna():
        images = img_list.strip("[]").replace("'", "").split(", ")
        paths = [os.path.join(image_dir, f"segmented_{img.strip()}") for img in images if img.strip()]
        image_paths.extend(paths)
    return [img_path for img_path in image_paths if os.path.isfile(img_path)]

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned'
val_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned_for_val'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/preferences0.xlsx'
preferences_df = pd.read_excel(top_100_file_path)

# 학습 및 검증 이미지 경로 추출
train_image_paths = extract_image_paths_from_preferences(preferences_df['Training Style Preferred'], train_image_dir)
val_image_paths = extract_image_paths_from_preferences(preferences_df['Validation Style Preferred'], val_image_dir)

# 경로 수 확인
print(f"Number of train_image_paths: {len(train_image_paths)}")
print(f"Number of val_image_paths: {len(val_image_paths)}")

# 중복 데이터 확인
common_images = set(train_image_paths) & set(val_image_paths)
print(f"Train과 Validation 데이터셋 간 중복 이미지 수: {len(common_images)}")

# 학습 및 검증 이미지의 특징 벡터 추출 및 저장
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device, '/content/train_features.npy')
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device, '/content/val_features.npy')

# 특징 벡터 배열 크기 확인
print(f"Train features shape: {train_features.shape}")
print(f"Validation features shape: {val_features.shape}")

# 학습 이미지와 검증 이미지의 라벨 추출
train_labels = get_labels(train_image_paths)
val_labels = get_labels(val_image_paths)

# 학습 이미지의 선호도 레이블 (숫자 형식으로 변환)
train_preferences = [
    1 if any(img in preferences_df['Training Style Preferred'].dropna().values for img in [label]) else 0
    for label in train_labels
]

# 검증 이미지의 라벨도 숫자 형식으로 변환
val_preferences = [
    1 if any(img in preferences_df['Validation Style Preferred'].dropna().values for img in [label]) else 0
    for label in val_labels
]

# 결과 분포 출력
print("Train Preferences 분포:", pd.Series(train_preferences).value_counts())
print("Validation Preferences 분포:", pd.Series(val_preferences).value_counts())

# 이후의 유사도 계산 및 선호도 예측 코드는 그대로 진행됩니다.


# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 빈 배열 또는 비정상 차원일 경우 예외 처리
    if val_features.numel() == 0 or train_features.numel() == 0:
        raise ValueError("Error: One or both of the feature arrays are empty or invalid.")

    # 2차원으로 변환 (3차원 이상의 텐서인 경우)
    if val_features.dim() > 2:
        val_features = val_features.view(val_features.size(0), -1)
    if train_features.dim() > 2:
        train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.6):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = np.argmax(score_row)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)  # 임계값보다 낮으면 비선호로 처리
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold=0.5)

# 유사도, 예측 값, 실제 값 출력
print("Similarity scores sample:")
print(similarity_scores[:5])

print("Predicted Preferences Sample:", predicted_preferences[:10])
print("Validation Preferences Sample:", val_preferences[:10])

# 정확도, 정밀도, 재현율, F1 점수 계산
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_preferences)

# 결과 출력
print(f"Prediction Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# 레이블 비교를 위한 차이점 계산
diff_count = sum([1 for pred, actual in zip(predicted_preferences, val_preferences) if pred != actual])
print(f"Prediction and validation labels difference count: {diff_count}")


Number of train_image_paths: 674
Number of val_image_paths: 442
Train과 Validation 데이터셋 간 중복 이미지 수: 0


Extracting Features: 100%|██████████| 674/674 [00:09<00:00, 73.49it/s] 
Extracting Features: 100%|██████████| 442/442 [00:03<00:00, 116.39it/s]


Train features shape: (674, 512, 7, 7)
Validation features shape: (442, 512, 7, 7)
Train Preferences 분포: 0    674
Name: count, dtype: int64
Validation Preferences 분포: 0    442
Name: count, dtype: int64
Calculating similarity...
Predicting preferences...
Similarity scores sample:
[[0.96352625 0.95435995 0.962906   ... 0.96968806 0.9568928  0.9613649 ]
 [0.96914303 0.95496446 0.96750534 ... 0.970698   0.9681988  0.96925116]
 [0.9632888  0.9543785  0.9693126  ... 0.97012174 0.96319926 0.96754843]
 [0.9752952  0.9611111  0.9694315  ... 0.9701168  0.96731836 0.96817917]
 [0.970234   0.96511286 0.97612244 ... 0.9713933  0.9606411  0.9806955 ]]
Predicted Preferences Sample: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Validation Preferences Sample: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Prediction Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Prediction and validation labels difference count: 0


In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=False)
        state_dict = torch.load(model_path)
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수 및 저장
def extract_features_for_dataset(image_paths, feature_extractor, device, save_path):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    features = np.array(features)
    np.save(save_path, features)  # 특징 벡터 저장
    return features

# 파일명에서 스타일(라벨)을 추출하는 함수
image_pattern = re.compile(r'^segmented_(W|T)_(\d+)_(.*?)_(.*?)_(W|M)\.jpg$')
def get_labels(image_paths):
    labels = []
    for filename in image_paths:
        match = image_pattern.match(os.path.basename(filename))
        if match:
            _, _, _, style, _ = match.groups()
            labels.append(style)
    return labels

# 수정된 함수: 파일명에 접두사 추가하여 경로 생성
def extract_image_paths_from_preferences(preference_column, image_dir):
    image_paths = []
    for img_list in preference_column.dropna():
        images = img_list.strip("[]").replace("'", "").split(", ")
        paths = [os.path.join(image_dir, f"segmented_{img.strip()}") for img in images if img.strip()]
        image_paths.extend(paths)
    return [img_path for img_path in image_paths if os.path.isfile(img_path)]

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned'
val_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned_for_val'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/preferences0.xlsx'
preferences_df = pd.read_excel(top_100_file_path)

# 엑셀 파일의 선호도 컬럼 샘플 출력
print("Sample of Training Style Preferred from Excel:")
print(preferences_df['Training Style Preferred'].head())
print("Sample of Validation Style Preferred from Excel:")
print(preferences_df['Validation Style Preferred'].head())

# 학습 및 검증 이미지 경로 추출
train_image_paths = extract_image_paths_from_preferences(preferences_df['Training Style Preferred'], train_image_dir)
val_image_paths = extract_image_paths_from_preferences(preferences_df['Validation Style Preferred'], val_image_dir)

# 경로 수 확인
print(f"Number of train_image_paths: {len(train_image_paths)}")
print(f"Number of val_image_paths: {len(val_image_paths)}")

# 중복 데이터 확인
common_images = set(train_image_paths) & set(val_image_paths)
print(f"Train과 Validation 데이터셋 간 중복 이미지 수: {len(common_images)}")

# 학습 및 검증 이미지의 특징 벡터 추출 및 저장
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device, '/content/train_features.npy')
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device, '/content/val_features.npy')

# 저장된 특징 벡터 파일 로드 및 크기 확인
loaded_train_features = np.load('/content/train_features.npy')
loaded_val_features = np.load('/content/val_features.npy')
print(f"Loaded Train features shape: {loaded_train_features.shape}")
print(f"Loaded Validation features shape: {loaded_val_features.shape}")

# 학습 이미지와 검증 이미지의 라벨 추출
train_labels = get_labels(train_image_paths)
val_labels = get_labels(val_image_paths)

# 파일명에서 접두어를 제거하는 함수 정의
def normalize_filename(filename):
    return filename.replace("segmented_", "")

# 학습 이미지의 선호도 레이블 (숫자 형식으로 변환)
train_preferences = []
for label in train_labels:
    normalized_label = normalize_filename(label)  # 접두어 제거
    preferred_list = preferences_df['Training Style Preferred'].dropna().tolist()
    is_preferred = any(normalized_label == img.strip() for item in preferred_list for img in item.split(", "))
    train_preferences.append(1 if is_preferred else 0)
print("Train Preferences 샘플:", train_preferences[:10])

# 검증 이미지의 라벨도 숫자 형식으로 변환
val_preferences = []
for label in val_labels:
    normalized_label = normalize_filename(label)  # 접두어 제거
    preferred_list = preferences_df['Validation Style Preferred'].dropna().tolist()
    is_preferred = any(normalized_label == img.strip() for item in preferred_list for img in item.split(", "))
    val_preferences.append(1 if is_preferred else 0)
print("Validation Preferences 샘플:", val_preferences[:10])

# 레이블 분포 출력
print("Train Preferences 분포:", pd.Series(train_preferences).value_counts())
print("Validation Preferences 분포:", pd.Series(val_preferences).value_counts())


# 이후의 유사도 계산 및 선호도 예측 코드는 그대로 진행됩니다.

# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 빈 배열 또는 비정상 차원일 경우 예외 처리
    if val_features.numel() == 0 or train_features.numel() == 0:
        raise ValueError("Error: One or both of the feature arrays are empty or invalid.")

    # 2차원으로 변환 (3차원 이상의 텐서인 경우)
    if val_features.dim() > 2:
        val_features = val_features.view(val_features.size(0), -1)
    if train_features.dim() > 2:
        train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.6):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = np.argmax(score_row)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)  # 임계값보다 낮으면 비선호로 처리
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold=0.5)

# 유사도, 예측 값, 실제 값 출력
print("Similarity scores sample:")
print(similarity_scores[:5])

print("Predicted Preferences Sample:", predicted_preferences[:10])
print("Validation Preferences Sample:", val_preferences[:10])

# 정확도, 정밀도, 재현율, F1 점수 계산
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_preferences)

# 결과 출력
print(f"Prediction Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# 레이블 비교를 위한 차이점 계산
diff_count = sum([1 for pred, actual in zip(predicted_preferences, val_preferences) if pred != actual])
print(f"Prediction and validation labels difference count: {diff_count}")


Sample of Training Style Preferred from Excel:
0    W_07098_19_normcore_M.jpg, W_16449_10_sportivecasual_M.jpg, W_16403_10_sportivecasual_M.jpg, W_07307_19_normcore_M.jpg, W_07260_10_sportivecasual_M.jpg, W_01394_10_sportivecasual_M.jpg, W_17702_90_hiphop_M.jpg, W_04687_00_metrosexual_M.jpg, W_04324_90_hiphop_M.jpg, W_02693_70_hippie_M.jpg, W_16445_50_ivy_M.jpg, W_05869_60_mods_M.jpg
1                                                                                                                                                                             W_02978_10_sportivecasual_M.jpg, W_02771_10_sportivecasual_M.jpg, W_00831_19_normcore_M.jpg, W_17273_19_normcore_M.jpg, W_06993_90_hiphop_M.jpg, W_01737_50_ivy_M.jpg
2                                                                                                                           W_01989_19_normcore_W.jpg, W_09698_19_genderless_W.jpg, W_19801_19_genderless_W.jpg, W_05786_10_sportivecasual_W.jpg, W_13163_90_kitsch_W.jpg, W_0755

Extracting Features: 100%|██████████| 674/674 [00:08<00:00, 83.92it/s] 
Extracting Features: 100%|██████████| 442/442 [00:03<00:00, 114.71it/s]


Loaded Train features shape: (674, 512, 7, 7)
Loaded Validation features shape: (442, 512, 7, 7)
Train Preferences 샘플: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Validation Preferences 샘플: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Train Preferences 분포: 0    674
Name: count, dtype: int64
Validation Preferences 분포: 0    442
Name: count, dtype: int64
Calculating similarity...
Predicting preferences...
Similarity scores sample:
[[0.96352625 0.95435995 0.962906   ... 0.96968806 0.9568928  0.9613649 ]
 [0.96914303 0.95496446 0.96750534 ... 0.970698   0.9681988  0.96925116]
 [0.9632888  0.9543785  0.9693126  ... 0.97012174 0.96319926 0.96754843]
 [0.9752952  0.9611111  0.9694315  ... 0.9701168  0.96731836 0.96817917]
 [0.970234   0.96511286 0.97612244 ... 0.9713933  0.9606411  0.9806955 ]]
Predicted Preferences Sample: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Validation Preferences Sample: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Prediction Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Prediction and validation la

In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=False)  # ResNet-18 모델 생성
        state_dict = torch.load(model_path)
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    return np.array(features)

# 파일명에서 스타일(라벨)을 추출하는 함수
image_pattern = re.compile(r'^segmented_(W|T)_(\d+)_(.*?)_(.*?)_(W|M)\.jpg$')
def get_labels(image_paths):
    labels = []
    for filename in image_paths:
        match = image_pattern.match(os.path.basename(filename))
        if match:
            _, _, _, style, _ = match.groups()
            labels.append(style)
    return labels

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned'
val_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned_for_val'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'  # 학습된 가중치 경로 설정

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 엑셀 파일에서 데이터를 읽고 분리
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/preferences0.xlsx'
preferences_df = pd.read_excel(top_100_file_path)

# 선호도를 구분할 기준 설정 (예: 특정 스타일에 대해 선호도 구분)
def assign_preferences(styles):
    preferences = []
    for style in styles:
        if "specific_style" in style:  # 예시 기준: 특정 스타일 이름 포함 여부
            preferences.append(1)  # 선호
        else:
            preferences.append(0)  # 비선호
    return preferences

# Training Style Preferred와 Validation Style Preferred에서 파일명 추출 및 선호도 부여
train_style = []
val_style = []
for items in preferences_df['Training Style Preferred'].dropna():
    item_list = items.strip("[]").replace("'", "").split(", ")
    train_style.extend(item_list)

for items in preferences_df['Validation Style Preferred'].dropna():
    item_list = items.strip("[]").replace("'", "").split(", ")
    val_style.extend(item_list)

# 이미지 경로로 변환
train_image_paths = [os.path.join(train_image_dir, f"segmented_{img.strip()}") for img in train_style]
val_image_paths = [os.path.join(val_image_dir, f"segmented_{img.strip()}") for img in val_style]

# 학습 및 검증 데이터 선호도 할당
train_preferences = assign_preferences(train_style)
val_preferences = assign_preferences(val_style)
# 경로 및 중복 확인
print(f"Number of train_image_paths: {len(train_image_paths)}")
print(f"Number of val_image_paths: {len(val_image_paths)}")
common_images = set(train_image_paths) & set(val_image_paths)
print(f"Train과 Validation 데이터셋 간 중복 이미지 수: {len(common_images)}")

# 학습 및 검증 이미지의 특징 벡터 추출
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)
print(f"Train features shape: {train_features.shape}")
print(f"Validation features shape: {val_features.shape}")

# 학습 및 검증 이미지의 라벨 추출
train_labels = get_labels(train_image_paths)
val_labels = get_labels(val_image_paths)

# 학습 및 검증 이미지의 선호도 레이블 (숫자 형식으로 변환)
train_preferences = [1] * len(train_labels)  # 선호 스타일이므로 1
val_preferences = [1] * len(val_labels)  # 선호 스타일이므로 1

# 결과 분포 출력
print("Train Preferences 분포:", pd.Series(train_preferences).value_counts())
print("Validation Preferences 분포:", pd.Series(val_preferences).value_counts())

# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환 및 2차원 형태로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 차원 축소: (batch_size, 512 * 7 * 7)
    val_features = val_features.view(val_features.size(0), -1)
    train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores


# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.6):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = np.argmax(score_row)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold=0.5)

# 예측 값, 실제 값 출력
print("Similarity scores sample:")
print(similarity_scores[:5])
print("Predicted Preferences Sample:", predicted_preferences[:10])
print("Validation Preferences Sample:", val_preferences[:10])

# 정확도, 정밀도, 재현율, F1 점수 계산
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_preferences)
print(f"Prediction Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
diff_count = sum([1 for pred, actual in zip(predicted_preferences, val_preferences) if pred != actual])
print(f"Prediction and validation labels difference count: {diff_count}")


  state_dict = torch.load(model_path)


Number of train_image_paths: 674
Number of val_image_paths: 442
Train과 Validation 데이터셋 간 중복 이미지 수: 0


Extracting Features: 100%|██████████| 674/674 [00:13<00:00, 51.16it/s]
Extracting Features: 100%|██████████| 442/442 [00:04<00:00, 105.12it/s]


Train features shape: (674, 512, 7, 7)
Validation features shape: (442, 512, 7, 7)
Train Preferences 분포: 1    674
Name: count, dtype: int64
Validation Preferences 분포: 1    442
Name: count, dtype: int64
Calculating similarity...
Predicting preferences...
Similarity scores sample:
[[0.9804314  0.977074   0.98327285 ... 0.99003124 0.9762687  0.98570466]
 [0.9857222  0.97977865 0.9822368  ... 0.98882514 0.98261434 0.9863326 ]
 [0.9801007  0.9769698  0.983831   ... 0.98702765 0.97866565 0.9871478 ]
 [0.98965466 0.98082674 0.9777789  ... 0.9851641  0.98269784 0.98216987]
 [0.9763631  0.9734934  0.98768944 ... 0.98569    0.9712979  0.991022  ]]
Predicted Preferences Sample: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Validation Preferences Sample: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Prediction Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Prediction and validation labels difference count: 0


In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=False)
        state_dict = torch.load(model_path)
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    return np.array(features)

# 파일명에서 스타일(라벨)을 추출하는 함수
image_pattern = re.compile(r'^segmented_(W|T)_(\d+)_(.*?)_(.*?)_(W|M)\.jpg$')
def get_labels(image_paths):
    labels = []
    for filename in image_paths:
        match = image_pattern.match(os.path.basename(filename))
        if match:
            _, _, _, style, _ = match.groups()
            labels.append(style)
    return labels

# 클래스 목록 (성별 + 스타일 조합으로 구성)
fashion_classes = [
    "남성_bold", "남성_hiphop", "남성_hippie", "남성_ivy", "남성_metrosexual",
    "남성_mods", "남성_normcore", "남성_sportivecasual", "여성_athleisure",
    "여성_bodyconscious", "여성_cityglam", "여성_classic", "여성_disco",
    "여성_ecology", "여성_feminine", "여성_genderless", "여성_grunge",
    "여성_hiphop", "여성_hippie", "여성_kitsch", "여성_lingerie",
    "여성_lounge", "여성_military", "여성_minimal", "여성_normcore",
    "여성_oriental", "여성_popart", "여성_powersuit", "여성_punk",
    "여성_space", "여성_sportivecasual"
]

# 선호도를 구분할 기준 설정
def assign_preferences(styles):
    preferences = []
    for style in styles:
        gender, style_name = style.split("_")[-2:]  # 파일명으로부터 성별 및 스타일 정보 추출
        label = f"{'남성' if gender == 'M' else '여성'}_{style_name}"
        if label in fashion_classes:
            preferences.append(1)  # 선호
        else:
            preferences.append(0)  # 비선호
    return preferences

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned'
val_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned_for_val'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'  # 학습된 가중치 경로 설정

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 엑셀 파일에서 데이터를 읽고 분리
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/preferences0.xlsx'
preferences_df = pd.read_excel(top_100_file_path)

# Training Style Preferred와 Validation Style Preferred에서 파일명 추출 및 선호도 부여
train_style = []
val_style = []
for items in preferences_df['Training Style Preferred'].dropna():
    item_list = items.strip("[]").replace("'", "").split(", ")
    train_style.extend(item_list)

for items in preferences_df['Validation Style Preferred'].dropna():
    item_list = items.strip("[]").replace("'", "").split(", ")
    val_style.extend(item_list)

# 이미지 경로로 변환
train_image_paths = [os.path.join(train_image_dir, f"segmented_{img.strip()}") for img in train_style]
val_image_paths = [os.path.join(val_image_dir, f"segmented_{img.strip()}") for img in val_style]

# 학습 및 검증 데이터 선호도 할당
train_preferences = assign_preferences(train_style)
val_preferences = assign_preferences(val_style)

# 경로 및 중복 확인
print(f"Number of train_image_paths: {len(train_image_paths)}")
print(f"Number of val_image_paths: {len(val_image_paths)}")
common_images = set(train_image_paths) & set(val_image_paths)
print(f"Train과 Validation 데이터셋 간 중복 이미지 수: {len(common_images)}")

# 학습 및 검증 이미지의 특징 벡터 추출
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)
print(f"Train features shape: {train_features.shape}")
print(f"Validation features shape: {val_features.shape}")

# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환 및 2차원 형태로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 차원 축소: (batch_size, 512 * 7 * 7)
    val_features = val_features.view(val_features.size(0), -1)
    train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.6):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = np.argmax(score_row)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold=0.5)

# 예측 값, 실제 값 출력
print("Similarity scores sample:")
print(similarity_scores[:5])
print("Predicted Preferences Sample:", predicted_preferences[:10])
print("Validation Preferences Sample:", val_preferences[:10])

# 정확도, 정밀도, 재현율, F1 점수 계산
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_preferences)
print(f"Prediction Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# 예측 및 검증 레이블 간 차이점 계산
diff_count = sum([1 for pred, actual in zip(predicted_preferences, val_preferences) if pred != actual])
print(f"Prediction and validation labels difference count: {diff_count}")


  state_dict = torch.load(model_path)


Number of train_image_paths: 674
Number of val_image_paths: 442
Train과 Validation 데이터셋 간 중복 이미지 수: 0


Extracting Features: 100%|██████████| 674/674 [00:06<00:00, 105.11it/s]
Extracting Features: 100%|██████████| 442/442 [00:04<00:00, 104.23it/s]


Train features shape: (674, 512, 7, 7)
Validation features shape: (442, 512, 7, 7)
Calculating similarity...
Predicting preferences...
Similarity scores sample:
[[0.94504374 0.92273325 0.9347446  ... 0.94943714 0.93698066 0.9287985 ]
 [0.956121   0.9233682  0.95403135 ... 0.958176   0.95452243 0.95237833]
 [0.94997996 0.9239968  0.9522519  ... 0.95290667 0.9466649  0.94507134]
 [0.96342987 0.93958867 0.9505069  ... 0.95801425 0.95531505 0.9392491 ]
 [0.9486501  0.9259146  0.9610124  ... 0.94613206 0.9402099  0.9654377 ]]
Predicted Preferences Sample: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Validation Preferences Sample: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Prediction Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Prediction and validation labels difference count: 0


In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=False)
        state_dict = torch.load(model_path)
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    return transform(image).unsqueeze(0).to(device)

def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
    return np.array(features)

# 엑셀 파일에서 스타일 선호/비선호 추출 함수 (eval 사용하지 않음)
def load_styles_from_excel(file_path):
    df = pd.read_excel(file_path)
    train_style = []
    val_style = []
    for items in df['Training Style Preferred'].dropna():
        item_list = items.strip("[]").replace("'", "").split(", ")
        train_style.extend(item_list)

    for items in df['Validation Style Preferred'].dropna():
        item_list = items.strip("[]").replace("'", "").split(", ")
        val_style.extend(item_list)
    return train_style, val_style

# 이미지 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned'
val_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned_for_val'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 엑셀 파일 로드 및 이미지 경로 생성
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/preferences0.xlsx'
train_style, val_style = load_styles_from_excel(top_100_file_path)
train_image_paths = [os.path.join(train_image_dir, f"segmented_{img.strip()}") for img in train_style]
val_image_paths = [os.path.join(val_image_dir, f"segmented_{img.strip()}") for img in val_style]

# 피처 추출
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)

# 선호도 레이블 설정 (선호: 1, 비선호: 0)
train_preferences = [1] * len(train_style)
val_preferences = [1] * len(val_style)  # 초기값을 1로 설정

def calculate_cosine_similarity_gpu(val_features, train_features):
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)
    val_features = val_features.view(val_features.size(0), -1)
    train_features = train_features.view(train_features.size(0), -1)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)
    return torch.mm(val_norm, train_norm.T).cpu().numpy()

def predict_style_preference(similarity_scores, train_preferences, threshold=0.8):
    return [1 if np.max(score_row) > threshold else 0 for score_row in similarity_scores]

# 유사도 계산
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)
predicted_preferences = predict_style_preference(similarity_scores, train_preferences)

# 성능 평가
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_preferences)
print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")


  state_dict = torch.load(model_path)
Extracting Features: 100%|██████████| 674/674 [00:07<00:00, 86.11it/s]
Extracting Features: 100%|██████████| 442/442 [00:04<00:00, 101.71it/s]


Accuracy: 1.0000, Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000


In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=False)
        state_dict = torch.load(model_path)
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    return np.array(features)

# 클래스 목록 (성별 + 스타일 조합으로 구성)
fashion_classes = [
    "남성_bold", "남성_hiphop", "남성_hippie", "남성_ivy", "남성_metrosexual",
    "남성_mods", "남성_normcore", "남성_sportivecasual", "여성_athleisure",
    "여성_bodyconscious", "여성_cityglam", "여성_classic", "여성_disco",
    "여성_ecology", "여성_feminine", "여성_genderless", "여성_grunge",
    "여성_hiphop", "여성_hippie", "여성_kitsch", "여성_lingerie",
    "여성_lounge", "여성_military", "여성_minimal", "여성_normcore",
    "여성_oriental", "여성_popart", "여성_powersuit", "여성_punk",
    "여성_space", "여성_sportivecasual"
]

# 선호도를 구분할 기준 설정
def assign_preferences(styles):
    preferences = []
    for style in styles:
        gender, style_name = style.split("_")[-2:]  # 파일명으로부터 성별 및 스타일 정보 추출
        label = f"{'남성' if gender == 'M' else '여성'}_{style_name}"
        if label in fashion_classes:
            preferences.append(1)  # 선호
        else:
            preferences.append(0)  # 비선호
    return preferences

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned'
val_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned_for_val'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'  # 학습된 가중치 경로 설정

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 엑셀 파일에서 데이터를 읽고 분리
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/preferences0.xlsx'
preferences_df = pd.read_excel(top_100_file_path)

# Training Style Preferred와 Validation Style Preferred에서 파일명 추출 및 선호도 부여
train_style = []
val_style = []
for items in preferences_df['Training Style Preferred'].dropna():
    item_list = items.strip("[]").replace("'", "").split(", ")
    train_style.extend(item_list)

for items in preferences_df['Validation Style Preferred'].dropna():
    item_list = items.strip("[]").replace("'", "").split(", ")
    val_style.extend(item_list)

# 이미지 경로로 변환
train_image_paths = [os.path.join(train_image_dir, f"segmented_{img.strip()}") for img in train_style]
val_image_paths = [os.path.join(val_image_dir, f"segmented_{img.strip()}") for img in val_style]

# 학습 및 검증 데이터 선호도 할당
train_preferences = assign_preferences(train_style)
val_preferences = assign_preferences(val_style)

# 데이터 분포 확인
print("Train Preferences 분포:", pd.Series(train_preferences).value_counts())
print("Validation Preferences 분포:", pd.Series(val_preferences).value_counts())

# 학습 및 검증 이미지의 특징 벡터 추출
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)
print(f"Train features shape: {train_features.shape}")
print(f"Validation features shape: {val_features.shape}")

# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환 및 2차원 형태로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 차원 축소: (batch_size, 512 * 7 * 7)
    val_features = val_features.view(val_features.size(0), -1)
    train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.7):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = np.argmax(score_row)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold=0.7)

# 예측 값, 실제 값 출력
print("Similarity scores sample:")
print(similarity_scores[:5])  # 유사도 점수의 일부를 출력
print("Predicted Preferences Sample:", predicted_preferences[:10])  # 예측 결과 일부 출력
print("Validation Preferences Sample:", val_preferences[:10])  # 실제 레이블 일부 출력

# 정확도, 정밀도, 재현율, F1 점수 계산
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_preferences)
print(f"Prediction Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# 예측 및 검증 레이블 간 차이점 계산
diff_count = sum([1 for pred, actual in zip(predicted_preferences, val_preferences) if pred != actual])
print(f"Prediction and validation labels difference count: {diff_count}")



  state_dict = torch.load(model_path)


Train Preferences 분포: 0    674
Name: count, dtype: int64
Validation Preferences 분포: 0    442
Name: count, dtype: int64


Extracting Features: 100%|██████████| 674/674 [00:07<00:00, 91.34it/s] 
Extracting Features: 100%|██████████| 442/442 [00:04<00:00, 109.41it/s]


Train features shape: (674, 512, 7, 7)
Validation features shape: (442, 512, 7, 7)
Calculating similarity...
Predicting preferences...
Similarity scores sample:
[[0.95333195 0.94281924 0.94824344 ... 0.9614783  0.9497949  0.9460572 ]
 [0.9601681  0.94528604 0.9581765  ... 0.96413857 0.95836973 0.9583902 ]
 [0.9575387  0.9421646  0.9573164  ... 0.9618231  0.95690095 0.95695454]
 [0.9682179  0.9525788  0.96047443 ... 0.9632424  0.9612097  0.9552729 ]
 [0.9622929  0.9527761  0.96820956 ... 0.961179   0.9569083  0.9711716 ]]
Predicted Preferences Sample: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Validation Preferences Sample: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Prediction Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Prediction and validation labels difference count: 0


In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=False)
        state_dict = torch.load(model_path)
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    return np.array(features)

# 클래스 목록 (성별 + 스타일 조합으로 구성)
fashion_classes = [
    "남성_bold", "남성_hiphop", "남성_hippie", "남성_ivy", "남성_metrosexual",
    "남성_mods", "남성_normcore", "남성_sportivecasual", "여성_athleisure",
    "여성_bodyconscious", "여성_cityglam", "여성_classic", "여성_disco",
    "여성_ecology", "여성_feminine", "여성_genderless", "여성_grunge",
    "여성_hiphop", "여성_hippie", "여성_kitsch", "여성_lingerie",
    "여성_lounge", "여성_military", "여성_minimal", "여성_normcore",
    "여성_oriental", "여성_popart", "여성_powersuit", "여성_punk",
    "여성_space", "여성_sportivecasual"
]

# 엑셀 파일에서 데이터 추출 함수
def extract_preferences_from_excel(file_path):
    preferences_dict = {}
    preferences_df = pd.read_excel(file_path)

    for idx, row in preferences_df.iterrows():
        respondent_id = row['Respondent ID']
        training_preferred = row['Training Style Preferred']
        training_not_preferred = row['Training Style Not Preferred']
        validation_preferred = row['Validation Style Preferred']
        validation_not_preferred = row['Validation Style Not Preferred']

        # 리스트 형태로 변환
        training_preferred_list = training_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(training_preferred) else []
        training_not_preferred_list = training_not_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(training_not_preferred) else []
        validation_preferred_list = validation_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(validation_preferred) else []
        validation_not_preferred_list = validation_not_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(validation_not_preferred) else []

        preferences_dict[respondent_id] = {
            'training_preferred': training_preferred_list,
            'training_not_preferred': training_not_preferred_list,
            'validation_preferred': validation_preferred_list,
            'validation_not_preferred': validation_not_preferred_list
        }

    return preferences_dict

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned'
val_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned_for_val'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'  # 학습된 가중치 경로 설정

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 엑셀 파일에서 선호 및 비선호 이미지 리스트 추출
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/preferences0.xlsx'
preferences_dict = extract_preferences_from_excel(top_100_file_path)

# Training Style Preferred 리스트 추출
train_style = []
for respondent_id, prefs in preferences_dict.items():
    train_style.extend(prefs['training_preferred'])

# Validation Style Preferred 리스트 추출 및 스타일 선호도 할당
val_style = []
val_labels = []  # 검증 레이블 리스트 초기화
for respondent_id, prefs in preferences_dict.items():
    val_style.extend(prefs['validation_preferred'])
    # Validation Preferred는 1, Validation Not Preferred는 0으로 설정
    val_labels.extend([1] * len(prefs['validation_preferred']))
    val_labels.extend([0] * len(prefs['validation_not_preferred']))

# 이미지 경로로 변환 (segmented_가 추가됨)
train_image_paths = [os.path.join(train_image_dir, f"segmented_{img.strip()}") for img in train_style]
val_image_paths = [os.path.join(val_image_dir, f"segmented_{img.strip()}") for img in val_style]

# 학습 및 검증 데이터 선호도 할당
train_preferences = assign_preferences(train_style)

# 데이터 분포 확인
print("Train Preferences 분포:", pd.Series(train_preferences).value_counts())
print("Validation Labels 분포:", pd.Series(val_labels).value_counts())  # 수정된 부분

# 학습 및 검증 이미지의 특징 벡터 추출
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)
print(f"Train features shape: {train_features.shape}")
print(f"Validation features shape: {val_features.shape}")

# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환 및 2차원 형태로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 차원 축소: (batch_size, 512 * 7 * 7)
    val_features = val_features.view(val_features.size(0), -1)
    train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.5):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        max_index = np.argmax(score_row)
        predicted_label = train_preferences[max_index]  # 최대 유사도를 가진 라벨을 사용

        # 여기에서 기본적으로 0으로 설정하는 것이 아니라, 항상 예측 값을 추가합니다.
        predicted_preferences.append(predicted_label)

        print(f"Score Row: {score_row}, Max Similarity: {max_similarity}, Max Index: {max_index}, Predicted Label: {predicted_label}")  # 디버깅 정보

    return predicted_preferences

# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences)

# 예측 값, 실제 값 출력
print("Similarity scores sample:")
print(similarity_scores[:5])  # 유사도 점수의 일부를 출력
print("Predicted Preferences Sample:", predicted_preferences[:])  # 예측 결과 일부 출력
print("Validation Labels Sample:", val_labels[:])  # 실제 레이블 일부 출력

# 예측된 선호도의 길이 확인
print(f"Length of Predicted Preferences: {len(predicted_preferences)}")
print(f"Length of Validation Labels: {len(val_labels)}")

# 정확도, 정밀도, 재현율, F1 점수 계산
if len(predicted_preferences) == len(val_labels):
    accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_labels)
    print(f"Prediction Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
else:
    print("Error: The lengths of predicted preferences and validation labels do not match.")




NameError: name 'assign_preferences' is not defined

## 3-2 김진 수정 드디어 됐다

씨바 드디어 됐다 !!!!

In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=False)
        state_dict = torch.load(model_path)
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    return np.array(features)

# 엑셀 파일에서 데이터 추출 함수
def extract_preferences_from_excel(file_path):
    preferences_dict = {}
    preferences_df = pd.read_excel(file_path)

    for idx, row in preferences_df.iterrows():
        respondent_id = row['Respondent ID']
        training_preferred = row['Training Style Preferred']
        training_not_preferred = row['Training Style Not Preferred']
        validation_preferred = row['Validation Style Preferred']
        validation_not_preferred = row['Validation Style Not Preferred']

        # 리스트 형태로 변환
        training_preferred_list = training_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(training_preferred) else []
        training_not_preferred_list = training_not_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(training_not_preferred) else []
        validation_preferred_list = validation_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(validation_preferred) else []
        validation_not_preferred_list = validation_not_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(validation_not_preferred) else []

        preferences_dict[respondent_id] = {
            'training_preferred': training_preferred_list,
            'training_not_preferred': training_not_preferred_list,
            'validation_preferred': validation_preferred_list,
            'validation_not_preferred': validation_not_preferred_list
        }

    return preferences_dict

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned'
val_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned_for_val'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 엑셀 파일에서 선호 및 비선호 이미지 리스트 추출
top_100_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/preferences0.xlsx'
preferences_dict = extract_preferences_from_excel(top_100_file_path)

# Training Style Preferred 리스트 추출
train_style = []
for respondent_id, prefs in preferences_dict.items():
    train_style.extend(prefs['training_preferred'])

# Validation Style Preferred 및 Not Preferred 리스트 추출 및 스타일 선호도 할당
val_style = []
val_labels = []  # 검증 레이블 리스트 초기화
for respondent_id, prefs in preferences_dict.items():
    val_style.extend(prefs['validation_preferred'])
    val_style.extend(prefs['validation_not_preferred'])
    # Validation Preferred는 1, Validation Not Preferred는 0으로 설정
    val_labels.extend([1] * len(prefs['validation_preferred']))
    val_labels.extend([0] * len(prefs['validation_not_preferred']))

# 이미지 경로로 변환 (segmented_가 추가됨)
train_image_paths = [os.path.join(train_image_dir, f"segmented_{img.strip()}") for img in train_style]
val_image_paths = [os.path.join(val_image_dir, f"segmented_{img.strip()}") for img in val_style]

# 학습 및 검증 이미지의 특징 벡터 추출
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)
print(f"Train features shape: {train_features.shape}")
print(f"Validation features shape: {val_features.shape}")

# 정확도, 정밀도, 재현율, F1 점수 계산 함수 (퍼센트 변환)
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted) * 100  # 퍼센트로 변환
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted', zero_division=0)
    return accuracy, precision * 100, recall * 100, f1 * 100  # 퍼센트로 변환

# GPU에서 코사인 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    # numpy 배열을 torch 텐서로 변환 및 2차원 형태로 변환
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)

    # 차원 축소: (batch_size, 512 * 7 * 7)
    val_features = val_features.view(val_features.size(0), -1)
    train_features = train_features.view(train_features.size(0), -1)

    # 정규화하여 벡터를 단위 벡터로 변환 (L2 노멀라이즈)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)

    # 코사인 유사도 계산
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores


# GPU에서 코사인 유사도 계산 및 스타일 선호도 예측
print("Calculating similarity...")
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)

print("Predicting preferences...")
predicted_preferences = predict_style_preference(similarity_scores, train_preferences=[1]*len(train_features))

# 예측 값과 실제 값 비교 및 평가
if len(predicted_preferences) == len(val_labels):
    accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_labels)
    print(f"Prediction Accuracy: {accuracy:.2f}%")
    print(f"Precision: {precision:.2f}%")
    print(f"Recall: {recall:.2f}%")
    print(f"F1-Score: {f1:.2f}%")
else:
    print("Error: The lengths of predicted preferences and validation labels do not match.")


Extracting Features: 100%|██████████| 674/674 [00:05<00:00, 127.77it/s]
Extracting Features: 100%|██████████| 1102/1102 [00:08<00:00, 133.73it/s]


Train features shape: (674, 512, 7, 7)
Validation features shape: (1102, 512, 7, 7)
Calculating similarity...
Predicting preferences...
Prediction Accuracy: 40.11%
Precision: 16.09%
Recall: 40.11%
F1-Score: 22.96%


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 다양한 임계값을 테스트하여 최적의 threshold를 찾는 함수
def find_best_threshold(similarity_scores, val_labels, train_preferences, thresholds):
    best_threshold = 0.5
    best_f1 = 0

    for threshold in thresholds:
        predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold)
        if len(predicted_preferences) == len(val_labels):
            _, _, _, f1 = calculate_metrics(predicted_preferences, val_labels)
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
    return best_threshold

# 다양한 threshold 값을 테스트
thresholds = np.arange(0.5, 0.9, 0.05)
optimal_threshold = find_best_threshold(similarity_scores, val_labels, train_preferences=[1]*len(train_features), thresholds=thresholds)

print(f"Optimal Threshold: {optimal_threshold:.2f}")

# 최적 threshold로 다시 예측 수행
predicted_preferences = predict_style_preference(similarity_scores, train_preferences=[1]*len(train_features), threshold=optimal_threshold)

# 최종 평가 출력
if len(predicted_preferences) == len(val_labels):
    accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_labels)
    print(f"Final Prediction Accuracy: {accuracy:.2f}%")
    print(f"Final Precision: {precision:.2f}%")
    print(f"Final Recall: {recall:.2f}%")
    print(f"Final F1-Score: {f1:.2f}%")
else:
    print("Error: The lengths of predicted preferences and validation labels do not match.")


Optimal Threshold: 0.50
Final Prediction Accuracy: 0.401089%
Final Precision: 0.160872%
Final Recall: 0.401089%
Final F1-Score: 0.229639%


수정본들

68퍼 나온다!!!!!!!!!!!!!

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=False)
        state_dict = torch.load(model_path)
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    return np.array(features)

# 코사인 유사도 및 유클리디안 거리 계산 함수
def calculate_similarity_scores(val_features, train_features):
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)
    val_features = val_features.view(val_features.size(0), -1)
    train_features = train_features.view(train_features.size(0), -1)

    # 코사인 유사도
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)
    cosine_similarity = torch.mm(val_norm, train_norm.T).cpu().numpy()

    # 유클리디안 거리
    val_expand = val_features.unsqueeze(1)
    train_expand = train_features.unsqueeze(0)
    euclidean_distance = -torch.cdist(val_expand, train_expand).cpu().numpy()

    return cosine_similarity, euclidean_distance

# 코사인 유사도와 유클리디안 거리 결합
def combine_similarity_scores(cosine_similarity, euclidean_distance, alpha=0.5):
    combined_score = alpha * cosine_similarity + (1 - alpha) * euclidean_distance
    return combined_score

# 최적 threshold 찾기 함수
def find_optimal_threshold(similarity_scores, train_preferences, val_labels):
    best_threshold = 0.5
    best_f1 = 0
    for threshold in np.arange(0.1, 0.9, 0.05):
        predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold)
        _, _, _, f1 = calculate_metrics(predicted_preferences, val_labels)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.5):
    predicted_preferences = []
    train_len = len(train_preferences)
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            # 유효한 인덱스 범위 내에서만 접근
            most_similar_index = min(np.argmax(score_row), train_len - 1)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)  # 임계값 미만인 경우 비선호로 분류
    return predicted_preferences


# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# 데이터 불러오기 및 준비
# 이미지 경로 및 특징 벡터 추출
train_image_paths = [os.path.join(train_image_dir, f"segmented_{img.strip()}") for img in train_style]
val_image_paths = [os.path.join(val_image_dir, f"segmented_{img.strip()}") for img in val_style]

train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)
train_preferences = [1] * len(train_features)  # 선호 라벨로 지정

# 유사도 계산 및 결합 점수 산출
cosine_similarity, euclidean_distance = calculate_similarity_scores(val_features, train_features)
combined_scores = combine_similarity_scores(cosine_similarity, euclidean_distance, alpha=0.5)

# 최적 threshold 찾기
optimal_threshold = find_optimal_threshold(combined_scores, train_preferences, val_labels)
print(f"Optimal Threshold: {optimal_threshold:.2f}")

# 최적 threshold로 예측 및 성능 평가
predicted_preferences = predict_style_preference(combined_scores, train_preferences, optimal_threshold)
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_labels)

print(f"Final Prediction Accuracy: {accuracy*100:.2f}%")
print(f"Final Precision: {precision*100:.2f}%")
print(f"Final Recall: {recall*100:.2f}%")
print(f"Final F1-Score: {f1*100:.2f}%")


Extracting Features: 100%|██████████| 674/674 [00:04<00:00, 135.81it/s]
Extracting Features: 100%|██████████| 1102/1102 [00:07<00:00, 143.03it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 69.42 GiB. GPU 0 has a total capacity of 14.75 GiB of which 14.22 GiB is free. Process 4325 has 536.00 MiB memory in use. Of the allocated memory 391.82 MiB is allocated by PyTorch, and 16.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=False)
        state_dict = torch.load(model_path)
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    return np.array(features)

# 코사인 유사도 및 유클리디안 거리 계산 함수
def calculate_similarity_scores(val_features, train_features):
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)
    val_features = val_features.view(val_features.size(0), -1)
    train_features = train_features.view(train_features.size(0), -1)

    # 코사인 유사도
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)
    cosine_similarity = torch.mm(val_norm, train_norm.T).cpu().numpy()

    # 유클리디안 거리
    val_expand = val_features.unsqueeze(1)
    train_expand = train_features.unsqueeze(0)
    euclidean_distance = -torch.cdist(val_expand, train_expand).cpu().numpy()

    return cosine_similarity, euclidean_distance

# 최적 alpha 찾기 함수
def find_optimal_alpha(cosine_similarity, euclidean_distance, train_preferences, val_labels):
    best_alpha = 0.5
    best_f1 = 0
    for alpha in np.arange(0.1, 1.0, 0.1):
        combined_score = combine_similarity_scores(cosine_similarity, euclidean_distance, alpha)
        predicted_preferences = predict_style_preference(combined_score, train_preferences)
        _, _, _, f1 = calculate_metrics(predicted_preferences, val_labels)
        if f1 > best_f1:
            best_f1 = f1
            best_alpha = alpha
    return best_alpha

# 최적 threshold 찾기 함수
def find_optimal_threshold(similarity_scores, train_preferences, val_labels):
    best_threshold = 0.5
    best_f1 = 0
    for threshold in np.arange(0.1, 0.9, 0.01):
        predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold)
        _, _, _, f1 = calculate_metrics(predicted_preferences, val_labels)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.5):
    predicted_preferences = []
    train_len = len(train_preferences)
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = min(np.argmax(score_row), train_len - 1)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)  # 임계값 미만인 경우 비선호로 분류
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# 데이터 불러오기 및 준비
train_image_paths = [os.path.join(train_image_dir, f"segmented_{img.strip()}") for img in train_style]
val_image_paths = [os.path.join(val_image_dir, f"segmented_{img.strip()}") for img in val_style]
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)
train_preferences = [1] * len(train_features)  # 선호 라벨로 지정

# 유사도 계산 및 결합 점수 산출
cosine_similarity, euclidean_distance = calculate_similarity_scores(val_features, train_features)
optimal_alpha = find_optimal_alpha(cosine_similarity, euclidean_distance, train_preferences, val_labels)
combined_scores = combine_similarity_scores(cosine_similarity, euclidean_distance, alpha=optimal_alpha)

# 최적 threshold 찾기
optimal_threshold = find_optimal_threshold(combined_scores, train_preferences, val_labels)
print(f"Optimal Threshold: {optimal_threshold:.2f}, Optimal Alpha: {optimal_alpha:.2f}")

# 최적 threshold로 예측 및 성능 평가
predicted_preferences = predict_style_preference(combined_scores, train_preferences, optimal_threshold)
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_labels)

print(f"Final Prediction Accuracy: {accuracy*100:.2f}%")
print(f"Final Precision: {precision*100:.2f}%")
print(f"Final Recall: {recall*100:.2f}%")
print(f"Final F1-Score: {f1*100:.2f}%")


Extracting Features: 100%|██████████| 674/674 [00:04<00:00, 137.40it/s]
Extracting Features: 100%|██████████| 1102/1102 [00:08<00:00, 136.84it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 69.42 GiB. GPU 0 has a total capacity of 14.75 GiB of which 13.55 GiB is free. Process 4325 has 1.20 GiB memory in use. Of the allocated memory 1.05 GiB is allocated by PyTorch, and 22.17 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

아웃오브 메모리 방지

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=False)
        state_dict = torch.load(model_path)
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    return np.array(features)

# 코사인 유사도 및 유클리디안 거리 계산 함수 (배치 처리 적용)
def calculate_similarity_scores(val_features, train_features, batch_size=100):
    cosine_similarity_list = []
    euclidean_distance_list = []

    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)
    val_features = val_features.view(val_features.size(0), -1)
    train_features = train_features.view(train_features.size(0), -1)

    with torch.no_grad():
        for i in range(0, len(val_features), batch_size):
            val_batch = val_features[i:i + batch_size]

            # 코사인 유사도 계산
            val_norm = val_batch / val_batch.norm(dim=1, keepdim=True)
            train_norm = train_features / train_features.norm(dim=1, keepdim=True)
            cosine_sim_batch = torch.mm(val_norm, train_norm.T).cpu().numpy()
            cosine_similarity_list.append(cosine_sim_batch)

            # 유클리디안 거리 계산 (CPU에서 수행)
            val_batch_cpu = val_batch.cpu().numpy()
            train_features_cpu = train_features.cpu().numpy()
            euclidean_dist_batch = -np.linalg.norm(val_batch_cpu[:, None, :] - train_features_cpu[None, :, :], axis=2)
            euclidean_distance_list.append(euclidean_dist_batch)

    cosine_similarity = np.vstack(cosine_similarity_list)
    euclidean_distance = np.vstack(euclidean_distance_list)

    return cosine_similarity, euclidean_distance

# 최적 alpha 찾기 함수
def find_optimal_alpha(cosine_similarity, euclidean_distance, train_preferences, val_labels):
    best_alpha = 0.5
    best_f1 = 0
    for alpha in np.arange(0.1, 1.0, 0.1):
        combined_score = combine_similarity_scores(cosine_similarity, euclidean_distance, alpha)
        predicted_preferences = predict_style_preference(combined_score, train_preferences)
        _, _, _, f1 = calculate_metrics(predicted_preferences, val_labels)
        if f1 > best_f1:
            best_f1 = f1
            best_alpha = alpha
    return best_alpha

# 최적 threshold 찾기 함수
def find_optimal_threshold(similarity_scores, train_preferences, val_labels):
    best_threshold = 0.5
    best_f1 = 0
    for threshold in np.arange(0.1, 0.9, 0.01):
        predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold)
        _, _, _, f1 = calculate_metrics(predicted_preferences, val_labels)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.5):
    predicted_preferences = []
    train_len = len(train_preferences)
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = min(np.argmax(score_row), train_len - 1)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)  # 임계값 미만인 경우 비선호로 분류
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(
        actual, predicted, average='weighted', zero_division=0
    )
    return accuracy, precision, recall, f1


# 데이터 불러오기 및 준비
train_image_paths = [os.path.join(train_image_dir, f"segmented_{img.strip()}") for img in train_style]
val_image_paths = [os.path.join(val_image_dir, f"segmented_{img.strip()}") for img in val_style]
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)
train_preferences = [1] * len(train_features)  # 선호 라벨로 지정

# 유사도 계산 및 결합 점수 산출
cosine_similarity, euclidean_distance = calculate_similarity_scores(val_features, train_features, batch_size=100)
optimal_alpha = find_optimal_alpha(cosine_similarity, euclidean_distance, train_preferences, val_labels)
combined_scores = combine_similarity_scores(cosine_similarity, euclidean_distance, alpha=optimal_alpha)

# 최적 threshold 찾기
optimal_threshold = find_optimal_threshold(combined_scores, train_preferences, val_labels)
print(f"Optimal Threshold: {optimal_threshold:.2f}, Optimal Alpha: {optimal_alpha:.2f}")

# 최적 threshold로 예측 및 성능 평가
predicted_preferences = predict_style_preference(combined_scores, train_preferences, optimal_threshold)
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_labels)

print(f"Final Prediction Accuracy: {accuracy*100:.2f}%")
print(f"Final Precision: {precision*100:.2f}%")
print(f"Final Recall: {recall*100:.2f}%")
print(f"Final F1-Score: {f1*100:.2f}%")


Extracting Features: 100%|██████████| 674/674 [00:05<00:00, 119.65it/s]
Extracting Features: 100%|██████████| 1102/1102 [00:07<00:00, 143.82it/s]


Optimal Threshold: 0.10, Optimal Alpha: 0.60
Final Prediction Accuracy: 67.88%
Final Precision: 74.04%
Final Recall: 67.88%
Final F1-Score: 61.82%


In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=False)
        state_dict = torch.load(model_path)
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    return np.array(features)

# 코사인 유사도 및 유클리디안 거리 계산 함수 (배치 방식으로 최적화)
def calculate_similarity_scores(val_features, train_features, batch_size=100):
    cosine_similarity_list = []
    euclidean_distance_list = []

    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)
    val_features = val_features.view(val_features.size(0), -1)
    train_features = train_features.view(train_features.size(0), -1)

    with torch.no_grad():
        for i in range(0, len(val_features), batch_size):
            val_batch = val_features[i:i + batch_size]

            # 코사인 유사도 계산
            val_norm = val_batch / val_batch.norm(dim=1, keepdim=True)
            train_norm = train_features / train_features.norm(dim=1, keepdim=True)
            cosine_sim_batch = torch.mm(val_norm, train_norm.T).cpu().numpy()
            cosine_similarity_list.append(cosine_sim_batch)

            # 유클리디안 거리 계산 (CPU에서 수행)
            val_batch_cpu = val_batch.cpu().numpy()
            train_features_cpu = train_features.cpu().numpy()
            euclidean_dist_batch = -np.linalg.norm(val_batch_cpu[:, None, :] - train_features_cpu[None, :, :], axis=2)
            euclidean_distance_list.append(euclidean_dist_batch)

    cosine_similarity = np.vstack(cosine_similarity_list)
    euclidean_distance = np.vstack(euclidean_distance_list)

    return cosine_similarity, euclidean_distance

# 최적 alpha 찾기 함수
def find_optimal_alpha(cosine_similarity, euclidean_distance, train_preferences, val_labels):
    best_alpha = 0.5
    best_f1 = 0
    for alpha in np.arange(0.1, 1.0, 0.1):
        combined_score = combine_similarity_scores(cosine_similarity, euclidean_distance, alpha)
        predicted_preferences = predict_style_preference(combined_score, train_preferences)
        _, _, _, f1 = calculate_metrics(predicted_preferences, val_labels)
        if f1 > best_f1:
            best_f1 = f1
            best_alpha = alpha
    return best_alpha

# 최적 threshold 찾기 함수
def find_optimal_threshold(similarity_scores, train_preferences, val_labels):
    best_threshold = 0.5
    best_f1 = 0
    for threshold in np.arange(0.1, 0.9, 0.01):
        predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold)
        _, _, _, f1 = calculate_metrics(predicted_preferences, val_labels)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.5):
    predicted_preferences = []
    train_len = len(train_preferences)
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = min(np.argmax(score_row), train_len - 1)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)  # 임계값 미만인 경우 비선호로 분류
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted')
    return accuracy, precision, recall, f1

# 데이터 불러오기 및 준비
train_image_paths = [os.path.join(train_image_dir, f"segmented_{img.strip()}") for img in train_style]
val_image_paths = [os.path.join(val_image_dir, f"segmented_{img.strip()}") for img in val_style]
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)
train_preferences = [1] * len(train_features)  # 선호 라벨로 지정

# 유사도 계산 및 결합 점수 산출
cosine_similarity, euclidean_distance = calculate_similarity_scores(val_features, train_features, batch_size=100)
optimal_alpha = find_optimal_alpha(cosine_similarity, euclidean_distance, train_preferences, val_labels)
combined_scores = combine_similarity_scores(cosine_similarity, euclidean_distance, alpha=optimal_alpha)

# 최적 threshold 찾기
optimal_threshold = find_optimal_threshold(combined_scores, train_preferences, val_labels)
print(f"Optimal Threshold: {optimal_threshold:.2f}, Optimal Alpha: {optimal_alpha:.2f}")

# 최적 threshold로 예측 및 성능 평가
predicted_preferences = predict_style_preference(combined_scores, train_preferences, optimal_threshold)
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_labels)

print(f"Final Prediction Accuracy: {accuracy*100:.2f}%")
print(f"Final Precision: {precision*100:.2f}%")
print(f"Final Recall: {recall*100:.2f}%")
print(f"Final F1-Score: {f1*100:.2f}%")


Extracting Features: 100%|██████████| 674/674 [00:04<00:00, 139.53it/s]
Extracting Features: 100%|██████████| 1102/1102 [00:07<00:00, 144.83it/s]


Optimal Threshold: 0.10, Optimal Alpha: 0.60
Final Prediction Accuracy: 57.89%
Final Precision: 52.30%
Final Recall: 57.89%
Final F1-Score: 49.96%


xlsx 내용 리스트로 바꾸기

In [None]:
import pandas as pd

# 엑셀 파일 경로
excel_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/preferences0.xlsx'

# 엑셀 파일 읽기
preferences_df = pd.read_excel(excel_file_path)

# 각 Respondent ID 별로 선호 및 비선호 이미지 리스트 추출
for idx, row in preferences_df.iterrows():
    respondent_id = row['Respondent ID']
    training_preferred = row['Training Style Preferred']
    training_not_preferred = row['Training Style Not Preferred']
    validation_preferred = row['Validation Style Preferred']
    validation_not_preferred = row['Validation Style Not Preferred']

    # 리스트 형태로 변환
    training_preferred_list = training_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(training_preferred) else []
    training_not_preferred_list = training_not_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(training_not_preferred) else []
    validation_preferred_list = validation_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(validation_preferred) else []
    validation_not_preferred_list = validation_not_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(validation_not_preferred) else []

    # 출력
    print(f"Respondent ID: {respondent_id}")
    print(f"  Training Preferred: {training_preferred_list}")
    print(f"  Training Not Preferred: {training_not_preferred_list}")
    print(f"  Validation Preferred: {validation_preferred_list}")
    print(f"  Validation Not Preferred: {validation_not_preferred_list}")
    print("-" * 50)


Respondent ID: 62264
  Training Preferred: ['W_07098_19_normcore_M.jpg', 'W_16449_10_sportivecasual_M.jpg', 'W_16403_10_sportivecasual_M.jpg', 'W_07307_19_normcore_M.jpg', 'W_07260_10_sportivecasual_M.jpg', 'W_01394_10_sportivecasual_M.jpg', 'W_17702_90_hiphop_M.jpg', 'W_04687_00_metrosexual_M.jpg', 'W_04324_90_hiphop_M.jpg', 'W_02693_70_hippie_M.jpg', 'W_16445_50_ivy_M.jpg', 'W_05869_60_mods_M.jpg']
  Training Not Preferred: ['W_12869_19_normcore_M.jpg', 'W_17469_19_normcore_M.jpg', 'W_17704_19_normcore_M.jpg', 'W_12309_80_bold_M.jpg', 'W_16136_80_bold_M.jpg', 'W_16428_90_hiphop_M.jpg', 'W_15159_80_bold_M.jpg', 'W_16354_80_bold_M.jpg', 'W_15477_00_metrosexual_M.jpg', 'W_16601_70_hippie_M.jpg', 'W_02721_00_metrosexual_M.jpg', 'W_16189_50_ivy_M.jpg', 'W_16189_50_ivy_M.jpg', 'W_10125_70_hippie_M.jpg', 'W_04233_60_mods_M.jpg']
  Validation Preferred: ['W_09278_70_hippie_M.jpg', 'W_04324_90_hiphop_M.jpg', 'W_09889_10_sportivecasual_M.jpg', 'W_12413_90_hiphop_M.jpg']
  Validation Not Prefer

[3-2] 최종?

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import warnings
from PIL import Image
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# 경고 메시지 무시
warnings.filterwarnings("ignore")

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(weights=None)
        state_dict = torch.load(model_path, weights_only=True)
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    return np.array(features)

# 코사인 유사도 및 유클리디안 거리 계산 함수 (배치 처리 적용)
def calculate_similarity_scores(val_features, train_features, batch_size=100):
    cosine_similarity_list = []
    euclidean_distance_list = []

    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)
    val_features = val_features.view(val_features.size(0), -1)
    train_features = train_features.view(train_features.size(0), -1)

    with torch.no_grad():
        for i in range(0, len(val_features), batch_size):
            val_batch = val_features[i:i + batch_size]

            # 코사인 유사도 계산
            val_norm = val_batch / val_batch.norm(dim=1, keepdim=True)
            train_norm = train_features / train_features.norm(dim=1, keepdim=True)
            cosine_sim_batch = torch.mm(val_norm, train_norm.T).cpu().numpy()
            cosine_similarity_list.append(cosine_sim_batch)

            # 유클리디안 거리 계산 (CPU에서 수행)
            val_batch_cpu = val_batch.cpu().numpy()
            train_features_cpu = train_features.cpu().numpy()
            euclidean_dist_batch = -np.linalg.norm(val_batch_cpu[:, None, :] - train_features_cpu[None, :, :], axis=2)
            euclidean_distance_list.append(euclidean_dist_batch)

    cosine_similarity = np.vstack(cosine_similarity_list)
    euclidean_distance = np.vstack(euclidean_distance_list)

    return cosine_similarity, euclidean_distance

# 결합된 유사도 스코어 계산 함수
def combine_similarity_scores(cosine_similarity, euclidean_distance, alpha=0.5):
    return alpha * cosine_similarity + (1 - alpha) * euclidean_distance

# 최적 alpha 찾기 함수
def find_optimal_alpha(cosine_similarity, euclidean_distance, train_preferences, val_labels):
    best_alpha = 0.5
    best_f1 = 0
    for alpha in np.arange(0.1, 1.0, 0.1):
        combined_score = combine_similarity_scores(cosine_similarity, euclidean_distance, alpha)
        predicted_preferences = predict_style_preference(combined_score, train_preferences)
        _, _, _, f1 = calculate_metrics(predicted_preferences, val_labels)
        if f1 > best_f1:
            best_f1 = f1
            best_alpha = alpha
    return best_alpha

# 최적 threshold 찾기 함수
def find_optimal_threshold(similarity_scores, train_preferences, val_labels):
    best_threshold = 0.5
    best_f1 = 0
    for threshold in np.arange(0.1, 0.9, 0.01):
        predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold)
        _, _, _, f1 = calculate_metrics(predicted_preferences, val_labels)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.5):
    predicted_preferences = []
    train_len = len(train_preferences)
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = min(np.argmax(score_row), train_len - 1)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(
        actual, predicted, average='weighted', zero_division=0
    )
    return accuracy, precision, recall, f1

# 데이터 불러오기 및 준비
train_image_paths = [os.path.join(train_image_dir, f"segmented_{img.strip()}") for img in train_style]
val_image_paths = [os.path.join(val_image_dir, f"segmented_{img.strip()}") for img in val_style]
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)
train_preferences = [1] * len(train_features)

# 유사도 계산 및 결합 점수 산출
cosine_similarity, euclidean_distance = calculate_similarity_scores(val_features, train_features, batch_size=100)
optimal_alpha = find_optimal_alpha(cosine_similarity, euclidean_distance, train_preferences, val_labels)
combined_scores = combine_similarity_scores(cosine_similarity, euclidean_distance, alpha=optimal_alpha)

# 최적 threshold 찾기
optimal_threshold = find_optimal_threshold(combined_scores, train_preferences, val_labels)
print(f"Optimal Threshold: {optimal_threshold:.2f}, Optimal Alpha: {optimal_alpha:.2f}")

# 최적 threshold로 예측 및 성능 평가
predicted_preferences = predict_style_preference(combined_scores, train_preferences, optimal_threshold)
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_labels)

print(f"Final Prediction Accuracy: {accuracy*100:.2f}%")
print(f"Final Precision: {precision*100:.2f}%")
print(f"Final Recall: {recall*100:.2f}%")
print(f"Final F1-Score: {f1*100:.2f}%")


Extracting Features: 100%|██████████| 674/674 [00:05<00:00, 116.92it/s]
Extracting Features: 100%|██████████| 1102/1102 [00:08<00:00, 131.28it/s]


Optimal Threshold: 0.10, Optimal Alpha: 0.60
Final Prediction Accuracy: 57.89%
Final Precision: 52.30%
Final Recall: 57.89%
Final F1-Score: 49.96%


In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# 경로 설정 및 데이터 준비
train_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned'
val_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned_for_val'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'
excel_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/preferences0.xlsx'

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=False)
        state_dict = torch.load(model_path)
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    return np.array(features)

# 엑셀 파일에서 데이터 추출 함수
def extract_preferences_from_excel(file_path):
    preferences_dict = {}
    preferences_df = pd.read_excel(file_path)

    for idx, row in preferences_df.iterrows():
        respondent_id = row['Respondent ID']
        training_preferred = row['Training Style Preferred']
        training_not_preferred = row['Training Style Not Preferred']
        validation_preferred = row['Validation Style Preferred']
        validation_not_preferred = row['Validation Style Not Preferred']

        # 리스트 형태로 변환
        training_preferred_list = training_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(training_preferred) else []
        training_not_preferred_list = training_not_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(training_not_preferred) else []
        validation_preferred_list = validation_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(validation_preferred) else []
        validation_not_preferred_list = validation_not_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(validation_not_preferred) else []

        preferences_dict[respondent_id] = {
            'training_preferred': training_preferred_list,
            'training_not_preferred': training_not_preferred_list,
            'validation_preferred': validation_preferred_list,
            'validation_not_preferred': validation_not_preferred_list
        }

    return preferences_dict

# 결합된 유사도 스코어 계산 함수
def combine_similarity_scores(cosine_similarity, euclidean_distance, alpha=0.5):
    return alpha * cosine_similarity + (1 - alpha) * euclidean_distance

# 코사인 유사도 및 유클리디안 거리 계산 함수 (배치 처리 적용)
def calculate_similarity_scores(val_features, train_features, batch_size=100):
    cosine_similarity_list = []
    euclidean_distance_list = []

    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)
    val_features = val_features.view(val_features.size(0), -1)
    train_features = train_features.view(train_features.size(0), -1)

    with torch.no_grad():
        for i in range(0, len(val_features), batch_size):
            val_batch = val_features[i:i + batch_size]

            # 코사인 유사도 계산
            val_norm = val_batch / val_batch.norm(dim=1, keepdim=True)
            train_norm = train_features / train_features.norm(dim=1, keepdim=True)
            cosine_sim_batch = torch.mm(val_norm, train_norm.T).cpu().numpy()
            cosine_similarity_list.append(cosine_sim_batch)

            # 유클리디안 거리 계산
            val_batch_cpu = val_batch.cpu().numpy()
            train_features_cpu = train_features.cpu().numpy()
            euclidean_dist_batch = -np.linalg.norm(val_batch_cpu[:, None, :] - train_features_cpu[None, :, :], axis=2)
            euclidean_distance_list.append(euclidean_dist_batch)

    cosine_similarity = np.vstack(cosine_similarity_list)
    euclidean_distance = np.vstack(euclidean_distance_list)

    return cosine_similarity, euclidean_distance

# 최적 alpha 찾기 함수
def find_optimal_alpha(cosine_similarity, euclidean_distance, train_preferences, val_labels):
    best_alpha = 0.5
    best_f1 = 0
    for alpha in np.arange(0.1, 1.0, 0.1):
        combined_score = combine_similarity_scores(cosine_similarity, euclidean_distance, alpha)
        predicted_preferences = predict_style_preference(combined_score, train_preferences)
        _, _, _, f1 = calculate_metrics(predicted_preferences, val_labels)
        if f1 > best_f1:
            best_f1 = f1
            best_alpha = alpha
    return best_alpha

# 최적 threshold 찾기 함수
def find_optimal_threshold(similarity_scores, train_preferences, val_labels):
    best_threshold = 0.5
    best_f1 = 0
    for threshold in np.arange(0.1, 0.9, 0.01):
        predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold)
        _, _, _, f1 = calculate_metrics(predicted_preferences, val_labels)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold

# 유사도를 기반으로 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.5):
    predicted_preferences = []
    train_len = len(train_preferences)
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        if max_similarity > threshold:
            most_similar_index = min(np.argmax(score_row), train_len - 1)
            predicted_label = train_preferences[most_similar_index]
            predicted_preferences.append(predicted_label)
        else:
            predicted_preferences.append(0)
    return predicted_preferences

# 정확도, 정밀도, 재현율, F1 점수 계산 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(
        actual, predicted, average='weighted', zero_division=0
    )
    return accuracy, precision, recall, f1

# 실행
if __name__ == "__main__":
    # 장치 설정 및 특징 추출기 로드
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    feature_extractor = ResNetFeatureExtractor(model_path).to(device)

    # 엑셀 파일에서 선호도 데이터 추출
    preferences_dict = extract_preferences_from_excel(excel_file_path)

    # 학습 및 검증 데이터 준비
    train_style = [img for resp in preferences_dict.values() for img in resp['training_preferred']]
    val_style = [img for resp in preferences_dict.values() for img in resp['validation_preferred'] + resp['validation_not_preferred']]
    val_labels = [1] * sum(len(resp['validation_preferred']) for resp in preferences_dict.values()) + [0] * sum(len(resp['validation_not_preferred']) for resp in preferences_dict.values())

    train_image_paths = [os.path.join(train_image_dir, f"segmented_{img.strip()}") for img in train_style]
    val_image_paths = [os.path.join(val_image_dir, f"segmented_{img.strip()}") for img in val_style]

    # 이미지 특징 추출
    train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)
    val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)

# 유사도 계산 및 결합 점수 산출
cosine_similarity, euclidean_distance = calculate_similarity_scores(val_features, train_features, batch_size=100)
optimal_alpha = find_optimal_alpha(cosine_similarity, euclidean_distance, [1] * len(train_features), val_labels)
combined_scores = combine_similarity_scores(cosine_similarity, euclidean_distance, alpha=optimal_alpha)

# 최적 threshold 찾기
optimal_threshold = find_optimal_threshold(combined_scores, [1] * len(train_features), val_labels)
print(f"Optimal Threshold: {optimal_threshold:.2f}, Optimal Alpha: {optimal_alpha:.2f}")

# 최적 threshold로 예측 및 성능 평가
predicted_preferences = predict_style_preference(combined_scores, [1] * len(train_features), threshold=optimal_threshold)
accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_labels)

print(f"Final Prediction Accuracy: {accuracy*100:.2f}%")
print(f"Final Precision: {precision*100:.2f}%")
print(f"Final Recall: {recall*100:.2f}%")
print(f"Final F1-Score: {f1*100:.2f}%")




Extracting Features: 100%|██████████| 674/674 [00:04<00:00, 135.09it/s]
Extracting Features: 100%|██████████| 1102/1102 [00:08<00:00, 128.86it/s]


Optimal Threshold: 0.10, Optimal Alpha: 0.60
Final Prediction Accuracy: 57.89%
Final Precision: 52.30%
Final Recall: 57.89%
Final F1-Score: 49.96%


In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from torchvision import models, transforms
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# 경로 설정
train_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned'
val_image_dir = '/content/drive/MyDrive/dataset/processed_segmentation_cleaned_for_val'
model_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/model_final5.pth'
excel_file_path = '/content/drive/MyDrive/데이터 크리에이터 캠프 최종 정리/김진/preferences0.xlsx'

# ResNet 특징 추출기 클래스 정의 (fc 레이어 제외)
class ResNetFeatureExtractor(nn.Module):
    def __init__(self, model_path):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(pretrained=False)
        state_dict = torch.load(model_path)
        state_dict = {k: v for k, v in state_dict.items() if not k.startswith("fc.")}
        resnet.load_state_dict(state_dict, strict=False)
        self.features = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.features(x)

# 이미지 전처리 파이프라인 정의
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 이미지 로드 및 전처리 함수
def load_and_preprocess_image(image_path, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    return image

# 데이터셋에서 이미지들의 특징 벡터를 추출하는 함수
def extract_features_for_dataset(image_paths, feature_extractor, device):
    features = []
    feature_extractor.eval()
    with torch.no_grad():
        for image_path in tqdm(image_paths, desc="Extracting Features"):
            if os.path.exists(image_path):
                image_tensor = load_and_preprocess_image(image_path, device)
                feature_vector = feature_extractor(image_tensor).squeeze().cpu().numpy()
                features.append(feature_vector)
            else:
                print(f"File not found: {image_path}")
    return np.array(features)

# 엑셀 파일에서 데이터 추출 함수
def extract_preferences_from_excel(file_path):
    preferences_dict = {}
    preferences_df = pd.read_excel(file_path)

    for idx, row in preferences_df.iterrows():
        respondent_id = row['Respondent ID']
        training_preferred = row['Training Style Preferred']
        training_not_preferred = row['Training Style Not Preferred']
        validation_preferred = row['Validation Style Preferred']
        validation_not_preferred = row['Validation Style Not Preferred']

        training_preferred_list = training_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(training_preferred) else []
        training_not_preferred_list = training_not_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(training_not_preferred) else []
        validation_preferred_list = validation_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(validation_preferred) else []
        validation_not_preferred_list = validation_not_preferred.strip("[]").replace("'", "").split(", ") if pd.notna(validation_not_preferred) else []

        preferences_dict[respondent_id] = {
            'training_preferred': training_preferred_list,
            'training_not_preferred': training_not_preferred_list,
            'validation_preferred': validation_preferred_list,
            'validation_not_preferred': validation_not_preferred_list
        }

    return preferences_dict

# ResNet 특징 추출기 초기화 및 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = ResNetFeatureExtractor(model_path).to(device)

# 엑셀 파일에서 선호 및 비선호 이미지 리스트 추출
preferences_dict = extract_preferences_from_excel(excel_file_path)

# Training Style Preferred 리스트 추출
train_style = []
for respondent_id, prefs in preferences_dict.items():
    train_style.extend(prefs['training_preferred'])

# Validation Style Preferred 및 Not Preferred 리스트 추출 및 스타일 선호도 할당
val_style = []
val_labels = []  # 검증 레이블 리스트 초기화
for respondent_id, prefs in preferences_dict.items():
    val_style.extend(prefs['validation_preferred'])
    val_style.extend(prefs['validation_not_preferred'])
    val_labels.extend([1] * len(prefs['validation_preferred']))
    val_labels.extend([0] * len(prefs['validation_not_preferred']))

# 이미지 경로로 변환
train_image_paths = [os.path.join(train_image_dir, f"segmented_{img.strip()}") for img in train_style]
val_image_paths = [os.path.join(val_image_dir, f"segmented_{img.strip()}") for img in val_style]

# 학습 및 검증 이미지의 특징 벡터 추출
train_features = extract_features_for_dataset(train_image_paths, feature_extractor, device)
val_features = extract_features_for_dataset(val_image_paths, feature_extractor, device)

# 유사도 계산 함수
def calculate_cosine_similarity_gpu(val_features, train_features):
    val_features = torch.tensor(val_features).to(device)
    train_features = torch.tensor(train_features).to(device)
    val_features = val_features.view(val_features.size(0), -1)
    train_features = train_features.view(train_features.size(0), -1)
    val_norm = val_features / val_features.norm(dim=1, keepdim=True)
    train_norm = train_features / train_features.norm(dim=1, keepdim=True)
    similarity_scores = torch.mm(val_norm, train_norm.T).cpu().numpy()
    return similarity_scores

# 스타일 선호도 예측
similarity_scores = calculate_cosine_similarity_gpu(val_features, train_features)
train_preferences = [1] * len(train_features)

# 최적 alpha 찾기
def find_optimal_alpha(cosine_similarity, train_preferences, val_labels):
    best_alpha, best_f1 = 0.5, 0
    for alpha in np.arange(0.1, 1.0, 0.1):
        predicted_preferences = predict_style_preference(cosine_similarity * alpha, train_preferences)
        _, _, _, f1 = calculate_metrics(predicted_preferences, val_labels)
        if f1 > best_f1:
            best_f1, best_alpha = f1, alpha
    return best_alpha

# 최적 threshold 찾기
def find_optimal_threshold(similarity_scores, train_preferences, val_labels):
    best_threshold, best_f1 = 0.5, 0
    for threshold in np.arange(0.1, 0.9, 0.01):
        predicted_preferences = predict_style_preference(similarity_scores, train_preferences, threshold)
        _, _, _, f1 = calculate_metrics(predicted_preferences, val_labels)
        if f1 > best_f1:
            best_f1, best_threshold = f1, threshold
    return best_threshold

# 스타일 선호도 예측
def predict_style_preference(similarity_scores, train_preferences, threshold=0.5):
    predicted_preferences = []
    for score_row in similarity_scores:
        max_similarity = np.max(score_row)
        predicted_label = train_preferences[np.argmax(score_row)] if max_similarity > threshold else 0
        predicted_preferences.append(predicted_label)
    return predicted_preferences

# 성능 평가 함수
def calculate_metrics(predicted, actual):
    accuracy = accuracy_score(actual, predicted) * 100
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted', zero_division=0)
    return accuracy, precision * 100, recall * 100, f1 * 100

# 최적 alpha 및 threshold 찾기 및 성능 평가
optimal_alpha = find_optimal_alpha(similarity_scores, train_preferences, val_labels)
optimal_threshold = find_optimal_threshold(similarity_scores * optimal_alpha, train_preferences, val_labels)
predicted_preferences = predict_style_preference(similarity_scores * optimal_alpha, train_preferences, optimal_threshold)

accuracy, precision, recall, f1 = calculate_metrics(predicted_preferences, val_labels)
print(f"Optimal Threshold: {optimal_threshold:.2f}, Optimal Alpha: {optimal_alpha:.2f}")
print(f"Final Prediction Accuracy: {accuracy:.2f}%")
print(f"Final Precision: {precision:.2f}%")
print(f"Final Recall: {recall:.2f}%")
print(f"Final F1-Score: {f1:.2f}%")


  state_dict = torch.load(model_path)
Extracting Features: 100%|██████████| 674/674 [00:04<00:00, 147.01it/s]
Extracting Features: 100%|██████████| 1102/1102 [00:07<00:00, 143.65it/s]


Optimal Threshold: 0.50, Optimal Alpha: 0.50
Final Prediction Accuracy: 64.61%
Final Precision: 72.33%
Final Recall: 64.61%
Final F1-Score: 55.75%


In [None]:
import random

# 예측 결과와 실제 레이블 샘플링 함수
def sample_predictions(predicted, actual, val_image_paths, sample_size=10):
    samples = random.sample(range(len(predicted)), sample_size)  # 샘플링할 인덱스 선택
    print("\nSample Prediction Results (Predicted vs. Actual):")
    print("--------------------------------------------------")

    for i in samples:
        image_path = val_image_paths[i]
        print(f"Image: {image_path.split('/')[-1]}")
        print(f"Predicted: {predicted[i]}, Actual: {actual[i]}")
        print("--------------------------------------------------")

# 예측 결과 샘플링하여 출력
sample_predictions(predicted_preferences, val_labels, val_image_paths, sample_size=10)



Sample Prediction Results (Predicted vs. Actual):
--------------------------------------------------
Image: segmented_W_15766_00_metrosexual_M.jpg
Predicted: 0, Actual: 0
--------------------------------------------------
Image: segmented_W_03587_10_sportivecasual_W.jpg
Predicted: 1, Actual: 1
--------------------------------------------------
Image: segmented_W_27854_50_ivy_M.jpg
Predicted: 1, Actual: 1
--------------------------------------------------
Image: segmented_W_38656_10_sportivecasual_W.jpg
Predicted: 0, Actual: 1
--------------------------------------------------
Image: segmented_W_02394_10_sportivecasual_W.jpg
Predicted: 0, Actual: 1
--------------------------------------------------
Image: segmented_W_60184_10_sportivecasual_M.jpg
Predicted: 0, Actual: 0
--------------------------------------------------
Image: segmented_W_32800_60_mods_M.jpg
Predicted: 0, Actual: 0
--------------------------------------------------
Image: segmented_W_15998_80_bold_M.jpg
Predicted: 0, A

In [None]:
import random

# 각 이미지 경로에 맞는 Respondent ID 리스트 생성
val_respondent_ids = []
for respondent_id, prefs in preferences_dict.items():
    val_respondent_ids.extend([respondent_id] * (len(prefs['validation_preferred']) + len(prefs['validation_not_preferred'])))

# 예측 결과와 실제 레이블 샘플링 함수 (유저 ID 포함)
def sample_predictions(predicted, actual, val_image_paths, respondent_ids, sample_size=10):
    samples = random.sample(range(len(predicted)), sample_size)  # 샘플링할 인덱스 선택
    print("\nSample Prediction Results (Predicted vs. Actual):")
    print("--------------------------------------------------")

    for i in samples:
        image_path = val_image_paths[i]
        respondent_id = respondent_ids[i]
        print(f"Respondent ID: {respondent_id}")
        print(f"Image: {image_path.split('/')[-1]}")
        print(f"Predicted: {predicted[i]}, Actual: {actual[i]}")
        print("--------------------------------------------------")

# 예측 결과 샘플링하여 출력
sample_predictions(predicted_preferences, val_labels, val_image_paths, val_respondent_ids, sample_size=10)



Sample Prediction Results (Predicted vs. Actual):
--------------------------------------------------
Respondent ID: 63207
Image: segmented_W_17135_00_metrosexual_M.jpg
Predicted: 0, Actual: 0
--------------------------------------------------
Respondent ID: 63910
Image: segmented_W_16851_10_sportivecasual_M.jpg
Predicted: 0, Actual: 1
--------------------------------------------------
Respondent ID: 63748
Image: segmented_W_11144_00_metrosexual_M.jpg
Predicted: 0, Actual: 0
--------------------------------------------------
Respondent ID: 63424
Image: segmented_W_13533_19_normcore_W.jpg
Predicted: 0, Actual: 0
--------------------------------------------------
Respondent ID: 63481
Image: segmented_W_06867_60_mods_M.jpg
Predicted: 0, Actual: 0
--------------------------------------------------
Respondent ID: 59704
Image: segmented_W_19833_50_ivy_M.jpg
Predicted: 0, Actual: 0
--------------------------------------------------
Respondent ID: 63424
Image: segmented_W_03717_10_athleisure_W