**** This notebook is written in Google Colab ****

# Settings

In [1]:
!pip install -U datasets
!pip install gensim

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
!pip install ace_tools

Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace_tools
Successfully installed ace_tools-0.0


In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
import random
from datasets import Dataset
import torch
from torch.optim import AdamW
import numpy as np
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cosine
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

import pickle

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Upload File& Prep

In [8]:
file_path = "/content/drive/My Drive/hateslop_final/res/prediction_train.csv"  # 파일 경로 지정
perfume_data = pd.read_csv(file_path)

In [12]:
# 각 행이 실제로 어떤 형태인지 출력
print(perfume_data['notes'].head(10))  # 상위 10개 데이터 확인

# 데이터 타입 확인
print(type(perfume_data['notes'].iloc[0]))  # 첫 번째 행의 타입 확인


0    ['bergamot', 'blackcurrant', 'incense', 'clay'...
1    ['incense', 'pink pepper', 'rose', 'myrrh', 'e...
2                  ['lychee', 'basil', 'rose', 'musk']
3    ['almond', 'coffee', 'tuberose', 'jasmine', 't...
4    ['green notes', 'pine', 'woody notes', 'orient...
5    ['currant bud', 'hyacinth', 'green notes', 'ha...
6    ['lemon', 'grapefruit', 'floral notes', 'peony...
7    ['grapefruit', 'bergamot', 'cardamom', 'black ...
8    ['ozonic notes', 'cardamom', 'lemon', 'geraniu...
9    ['coriander', 'caraway', 'pear', 'bergamot', '...
Name: notes, dtype: object
<class 'str'>


In [13]:
import pandas as pd
import re
import ast  # 문자열을 실제 리스트로 변환

# 문자열 리스트를 실제 리스트로 변환하는 함수
def convert_to_list(note_data):
    if isinstance(note_data, str):  # 문자열이라면 리스트로 변환 시도
        try:
            return ast.literal_eval(note_data)  # 문자열을 실제 리스트로 변환
        except (SyntaxError, ValueError):
            return []  # 변환 실패 시 빈 리스트 반환
    elif isinstance(note_data, list):  # 이미 리스트라면 그대로 반환
        return note_data
    else:
        return []  # 예외 처리

# 특수 문자 제거 함수
def clean_notes(note_list):
    if isinstance(note_list, list):  # 리스트인지 확인
        return [re.sub(r'[^\w\s]', '', note).strip() for note in note_list]  # 특수 문자 제거 및 공백 정리
    return []  # 리스트가 아니면 빈 리스트 반환

# 특수 문자 및 공백 정리 함수
def clean_description(text):
    if isinstance(text, str):  # 문자열인지 확인
        text = re.sub(r'<[^>]+>', '', text)  # HTML 태그 제거
        text = re.sub(r'[^\w\s]', '', text)  # 특수 문자 제거
        text = re.sub(r'\s+', ' ', text).strip()  # 공백 정리
        return text.lower()  # 소문자로 변환 (선택사항)
    return text  # NaN 또는 비문자열 값 그대로 반환

# 1단계: 문자열 리스트를 실제 리스트로 변환
perfume_data['notes'] = perfume_data['notes'].apply(convert_to_list)

# 2단계: 특수 문자 제거
perfume_data['notes'] = perfume_data['notes'].apply(clean_notes)
perfume_data['description'] = perfume_data['description'].apply(clean_description)

perfume_data

Unnamed: 0,description,notes
0,eau ditalie is an olfactory poem that transpor...,"[bergamot, blackcurrant, incense, clay, magnol..."
1,rosa gallica by brecourt is an exquisite unise...,"[incense, pink pepper, rose, myrrh, ebony, cas..."
2,rose blush cologne 2023 by jo malone london ro...,"[lychee, basil, rose, musk]"
3,qimmah for women by lattafa perfumes immerse y...,"[almond, coffee, tuberose, jasmine, tonka bean..."
4,alba di seoul by santa maria novella is a capt...,"[green notes, pine, woody notes, oriental notes]"
...,...,...
379,halfeti draws inspiration from the lavish good...,"[grapefruit, bergamot, green notes, armoise, l..."
380,created by perfumer julie pluchet working with...,"[lime, mimosa, tuberose, hyacinth, lavender, g..."
381,no flowers in this scent just a whole load of ...,"[bergamot, strawberry, dewberry, honey, jasmin..."
382,this very distinct sweet fragrance is one of t...,"[bergamot, herbal notes, lavender, peppermint,..."


In [14]:
# 향수 노트 전체 목록 생성 (중복 제거)
all_notes = list(set(note.strip() for notes in perfume_data["notes"] for note in notes))

In [15]:
with open("/content/drive/My Drive/hateslop_final/res/all_notes.pkl","wb") as f:
    pickle.dump(all_notes, f)

In [None]:
sorted(all_notes)

['absinthe',
 'acacia',
 'acai berry',
 'accord eudora',
 'agarwood',
 'akigalawood',
 'aldehydes',
 'allspice',
 'almond',
 'almond blossom',
 'almond cream',
 'amaryllis',
 'amber',
 'ambergris',
 'amberseed',
 'ambertonic',
 'amberwood',
 'ambranum',
 'ambrette',
 'ambrostar',
 'ambrox',
 'ambroxan',
 'amyris',
 'angelica',
 'animalic notes',
 'anise',
 'apple',
 'apple blossom',
 'apple tree',
 'apricot',
 'armoise',
 'artemisia',
 'baies rose',
 'balsam fir',
 'balsamic notes',
 'bamboo',
 'bamboo leaf',
 'banana',
 'basil',
 'bay leaf',
 'beeswax',
 'belambra tree',
 'bellflower',
 'bellini',
 'benzoin',
 'bergamot',
 'biovanilla',
 'birch',
 'bitter orange',
 'black amber',
 'black cardamom',
 'black currant',
 'black elder',
 'black pepper',
 'black tea',
 'black vanilla husk',
 'blackberry',
 'blackcurrant',
 'blackcurrant bud',
 'blonde woods',
 'blood mandarin',
 'blood orange',
 'blue hyacinth',
 'broom',
 'brown sugar',
 'cabreuva',
 'cacao',
 'cactus',
 'cade oil',
 'came

# Training Datasets

## v1 1to many, negative randomly p:n=(1:3)

In [None]:
import random

def create_finetuning_data_v1(df, all_notes):
    train_data = []

    for _, row in df.iterrows():
        description = row["description"]
        positive_notes = row["notes"]  # 해당 향수의 실제 노트들

        # ✅ 예외 처리: positive note가 없으면 스킵
        if not positive_notes:
            continue

        positive_notes_str = ", ".join(positive_notes)  # 긍정 노트 병합
        num_notes = len(positive_notes)  # 한 샘플 내부 노트 개수 유지

        # ✅ Negative Sampling (랜덤 방식)
        available_negative_notes = list(set(all_notes) - set(positive_notes))  # 긍정 노트 제외

        # 부정 샘플 2개 생성 (각 샘플에서 긍정 샘플과 같은 개수의 노트를 랜덤 선택)
        negative_samples = []
        for _ in range(3):  # 부정 샘플 3개 유지
            sampled_negatives = random.sample(available_negative_notes, min(len(available_negative_notes), num_notes))
            negative_samples.append(", ".join(sampled_negatives))

        # ✅ 최종 데이터 저장 (긍정 1개, 부정 3개)
        train_data.append((description, positive_notes_str, 1.0))  # 긍정 샘플
        for neg_str in negative_samples:
            train_data.append((description, neg_str, 0.0))  # 부정 샘플 3개

    return train_data



# 새로운 방식으로 데이터 구축
train_data_v1 = create_finetuning_data_v1(perfume_data, all_notes)



In [None]:
# 데이터프레임 변환 및 출력
train_df_v1 = pd.DataFrame(train_data_v1, columns=["description", "note", "label"])

# 데이터 개수 확인
train_df_v1.tail()

Unnamed: 0,description,note,label
1531,this very distinct sweet fragrance is one of t...,"shiso, neroli, papyrus, green mandarin, green ...",0.0
1532,inspired by the clean fresh scent of a soap fr...,"bergamot, neroli, petitgrain, orange flower, musk",1.0
1533,inspired by the clean fresh scent of a soap fr...,"black currant, milk, salt, mangosteen, lotus",0.0
1534,inspired by the clean fresh scent of a soap fr...,"lily, cedar leaf, mandarin, bay leaf, amaryllis",0.0
1535,inspired by the clean fresh scent of a soap fr...,"lavender, moss, fennel, brown sugar, orchid",0.0


In [None]:
# ✅ 바이너리 파일로 저장
train_df_v1.to_csv("/content/drive/My Drive/hateslop_final/res/training_pairs_v1.csv", index=False, header = True)

## v2 1to1, negative with fasttext p:n=(1:1)

In [None]:
# ✅ 1. Gensim FastText 모델 로드
fasttext_model = FastText.load("/content/drive/My Drive/hateslop_final/fasttext_perfume.model")

# ✅ 2. 모든 notes들의 임베딩을 계산하여 저장
note_embeddings = {note: fasttext_model.wv.get_vector(note) for note in all_notes if note in fasttext_model.wv}

# 논문 방식대로 파인튜닝 데이터 생성 코드

def create_finetuning_data_v2(df, all_notes, neg_count=20):
    train_data = []
    for _, row in df.iterrows():
        description = row["description"]
        positive_notes = row["notes"]  # 해당 향수의 실제 노트들

        # ✅ Positive Samples (q+)
        for note in positive_notes:
            train_data.append((description, note, 1.0))  # STS = 1 (긍정 샘플)
        # ✅ 4. FastText 기반 Negative Sampling
        positive_vectors = [note_embeddings[note] for note in positive_notes if note in note_embeddings]
        if not positive_vectors:
            continue  # 예외 처리: positive note의 벡터가 없다면 건너뛰기

        avg_positive_vector = np.mean(positive_vectors, axis=0).reshape(1, -1)  # 평균 벡터 계산

        # 모든 notes와의 코사인 유사도 계산
        note_similarities = {
            note: cosine_similarity(avg_positive_vector, note_embeddings[note].reshape(1, -1))[0][0]
            for note in all_notes if note not in positive_notes and note in note_embeddings
        }

        # 유사도가 가장 낮은 neg_count개 선택
        negative_notes = sorted(note_similarities, key=note_similarities.get)[:neg_count]

        sampled_negatives = random.sample(negative_notes, min(len(negative_notes), len(positive_notes)))  # Mi 개수와 동일하게 부정 샘플 선택

        for note in sampled_negatives:
            train_data.append((description, note, 0.0))  # STS = 0 (부정 샘플)

    return train_data

# 새로운 방식으로 데이터 구축
train_data_v2 = create_finetuning_data_v2(perfume_data, all_notes)



In [None]:
# 데이터프레임 변환 및 출력
train_df_v2 = pd.DataFrame(train_data_v2, columns=["description", "note", "label"])

# 데이터 개수 확인
train_df_v2.tail()

Unnamed: 0,description,note,label
7798,inspired by the clean fresh scent of a soap fr...,cypress,0.0
7799,inspired by the clean fresh scent of a soap fr...,citrus notes,0.0
7800,inspired by the clean fresh scent of a soap fr...,elemi,0.0
7801,inspired by the clean fresh scent of a soap fr...,earthy notes,0.0
7802,inspired by the clean fresh scent of a soap fr...,cream,0.0


In [None]:
# ✅ 바이너리 파일로 저장
train_df_v2.to_csv("/content/drive/My Drive/hateslop_final/res/training_pairs.csv", index=False, header = True)

## v3 1to many, negative with fasttext p:n=(1:3)

In [None]:
# ✅ 1. Gensim FastText 모델 로드
fasttext_model = FastText.load("/content/drive/My Drive/hateslop_final/fasttext_perfume.model")

# ✅ 2. 모든 notes들의 임베딩을 계산하여 저장
note_embeddings = {note: fasttext_model.wv.get_vector(note) for note in all_notes if note in fasttext_model.wv}

# 논문 방식대로 파인튜닝 데이터 생성 코드

def create_finetuning_data_v3(df, all_notes, neg_count=20):
    train_data = []

    for _, row in df.iterrows():
        description = row["description"]
        positive_notes = row["notes"]  # 해당 향수의 실제 노트들

        # ✅ Positive Samples (q+)
        if not positive_notes:  # 예외 처리: positive note가 없으면 스킵
            continue

        positive_notes_str = ", ".join(positive_notes)  # 긍정 노트들 병합
        num_notes = len(positive_notes)  # 한 샘플 내부 노트 개수 유지

        positive_vectors = [note_embeddings[note] for note in positive_notes if note in note_embeddings]

        if not positive_vectors:
            continue  # 예외 처리: positive note의 벡터가 없다면 건너뛰기

        avg_positive_vector = np.mean(positive_vectors, axis=0).reshape(1, -1)  # 평균 벡터 계산

        # ✅ Negative Sampling (FastText 기반)
        note_similarities = {
            note: cosine_similarity(avg_positive_vector, note_embeddings[note].reshape(1, -1))[0][0]
            for note in all_notes if note not in positive_notes and note in note_embeddings
        }

        # 유사도가 가장 낮은 neg_count개 선택
        negative_notes = sorted(note_similarities, key=note_similarities.get)[:neg_count*5]
        negative_samples = []
        for _ in range(3):  # 부정 샘플 3개 유지
            sampled_negatives = random.sample(negative_notes, min(len(negative_notes), num_notes))
            negative_samples.append(", ".join(sampled_negatives))

        # ✅ 최종 데이터 저장 (긍정 1개, 부정 3개)
        train_data.append((description, positive_notes_str, 1.0))  # 긍정 샘플
        for neg_str in negative_samples:
            train_data.append((description, neg_str, 0.0))  # 부정 샘플 2개



    return train_data

# 새로운 방식으로 데이터 구축
train_data_v3 = create_finetuning_data_v3(perfume_data, all_notes)



In [None]:
len(train_data_v3)

1536

In [None]:
# 데이터프레임 변환 및 출력
train_df_v3 = pd.DataFrame(train_data_v3, columns=["description", "note", "label"])

train_df_v3.to_csv("/content/drive/My Drive/hateslop_final/res/training_pairs_v3.csv", index=False, header = True)

## v4 1to1, negative without fasttext p:n=(1:many)

In [None]:
# ✅ 1. Gensim FastText 모델 로드
fasttext_model = FastText.load("/content/drive/My Drive/hateslop_final/fasttext_perfume.model")

# ✅ 2. 모든 notes들의 임베딩을 계산하여 저장
note_embeddings = {note: fasttext_model.wv.get_vector(note) for note in all_notes if note in fasttext_model.wv}

# 논문 방식대로 파인튜닝 데이터 생성 코드

def create_finetuning_data_v4(df, all_notes):
    train_data = []

    for _, row in df.iterrows():
        description = row["description"]
        positive_notes = set(row["notes"])  # 해당 향수의 실제 노트들 (집합으로 변환)

        # ✅ Positive Samples (q+)
        for note in positive_notes:
            train_data.append((description, note, 1.0))  # STS = 1 (긍정 샘플)

        # ✅ Negative Samples (q-)
        negative_notes = [note for note in all_notes if note not in positive_notes]  # 긍정 노트 제외
        for note in negative_notes:
            train_data.append((description, note, 0.0))  # STS = 0 (부정 샘플)

    return train_data

# 새로운 방식으로 데이터 구축
train_data_v4 = create_finetuning_data_v4(perfume_data, all_notes)



In [None]:
# 데이터프레임 변환 및 출력
train_df_v4 = pd.DataFrame(train_data_v4, columns=["description", "note", "label"])

# 데이터 개수 확인
train_df_v4.tail()

Unnamed: 0,description,note,label
145147,inspired by the clean fresh scent of a soap fr...,cacao,0.0
145148,inspired by the clean fresh scent of a soap fr...,orris root,0.0
145149,inspired by the clean fresh scent of a soap fr...,honey jasmine,0.0
145150,inspired by the clean fresh scent of a soap fr...,agarwood,0.0
145151,inspired by the clean fresh scent of a soap fr...,cardamom,0.0


In [None]:
# ✅ 바이너리 파일로 저장
train_df_v4.to_csv("/content/drive/My Drive/hateslop_final/res/training_pairs_v4.csv", index=False, header = True)

# Training

## v1 1to many, negative randomly p:n=(1:3)

In [6]:
file_path = "/content/drive/My Drive/hateslop_final/res/training_pairs_v1.csv"  # 파일 경로 지정
train_df_v1 = pd.read_csv(file_path)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/hateslop_final/res/training_pairs_v1.csv'

In [None]:
len(train_df_v1)

1536

In [None]:
# Train / Validation Split (95% Train, 5% Validation)
train_df, val_df_v1 = train_test_split(train_df_v1, test_size=0.05, random_state=42)

# Sentence-Transformer 학습을 위한 데이터 변환
train_examples = [
    InputExample(texts=[row["description"], row["note"]], label=row["label"])
    for _, row in train_df.iterrows()
]
val_examples_v1 = [
    InputExample(texts=[row["description"], row["note"]], label=row["label"])
    for _, row in val_df_v1.iterrows()
]

# 데이터로더 생성
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
val_dataloader = DataLoader(val_examples_v1, shuffle=False, batch_size=32)

In [None]:
model_v1 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
train_loss = losses.CosineSimilarityLoss(model_v1)
optimizer = AdamW(model_v1.parameters(), lr=2e-5, eps=1e-8, betas=(0.9, 0.999))

# Validation 데이터셋을 위한 STS 기반 평가자 생성
val_evaluator_v1 = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples_v1
)

In [None]:
# 모델 학습 및 검증 (Validation 포함)
model_v1.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=val_evaluator_v1,
    epochs=5,
    warmup_steps=100,
    evaluation_steps=50,  # 50 스텝마다 검증 수행
    optimizer_params={'lr': 1e-5, 'eps': 1e-8, 'betas': (0.9, 0.999)},
    weight_decay=0.01,
    show_progress_bar=True
)


Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
46,No log,No log,0.630282,0.579903
50,No log,No log,0.650308,0.606143
92,No log,No log,0.778973,0.694046
100,No log,No log,0.772716,0.694046
138,No log,No log,0.814961,0.707166
150,No log,No log,0.830145,0.712414
184,No log,No log,0.8326,0.715038
200,No log,No log,0.819419,0.717662
230,No log,No log,0.842575,0.718974


In [None]:
# ✅ 학습된 모델 저장
save_path = "/content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v1"
model_v1.save(save_path)

print(f"✅ 모델이 {save_path} 경로에 저장되었습니다!")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

✅ 모델이 /content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v1 경로에 저장되었습니다!


In [None]:
# ✅ 저장된 모델 로드
embedding_model_tuned_v1 = SentenceTransformer("/content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v1")

print("✅ 저장된 모델을 성공적으로 불러왔습니다!")

✅ 저장된 모델을 성공적으로 불러왔습니다!


In [None]:
from huggingface_hub import HfApi
api = HfApi()
repo_id = "minilm6_perfumerecommender_v1"
api.create_repo(repo_id=repo_id)

api.upload_folder(
    folder_path="/content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v1",
    repo_id=f"dawn78/{repo_id}",
    repo_type="model",
)

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dawn78/minilm6_perfumerecommender_v1/commit/2419b772416ac536d0fee5fd0c59a570bbfe6bf3', commit_message='Upload folder using huggingface_hub', commit_description='', oid='2419b772416ac536d0fee5fd0c59a570bbfe6bf3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dawn78/minilm6_perfumerecommender_v1', endpoint='https://huggingface.co', repo_type='model', repo_id='dawn78/minilm6_perfumerecommender_v1'), pr_revision=None, pr_num=None)

## v2 1to1, negative with fasttext p:n=(1:1)

In [None]:
file_path = "/content/drive/My Drive/hateslop_final/res/training_pairs.csv"  # 파일 경로 지정
train_df_v2 = pd.read_csv(file_path)

In [None]:
len(train_df_v2)

7803

In [None]:
# Train / Validation Split (90% Train, 10% Validation)
train_df, val_df_v2 = train_test_split(train_df_v2, test_size=0.1, random_state=42)

# Sentence-Transformer 학습을 위한 데이터 변환
train_examples = [
    InputExample(texts=[row["description"], row["note"]], label=row["label"])
    for _, row in train_df.iterrows()
]
val_examples_v2 = [
    InputExample(texts=[row["description"], row["note"]], label=row["label"])
    for _, row in val_df_v2.iterrows()
]

# 데이터로더 생성
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
val_dataloader = DataLoader(val_examples_v2, shuffle=False, batch_size=32)

In [None]:
model_v2 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
train_loss = losses.CosineSimilarityLoss(model_v2)
optimizer = AdamW(model_v2.parameters(), lr=2e-5, eps=1e-8, betas=(0.9, 0.999))

# Validation 데이터셋을 위한 STS 기반 평가자 생성
val_evaluator_v2 = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples_v2
)

In [None]:
# 모델 학습 및 검증 (Validation 포함)
model_v2.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=val_evaluator_v2,
    epochs=4,
    warmup_steps=100,
    evaluation_steps=50,  # 50 스텝마다 검증 수행
    optimizer_params={'lr': 1e-5, 'eps': 1e-8, 'betas': (0.9, 0.999)},
    weight_decay=0.01,
    show_progress_bar=True
)


Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
50,No log,No log,0.274372,0.285552
100,No log,No log,0.671663,0.723715
150,No log,No log,0.813367,0.781369
200,No log,No log,0.877149,0.803708
220,No log,No log,0.883662,0.809582
250,No log,No log,0.882746,0.806741
300,No log,No log,0.895851,0.813048
350,No log,No log,0.902669,0.819479
400,No log,No log,0.909265,0.822069
440,No log,No log,0.91055,0.822172


In [None]:
# ✅ 학습된 모델 저장
save_path = "/content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v2"
model_v2.save(save_path)

print(f"✅ 모델이 {save_path} 경로에 저장되었습니다!")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

✅ 모델이 /content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v2 경로에 저장되었습니다!


In [None]:
# ✅ 저장된 모델 로드
embedding_model_tuned_v2 = SentenceTransformer("/content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v2")

print("✅ 저장된 모델을 성공적으로 불러왔습니다!")

✅ 저장된 모델을 성공적으로 불러왔습니다!


In [None]:
from huggingface_hub import HfApi
api = HfApi()
repo_id = "minilm6_perfumerecommender_v2"
api.create_repo(repo_id=repo_id)

api.upload_folder(
    folder_path="/content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v2",
    repo_id=f"dawn78/{repo_id}",
    repo_type="model",
)

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dawn78/minilm6_perfumerecommender_v2/commit/972f9fa179e732799265523792a0638e42dac9d7', commit_message='Upload folder using huggingface_hub', commit_description='', oid='972f9fa179e732799265523792a0638e42dac9d7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dawn78/minilm6_perfumerecommender_v2', endpoint='https://huggingface.co', repo_type='model', repo_id='dawn78/minilm6_perfumerecommender_v2'), pr_revision=None, pr_num=None)

## v3 1to many, negative with fasttext p:n=(1:3)

In [None]:
file_path = "/content/drive/My Drive/hateslop_final/res/training_pairs_v3.csv"  # 파일 경로 지정
train_df_v3 = pd.read_csv(file_path)

In [None]:
train_df_v3.head()

Unnamed: 0,description,note,label
0,eau ditalie is an olfactory poem that transpor...,"bergamot, blackcurrant, incense, clay, magnoli...",1.0
1,eau ditalie is an olfactory poem that transpor...,"coumarin, wood notes, osmanthus, cassia, oud, ...",0.0
2,eau ditalie is an olfactory poem that transpor...,"fenugreek, cedar needle, frangipani, osmanthus...",0.0
3,eau ditalie is an olfactory poem that transpor...,"white tea, capsicum, cassis, blonde woods, fra...",0.0
4,rosa gallica by brecourt is an exquisite unise...,"incense, pink pepper, rose, myrrh, ebony, cash...",1.0


In [None]:
len(train_df_v3)

1536

In [None]:
# Train / Validation Split (95% Train, 5% Validation)
train_df, val_df_v3 = train_test_split(train_df_v3, test_size=0.05, random_state=42)

# Sentence-Transformer 학습을 위한 데이터 변환
train_examples = [
    InputExample(texts=[row["description"], row["note"]], label=row["label"])
    for _, row in train_df.iterrows()
]
val_examples_v3 = [
    InputExample(texts=[row["description"], row["note"]], label=row["label"])
    for _, row in val_df_v3.iterrows()
]

# 데이터로더 생성
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
val_dataloader = DataLoader(val_examples_v3, shuffle=False, batch_size=32)

In [None]:
model_v3 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
train_loss = losses.CosineSimilarityLoss(model_v3)
optimizer = AdamW(model_v3.parameters(), lr=2e-5, eps=1e-8, betas=(0.9, 0.999))

# Validation 데이터셋을 위한 STS 기반 평가자 생성
val_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples_v3
)

In [None]:
# 모델 학습 및 검증 (Validation 포함)
model_v3.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=val_evaluator,
    epochs=5,
    warmup_steps=100,
    evaluation_steps=50,  # 50 스텝마다 검증 수행
    optimizer_params={'lr': 1e-5, 'eps': 1e-8, 'betas': (0.9, 0.999)},
    weight_decay=0.01,
    show_progress_bar=True
)


Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
46,No log,No log,0.899793,0.738654
50,No log,No log,0.903031,0.738654
92,No log,No log,0.911038,0.72947
100,No log,No log,0.910493,0.725534
138,No log,No log,0.924562,0.724222
150,No log,No log,0.917001,0.725534
184,No log,No log,0.923709,0.712414
200,No log,No log,0.92523,0.721598
230,No log,No log,0.933954,0.733406


In [None]:
# ✅ 학습된 모델 저장
save_path = "/content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v3"
model_v3.save(save_path)

print(f"✅ 모델이 {save_path} 경로에 저장되었습니다!")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

✅ 모델이 /content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v3 경로에 저장되었습니다!


In [None]:
# ✅ 저장된 모델 로드
embedding_model_tuned_v3 = SentenceTransformer("/content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v3")

print("✅ 저장된 모델을 성공적으로 불러왔습니다!")

✅ 저장된 모델을 성공적으로 불러왔습니다!


In [None]:
from huggingface_hub import HfApi
api = HfApi()
repo_id = "minilm6_perfumerecommender_v3"
api.create_repo(repo_id=repo_id)

api.upload_folder(
    folder_path="/content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v3",
    repo_id=f"dawn78/{repo_id}",
    repo_type="model",
)

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dawn78/minilm6_perfumerecommender_v3/commit/573b656f871bdfd7c25aed591694c05f16d57b58', commit_message='Upload folder using huggingface_hub', commit_description='', oid='573b656f871bdfd7c25aed591694c05f16d57b58', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dawn78/minilm6_perfumerecommender_v3', endpoint='https://huggingface.co', repo_type='model', repo_id='dawn78/minilm6_perfumerecommender_v3'), pr_revision=None, pr_num=None)

## v4 1to1, negative without fasttext p:n=(1:many)

In [None]:
file_path = "/content/drive/My Drive/hateslop_final/res/training_pairs_v4.csv"  # 파일 경로 지정
train_df_v4 = pd.read_csv(file_path)

In [None]:
len(train_df_v4)

145152

In [None]:
# Train / Validation Split (90% Train, 10% Validation)
train_df, val_df_v4 = train_test_split(train_df_v4, test_size=0.2, random_state=42)

# Sentence-Transformer 학습을 위한 데이터 변환
train_examples = [
    InputExample(texts=[row["description"], row["note"]], label=row["label"])
    for _, row in train_df.iterrows()
]
val_examples_v4 = [
    InputExample(texts=[row["description"], row["note"]], label=row["label"])
    for _, row in val_df_v4.iterrows()
]

# 데이터로더 생성
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
val_dataloader = DataLoader(val_examples_v4, shuffle=False, batch_size=32)

In [None]:
model_v4 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
train_loss = losses.CosineSimilarityLoss(model_v4)
optimizer = AdamW(model_v4.parameters(), lr=2e-5, eps=1e-8, betas=(0.9, 0.999))

# Validation 데이터셋을 위한 STS 기반 평가자 생성
val_evaluator_v4 = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples_v4
)

In [None]:
# 모델 학습 및 검증 (Validation 포함)
model_v4.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=val_evaluator,
    epochs=1,
    warmup_steps=100,
    evaluation_steps=100,  # 50 스텝마다 검증 수행
    optimizer_params={'lr': 1e-5, 'eps': 1e-8, 'betas': (0.9, 0.999)},
    weight_decay=0.01,
    show_progress_bar=True
)


Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
100,No log,No log,0.092455,0.072185
200,No log,No log,0.13264,0.107672
300,No log,No log,0.175501,0.131364
400,No log,No log,0.189053,0.135225
500,0.028500,No log,0.211073,0.143355
600,0.028500,No log,0.242472,0.160407
700,0.028500,No log,0.261297,0.167803
800,0.028500,No log,0.270077,0.169542
900,0.028500,No log,0.280589,0.170852
1000,0.025300,No log,0.281494,0.169007


In [None]:
# ✅ 학습된 모델 저장
save_path = "/content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v4"
model_v4.save(save_path)

print(f"✅ 모델이 {save_path} 경로에 저장되었습니다!")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

✅ 모델이 /content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v4 경로에 저장되었습니다!


In [None]:
from huggingface_hub import HfApi
api = HfApi()
repo_id = "minilm6_perfumerecommender_v4"
api.create_repo(repo_id=repo_id)

api.upload_folder(
    folder_path="/content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v4",
    repo_id=f"dawn78/{repo_id}",
    repo_type="model",
)

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dawn78/minilm6_perfumerecommender_v4/commit/9fcb51240a163209267e8677643f2370f74f2b8e', commit_message='Upload folder using huggingface_hub', commit_description='', oid='9fcb51240a163209267e8677643f2370f74f2b8e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dawn78/minilm6_perfumerecommender_v4', endpoint='https://huggingface.co', repo_type='model', repo_id='dawn78/minilm6_perfumerecommender_v4'), pr_revision=None, pr_num=None)

In [None]:
# ✅ 저장된 모델 로드
embedding_model_tuned_v4 = SentenceTransformer("/content/drive/MyDrive/hateslop_final/prediction_model/minilm6_perfumerecommender_v4")

print("✅ 저장된 모델을 성공적으로 불러왔습니다!")

✅ 저장된 모델을 성공적으로 불러왔습니다!


# Assessment

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
evaluator_o1 = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples_v1
)
evaluator_o2 = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples_v2
)
evaluator_1 = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples_v1
)
evaluator_2 = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples_v2
)
evaluator_3 = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples_v3
)
evaluator_4 = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples_v4
)
print("original: ", evaluator_o1(model))
print("v1: ", evaluator_1(embedding_model_tuned_v1))
print("v3: ", evaluator_3(embedding_model_tuned_v3))
print("original: ", evaluator_o2(model))
print("v2: ", evaluator_2(embedding_model_tuned_v2))
print("v4: ", evaluator_4(embedding_model_tuned_v4))

original:  {'pearson_cosine': 0.49687206624610303, 'spearman_cosine': 0.4539509857075771}
v1:  {'pearson_cosine': 0.8425746761744255, 'spearman_cosine': 0.718974393548417}
v3:  {'pearson_cosine': 0.9339541699697309, 'spearman_cosine': 0.733406361302126}
original:  {'pearson_cosine': 0.049094796815849724, 'spearman_cosine': 0.06253949726370187}
v2:  {'pearson_cosine': 0.9221809753640012, 'spearman_cosine': 0.8309208495457832}
v4:  {'pearson_cosine': 0.36641281050343105, 'spearman_cosine': 0.20018342620535076}


In [10]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer
from IPython.display import display

class PerfumeNotePredictor:
    def __init__(self, all_notes, models):
        """
        :param all_notes: 비교할 향수 노트 리스트
        :param models: {'v1': model1, 'v2': model2, 'v3': model3, 'v4': model4} 형태의 모델 딕셔너리
        """
        self.all_notes = all_notes
        self.models = models

    def predict_notes(self, test_sentence, model):
        """주어진 문장(test_sentence)과 모델을 사용하여 가장 유사한 향수 노트 8개를 예측"""
        test_embedding = model.encode(test_sentence)
        note_embeddings = {note: model.encode(note) for note in self.all_notes}
        similarities = {note: 1 - cosine(test_embedding, note_embedding) for note, note_embedding in note_embeddings.items()}
        most_likely_notes = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:8]
        return [note for note, _ in most_likely_notes]

    def compare_predictions(self, test_sentence, real_notes):
        """주어진 문장(test_sentence)에 대해 모든 모델(v1, v2, v3, v4)의 예측을 비교하고 지표를 추가"""
        real_notes_set = set(real_notes if isinstance(real_notes, list) else [real_notes])  # 집합으로 변환하여 빠른 비교
        predictions = {version: self.predict_notes(test_sentence, model) for version, model in self.models.items()}

        # 모델별 매칭된 노트 개수 및 정확도 계산
        match_counts = {version: len(set(pred) & real_notes_set) for version, pred in predictions.items()}
        accuracies = {version: round((match_counts[version] / len(real_notes_set)) * 100, 2) if real_notes_set else 0.0
                      for version in predictions}

        # 최대 길이에 맞춰 DataFrame 정리
        max_length = max(len(real_notes_set), *[len(pred) for pred in predictions.values()])
        real_notes_list = list(real_notes_set) + [""] * (max_length - len(real_notes_set))

        for version in predictions:
            predictions[version].extend([""] * (max_length - len(predictions[version])))

        # DataFrame 생성
        df = pd.DataFrame({
            'Real Notes': real_notes_list,
            **{f'Predicted Notes ({version})': predictions[version] for version in predictions}
        })

        # 지표 추가
        metrics_df = pd.DataFrame({
            'Model': list(predictions.keys()),
            'Matched Count': [match_counts[version] for version in predictions],
            'Accuracy (%)': [accuracies[version] for version in predictions]
        })

        # 데이터프레임 출력
        print("\n🔹 Model Predictions Comparison:")
        display(df)

        print("\n🔹 Model Performance Metrics:")
        display(metrics_df)


In [17]:
from scipy.spatial.distance import cosine
import pandas as pd

class PerfumeNotePredictor:
    def __init__(self, all_notes, models):
        """
        :param all_notes: 비교할 향수 노트 리스트
        :param models: {'v1': model1, 'v2': model2, 'v3': model3, 'v4': model4} 형태의 모델 딕셔너리
        """
        self.all_notes = all_notes
        self.models = models

        # 모델별 노트 임베딩 사전 저장
        self.note_embeddings = {
            version: {note: model.encode(note) for note in self.all_notes}
            for version, model in self.models.items()
        }

    def predict_notes(self, test_sentence, model, model_version):
        """주어진 문장(test_sentence)과 모델을 사용하여 가장 유사한 향수 노트 8개를 예측"""
        test_embedding = model.encode(test_sentence)

        # 모델별 미리 저장된 노트 임베딩 사용
        note_embeddings = self.note_embeddings[model_version]

        # 코사인 유사도 계산
        similarities = {
            note: 1 - cosine(test_embedding, note_embedding)
            for note, note_embedding in note_embeddings.items()
        }

        # 가장 유사한 노트 8개 선택
        most_likely_notes = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:8]
        return [note for note, _ in most_likely_notes]

    def compare_predictions(self, test_sentence, real_notes):
        """주어진 문장(test_sentence)에 대해 모든 모델(v1, v2, v3, v4)의 예측을 비교하고 지표를 추가"""
        real_notes_set = set(real_notes if isinstance(real_notes, list) else [real_notes])  # 집합으로 변환하여 빠른 비교
        predictions = {
            version: self.predict_notes(test_sentence, model, version)
            for version, model in self.models.items()
        }

        # 모델별 매칭된 노트 개수 및 정확도 계산
        match_counts = {version: len(set(pred) & real_notes_set) for version, pred in predictions.items()}
        accuracies = {
            version: round((match_counts[version] / len(real_notes_set)) * 100, 2) if real_notes_set else 0.0
            for version in predictions
        }

        # 최대 길이에 맞춰 DataFrame 정리
        max_length = max(len(real_notes_set), *[len(pred) for pred in predictions.values()])
        real_notes_list = list(real_notes_set) + [""] * (max_length - len(real_notes_set))

        for version in predictions:
            predictions[version].extend([""] * (max_length - len(predictions[version])))

        # DataFrame 생성
        df = pd.DataFrame({
            'Real Notes': real_notes_list,
            **{f'Predicted Notes ({version})': predictions[version] for version in predictions}
        })

        # 지표 추가
        metrics_df = pd.DataFrame({
            'Model': list(predictions.keys()),
            'Matched Count': [match_counts[version] for version in predictions],
            'Accuracy (%)': [accuracies[version] for version in predictions]
        })

        # 데이터프레임 출력
        print("\n🔹 Model Predictions Comparison:")
        display(df)

        print("\n🔹 Model Performance Metrics:")
        display(metrics_df)


In [4]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embedding_model_tuned_v1 = SentenceTransformer("dawn78/minilm6_perfumerecommender_v1")
embedding_model_tuned_v2 = SentenceTransformer("dawn78/minilm6_perfumerecommender_v2")
embedding_model_tuned_v3 = SentenceTransformer("dawn78/minilm6_perfumerecommender_v3")
embedding_model_tuned_v4 = SentenceTransformer("dawn78/minilm6_perfumerecommender_v4")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/22.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/24.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [18]:
# 사용 예시
all_notes = list(set(perfume_data['notes'].explode()))  # 모든 향수 노트 리스트
models = {
    'vo': model,
    'v1': embedding_model_tuned_v1,
    'v2': embedding_model_tuned_v2,
    'v3': embedding_model_tuned_v3,
    'v4': embedding_model_tuned_v4

}

predictor = PerfumeNotePredictor(all_notes, models)

# 테스트 데이터
for i in range(5):
    i= random.randint(0, len(perfume_data))
    test_sentence = perfume_data['description'][i]
    real_notes = perfume_data['notes'][i]

    # 비교 실행
    predictor.compare_predictions(test_sentence, real_notes)
    print("\n")
#test_sentence = perfume_data['description'][120]
#real_notes = perfume_data['notes'][120]

# 비교 실행
#predictor.compare_predictions(test_sentence, real_notes)


🔹 Model Predictions Comparison:


Unnamed: 0,Real Notes,Predicted Notes (vo),Predicted Notes (v1),Predicted Notes (v2),Predicted Notes (v3),Predicted Notes (v4)
0,ylang ylang,silk tree blossom,lemon blossom,bergamot,lemon blossom,musk
1,sea notes,apple blossom,grapefruit blossom,blue hyacinth,pink rose,bergamot
2,white musk,orange blossom,pink grapefruit,lemon verbena,bergamot,amber
3,mandarin,lemon blossom,citrus notes,black vanilla husk,orange blossom,vanilla
4,passionfruit,night blooming cereus,citruses,balsam fir,lavender,sandalwood
5,grapefruit,cosmos flower,green mandarin,hyacinth,apple blossom,patchouli
6,green notes,pear blossom,lavender,honeysuckle,almond blossom,jasmine
7,orange blossom,grapefruit blossom,bergamot,labdanum,cardamom,rose
8,lemon,,,,,
9,bergamot,,,,,



🔹 Model Performance Metrics:


Unnamed: 0,Model,Matched Count,Accuracy (%)
0,vo,1,9.09
1,v1,1,9.09
2,v2,1,9.09
3,v3,2,18.18
4,v4,1,9.09





🔹 Model Predictions Comparison:


Unnamed: 0,Real Notes,Predicted Notes (vo),Predicted Notes (v1),Predicted Notes (v2),Predicted Notes (v3),Predicted Notes (v4)
0,caramel,lavender,bergamot,saffron,bergamot,musk
1,vetiver,muguet,lavender,biovanilla,lavender,bergamot
2,mandarin,baies rose,cardamom,bergamot,pink rose,amber
3,tonka bean,monoi oil,passionfruit,civet,cardamom,vanilla
4,clary sage,incense,black cardamom,labdanum,black cardamom,jasmine
5,,lady of the night flower,honey jasmine,black vanilla husk,rock rose,patchouli
6,,pink rose,vanilla,gurgum wood,night blooming jasmine,sandalwood
7,,tiare flower,floral notes,guaiac wood,pink pepper,rose



🔹 Model Performance Metrics:


Unnamed: 0,Model,Matched Count,Accuracy (%)
0,vo,0,0.0
1,v1,0,0.0
2,v2,0,0.0
3,v3,0,0.0
4,v4,0,0.0





🔹 Model Predictions Comparison:


Unnamed: 0,Real Notes,Predicted Notes (vo),Predicted Notes (v1),Predicted Notes (v2),Predicted Notes (v3),Predicted Notes (v4)
0,rose,lavender,lavender,black vanilla husk,pink rose,musk
1,amber,night blooming jasmine,bergamot,saffron,lavender,bergamot
2,vanilla,silk tree blossom,floral notes,bergamot,bergamot,amber
3,jasmine sambac,floral notes,vanilla,blue hyacinth,cardamom,vanilla
4,balsamic notes,night blooming cereus,honey jasmine,lilac,night blooming jasmine,jasmine
5,,lady of the night flower,lady of the night flower,biovanilla,rock rose,patchouli
6,,cosmos flower,night blooming jasmine,labdanum,lady of the night flower,sandalwood
7,,honey jasmine,cardamom,ambergris,black cardamom,rose



🔹 Model Performance Metrics:


Unnamed: 0,Model,Matched Count,Accuracy (%)
0,vo,0,0.0
1,v1,1,20.0
2,v2,0,0.0
3,v3,0,0.0
4,v4,3,60.0





🔹 Model Predictions Comparison:


Unnamed: 0,Real Notes,Predicted Notes (vo),Predicted Notes (v1),Predicted Notes (v2),Predicted Notes (v3),Predicted Notes (v4)
0,guaiac wood,leather,leather,coriander,lavender,musk
1,oud,wood,lavender,iron wood,bergamot,bergamot
2,leather,iron wood,bergamot,bergamot,pink rose,amber
3,sandalwood,white wood,iron wood,black vanilla husk,leather,vanilla
4,cedar,guaiac wood,blonde woods,suede,rock rose,leather
5,,aldehydes,aldehydes,gurgum wood,baies rose,sandalwood
6,,mahogany,mahogany,fir balsam,cardamom,jasmine
7,,lavender,black vanilla husk,dewberry,silk tree blossom,cedar



🔹 Model Performance Metrics:


Unnamed: 0,Model,Matched Count,Accuracy (%)
0,vo,2,40.0
1,v1,1,20.0
2,v2,0,0.0
3,v3,1,20.0
4,v4,3,60.0





🔹 Model Predictions Comparison:


Unnamed: 0,Real Notes,Predicted Notes (vo),Predicted Notes (v1),Predicted Notes (v2),Predicted Notes (v3),Predicted Notes (v4)
0,black cardamom,black vanilla husk,bergamot,saffron,bergamot,musk
1,balsam fir,black cardamom,lavender,bergamot,pink rose,bergamot
2,rosewood,black amber,spices,black vanilla husk,lavender,amber
3,patchouli,blackcurrant,black cardamom,honeysuckle,black cardamom,vanilla
4,praline,black pepper,lemon blossom,labdanum,cardamom,jasmine
5,cinnamon,black tea,black vanilla husk,musk,lemon blossom,patchouli
6,lemon,blackcurrant bud,honey jasmine,hyacinth,rock rose,sandalwood
7,black amber,clary sage,cardamom,ambergris,pink pepper,rose
8,sage,,,,,



🔹 Model Performance Metrics:


Unnamed: 0,Model,Matched Count,Accuracy (%)
0,vo,2,22.22
1,v1,1,11.11
2,v2,0,0.0
3,v3,1,11.11
4,v4,1,11.11






# Cross_encoder

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder
cross_model = CrossEncoder('cross-encoder/stsb-roberta-large', num_labels=1)

In [None]:
file_path = "/content/drive/My Drive/hateslop_final/res/training_pairs_v4.csv"  # 파일 경로 지정
train_samples = pd.read_csv(file_path)

In [None]:
train_samples.rename(columns={'description': 'sentence1', 'note': 'sentence2', 'label': 'label'}, inplace=True)

In [None]:
train_samples_list = [
    InputExample(texts=[row['sentence1'], row['sentence2']], label=row['label'])
    for _, row in train_samples.iterrows()
]

In [None]:
train_batch_size = 32
num_epochs = 1
model_save_path = '/content/drive/My Drive/hateslop_final/cross_encoder_v4'

train_dataloader = DataLoader(train_samples_list, shuffle=True, batch_size=train_batch_size)

cross_model.fit(
    train_dataloader=train_dataloader,
    epochs=num_epochs,
    warmup_steps=100,
    output_path=model_save_path
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4536 [00:00<?, ?it/s]

In [None]:
!pip install transformers huggingface_hub



In [None]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi
api = HfApi()
repo_id = "cross_encoder_v4"
api.create_repo(repo_id=repo_id)

api.upload_folder(
    folder_path=model_save_path,
    repo_id=f"dawn78/{repo_id}",
    repo_type="model",
)

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dawn78/cross_encoder_v4/commit/48daafdbe02c148bf898de02de94421dd16d1c3d', commit_message='Upload folder using huggingface_hub', commit_description='', oid='48daafdbe02c148bf898de02de94421dd16d1c3d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dawn78/cross_encoder_v4', endpoint='https://huggingface.co', repo_type='model', repo_id='dawn78/cross_encoder_v4'), pr_revision=None, pr_num=None)

In [None]:
from huggingface_hub import HfApi
api = HfApi()
repo_id = "cross_encoder_v3"
api.create_repo(repo_id=repo_id)

api.upload_folder(
    folder_path="/content/drive/MyDrive/hateslop_final/cross_encoder",
    repo_id=f"dawn78/{repo_id}",
    repo_type="model",
)

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dawn78/cross_encoder_v3/commit/dd9f911bf86c7a4a1f541969d438bc29d94c8288', commit_message='Upload folder using huggingface_hub', commit_description='', oid='dd9f911bf86c7a4a1f541969d438bc29d94c8288', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dawn78/cross_encoder_v3', endpoint='https://huggingface.co', repo_type='model', repo_id='dawn78/cross_encoder_v3'), pr_revision=None, pr_num=None)

In [None]:
# ✅ 학습된 모델 저장
cross_model.save(model_save_path)

print(f"✅ 모델이 {model_save_path} 경로에 저장되었습니다!")

✅ 모델이 /content/drive/My Drive/hateslop_final/cross_encoder_v4 경로에 저장되었습니다!


In [None]:
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator

cross_model_original = CrossEncoder('cross-encoder/stsb-roberta-large', num_labels=1)
ce_evaluator = CECorrelationEvaluator.from_input_examples(val_examples)
ce_evaluator(cross_model_original)

0.28732735800566295

In [None]:
ce_evaluator = CECorrelationEvaluator.from_input_examples(val_examples)
ce_evaluator(cross_model)
# 0.8650250798639563

0.7242223818224929