**** This notebook is written in Google Colab ****

# Settings

In [5]:
!pip install gensim



In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
import re
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import FastText
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Train and Save Model

In [None]:
with open("/content/drive/MyDrive/hateslop_final/res/corpus.pkl","rb") as f:
    corpus = pickle.load(f)

In [None]:
# 샘플 향수 설명 데이터
corpus = corpus

# 전처리 함수 (소문자 변환, 특수 문자 제거, 불용어 제거)
def preprocess_text(text):
    text = text.lower()  # 소문자 변환
    text = re.sub(r"[^\w\s]", "", text)  # 특수 문자 제거
    words = word_tokenize(text)  # 단어 토큰화
    words = [word for word in words if word not in stopwords.words("english")]  # 불용어 제거
    return words

# 모든 문장 전처리 적용
tokenized_corpus = [preprocess_text(sentence) for sentence in corpus]

# FastText 모델 학습
fasttext_model = FastText(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, sg=1, workers=4)

# 모델 저장
fasttext_model.save("/content/drive/My Drive/hateslop_final/fasttext_perfume.model")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Upload Model

In [7]:
# 모델 불러오기
fasttext_model = FastText.load("/content/drive/My Drive/hateslop_final/fasttext_perfume.model")

In [8]:
# 'vanilla'과 가장 유사한 향료 5개 찾기
similar_notes = fasttext_model.wv.most_similar("vanila", topn=5)

print(similar_notes)

[('interlaced', 0.9997525811195374), ('smoky', 0.9996361136436462), ('driftwood', 0.9993314146995544), ('interwoven', 0.9993111491203308), ('richer', 0.9991893172264099)]


In [9]:
# 입력된 노트
input_notes = {"bergamot", "vanilla", "ceder", "patchuli", "vetiver"}

# 입력 노트와 유사한 향료 찾기
expanded_notes = set(input_notes)
for note in input_notes:
    try:
        similar_words = fasttext_model.wv.most_similar(note, topn=3)  # 각 노트별 유사한 단어 3개씩 추가
        expanded_notes.update([word[0] for word in similar_words])
    except KeyError:
        print(f"{note} not in vocabulary")

print("확장된 노트 목록:", expanded_notes)


확장된 노트 목록: {'dry', 'taste', 'patchuli', 'empowered', 'hearts', 'bergamot', 'customs', 'passiflora', 'symphony', 'avenue', 'vanilla', 'muracciole', 'merge', 'enrich', 'halloween', 'ceder', 'foliage', 'vetiver', 'raspberry', 'smoky'}


In [None]:
# 향수 노트 벡터 변환 함수
def perfume_to_vector(notes):
    note_list = notes.split(", ")  # 문자열 → 리스트 변환
    vectors = [fasttext_model.wv[note] for note in note_list if note in fasttext_model.wv]

    if vectors:
        return np.mean(vectors, axis=0)  # 평균 벡터 반환
    else:
        return np.zeros(100)  # FastText 벡터 차원 (예: 100차원)