In [1]:
import pandas as pd
import os

path = r"C:\Users\changjin\workspace\lab\pln\data_set\null_X"
files = ["attractions_fixed.csv", "restaurants_fixed.csv", "accommodations_fixed.csv"]

texts = []

for fname in files:
    df = pd.read_csv(os.path.join(path, fname))
    for col in ["like", "dislike"]:
        if col in df.columns:
            texts.extend(df[col].dropna().tolist())

print(f"총 텍스트 개수: {len(texts)}")

총 텍스트 개수: 2733


In [2]:
!pip install sentence-transformers



In [3]:
from sentence_transformers import SentenceTransformer

# 모델 로드
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# 샘플 키워드
keywords = [
    "Scenic views",
    "Delicious food",
    "Comfortable bedding",
    "Historical temples",
    "Street food",
    "Pet-friendly"
]

# 임베딩 변환
embeddings = model.encode(keywords, convert_to_tensor=True)

print(embeddings.shape)  # (6, 768)


  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


torch.Size([6, 768])


In [4]:
from sklearn.cluster import KMeans

# 예: 5개 클러스터
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters, random_state=42)
clustering_model.fit(embeddings.cpu().numpy())

cluster_assignment = clustering_model.labels_

# 클러스터별 키워드 묶기
clusters = {}
for i, label in enumerate(cluster_assignment):
    clusters.setdefault(label, []).append(keywords[i])

for label, cluster_keywords in clusters.items():
    print(f"Cluster {label}: {cluster_keywords}")


Cluster 2: ['Scenic views']
Cluster 1: ['Delicious food', 'Street food']
Cluster 0: ['Comfortable bedding']
Cluster 3: ['Historical temples']
Cluster 4: ['Pet-friendly']


In [7]:
import pandas as pd
import os
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# 경로
path = r"C:\Users\changjin\workspace\lab\pln\data_set\null_X"
files = ["attractions_fixed.csv", "restaurants_fixed.csv", "accommodations_fixed.csv"]

# 저장소
like_keywords, dislike_keywords = [], []

# 파일 로드 및 키워드 추출
for fname in files:
    df = pd.read_csv(os.path.join(path, fname))
    for col, storage in [("like", like_keywords), ("dislike", dislike_keywords)]:
        if col in df.columns:
            storage.extend(df[col].dropna().tolist())

# --- 1. 세미콜론 분리 ---
def split_keywords(keyword_list):
    clean = []
    for kw in keyword_list:
        clean.extend([x.strip() for x in kw.split(";") if x.strip()])
    return clean

clean_like = split_keywords(like_keywords)
clean_dislike = split_keywords(dislike_keywords)

print("✅ Cleaned Like:", clean_like[:10])
print("✅ Cleaned Dislike:", clean_dislike[:10])

# --- 2. 임베딩 ---
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

like_embeddings = model.encode(clean_like, batch_size=64, convert_to_tensor=True, show_progress_bar=True)
dislike_embeddings = model.encode(clean_dislike, batch_size=64, convert_to_tensor=True, show_progress_bar=True)

print("Like embedding shape:", like_embeddings.shape)
print("Dislike embedding shape:", dislike_embeddings.shape)


✅ Cleaned Like: ['Scenic beauty', 'Quiet atmosphere', 'Cleanliness', 'Walking trails', 'Snow activities', 'Well-designed park', 'Walking paths', 'Clean environment', 'Attractive sculptures', 'Scenic beauty']
✅ Cleaned Dislike: ['Lack of microwave', 'Bring own towels', 'Disabled parking misuse', 'Closed restrooms', 'Limited attractions', 'Narrow paths', 'Challenging hike', 'Limited visibility', 'Lack of washbasins', 'Insects presence']


Batches: 100%|██████████| 122/122 [00:00<00:00, 126.34it/s]
Batches: 100%|██████████| 44/44 [00:00<00:00, 130.28it/s]

Like embedding shape: torch.Size([7776, 768])
Dislike embedding shape: torch.Size([2796, 768])





In [8]:
from sklearn.cluster import KMeans
import numpy as np
from collections import Counter

def cluster_keywords(keywords, embeddings, n_clusters=10, top_n=5, label="Like"):
    """
    KMeans로 키워드 클러스터링 후 대표 라벨 추출
    """
    print(f"\n=== {label} Keywords Clustering ===")
    
    # KMeans 클러스터링
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(embeddings)

    clusters = {}
    for i, kw in enumerate(keywords):
        cluster_id = labels[i]
        clusters.setdefault(cluster_id, []).append(kw)

    # 각 클러스터별 대표 라벨 추출
    for cluster_id, kws in clusters.items():
        counter = Counter(kws)
        top_keywords = [kw for kw, _ in counter.most_common(top_n)]
        print(f"Cluster {cluster_id}: {top_keywords[:top_n]}")

    return clusters

# 👉 사용 예시 (like / dislike 나눠서)
like_clusters = cluster_keywords(clean_like, model.encode(clean_like, convert_to_tensor=False), 
                                 n_clusters=10, top_n=5, label="Like")

dislike_clusters = cluster_keywords(clean_dislike, model.encode(clean_dislike, convert_to_tensor=False), 
                                    n_clusters=8, top_n=5, label="Dislike")



=== Like Keywords Clustering ===
Cluster 7: ['Scenic view', 'Scenic views', 'Beautiful view', 'Ocean view', 'Scenic beauty']
Cluster 5: ['Pleasant atmosphere', 'Spacious room', 'Comfortable bedding', 'Quiet atmosphere', 'Cozy atmosphere']
Cluster 1: ['Cleanliness', 'Clean environment', 'Clean facilities', 'Clean interior', 'Clean room']
Cluster 4: ['Family-friendly', 'Convenient amenities', 'Good facilities', 'Well-maintained facilities', 'Well-equipped amenities']
Cluster 2: ['Friendly service', 'Friendly staff', 'Quick service', 'Attentive service', 'Excellent service']
Cluster 9: ['Variety of dishes', 'Flavorful broth', 'Diverse menu', 'Authentic taste', 'Spicy flavor']
Cluster 8: ['Fresh ingredients', 'Fresh seafood', 'High-quality ingredients', 'High-quality meat', 'Tender meat']
Cluster 6: ['Delicious food', 'Tasty food', 'Delicious taste', 'Delicious desserts', 'Delicious coffee']
Cluster 0: ['Good value', 'Affordable prices', 'Affordable price', 'Affordable pricing', 'Value fo