In [None]:
import requests
import pandas as pd

def fetch_comments():
    url = "https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList"
    headers = {
        "Content-Type": "application/json",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    payload = {
        "arg": {
            "channelType": 2,
            "collapseType": 0,
            "commentTagId": 0,
            "pageIndex": 1,
            "pageSize": 10,
            "poiId": 81801,
            "sourceType": 1,
            "sortType": 3,
            "starType": 0
        },
        "head": {
            "cid": "09031058213352606128",
            "ctok": "",
            "cver": "1.0",
            "lang": "01",
            "sid": "8888",
            "syscode": "09",
            "auth": "",
            "xsid": "",
            "extension": []
        }
    }

    response = requests.post(url, headers=headers, json=payload)

    if response.status_code == 200:
        data = response.json()
        comments = data.get("result", {}).get("items", [])

        processed_data = []

        for comment in comments:
            processed_data.append({
                "Comment ID": comment.get("commentId"),
                "User Nick": comment.get("userInfo", {}).get("userNick"),
                "User Image": comment.get("userInfo", {}).get("userImage"),
                "Comment Content": comment.get("content"),
                "Publish Time": comment.get("publishTypeTag"),
                "Useful Count": comment.get("usefulCount"),
                "Score": comment.get("score"),
                "Images": [img.get("imageSrcUrl") for img in comment.get("images", [])]
            })

        df = pd.DataFrame(processed_data)
        return df

    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None

# Fetch comments and save to a DataFrame
dataframe = fetch_comments()
if dataframe is not None:
    # Display the DataFrame
    print(dataframe)
    # Optionally save to a CSV file
    dataframe.to_csv("comments.csv", index=False)


   Comment ID     User Nick  \
0   107267523        远行-遇见你   
1    92580773   132****2066   
2    71514243          阿断断断   
3    70847520   chri****_17   
4    69665032       1990有43   
5    69086037  littlehottie   
6    60015644     lmjazzman   
7    58875840           臭苗苗   
8    74205518         老力123   
9    87044839         行程的会员   

                                          User Image  \
0  https://dimg04.c-ctrip.com/images/0Z80g1200084...   
1  https://dimg04.c-ctrip.com/images/fd/headphoto...   
2  https://dimg04.c-ctrip.com/images/fd/headphoto...   
3  https://dimg04.c-ctrip.com/images/fd/headphoto...   
4  https://dimg04.c-ctrip.com/images/Z80j17000001...   
5  https://dimg04.c-ctrip.com/images/t1/headphoto...   
6  https://dimg04.c-ctrip.com/images/fd/headphoto...   
7  https://dimg04.c-ctrip.com/images/fd/headphoto...   
8  https://dimg04.c-ctrip.com/images/Z80o18000001...   
9  https://dimg04.c-ctrip.com/images/Z80618000001...   

                                     Comm

In [None]:
import requests
import pandas as pd

def fetch_comments(page_limit=60):
    url = "https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList"
    headers = {
        "Content-Type": "application/json",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    all_comments = []

    for page in range(1, page_limit + 1):
        payload = {
            "arg": {
                "channelType": 2,
                "collapseType": 0,
                "commentTagId": 0,
                "pageIndex": page,
                "pageSize": 10,
                "poiId": 81801,
                "sourceType": 1,
                "sortType": 3,
                "starType": 0
            },
            "head": {
                "cid": "09031058213352606128",
                "ctok": "",
                "cver": "1.0",
                "lang": "01",
                "sid": "8888",
                "syscode": "09",
                "auth": "",
                "xsid": "",
                "extension": []
            }
        }

        response = requests.post(url, headers=headers, json=payload)

        if response.status_code == 200:
            data = response.json()
            comments = data.get("result", {}).get("items", [])

            for comment in comments:
                user_info = comment.get("userInfo") or {}
                all_comments.append({
                    "Comment ID": comment.get("commentId"),
                    "User Nick": user_info.get("userNick", "N/A"),
                    "User Image": user_info.get("userImage", "N/A"),
                    "Comment Content": comment.get("content", "N/A"),
                    "Publish Time": comment.get("publishTypeTag", "N/A"),
                    "Useful Count": comment.get("usefulCount", 0),
                    "Score": comment.get("score", 0.0),
                    "Images": [img.get("imageSrcUrl", "N/A") for img in comment.get("images", [])]
                })
        else:
            print(f"Failed to fetch data for page {page}. Status code: {response.status_code}")

    df = pd.DataFrame(all_comments)
    return df

# Fetch comments from 60 pages and save to a DataFrame
dataframe = fetch_comments(page_limit=60)
if dataframe is not None:
    # Display the DataFrame
    print(dataframe)
    # Optionally save to a CSV file
    dataframe.to_csv("comments_60_pages.csv", index=False)


     Comment ID            User Nick  \
0     107267523               远行-遇见你   
1      92580773          132****2066   
2      71514243                 阿断断断   
3      70847520          chri****_17   
4      69665032              1990有43   
..          ...                  ...   
587    89126117                 geer   
588    79451123  Little_monster_2018   
589   141595045          137****5124   
590    99461898                 AA杨素   
591   200329124                   친구   

                                            User Image  \
0    https://dimg04.c-ctrip.com/images/0Z80g1200084...   
1    https://dimg04.c-ctrip.com/images/fd/headphoto...   
2    https://dimg04.c-ctrip.com/images/fd/headphoto...   
3    https://dimg04.c-ctrip.com/images/fd/headphoto...   
4    https://dimg04.c-ctrip.com/images/Z80j17000001...   
..                                                 ...   
587  https://dimg04.c-ctrip.com/images/Z80i0w000000...   
588  https://dimg04.c-ctrip.com/images/Z8090o000000... 

In [None]:
# prompt: 판다스 데이터 프레임으로 comments_60_pages.csv 불러오기

import pandas as pd

df = pd.read_csv('comments_60_pages.csv')
df

Unnamed: 0,Comment ID,User Nick,User Image,Comment Content,Publish Time,Useful Count,Score,Images
0,107267523,远行-遇见你,https://dimg04.c-ctrip.com/images/0Z80g1200084...,登上汉拿山顶可以远眺整个济州市，从城板岳线路上然后观音寺线路下，一路上风景美不胜收，映入眼眸...,2017-05-18 发布点评,11,5.0,['https://dimg04.c-ctrip.com/images/100e0g0000...
1,92580773,132****2066,https://dimg04.c-ctrip.com/images/fd/headphoto...,在公车上看到一群老人穿着运动装，下车后跟着他们走就找到登山的路了，建议一定要穿专门的登山鞋，...,2016-12-01 发布点评,25,5.0,['https://dimg04.c-ctrip.com/images/100v0b0000...
2,71514243,阿断断断,https://dimg04.c-ctrip.com/images/fd/headphoto...,汉拿山，世遗，免费。问询处提供中文地图，一位中文很好的大姐把每段路用时多少公交信息都帮忙写在...,2016-03-13 发布点评,85,5.0,['https://dimg04.c-ctrip.com/images/fd/tg/g4/M...
3,70847520,chri****_17,https://dimg04.c-ctrip.com/images/fd/headphoto...,我们是2月份去的，山脚下就全是雪，可见山上的积雪有多厚。选择的是城板岳路线，单程9.6km，...,2016-02-14 发布点评,25,5.0,['https://dimg04.c-ctrip.com/images/fd/tg/g4/M...
4,69665032,1990有43,https://dimg04.c-ctrip.com/images/Z80j17000001...,极少有中国人，即便有1.不到顶2.装备不专业。所以如果爬，1.装备一定要专业（韩国人都巨专业...,2016-01-12 发布点评,18,5.0,['https://dimg04.c-ctrip.com/images/fd/tg/g3/M...
...,...,...,...,...,...,...,...,...
587,89126117,geer,https://dimg04.c-ctrip.com/images/Z80i0w000000...,一般吧，反正是座山，不算壮观。,2016-10-02 发布点评,0,3.0,[]
588,79451123,Little_monster_2018,https://dimg04.c-ctrip.com/images/Z8090o000000...,我们来到汉拿山公园的时候正好是阴天 山上雾蒙蒙的 也没有什么可看 能见度特别低,2016-07-01 发布点评,0,3.0,[]
589,141595045,137****5124,https://dimg04.c-ctrip.com/images/Z80o18000001...,小伙伴们，给大家发红包喽！人人可领，领完就能用。祝大家领取的红包金额大大大！#吱口令#长按复...,2017-11-06 发布点评,2,1.0,[]
590,99461898,AA杨素,https://dimg04.c-ctrip.com/images/Z80t14000000...,携程个垃圾还在销售韩国旅游产品呀？,2017-03-18 发布点评,3,1.0,[]


In [None]:
# prompt: ["Publish Time"] 컬럼에서 发布点评을 삭제하고 2017-05-18 같은 형식으로 표현

import pandas as pd

df = pd.read_csv('comments_60_pages.csv')

# Remove "发布点评" and format the date
def clean_publish_time(time_str):
    if isinstance(time_str, str) and "发布点评" in time_str:
        return time_str.replace("发布点评", "").strip()
    return time_str

df["Publish Time"] = df["Publish Time"].apply(clean_publish_time)


In [None]:
df

Unnamed: 0,Comment ID,User Nick,User Image,Comment Content,Publish Time,Useful Count,Score,Images
0,107267523,远行-遇见你,https://dimg04.c-ctrip.com/images/0Z80g1200084...,登上汉拿山顶可以远眺整个济州市，从城板岳线路上然后观音寺线路下，一路上风景美不胜收，映入眼眸...,2017-05-18,11,5.0,['https://dimg04.c-ctrip.com/images/100e0g0000...
1,92580773,132****2066,https://dimg04.c-ctrip.com/images/fd/headphoto...,在公车上看到一群老人穿着运动装，下车后跟着他们走就找到登山的路了，建议一定要穿专门的登山鞋，...,2016-12-01,25,5.0,['https://dimg04.c-ctrip.com/images/100v0b0000...
2,71514243,阿断断断,https://dimg04.c-ctrip.com/images/fd/headphoto...,汉拿山，世遗，免费。问询处提供中文地图，一位中文很好的大姐把每段路用时多少公交信息都帮忙写在...,2016-03-13,85,5.0,['https://dimg04.c-ctrip.com/images/fd/tg/g4/M...
3,70847520,chri****_17,https://dimg04.c-ctrip.com/images/fd/headphoto...,我们是2月份去的，山脚下就全是雪，可见山上的积雪有多厚。选择的是城板岳路线，单程9.6km，...,2016-02-14,25,5.0,['https://dimg04.c-ctrip.com/images/fd/tg/g4/M...
4,69665032,1990有43,https://dimg04.c-ctrip.com/images/Z80j17000001...,极少有中国人，即便有1.不到顶2.装备不专业。所以如果爬，1.装备一定要专业（韩国人都巨专业...,2016-01-12,18,5.0,['https://dimg04.c-ctrip.com/images/fd/tg/g3/M...
...,...,...,...,...,...,...,...,...
587,89126117,geer,https://dimg04.c-ctrip.com/images/Z80i0w000000...,一般吧，反正是座山，不算壮观。,2016-10-02,0,3.0,[]
588,79451123,Little_monster_2018,https://dimg04.c-ctrip.com/images/Z8090o000000...,我们来到汉拿山公园的时候正好是阴天 山上雾蒙蒙的 也没有什么可看 能见度特别低,2016-07-01,0,3.0,[]
589,141595045,137****5124,https://dimg04.c-ctrip.com/images/Z80o18000001...,小伙伴们，给大家发红包喽！人人可领，领完就能用。祝大家领取的红包金额大大大！#吱口令#长按复...,2017-11-06,2,1.0,[]
590,99461898,AA杨素,https://dimg04.c-ctrip.com/images/Z80t14000000...,携程个垃圾还在销售韩国旅游产品呀？,2017-03-18,3,1.0,[]


In [None]:
# prompt: ["Comment ID"] 컬럼 삭제

df = df.drop(columns=["Comment ID"])

In [None]:
df

Unnamed: 0,User Nick,Comment Content,Publish Time,Useful Count,Score
0,远行-遇见你,登上汉拿山顶可以远眺整个济州市，从城板岳线路上然后观音寺线路下，一路上风景美不胜收，映入眼眸...,2017-05-18,11,5.0
1,132****2066,在公车上看到一群老人穿着运动装，下车后跟着他们走就找到登山的路了，建议一定要穿专门的登山鞋，...,2016-12-01,25,5.0
2,阿断断断,汉拿山，世遗，免费。问询处提供中文地图，一位中文很好的大姐把每段路用时多少公交信息都帮忙写在...,2016-03-13,85,5.0
3,chri****_17,我们是2月份去的，山脚下就全是雪，可见山上的积雪有多厚。选择的是城板岳路线，单程9.6km，...,2016-02-14,25,5.0
4,1990有43,极少有中国人，即便有1.不到顶2.装备不专业。所以如果爬，1.装备一定要专业（韩国人都巨专业...,2016-01-12,18,5.0
...,...,...,...,...,...
587,geer,一般吧，反正是座山，不算壮观。,2016-10-02,0,3.0
588,Little_monster_2018,我们来到汉拿山公园的时候正好是阴天 山上雾蒙蒙的 也没有什么可看 能见度特别低,2016-07-01,0,3.0
589,137****5124,小伙伴们，给大家发红包喽！人人可领，领完就能用。祝大家领取的红包金额大大大！#吱口令#长按复...,2017-11-06,2,1.0
590,AA杨素,携程个垃圾还在销售韩国旅游产品呀？,2017-03-18,3,1.0


In [None]:
# prompt: ["Publish Time"]을 날짜 데이터형식으로 변환

import pandas as pd

df = pd.read_csv('comments_60_pages.csv')

# Remove "发布点评" and format the date
def clean_publish_time(time_str):
    if isinstance(time_str, str) and "发布点评" in time_str:
        return time_str.replace("发布点评", "").strip()
    return time_str

df["Publish Time"] = df["Publish Time"].apply(clean_publish_time)

# Convert to datetime, handling errors
df["Publish Time"] = pd.to_datetime(df["Publish Time"], errors='coerce')

df

Unnamed: 0,Comment ID,User Nick,User Image,Comment Content,Publish Time,Useful Count,Score,Images
0,107267523,远行-遇见你,https://dimg04.c-ctrip.com/images/0Z80g1200084...,登上汉拿山顶可以远眺整个济州市，从城板岳线路上然后观音寺线路下，一路上风景美不胜收，映入眼眸...,2017-05-18,11,5.0,['https://dimg04.c-ctrip.com/images/100e0g0000...
1,92580773,132****2066,https://dimg04.c-ctrip.com/images/fd/headphoto...,在公车上看到一群老人穿着运动装，下车后跟着他们走就找到登山的路了，建议一定要穿专门的登山鞋，...,2016-12-01,25,5.0,['https://dimg04.c-ctrip.com/images/100v0b0000...
2,71514243,阿断断断,https://dimg04.c-ctrip.com/images/fd/headphoto...,汉拿山，世遗，免费。问询处提供中文地图，一位中文很好的大姐把每段路用时多少公交信息都帮忙写在...,2016-03-13,85,5.0,['https://dimg04.c-ctrip.com/images/fd/tg/g4/M...
3,70847520,chri****_17,https://dimg04.c-ctrip.com/images/fd/headphoto...,我们是2月份去的，山脚下就全是雪，可见山上的积雪有多厚。选择的是城板岳路线，单程9.6km，...,2016-02-14,25,5.0,['https://dimg04.c-ctrip.com/images/fd/tg/g4/M...
4,69665032,1990有43,https://dimg04.c-ctrip.com/images/Z80j17000001...,极少有中国人，即便有1.不到顶2.装备不专业。所以如果爬，1.装备一定要专业（韩国人都巨专业...,2016-01-12,18,5.0,['https://dimg04.c-ctrip.com/images/fd/tg/g3/M...
...,...,...,...,...,...,...,...,...
587,89126117,geer,https://dimg04.c-ctrip.com/images/Z80i0w000000...,一般吧，反正是座山，不算壮观。,2016-10-02,0,3.0,[]
588,79451123,Little_monster_2018,https://dimg04.c-ctrip.com/images/Z8090o000000...,我们来到汉拿山公园的时候正好是阴天 山上雾蒙蒙的 也没有什么可看 能见度特别低,2016-07-01,0,3.0,[]
589,141595045,137****5124,https://dimg04.c-ctrip.com/images/Z80o18000001...,小伙伴们，给大家发红包喽！人人可领，领完就能用。祝大家领取的红包金额大大大！#吱口令#长按复...,2017-11-06,2,1.0,[]
590,99461898,AA杨素,https://dimg04.c-ctrip.com/images/Z80t14000000...,携程个垃圾还在销售韩国旅游产品呀？,2017-03-18,3,1.0,[]


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 592 entries, 0 to 591
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Comment ID       592 non-null    int64         
 1   User Nick        588 non-null    object        
 2   User Image       588 non-null    object        
 3   Comment Content  592 non-null    object        
 4   Publish Time     592 non-null    datetime64[ns]
 5   Useful Count     592 non-null    int64         
 6   Score            592 non-null    float64       
 7   Images           592 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(4)
memory usage: 37.1+ KB


In [None]:
df.drop(columns = ["Comment ID","User Image","Images"], inplace = True,axis = 1)


In [None]:
df

Unnamed: 0,User Nick,Comment Content,Publish Time,Useful Count,Score
0,远行-遇见你,登上汉拿山顶可以远眺整个济州市，从城板岳线路上然后观音寺线路下，一路上风景美不胜收，映入眼眸...,2017-05-18,11,5.0
1,132****2066,在公车上看到一群老人穿着运动装，下车后跟着他们走就找到登山的路了，建议一定要穿专门的登山鞋，...,2016-12-01,25,5.0
2,阿断断断,汉拿山，世遗，免费。问询处提供中文地图，一位中文很好的大姐把每段路用时多少公交信息都帮忙写在...,2016-03-13,85,5.0
3,chri****_17,我们是2月份去的，山脚下就全是雪，可见山上的积雪有多厚。选择的是城板岳路线，单程9.6km，...,2016-02-14,25,5.0
4,1990有43,极少有中国人，即便有1.不到顶2.装备不专业。所以如果爬，1.装备一定要专业（韩国人都巨专业...,2016-01-12,18,5.0
...,...,...,...,...,...
587,geer,一般吧，反正是座山，不算壮观。,2016-10-02,0,3.0
588,Little_monster_2018,我们来到汉拿山公园的时候正好是阴天 山上雾蒙蒙的 也没有什么可看 能见度特别低,2016-07-01,0,3.0
589,137****5124,小伙伴们，给大家发红包喽！人人可领，领完就能用。祝大家领取的红包金额大大大！#吱口令#长按复...,2017-11-06,2,1.0
590,AA杨素,携程个垃圾还在销售韩国旅游产品呀？,2017-03-18,3,1.0


In [None]:
# prompt: df['Comment Content'] 를 TF-IDF로 5개의 주제로 토픽모델링을 하고 각 토픽별 주요 키워드와 가중치 20개씩 추출

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Assuming 'df' is your DataFrame and 'Comment Content' is the column
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english') # Adjust parameters as needed
tfidf = vectorizer.fit_transform(df['Comment Content'].astype(str))

lda = LatentDirichletAllocation(n_components=5, random_state=42) # 5 topics
lda.fit(tfidf)

feature_names = vectorizer.get_feature_names_out()

for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx + 1}:")
    top_keywords_indices = topic.argsort()[:-21:-1]  # Get top 20 keywords
    top_keywords = [feature_names[i] for i in top_keywords_indices]
    top_weights = topic[top_keywords_indices]

    for keyword, weight in zip(top_keywords, top_weights):
        print(f"  - {keyword}: {weight:.3f}") # Print keywords and weights
    print("-" * 20)

Topic 1:
  - 风景如画: 4.200
  - 很大的公园: 2.890
  - 景色宜人: 2.682
  - 不错: 2.434
  - 景色不错: 2.206
  - 游客不多: 2.200
  - 无门票: 2.200
  - 7公里: 1.923
  - 白鹿潭: 1.922
  - 人也不多: 1.907
  - 很喜欢: 1.907
  - 韩国最高的山峰: 1.907
  - 冬天去的: 1.907
  - 汉拿山是韩国最高的山: 1.907
  - 汉拿山是韩国最高峰: 1.876
  - 雪很厚: 1.777
  - 风景非常漂亮: 1.630
  - 景色非常好: 1.627
  - 空气超好的: 1.614
  - 值得去的地方: 1.511
--------------------
Topic 2:
  - 950米: 4.882
  - 海拔高度为1: 3.939
  - 景色一般: 3.552
  - 很美: 3.473
  - 在济州岛任何地方都能看见: 3.470
  - 汉拿山意为: 3.430
  - 能拿下银河的高山: 3.430
  - 汉拿山是韩国三大名山之一: 3.314
  - 非常值得一去的地方: 3.041
  - 风景不错: 2.805
  - 是韩国最高的山: 2.699
  - 山顶上有约25: 2.605
  - 周围有360多个大小因火山爆发形成的小火山: 2.605
  - 是座神秘莫测的山: 2.557
  - 季节变化山色也变: 2.557
  - 看的角度不同山势就不同: 2.312
  - 一定要去: 2.200
  - 汉拿山是济州岛的名山: 2.200
  - 000年前因火山爆发而形成的直径500米的火山湖白鹿潭: 2.142
  - 韩国第一高峰: 1.913
--------------------
Topic 3:
  - 汉拿山: 8.761
  - 景色很美: 7.017
  - 空气清新: 5.211
  - 值得一去: 4.640
  - 海拔1950米: 3.992
  - 空气好: 3.603
  - 免费: 3.194
  - 不要门票: 2.992
  - 汉拿山是韩国第一高峰: 2.917
  - 哈哈: 2.783
  - 漂亮: 2.775
  - 人

In [None]:
# prompt: df['combined']를 통해 5개로 군집화 하고 마지막컬럼에 군집값을 넣어주고 각 군집별로 상위 키워드 20개와 가중치 출력, 각 군집에 할당된 데이터 수도 함께 표시

from sklearn.cluster import KMeans

# Assuming 'df' is your DataFrame and 'combined' is the column you want to use for clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # 5 clusters
df['cluster'] = kmeans.fit_predict(df[['Score', 'Useful Count']])

# Calculate cluster sizes
cluster_sizes = df['cluster'].value_counts()

# Display cluster sizes
print("Cluster Sizes:")
print(cluster_sizes)
print("-" * 20)

# Get top keywords for each cluster
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tfidf = vectorizer.fit_transform(df['Comment Content'].astype(str))
feature_names = vectorizer.get_feature_names_out()

for cluster_num in range(5):
    cluster_data = df[df['cluster'] == cluster_num]['Comment Content'].astype(str)
    cluster_tfidf = vectorizer.transform(cluster_data)

    # Sum tf-idf values for each word across all documents in the cluster
    cluster_tfidf_sum = cluster_tfidf.sum(axis=0)

    # Get top 20 keywords and their weights
    top_keywords_indices = cluster_tfidf_sum.argsort()[:-21:-1]
    top_keywords = [feature_names[i] for i in top_keywords_indices]
    top_weights = cluster_tfidf_sum[0, top_keywords_indices].A1  # Convert sparse matrix to array

    print(f"Top Keywords for Cluster {cluster_num + 1}:")
    for keyword, weight in zip(top_keywords, top_weights):
        print(f"  - {keyword}: {weight:.3f}")
    print("-" * 20)

Cluster Sizes:
cluster
0    557
3     20
4     10
1      4
2      1
Name: count, dtype: int64
--------------------
Top Keywords for Cluster 1:
  - [['灵室下' '御乘生岳最富代表性' '5公里' '体力不好最好打车到车站' '灵室路线终点离公交车站有2' '且全是很陡的盘山下坡'
  '去之前在汉拿山博物院里看了关于汉拿山的纪录片' '旅行团也很喜欢去' '不然容易滑倒' '冬季登山一定得带上冰爪' '其中土赤岳'
  '周围的侧火山多达360个之多' '风化侵蚀等地形地貌' '有火山' '砂罗岳' '安排的时间有限我们只爬到半山腰处就急着往回走下山了'
  '由于是跟团的行程' '以后有机会在春天的时候再来爬爬汉拿山' '观音寺两条线路' '但能抵达顶峰白鹿潭的只有城板岳'
  '御里牧是几条线路中最容易爬的' '部分原始不平的石块路' '全程山路部分用木板铺路' '一路基本上都是高低不平大小不一的石头路'
  '走起来有点艰辛' '公交也非常方便' '就最原始的目的就是想爬汉拿山' '在北京经常爬山' '值得一来' '山真的是非常非常高'
  '累死我了' '发现自己低估山的难度' '但是风景极好' '但是来了以后' '所以眨眼就到了' '所以没把汉拿山当回事' '当初来济州岛'
  '路非常非常长' '这边的司机车都是开的飞快的' '一路风光很美' '回头从御里牧下山' '再向顿乃克方向走到主峰山脚下'
  '一心希望到顶能看到美景' '登顶拍照证明下山后可以用1000韩币办一个证书' '结果白茫茫一片雾' '我是从灵室上山到威势岳庇护所'
  '看到一个个中国人来到这里看美景' '可见汉拿山是韩国很经典的景点啦' '汉拿山就是韩国的一切' '建议早点上山'
  '所以大家一定要计划好时间' '所以很难爬' '如果中午12点前没有到这处' '特别是最后一段' '没有进行铺垫' '是山原来的石头'
  '此段汉拿山有一个休息区叫金达莱' '12点后是无法再往上爬了' '不像中国的山一样都有台阶' '冬季的最后登山时间是早上10点'
  '漂亮干净整洁' '对于没有看过太多雪的南方人来说冬日去看看雪是最好的' '秋天

In [None]:
# prompt: 각 군집별 빈도수 시각화. plotly로 데이터 라벨 추가

import plotly.express as px

# Assuming 'cluster_sizes' is a pandas Series with cluster labels as index and counts as values
fig = px.bar(cluster_sizes,
             x=cluster_sizes.index,
             y=cluster_sizes.values,
             labels={'x':'Cluster', 'y':'Frequency'},
             title='Frequency of Each Cluster')

# Add data labels
fig.update_traces(texttemplate='%{y}', textposition='outside')  # Position the labels outside the bars

fig.show()