In [None]:
from pathlib import Path

import pandas as pd
from bertopic import BERTopic

# データダウンロード
[livedoor ニュースコーパス](https://www.rondhuit.com/download.html#ldcc)を利用する。

In [None]:
# data/
data_root_path = Path().resolve().parent.joinpath("data")

In [None]:
# download data
!wget -P $data_root_path https://www.rondhuit.com/download/ldcc-20140209.tar.gz
!cd $data_root_path && tar zxvf ldcc-20140209.tar.gz

In [None]:
# preprocess
livedoor_news_data_path = data_root_path.joinpath("text")

news_data: list[str] = []
for data_path in livedoor_news_data_path.glob("**/*.txt"):
    if data_path.name in ["CHANGES.txt", "README.txt", "LICENSE.txt"]:
        continue
    
    with data_path.open("r") as fin:
        news_data.append("".join([text.rstrip("\n") for text in fin.readlines()[3:]]))

In [None]:
# check data size
len(news_data)

# BERTopic を使ってみる

In [None]:
# load model & clustering
model = BERTopic(embedding_model="paraphrase-multilingual-MiniLM-L12-v2")  # 多言語モデルで日本語を使う
topics, probs = model.fit_transform(news_data)

In [None]:
# show Intertopic Distance Map
model.visualize_topics()

In [None]:
# show Topic Word Scores
model.visualize_barchart()

## 結果を見る

In [None]:
# DataFrame for analysis
result_df = pd.DataFrame(
    {
        "news_text": news_data,
        "topic_no": topics,
        "proba": probs,
    }
)
result_df.head(10)

In [None]:
# show topic 0
result_df[result_df.topic_no == 0].sort_values("proba", ascending=False).head(10)

In [None]:
result_df[result_df.topic_no == 2].sort_values("proba", ascending=False).head(10)

In [None]:
result_df[result_df.topic_no == 3].sort_values("proba", ascending=False).head(10)

In [None]:
result_df[result_df.topic_no == 4].sort_values("proba", ascending=False).head(10)