In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K     [90m━━━━━

In [3]:
from bertopic import BERTopic
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import PartOfSpeech
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from sentence_transformers import SentenceTransformer
from umap import UMAP
import json
import spacy

In [4]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
with open("drive/MyDrive/data_ba/sorted_title_content_84-23_v6", "r") as fp:
        data_list = json.load(fp)

In [6]:
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [7]:
stopwords = []

with open("drive/MyDrive/data_ba/german_stopwords_plain.txt", "r") as file:

    for line in file:
        if not line.startswith(";"):
            stopwords.append(line[:-1])

In [8]:
def train_model(plain_texts, stopwords):

    sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
    embeddings = sentence_model.encode(plain_texts, show_progress_bar=False)

    hdbscan_model = HDBSCAN(min_cluster_size=300, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

    vectorizer_model = CountVectorizer(stop_words = stopwords, min_df=10)

    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

    pos_patterns = [
            [{'POS': 'ADJ'}],
            [{'POS': 'NOUN'}],
            [{'POS': 'PROPN'}]
    ]

    mmr = MaximalMarginalRelevance(diversity=1.0)
    pos = PartOfSpeech("de_core_news_sm", pos_patterns=pos_patterns)
    representation_models = [pos, mmr]

    topic_model = BERTopic(low_memory=True,
                            hdbscan_model=hdbscan_model,
                            ctfidf_model=ctfidf_model,
                            representation_model=representation_models,
                            vectorizer_model=vectorizer_model)

    topics, probs = topic_model.fit_transform(plain_texts, embeddings)

    table = topic_model.get_topic_info()
    table.to_csv('drive/MyDrive/data_ba/model_84-23_topics_v6.csv')
    print(topic_model.get_topic_info())

    return topic_model

In [9]:
def save_model(topic_model):

    # Method 1 - safetensors
    embedding_model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    topic_model.save("drive/MyDrive/data_ba/static_model_84-23_v6", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

In [10]:
data_list.pop()

topic_model = train_model(data_list, stopwords)
save_model(topic_model)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

    Topic  Count                                               Name  \
0      -1  65230                      -1_neue_neuen_letzten_grossen   
1       0  12022                           0_bmw_renault_neue_neuen   
2       1   9386  1_verletzte_entgegenkommenden_verletzten_fahre...   
3       2   8822                          2_neuen_neue_ersten_erste   
4       3   7604               3_airlines_boeing_crossair_deutschen   
5       4   4101                                   4_svp_sp_fdp_cvp   
6       5   2763   5_russischen_russische_amerikanischen_irakischen   
7       6   2744     6_angeklagte_angeklagten_verurteilte_bedingten   
8       7   2679  7_gestohlen_unbekannte_gestohlenen_verfolgungs...   
9       8   2290     8_unwetter_oberland_zahlreiche_schneebedeckten   
10      9   2154                           9_neue_neuen_alten_leben   
11     10   1804   10_brandstiftung_brennenden_brennende_unbekannte   
12     11   1676         11_erlaubten_kontrollierten_fuhren_letzten   
13    