In [1]:
MODULE_PATH = "./BERTopic/bertopic/__init__.py"
MODULE_NAME = "bertopic"
import importlib
import sys
spec = importlib.util.spec_from_file_location(MODULE_NAME, MODULE_PATH)
module = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = module 
spec.loader.exec_module(module)

In [2]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np
import umap
import pickle
import logging

In [3]:
# model_path='/media/marcin/Dane/model/'
model_path='model/'

In [4]:
import psutil
psutil.virtual_memory()

svmem(total=59098333184, available=55806693376, percent=5.6, used=2679197696, free=54487445504, active=2669527040, inactive=1355632640, buffers=85372928, cached=1846317056, shared=9109504, slab=209686528)

In [4]:
df_all = pd.read_csv('parsed/corpus/all.csv', index_col=0, na_filter=False)

In [6]:
topic_model = BERTopic.load(f'{model_path}2015-default', embedding_model='xlm-r-distilroberta-base-paraphrase-v1')

with open(f'{model_path}topics.txt') as f:
    content = f.read()
    topics = eval(content)

topics_over_time = pd.read_csv(f'{model_path}topics_over_time.csv', index_col=0)

df['topic'] = topics

In [59]:
with open('polish_stopwords.txt') as f:
    stop_words = [x.strip() for x in f]

vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(1, 3), min_df=5)

sentence_model = SentenceTransformer("xlm-r-distilroberta-base-paraphrase-v1")

topic_model = BERTopic(
    # embedding_model=sentence_model,
    vectorizer_model=vectorizer_model,
    # min_topic_size=5,
    low_memory=True,
    verbose=True)

In [6]:
with open(f'{model_path}embeddings-lexrank-top1-2015.pkl', "rb") as fIn:
    embeddings_top1 = pickle.load(fIn)

In [7]:
with open(f'{model_path}embeddings-lexrank-min5-1.0-2015.pkl', "rb") as fIn:
    embeddings_top5 = pickle.load(fIn)

In [8]:
with open(f'{model_path}embeddings-lexrank-weighted-2015.pkl', "rb") as fIn:
    embeddings_weighted = pickle.load(fIn)

In [9]:
with open(f'{model_path}embeddings-all.pkl', "rb") as fIn:
    embeddings_all = pickle.load(fIn)

In [10]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 3), min_df=5)
embeddings_tfidf = tfidf_vectorizer.fit_transform(df_all[df_all.rok == "2015"].text.to_list())

In [11]:
df_all["embedding"] = [x for x in embeddings_all]

In [12]:
# df = df_all[df_all.rok >= "1991"]
df = df_all[df_all.rok == "2015"].copy()
docs = df.text.to_list()
embeddings = np.asarray(df.embedding.to_list())

In [13]:
del df_all
del embeddings_all

In [14]:
len(docs), embeddings.shape

(41930, (41930, 768))

In [146]:
mapper_top1 = umap.UMAP(n_neighbors=15,
                    n_components=2,
                    min_dist=0.0,
                    metric='cosine').fit(embeddings_top1)

mapper_top5 = umap.UMAP(n_neighbors=15,
                    n_components=2,
                    min_dist=0.0,
                    metric='cosine').fit(embeddings_top5)

mapper_weighted = umap.UMAP(n_neighbors=15,
                    n_components=2,
                    min_dist=0.0,
                    metric='cosine').fit(embeddings_weighted)

mapper_base = umap.UMAP(n_neighbors=15,
                    n_components=2,
                    min_dist=0.0,
                    metric='cosine').fit(embeddings)

In [147]:
mapper_tfidf = umap.UMAP(n_neighbors=15,
                    n_components=2,
                    min_dist=0.0,
                    metric='hellinger').fit(embeddings_tfidf)

In [181]:
umap_combined = mapper_weighted
# mapper_top5 * mapper_tfidf - mapper_weighted                  # 15318 961 484 481 418 (589)
# mapper_tfidf                                                  # 11003 1287 796 649 538 (526)
# mapper_tfidf - mapper_weighted                                # 12619 825 371 330 297 (578)
# mapper_tfidf * mapper_top5                                    # 14532 1417 1403 1123 1108 (406)
# mapper_top5 * mapper_weighted                                 # 14618 1754 1420 1323 1125 (385)
# mapper_tfidf * (mapper_top5 + mapper_weighted)                # 15027 1528 1212 1189 1015 (407)
# mapper_top5 * mapper_weighted - mapper_tfidf                  # 15564 1186 927 541 412 (634)
# mapper_tfidf * (mapper_weighted + mapper_base)                # 15805 1823 1045 1002 890 (403)
# mapper_tfidf * mapper_top5 * (mapper_weighted + mapper_base)  # 15920 1615 1500 1075 972 (422)
# mapper_tfidf * (mapper_top5 - mapper_weighted)                # 15943, 1708, 1366, 1118, 997 (372)
# mapper_tfidf * mapper_top5 - mapper_base                      # 15968 1447 879 785 587 (534)
# mapper_tfidf * mapper_weighted                                # 16039 1771 1559 783 645 (452)
# mapper_tfidf * (mapper_weighted - mapper_base)                # 16192 1803 1554 1355 1274 (395)
# mapper_top5 * mapper_tfidf5                                   # >16k
# mapper_top5 * mapper_weighted * mapper_tfidf5                 # >16k
# mapper_top5 * mapper_weighted * mapper_tfidf                  # >16k
# mapper_top5 * mapper_weighted + mapper_tfidf                  # >16k
# mapper_tfidf + mapper_top5                                    # >16k
# mapper_top5 + mapper_tfidf5                                   # >17k
# mapper_top5 * (mapper_weighted - mapper_tfidf)                # >17k
# mapper_tfidf * mapper_top5 * mapper_weighted                  # >17k
# mapper_tfidf * mapper_top5 * mapper_base                      # >17k
# mapper_tfidf * (mapper_top5 - mapper_base)                    # >17k
# mapper_top5_5 * mapper_tfidf5                                 # >19k

In [182]:
topics, _ = topic_model.fit_transform(docs, embeddings, umap_embeddings=umap_combined.embedding_)

2021-04-04 21:33:30,457 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [183]:
topic_model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,14984
1,69,1881
2,138,1571
3,40,1454
4,28,1282
...,...,...
333,221,10
332,318,10
330,129,10
329,250,10


In [44]:
topic_model.save(f'{model_path}2015-lexrank-top5-1.0', save_embedding_model=False)
with open(f'{model_path}topics-2015-lexrank-top5-1.0.pkl', "wb") as fOut:
    pickle.dump(topics, fOut)

In [22]:
len(topics)

41930

In [31]:
df["topic"] = topics

In [184]:
plot_topics = topic_model.visualize_topics()
plot_topics.update_layout(width=1200, height=1200).write_html('plots/topics2015_weighted_best.html', include_plotlyjs='cdn')
plot_topics

In [15]:
topics_over_time = topic_model.topics_over_time(df.text.to_list(), topics, df.date.to_list(), nr_bins=48)
topics_over_time.to_csv(f'{model_path}topics_over_time.csv')

46it [03:21,  4.39s/it]


In [16]:
plot_topics_over_time = topic_model.visualize_topics_over_time(topics_over_time, top_n=20)
plot_topics_over_time.update_layout(width=1920, height=700).write_html('plots/topics_over_time_cdn.html', include_plotlyjs='cdn')
plot_topics_over_time

In [39]:
df_1 = df[df.topic == -1]
docs_1 = df_1.text.to_list()
embeddings_1 = np.asarray(df_1.embedding.to_list())

In [37]:
topic_model_1 = BERTopic(vectorizer_model=vectorizer_model, low_memory=True, verbose=True)

In [40]:
topics_1, _ = topic_model_1.fit_transform(docs_1, embeddings_1)

2021-04-04 10:33:58,126 - BERTopic - Reduced dimensionality with UMAP
2021-04-04 10:34:00,273 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [43]:
topic_model_1.visualize_topics()