# Data

In [1]:
import pandas as pd
import numpy as np
from data import load_embeddings, save_file, load_file, model_path, LEXRANK_TOP1,LEXRANK_TOP3,LEXRANK_WEIGHTED, TFIDF_MORF, LEXRANK_TOP1xTFIDF, LEXRANK_TOP3xTFIDF, LEXRANK_WEIGHTEDxTFIDF

In [5]:
df = pd.read_csv(model_path + 'iii.csv', usecols=['text'])

In [14]:
docs = df.text.to_list()
embeddings = load_embeddings(LEXRANK_WEIGHTED)

# Bertopic

In [5]:
MODULE_PATH = "./BERTopic/bertopic/__init__.py"
MODULE_NAME = "bertopic"
import importlib
import sys
spec = importlib.util.spec_from_file_location(MODULE_NAME, MODULE_PATH)
module = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = module 
spec.loader.exec_module(module)

In [6]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import hdbscan

In [15]:
with open('polish_stopwords.txt') as f:
    stop_words = [x.strip() for x in f]

vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(1, 3), min_df=5)

sentence_model = SentenceTransformer("xlm-r-distilroberta-base-paraphrase-v1")

In [16]:
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=100, min_samples=1)

topic_model = BERTopic(
    embedding_model=sentence_model,
    vectorizer_model=vectorizer_model,
    hdbscan_model=hdbscan_model,
    low_memory=True,
    verbose=True)

In [17]:
topics, _ = topic_model.fit_transform(docs, umap_embeddings=embeddings)

2021-04-23 15:26:39,287 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [18]:
topic_model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,137494
1,210,13437
2,30,9596
3,17,9087
4,83,5739
...,...,...
250,22,103
251,204,102
252,232,101
253,199,101


In [19]:
plot_topics = topic_model.visualize_topics()
plot_topics.update_layout(width=1200, height=1200).write_html('plots/weighted_100x1.html', include_plotlyjs='cdn')
plot_topics

In [21]:
topic_model.save(f'{model_path}lexrank-weighted/model.pkl', save_embedding_model=False)

NameError: name 'save_file' is not defined

In [23]:
save_file(topics, 'lexrank-weighted/topics.pkl')

topic_list = sorted(list(topic_model.topics.keys()))
topic_words = {topic: " | ".join([word[0] for word in topic_model.get_topic(topic)[:5]]) for topic in topic_list}
save_file(topic_words, 'lexrank-weighted/topic-words.pkl')

In [7]:
topic_model = BERTopic.load(f'{model_path}lexrank-weighted/model.pkl', embedding_model=SentenceTransformer("xlm-r-distilroberta-base-paraphrase-v1"))

topics = load_file('lexrank-weighted/topics.pkl')

# topics_over_time = pd.read_csv(f'{model_path}topics_over_time.csv', index_col=0)

# df['topic'] = topics

In [13]:
topics_over_time = topic_model.topics_over_time(df.text.to_list(), topics, df.date.to_list(), nr_bins=56)
topics_over_time.to_csv(f'{model_path}lexrank-weighted/topics_over_time.csv')

56it [4:54:35, 315.63s/it]


In [16]:
plot_topics_over_time = topic_model.visualize_topics_over_time(topics_over_time, top_n=20)
plot_topics_over_time.update_layout(width=2000, height=1280).write_html('plots/lexrank-weighted_tot.html', include_plotlyjs='cdn')
plot_topics_over_time