<a href="https://colab.research.google.com/github/maciejskorski/anticipatio/blob/main/src/BERTopic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [24]:
!pip install itables --quiet
!pip install bertopic[spacy] --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
[?25h

# Data

In [6]:
#!git clone https://github.com/maciejskorski/anticipatio.git

import pandas as pd
from pathlib import Path

def open_fn(f):
    try:
        return pd.read_csv(f,engine='python')
    except:
        return pd.DataFrame()

files = Path('anticipatio/data/futurists_kol/data').rglob('*csv')
tweets = map(open_fn, files)
tweets = pd.concat(tweets)
tweets.columns = ['index','user','timestamp','url','txt']
tweets.reset_index(drop=True,inplace=True)
print(tweets['user'].nunique(),len(tweets)) # 257 users, 1254244 tweets
tweets['txt'] = tweets['txt'].astype(str)
tweets['timestamp'] = pd.to_datetime(tweets['timestamp'])

tweets.head()
docs = tweets['txt']

Cloning into 'anticipatio'...
remote: Enumerating objects: 850, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 850 (delta 3), reused 2 (delta 0), pack-reused 838[K
Receiving objects: 100% (850/850), 119.31 MiB | 15.78 MiB/s, done.
Resolving deltas: 100% (219/219), done.
Updating files: 100% (516/516), done.
256 1254243


In [7]:
import gc
del tweets
gc.collect()

100

# Model

In [8]:
from bertopic import BERTopic

topic_model = BERTopic(min_topic_size=100, nr_topics="auto", low_memory=True, calculate_probabilities=False, verbose=True)
topics, probs = topic_model.fit_transform(docs[:200000])

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/6250 [00:00<?, ?it/s]

2023-06-03 13:05:56,419 - BERTopic - Transformed documents to Embeddings
2023-06-03 13:25:21,844 - BERTopic - Reduced dimensionality
2023-06-03 13:26:10,665 - BERTopic - Clustered reduced embeddings
2023-06-03 13:26:28,581 - BERTopic - Reduced number of topics from 169 to 118


In [None]:
#topic_model.reduce_topics(docs, nr_topics=30)
#topics = topic_model.topics_

# Topics

In [9]:
fig = topic_model.visualize_topics()
fig.show()

In [12]:
topic_model.visualize_hierarchy()

In [26]:
from itables import init_notebook_mode, show

init_notebook_mode(all_interactive=True)

topic_info = topic_model.get_topic_info()
show(topic_info)

Topic,Count,Name,Representation,Representative_Docs
Loading... (need help?),,,,


# Validation

In [13]:
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in docs]
dictionary = corpora.Dictionary(tokens)
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens,
                                 dictionary=dictionary,
                                 coherence='c_v')
coherence = coherence_model.get_coherence()
print(coherence)

0.6858124681358374
