<a href="https://colab.research.google.com/github/maciejskorski/anticipatio/blob/main/src/BERTopic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install bertopic[spacy] --quiet
!python -m spacy download en_core_web_md --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/143.4 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m111.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m106.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m81.2 MB/s[0m eta 

# Data

In [38]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [1]:
#!git clone https://github.com/maciejskorski/anticipatio.git

import pandas as pd
from pathlib import Path

def open_fn(f):
    try:
        return pd.read_csv(f,engine='python')
    except:
        return pd.DataFrame()

files = Path('anticipatio/data/futurists_kol/data').rglob('*csv')
tweets = map(open_fn, files)
tweets = pd.concat(tweets)
tweets.columns = ['index','user','timestamp','url','txt']
tweets.reset_index(drop=True,inplace=True)
print(tweets['user'].nunique(),len(tweets)) # 257 users, 1254244 tweets
tweets['txt'] = tweets['txt'].astype(str)
tweets['timestamp'] = pd.to_datetime(tweets['timestamp'])

tweets.head()

256 1254243


Unnamed: 0,index,user,timestamp,url,txt
0,0,@davidu,2023-02-15 05:32:31+00:00,https://twitter.com/davidu/status/162572974996...,"@craigmod Wow, what a nice but unexpected feat..."
1,1,@davidu,2023-02-13 19:30:17+00:00,https://twitter.com/davidu/status/162521580385...,Remember last year when we prohibited rail wor...
2,2,@davidu,2023-02-13 03:43:12+00:00,https://twitter.com/davidu/status/162497746124...,It's amazing how little coverage the train der...
3,3,@davidu,2023-02-12 04:50:10+00:00,https://twitter.com/davidu/status/162463192798...,@MichelleVolz American kinetics.
4,4,@davidu,2023-02-12 04:26:22+00:00,https://twitter.com/davidu/status/162462593950...,@AlmostMedia I am not in the mood. https://t.c...


# Model

In [2]:
from bertopic import BERTopic
docs = tweets['txt'][:100000]

topic_model = BERTopic(min_topic_size=20, low_memory=True, verbose=True)
topics, probs = topic_model.fit_transform(docs) 

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

2023-06-02 05:07:36,163 - BERTopic - Transformed documents to Embeddings
2023-06-02 05:09:59,690 - BERTopic - Reduced dimensionality
2023-06-02 05:10:13,098 - BERTopic - Clustered reduced embeddings


In [8]:
topic_model.reduce_topics(docs, nr_topics=30)
topics = topic_model.topics_

2023-06-02 05:48:42,744 - BERTopic - Reduced number of topics from 620 to 30


# Topics

In [9]:
fig = topic_model.visualize_topics()
fig.show()

In [14]:
topic_model.visualize_hierarchy(top_n_topics=30)

In [40]:
from gensim.models.coherencemodel import CoherenceModel

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
#tokens = [analyzer(doc) for doc in cleaned_docs]
#dictionary = corpora.Dictionary(tokens)
#corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens,
                                 dictionary=dictionary,
                                 coherence='c_v')
coherence = coherence_model.get_coherence()
print(coherence)

0.7568683259458413


In [33]:
import multiprocessing as mp
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer(preserve_case=False)

with mp.Pool(4) as pool:
    tokens = pool.map(tokenizer.tokenize, docs)

from gensim.corpora import Dictionary
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

dictionary = Dictionary(tokens)

punct_words = ['.', ',', ':', '-', '“', '\',', '(', ')', '…', ']', '[', '/']
skip_words =  punct_words + stopwords.words('english')

bad_ids = list(map(dictionary.token2id.get, skip_words))
dictionary.filter_tokens(bad_ids=bad_ids)
dictionary.filter_extremes(no_below=50)

with mp.Pool(4) as pool:
    corpus = pool.map(dictionary.doc2bow, tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
