# Prerequisities

In [None]:
from pathlib import Path
import os
import pandas as pd
import numpy as np

repo_path = Path('/home/krajda/anticipatio/')

# Datasets

In [None]:
tweets = pd.read_pickle(repo_path / 'data/final.pkl')
docs = tweets['txt'].tolist()
len(docs)

## Precompute Embeddings

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer

try:
    embeds = np.load(repo_path / 'data/embeds.npy')
except:
    sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
    embeds = sentence_model.encode(docs, show_progress_bar=True, convert_to_numpy=True)
    np.save(repo_path / 'data/embeds.npy',embeds)

embeds.shape

# Topic Model

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

umap_model = PCA(n_components=10)

cluster_model = KMeans(n_clusters=200)
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=10)

# Build BERTopic
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=cluster_model,
    vectorizer_model=vectorizer_model,
    low_memory=True,
    verbose=True,
    min_topic_size=100, 
    nr_topics=100, 
)

model_name = 'pca10_kmeans200_cv.pkl'

In [None]:
topic_model = topic_model.fit(docs, embeddings=embeds)


In [None]:
topic_model.save(repo_path / 'models' / model_name)