In [1]:
from sentence_transformers import SentenceTransformer
import sys

from bertopic import BERTopic
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
import numpy as np
import pandas as pd

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
umap_model = UMAP(n_components=5, n_neighbors=50, random_state=42, metric="cosine", verbose=True)
hdbscan_model = HDBSCAN(min_samples=20, gen_min_span_tree=True, min_cluster_size=1500, verbose=True, prediction_data=True)
topic_model = BERTopic(embedding_model=embedding_model,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       verbose=True, 
                       calculate_probabilities=True,
                       nr_topics='auto')

training_df = pd.read_feather('train.feather')



[2025-03-30 18:26:31.269] [CUML] [info] build_algo set to brute_force_knn because random_state is given


Uncomment to create the window

In [2]:
n = 5
training_df['combined_text'] = [' '.join(training_df['text'].iloc[i:i+n+1]) for i in range(len(training_df))]
training_df['expected'] = [np.mean(training_df['sponsored'].iloc[i:i+n+1]) for i in range(len(training_df))]
texts = training_df['combined_text'].tolist()
expected = training_df['expected'].tolist()

In [3]:
def text_rolling_window_generator(input_df, window_size, batch_size=1000):
    """ Generator that yields rolling window samples in batches. """
    
    batch_X, batch_y = [], []

    for doc_id in input_df['videoID'].unique():
        doc_sentences = input_df[input_df['videoID'] == doc_id]
        topics = doc_sentences['text'].tolist()
        labels_array = doc_sentences['sponsored'].values

        for i in range(len(doc_sentences) - window_size + 1):
            window_topics = ' '.join(topics[i:i+window_size])
            window_label = np.mean(labels_array[i:i+window_size])  # Soft label
            
            batch_X.append(window_topics)
            batch_y.append(window_label)

    if batch_X:  # Yield any remaining data
        return np.array(batch_X), np.array(batch_y)

In [4]:
# X_train, Y_train = text_rolling_window_generator(training_df, 5)

Uncomment this code to recalculate the embeddings (takes about 20 mins)

In [5]:
# from sentence_transformers import SentenceTransformer
# 
# # Create embeddings
# emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# embeddings = emb_model.encode(texts, show_progress_bar=True)
# 
# with open('embeddings.npy', 'wb') as f:
#     np.save(f, embeddings)

Topic Model Training

In [None]:
import pickle

embeddings = np.load('embeddings.npy')

topic_model = topic_model.fit(texts, y=expected, embeddings=embeddings)
    
# with open('BERT_Topic_list.pkl', 'wb') as f:
#     pickle.dump(topics, f)

# with open('BERT_Topic_probs.pkl', 'wb') as f:
#     pickle.dump(probs, f)
#     



2025-03-30 18:28:53,025 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


[2025-03-30 18:41:14.141] [CUML] [debug] Performing categorical intersection
[2025-03-30 18:44:49.992] [CUML] [debug] Running transform
[2025-03-30 18:44:49.992] [CUML] [debug] Building KNN Graph
[2025-03-30 18:56:59.306] [CUML] [debug] Smoothing KNN distances
[2025-03-30 18:56:59.361] [CUML] [debug] Executing fuzzy simplicial set
[2025-03-30 18:56:59.889] [CUML] [debug] Performing L1 normalization
[2025-03-30 18:57:20.847] [CUML] [debug] n_epochs=30
[2025-03-30 18:57:43.705] [CUML] [debug] Computing # of epochs for training each sample


2025-03-30 19:03:16,058 - BERTopic - Dimensionality - Completed ✓
2025-03-30 19:03:16,187 - BERTopic - Cluster - Start clustering the reduced embeddings


[2025-03-30 18:57:44.168] [CUML] [debug] Performing optimization


2025-03-30 19:25:52,617 - BERTopic - Cluster - Completed ✓
2025-03-30 19:25:52,618 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.


In [None]:
topic_model.save(
    path='./TopicModel_1500',
    serialization="safetensors",
    save_ctfidf=True,
    save_embedding_model="sentence-transformers/all-MiniLM-L6-v2"
)

In [None]:
# with open('combined_training.pkl', 'wb') as f:
#     pickle.dump(texts, f)
#     
# with open('expected_training.pkl', 'wb') as f:
#     pickle.dump(expected, f)

In [None]:
topic_model.get_topic_info()

In [None]:
# test = topic_model.transform(texts[:5])