NYT News Article Topic Modelling
==============
***With ember-v1 Embedding***

**Author:** *Qihang Tang*\
**Email:** *qt2087@nyu.edu*\
**Last Edit:** *Dec. 2023*

In [1]:
# Import Packages
import pandas as pd
import numpy as np
import collections
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
from bertopic import BERTopic
import os
import requests
from transformers import AutoModel, AutoTokenizer
import collections
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from octis.evaluation_metrics.topic_significance_metrics import KL_uniform
from octis.evaluation_metrics.coherence_metrics import Coherence
from gensim import corpora
from gensim.models import CoherenceModel
import cohere
from bertopic.representation import MaximalMarginalRelevance
from keybert import KeyBERT
import nltk
from nltk.corpus import stopwords
from bertopic.vectorizers import OnlineCountVectorizer, ClassTfidfTransformer

  from .autonotebook import tqdm as notebook_tqdm


**Warnings**: This version of BERTopic training uses CUML to speed up UMAP and HDBSCAN with GPU. You need to run this in linux enviornment.

## Data & Parameters Preparation

In [2]:
# Common stop words list for removal in tokenizer and vectorizer
# Credit to: https://gist.github.com/sebleier/554280
stop_words = ["0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz", 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'january', 'february', 'march', 'april', 'may', 'june',' july', 'august', 'september', 'october', 'november', 'december', '_________', 'day']

In [3]:
# Read in NYT articles (pre-processed version) for the training period and testing period
df = pd.read_parquet('../data/NYT_2017_2023_cleaned_texts.parquet')
df['pub_year'] = df['pub_date'].apply(lambda x: str(x)[:4])
df_search = df[df['pub_year'] == '2019']
df_base = df[df['pub_year'].isin(['2019','2020','2021','2022'])]
df_test = df[df['pub_year'] == '2023']
del df

In [4]:
# Load the embedding model from local
# As I have connection issues for loading the model online, 
# I have to first clone the model repository to the local
embedding_model = SentenceTransformer('../../../../ember-v1', device='cuda')

In [6]:
# Generate Embeddings for the documents
if os.path.isfile('../models/embeddings_ember_base_final.npy'):
    print('Embeddings exists')
    embeddings = np.load('../models/embeddings_ember_base_final.npy')
    embeddings_2019 = np.load('../models/embeddings_ember_search.npy')
else:
    embeddings = embedding_model.encode(df_base['text_cleaned'].values, show_progress_bar=True)
    embeddings_2019 = embedding_model.encode(df_search['text_cleaned'].values, show_progress_bar=True)
    with open('../models/embeddings_ember_base_final.npy', 'wb') as f:
        np.save(f, embeddings)
    f.close()
    with open('../models/embeddings_ember_search.npy', 'wb') as f:
        np.save(f, embeddings_2019)
    f.close()
    print('Embeddings generated and saved')

Batches: 100%|██████████| 6078/6078 [13:14<00:00,  7.65it/s]
Batches: 100%|██████████| 1549/1549 [03:19<00:00,  7.76it/s]


Embeddings generated and saved


In [7]:
assert embeddings.shape[0] == len(df_base)

In [8]:
# Use KeyBERT to general vocaubulary list for the tokenizier in the following steps
kw_model = KeyBERT(model = embedding_model)
keywords = kw_model.extract_keywords(df_base['text_cleaned'].values, keyphrase_ngram_range=(1, 1), stop_words=stop_words, use_mmr=True, diversity=0.7)

# Create our vocabulary
vocabulary = [k[0] for keyword in keywords for k in keyword]
vocabulary = list(set(vocabulary))

In [9]:
keywords_2019 = kw_model.extract_keywords(df_search['text_cleaned'].values, keyphrase_ngram_range=(1, 1), stop_words=stop_words, use_mmr=True, diversity=0.7)

# Create our vocabulary
vocabulary_2019 = [k[0] for keyword in keywords_2019 for k in keyword]
vocabulary_2019 = list(set(vocabulary_2019))

In [10]:
# Parameters to try for UMAP and HDBSCAN
umap_params = [
    {'n_neighbors': 5, 'n_components': 5, 'min_dist': 0.01},
    {'n_neighbors': 10, 'n_components': 5, 'min_dist': 0.01},
    {'n_neighbors': 15, 'n_components': 5, 'min_dist': 0.01},
    {'n_neighbors': 20, 'n_components': 5, 'min_dist': 0.01},
    {'n_neighbors': 5, 'n_components': 10, 'min_dist': 0.01},
    {'n_neighbors': 10, 'n_components': 10, 'min_dist': 0.01},
    {'n_neighbors': 15, 'n_components': 10, 'min_dist': 0.01},
    {'n_neighbors': 20, 'n_components': 10, 'min_dist': 0.01},
    {'n_neighbors': 5, 'n_components': 15, 'min_dist': 0.01},
    {'n_neighbors': 10, 'n_components': 15, 'min_dist': 0.01},
    {'n_neighbors': 15, 'n_components': 15, 'min_dist': 0.01},
    {'n_neighbors': 20, 'n_components': 15, 'min_dist': 0.01},
    {'n_neighbors': 5, 'n_components': 20, 'min_dist': 0.01},
    {'n_neighbors': 10, 'n_components': 20, 'min_dist': 0.01},
    {'n_neighbors': 15, 'n_components': 20, 'min_dist': 0.01},
    {'n_neighbors': 20, 'n_components': 20, 'min_dist': 0.01},
]

hdbscan_params = [
    {'min_cluster_size': 100, 'min_samples': 50},
    {'min_cluster_size': 130, 'min_samples': 50},
    {'min_cluster_size': 160, 'min_samples': 50},
    {'min_cluster_size': 200, 'min_samples': 50},
    {'min_cluster_size': 100, 'min_samples': 100},
    {'min_cluster_size': 130, 'min_samples': 100},
    {'min_cluster_size': 160, 'min_samples': 100},
    {'min_cluster_size': 200, 'min_samples': 100},
    {'min_cluster_size': 100, 'min_samples': 150},
    {'min_cluster_size': 130, 'min_samples': 150},
    {'min_cluster_size': 160, 'min_samples': 150},
    {'min_cluster_size': 200, 'min_samples': 150},
    {'min_cluster_size': 100, 'min_samples': 200},
    {'min_cluster_size': 130, 'min_samples': 200},
    {'min_cluster_size': 160, 'min_samples': 200},
    {'min_cluster_size': 200, 'min_samples': 200},
]

combinations = [[i, j] for i in umap_params for j in hdbscan_params]

## Grid Search

In [11]:
# Grid Search the best set of parameters that yields highest coherence score
scores = []
vectorizer_model = TfidfVectorizer(vocabulary=vocabulary_2019, min_df=0.05, max_df=0.6, ngram_range=(1,2), stop_words=stop_words)
# for umap_param in umap_params:
for i in tqdm(range(len(combinations))):
    umap_param = combinations[i][0]
    hdbscan_param = combinations[i][1]
    umap_model = UMAP(**umap_param, random_state=1011, metric="cosine", verbose=False)
    hdbscan_model = HDBSCAN(**hdbscan_param, gen_min_span_tree=True, prediction_data=True, verbose=False)
    topic_model_temp = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        nr_topics="auto",
        verbose=False
        )
    topics, _ = topic_model_temp.fit_transform(df_search['text_cleaned'].values, embeddings = embeddings_2019)

    df_search_eval = df_search.sample(frac=0.7, replace=False, random_state=1011)
    topics_eval, _ = topic_model_temp.transform(df_search_eval['text_cleaned'].values)

    cv = topic_model_temp.vectorizer_model
    X = cv.fit_transform(df_search_eval['text_cleaned'].values)
    doc_tokens = [text.split(" ") for text in df_search_eval['text_cleaned'].values]
    id2word = corpora.Dictionary(doc_tokens)
    texts = doc_tokens
    corpus = [id2word.doc2bow(text) for text in texts]

    topic_words = []
    for i in range(len(topic_model_temp.get_topic_freq())-1):
        interim = []
        interim = [t[0] for t in topic_model_temp.get_topic(i)]
        topic_words.append(interim)
    coherence_model = CoherenceModel(topics=topic_words, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_v')
    score = coherence_model.get_coherence()
    scores.append([umap_param, hdbscan_param, score])
        


  0%|          | 0/256 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling 

In [12]:
scores2 = sorted(scores, key=lambda x: x[-1], reverse=True)
# File path for the text file
file_path = "../data/2019_model_param_grid_search_result.txt"

# Writing the list into the file
with open(file_path, 'w') as file:
    for item in scores2:
        file.write("%s\n" % item)
file.close()

In [13]:
del embeddings_2019
del vocabulary_2019
del df_search
del df_search_eval

## Topic Modeling on training period

In [14]:
# Prepare sub-models
umap_param = scores2[0][0]
hdbscan_param = scores2[0][1]
# representation_model = MaximalMarginalRelevance(diversity=0.9)
umap_model = UMAP(**umap_param, random_state=1011, metric="cosine", verbose=True)
hdbscan_model = HDBSCAN(**hdbscan_param, gen_min_span_tree=True, prediction_data=True, verbose=True)
vectorizer_model = TfidfVectorizer(vocabulary=vocabulary, min_df=0.05, max_df=0.6, ngram_range=(1,2), stop_words=stop_words)

# Fit BERTopic without actually performing any clustering
topic_model_0 = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        # representation_model=representation_model,
        nr_topics="auto",
        calculate_probabilities=True,
        top_n_words=10,
        verbose=True
)#.fit(df_base['text_cleaned'], embeddings=embeddings)
topics, probs = topic_model_0.fit_transform(df_base['text_cleaned'].values, embeddings=embeddings)

2023-12-14 05:40:57,827 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


[D] [05:41:01.934729] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:108 n_neighbors=5
[D] [05:41:01.935891] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:130 Calling knn graph run
[D] [05:41:08.215065] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:136 Done. Calling fuzzy simplicial set
[D] [05:41:08.217261] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:317 Smooth kNN Distances
[D] [05:41:08.217587] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:319 sigmas = [ 0.643661, 0.663208, 0.622375, 0.034359, 0.513985, 0.477425, 0.544769, 0.396744, 0.0102158, 0.016504, 0.072566, 0.0924606, 0.00826383, 0.472679, 0.00546694, 0.0387754, 0.597122, 0.588959, 0.61853, 0.575676, 0.101841, 0.0509548, 0.0232587, 0.712555, 0.00444806 ]

[D] [05:41:08.218014] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:321 rhos = [ 2.98023e-07, 1.78814e-07, 1.78814e-07, 0.337969, 2.38419e-07, 1.19209e-07, 5.36442e-07, 3.57628e-07, 0.237572, 0.222602, 0.18

2023-12-14 05:41:15,854 - BERTopic - Dimensionality - Completed ✓
2023-12-14 05:41:15,864 - BERTopic - Cluster - Start clustering the reduced embeddings


[D] [05:41:15.790990] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:382 Smoothing KNN distances
[D] [05:41:15.791893] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:414 Executing fuzzy simplicial set
[D] [05:41:15.793543] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:443 Performing L1 normalization
[D] [05:41:15.795037] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:479 n_epochs=30
[D] [05:41:15.796936] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:502 Computing # of epochs for training each sample
[D] [05:41:15.797869] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:509 Performing optimization


2023-12-14 05:41:38,358 - BERTopic - Cluster - Completed ✓
2023-12-14 05:41:38,359 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-12-14 05:41:54,182 - BERTopic - Representation - Completed ✓
2023-12-14 05:41:54,184 - BERTopic - Topic reduction - Reducing number of topics
2023-12-14 05:42:08,321 - BERTopic - Topic reduction - Reduced number of topics from 107 to 37


In [15]:
# Save topic keywords
with open('../data/bertopic_keywords_bigrams_final.txt', 'w', encoding='utf-8') as file:
    for topic_num, keywords in topic_model_0.get_topics().items():
        topic_keywords = ", ".join([word for word, _ in keywords])
        file.write(f"Topic {topic_num}: {topic_keywords}\n")

In [16]:
# Evaluate the actual performance of the topic modeling on the training period
cv = topic_model_0.vectorizer_model
X = cv.fit_transform(df_base['text_cleaned'].values)
doc_tokens = [text.split(" ") for text in df_base['text_cleaned'].values]
id2word = corpora.Dictionary(doc_tokens)
texts = doc_tokens
corpus = [id2word.doc2bow(text) for text in texts]

topic_words = []
for i in range(len(topic_model_0.get_topic_freq())-1):
    interim = []
    interim = [t[0] for t in topic_model_0.get_topic(i)]
    topic_words.append(interim)
coherence_model = CoherenceModel(topics=topic_words, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print("Coherence Score:", coherence_score)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Coherence Score: 0.7866995321010798


In [17]:
topic_model_0.save(
    path='../models/model_ember_final', 
    serialization="safetensors", 
    save_ctfidf=True, 
    save_embedding_model="../../../../ember-v1"
)

## Apply topic model to the temporal testing period

In [18]:
# For the training period: 2019-2022
# Calculate topic probabilities and aggregate the probabilities for each topic into daily mean
current_topic = -1
allprobs = []
for i in range(probs.shape[1]):
    temp = [prob[i] for prob in probs]
    df_base[f'topic {i-1}'] = temp

topic_columns = []
for column in df_base.columns:
    if str(column).startswith('topic'):
        topic_columns.append(column)

df_base['date'] = df_base['pub_date'].apply(lambda x: str(x)[:10])
df_base_agg = df_base.groupby('date').mean(topic_columns)
df_base_agg.reset_index(inplace=True)
del df_base_agg['id']

df_base_agg.to_parquet('../data/topic_scores_2019_2022_Mark.parquet')

In [19]:
# For the testing period: 2023
# Calculate topic probabilities and aggregate the probabilities for each topic into daily mean
topics2, probs2 = topic_model_0.transform(list(df_test['text_cleaned'].values))

current_topic = -1
allprobs = []
for i in range(probs2.shape[1]):
    temp = [prob[i] for prob in probs2]
    df_test[f'topic {i-1}'] = temp

topic_columns = []
for column in df_test.columns:
    if str(column).startswith('topic'):
        topic_columns.append(column)

df_test['date'] = df_test['pub_date'].apply(lambda x: str(x)[:10])
df_test_agg = df_test.groupby('date').mean(topic_columns)
df_test_agg.reset_index(inplace=True)
del df_test_agg['id']

df_test_agg.to_parquet('../data/topic_scores_2023_Mark.parquet')

Batches: 100%|██████████| 525/525 [00:57<00:00,  9.16it/s]
2023-12-14 05:43:37,607 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2023-12-14 05:43:38,250 - BERTopic - Dimensionality - Completed ✓
2023-12-14 05:43:38,251 - BERTopic - Clustering - Approximating new points with `hdbscan_model`


[D] [05:43:37.696155] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:347 Running transform
[D] [05:43:37.696248] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:349 Building KNN Graph
[D] [05:43:38.240172] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:382 Smoothing KNN distances
[D] [05:43:38.240967] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:414 Executing fuzzy simplicial set
[D] [05:43:38.241281] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:443 Performing L1 normalization
[D] [05:43:38.241618] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:479 n_epochs=30
[D] [05:43:38.241987] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:502 Computing # of epochs for training each sample
[D] [05:43:38.242239] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:509 Performing optimization


2023-12-14 05:43:39,269 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2023-12-14 05:43:40,340 - BERTopic - Probabilities - Completed ✓
2023-12-14 05:43:40,341 - BERTopic - Cluster - Completed ✓
