# Load data

In [None]:
pip install BERTopic

Collecting BERTopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from BERTopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from BERTopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from BERTopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [None]:
# import library
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
import pandas as pd

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora

## Connect to google drive

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
file_id = '1Le_H_t1UVFVBBUjmxZtgfifSmRv6jPdS'
download = drive.CreateFile({'id': file_id})
download.GetContentFile('NYT_text_cleaned.parquet')

In [None]:
df = pd.read_parquet("NYT_text_cleaned.parquet")

In [None]:
df.head()

Unnamed: 0,id,text_cleaned,pub_date
0,0,alabama bull way past washington reach title g...,2017-01-01 00:29:45+00:00
1,1,allan williams first manager beatles 86. mr. w...,2017-01-01 01:29:34+00:00
2,2,quotation day quotation day sunday january thi...,2017-01-01 02:32:59+00:00
3,3,trump promise revelation hack mr. trump say kn...,2017-01-01 04:13:20+00:00
4,4,clemson pound ohio state set rematch alabama s...,2017-01-01 04:15:48+00:00


## Filter data

In [None]:
# filter 2018, 2017+2018
df_2018 = df[df['pub_date'].dt.year == 2018].reset_index()
df_2017_18 = df[(df['pub_date'].dt.year == 2018) | (df['pub_date'].dt.year == 2017)].reset_index()
df_2019_2022 = df[(df['pub_date'].dt.year >= 2019) & (df['pub_date'].dt.year < 2023)].reset_index()
df_2023 = df[df['pub_date'].dt.year == 2023].reset_index()
data = df_2018['text_cleaned'].to_list()
train_data = df_2019_2022['text_cleaned'].to_list()
predict_data = df_2023['text_cleaned'].to_list()

# Baseline Bert: cite https://medium.com/@cd_24/bertopic-fine-tune-parameters-76c3377016fb

In [None]:
model_B = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics_B, probs_B = model_B.fit_transform(data)

2023-11-27 23:18:37,466 - BERTopic - Embedding - Transforming documents to embeddings.


.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/1654 [00:00<?, ?it/s]

2023-11-27 23:19:33,179 - BERTopic - Embedding - Completed ✓
2023-11-27 23:19:33,181 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-27 23:20:48,885 - BERTopic - Dimensionality - Completed ✓
2023-11-27 23:20:48,891 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-27 23:44:27,098 - BERTopic - Cluster - Completed ✓
2023-11-27 23:44:27,122 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-27 23:44:31,763 - BERTopic - Representation - Completed ✓


In [None]:
model_B.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,18550,-1_new_film_book_woman,"[new, film, book, woman, year, say, one, life,...",
1,0,1209,0_marry_officiate_couple_meet,"[marry, officiate, couple, meet, bride, groom,...",
2,1,1142,1_restaurant_recipe_chef_cook,"[restaurant, recipe, chef, cook, chicken, food...",
3,2,1012,2_briefing_signup_email_need,"[briefing, signup, email, need, want, start, k...",
4,3,742,3_yankee_mets_baseball_sox,"[yankee, mets, baseball, sox, inning, league, ...",
...,...,...,...,...,...
555,554,10,554_quart_alissa_lowrey_inequality,"[quart, alissa, lowrey, inequality, income, st...",
556,555,10,555_globe_metoo_golden_oscar,"[globe, metoo, golden, oscar, award, ceremony,...",
557,556,10,556_tennessee_bredesen_blackburn_marsha,"[tennessee, bredesen, blackburn, marsha, phil,...",
558,557,10,557_architect_koolhaas_pavilion_serpentine,"[architect, koolhaas, pavilion, serpentine, re...",


-1 refers to all outliers and should typically be ignored. Next, let's take a look at the most frequent topic that was generated, topic 0:

In [None]:
model_B.get_topic(0)

[['marry', 0.03676227706889394],
 ['officiate', 0.03262946807415625],
 ['couple', 0.028706880938118767],
 ['meet', 0.01971698928431528],
 ['bride', 0.01932685355266095],
 ['groom', 0.01749264009533819],
 ['rev', 0.015662747100919094],
 ['ceremony', 0.015258849326913706],
 ['rabbi', 0.014807063191529534],
 ['universal', 0.013329199612947158]]

In [None]:
model_B.get_topics()

{-1: [['new', 0.001627123366229526],
  ['film', 0.0016115102758529967],
  ['book', 0.0016000632846107279],
  ['woman', 0.001584572984496359],
  ['year', 0.0015843840207209886],
  ['say', 0.0015633483595091078],
  ['one', 0.0015548451366268056],
  ['life', 0.0015377463724217482],
  ['make', 0.0015239971801174362],
  ['like', 0.0015186995603894708]],
 0: [['marry', 0.03676227706889394],
  ['officiate', 0.03262946807415625],
  ['couple', 0.028706880938118767],
  ['meet', 0.01971698928431528],
  ['bride', 0.01932685355266095],
  ['groom', 0.01749264009533819],
  ['rev', 0.015662747100919094],
  ['ceremony', 0.015258849326913706],
  ['rabbi', 0.014807063191529534],
  ['universal', 0.013329199612947158]],
 1: [['restaurant', 0.014505239926406866],
  ['recipe', 0.013760397015270664],
  ['chef', 0.013601239508052006],
  ['cook', 0.013454567312601803],
  ['chicken', 0.010260762562721572],
  ['food', 0.010192787833001711],
  ['dish', 0.009776541040694154],
  ['sauce', 0.0069716840137441175],
  [

558 topics according from based line

In [None]:
model_B.get_document_info(data)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,tom brokaw find entire world inside hospital a...,18,18_health_care_insurance_drug,"[health, care, insurance, drug, affordable, me...",[get health coverage obamacare period enrol in...,health - care - insurance - drug - affordable ...,0.081337,False
1,new york family among american kill costa rica...,65,65_crash_plane_flight_pilot,"[crash, plane, flight, pilot, airline, helicop...",[russian plane crash kill aboard saratov airli...,crash - plane - flight - pilot - airline - hel...,0.006912,False
2,wall love outside jail family turn side wareho...,14,14_apartment_rent_landlord_coop,"[apartment, rent, landlord, coop, building, ho...",[new york city sue landlord refuse government ...,apartment - rent - landlord - coop - building ...,0.024574,False
3,sentient-being diet make new year resolution h...,-1,-1_new_film_book_woman,"[new, film, book, woman, year, say, one, life,...",[executive behind facebook china charm campaig...,new - film - book - woman - year - say - one -...,0.090737,False
4,belittle egypt egypt ambassador assert country...,224,224_egypt_cairo_egyptian_abdel,"[egypt, cairo, egyptian, abdel, elsisi, fattah...",[egypt sentence death attack christian right g...,egypt - cairo - egyptian - abdel - elsisi - fa...,0.030297,False
...,...,...,...,...,...,...,...,...
52916,divided town seek common ground town dealt con...,-1,-1_new_film_book_woman,"[new, film, book, woman, year, say, one, life,...",[executive behind facebook china charm campaig...,new - film - book - woman - year - say - one -...,0.461055,False
52917,ukraine russia battle orthodoxy schism loom cl...,39,39_ukraine_putin_moscow_russia,"[ukraine, putin, moscow, russia, russian, vlad...",[russia-ukraine tie sour far moscow imposes sa...,ukraine - putin - moscow - russia - russian - ...,0.106224,False
52918,héctor timerman argentine ex-foreign minister ...,428,428_argentina_buenos_aire_argentine,"[argentina, buenos, aire, argentine, macri, ma...",[argentina take emergency step shore peso pres...,argentina - buenos - aire - argentine - macri ...,1.000000,False
52919,hope green new year democrat pas legislation y...,-1,-1_new_film_book_woman,"[new, film, book, woman, year, say, one, life,...",[executive behind facebook china charm campaig...,new - film - book - woman - year - say - one -...,0.886727,False


In [None]:
model_B.save("Base_BERTopic_without_finetune", serialization="safetensors")

## Load model_B

In [None]:
model_B = BERTopic.load('model_B')

In [None]:
model_B.visualize_topics()

In [None]:
model_B.visualize_barchart()

In [None]:
topics = model_B.get_topics()

# Finetune BERT only with 2018 data

In [None]:
# Evaluation cite: https://www.theanalyticslab.nl/topic-modeling-with-bertopic/
def eval_cv(model, topics, data):
  documents = pd.DataFrame({"Document": data,
                            "ID": range(len(data)),
                            "Topic": topics})
  documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
  cleaned_docs = model._preprocess_text(documents_per_topic.Document.values)

  # Extract vectorizer and analyzer from BERTopic
  vectorizer = model.vectorizer_model
  analyzer = vectorizer.build_analyzer()

  # Extract features for Topic Coherence evaluation
  words = vectorizer.get_feature_names_out()
  tokens = [analyzer(doc) for doc in cleaned_docs]
  dictionary = corpora.Dictionary(tokens)
  corpus = [dictionary.doc2bow(token) for token in tokens]
  topic_words = [[words for words, _ in model.get_topic(topic)]
                for topic in range(len(set(topics))-1)]

  # Evaluate
  coherence_model = CoherenceModel(topics=topic_words,
                                  texts=tokens,
                                  corpus=corpus,
                                  dictionary=dictionary,
                                  coherence='c_v')
  coherence = coherence_model.get_coherence()
  return coherence

def eval_umass(model, topics,data):
  documents = pd.DataFrame({"Document": data,
                            "ID": range(len(data)),
                            "Topic": topics})
  documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
  cleaned_docs = model._preprocess_text(documents_per_topic.Document.values)

  # Extract vectorizer and analyzer from BERTopic
  vectorizer = model.vectorizer_model
  analyzer = vectorizer.build_analyzer()

  # Extract features for Topic Coherence evaluation
  words = vectorizer.get_feature_names_out()
  tokens = [analyzer(doc) for doc in cleaned_docs]
  dictionary = corpora.Dictionary(tokens)
  corpus = [dictionary.doc2bow(token) for token in tokens]
  topic_words = [[words for words, _ in model.get_topic(topic)]
                for topic in range(len(set(topics))-1)]

  # Evaluate
  coherence_model = CoherenceModel(topics=topic_words,
                                  texts=tokens,
                                  corpus=corpus,
                                  dictionary=dictionary,
                                  coherence='u_mass')
  coherence = coherence_model.get_coherence()
  return coherence

In [None]:
# Define a list of parameters to try for UMAP
umap_params = [
    {'n_neighbors': 15, 'n_components': 2, 'min_dist': 0.1},
    {'n_neighbors': 10, 'n_components': 2, 'min_dist': 0.01},
    {'n_neighbors': 3, 'n_components': 2, 'min_dist': 0.001}
]

# Define a list of parameters to try for HDBSCAN
hdbscan_params = [
    {'min_cluster_size': 100, 'min_samples': 100},
    {'min_cluster_size': 50, 'min_samples': 70},
    {'min_cluster_size': 5, 'min_samples': 50}
]

c_v = []
u_mass = []

vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Loop over the parameter combinations and fit BERTopic models
for umap_param in umap_params:
    for hdbscan_param in hdbscan_params:
        # Create UMAP and HDBSCAN models with the current parameter combination
        umap_model = UMAP(**umap_param)
        hdbscan_model = HDBSCAN(**hdbscan_param, gen_min_span_tree=True, prediction_data=True)

        # Fit a BERTopic model with the current parameter combination
        model = BERTopic(
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            top_n_words=10,
            language='english',
            calculate_probabilities=True,
            verbose=True,
            n_gram_range=(1, 2),
            vectorizer_model=vectorizer_model,
            ctfidf_model=ctfidf_model
        )
        topics, probs = model.fit_transform(data)

        # save model
        model.save(f'{umap_param}_{hdbscan_param}', serialization="safetensors")

        # Visualize Interactive graph and save the figure to an HTML file
        fig1 = model.visualize_topics()
        fig1.write_html(f'model_topicmap_{umap_param}_hdbscan_{hdbscan_param}.html')
        fig2 = model.visualize_barchart()
        fig2.write_html(f'model_barchart_{umap_param}_hdbscan_{hdbscan_param}.html')


        # Evaluation
        cohence_cv = eval_cv(model, topics)
        cohence_umass = eval_umass(model, topics)
        c_v.append(cohence_cv)
        u_mass.append(cohence_umass)


2023-11-28 23:22:35,056 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1654 [00:00<?, ?it/s]

2023-11-28 23:23:19,627 - BERTopic - Embedding - Completed ✓
2023-11-28 23:23:19,633 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-28 23:24:06,892 - BERTopic - Dimensionality - Completed ✓
2023-11-28 23:24:06,895 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-28 23:24:26,865 - BERTopic - Cluster - Completed ✓
2023-11-28 23:24:26,881 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-28 23:24:31,204 - BERTopic - Representation - Completed ✓
2023-11-28 23:26:02,906 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1654 [00:00<?, ?it/s]

2023-11-28 23:26:46,173 - BERTopic - Embedding - Completed ✓
2023-11-28 23:26:46,175 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-28 23:27:34,883 - BERTopic - Dimensionality - Completed ✓
2023-11-28 23:27:34,885 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-28 23:28:03,203 - BERTopic - Cluster - Completed ✓
2023-11-28 23:28:03,227 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-28 23:28:08,598 - BERTopic - Representation - Completed ✓
2023-11-28 23:29:45,920 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1654 [00:00<?, ?it/s]

2023-11-28 23:30:30,885 - BERTopic - Embedding - Completed ✓
2023-11-28 23:30:30,887 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-28 23:31:19,345 - BERTopic - Dimensionality - Completed ✓
2023-11-28 23:31:19,347 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-28 23:32:54,293 - BERTopic - Cluster - Completed ✓
2023-11-28 23:32:54,309 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-28 23:32:59,471 - BERTopic - Representation - Completed ✓
2023-11-28 23:34:57,742 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1654 [00:00<?, ?it/s]

2023-11-28 23:35:42,512 - BERTopic - Embedding - Completed ✓
2023-11-28 23:35:42,513 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-28 23:36:21,163 - BERTopic - Dimensionality - Completed ✓
2023-11-28 23:36:21,170 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-28 23:36:48,982 - BERTopic - Cluster - Completed ✓
2023-11-28 23:36:49,006 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-28 23:36:52,893 - BERTopic - Representation - Completed ✓
2023-11-28 23:38:21,092 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1654 [00:00<?, ?it/s]

2023-11-28 23:39:06,163 - BERTopic - Embedding - Completed ✓
2023-11-28 23:39:06,168 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-28 23:39:45,136 - BERTopic - Dimensionality - Completed ✓
2023-11-28 23:39:45,138 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-28 23:40:24,461 - BERTopic - Cluster - Completed ✓
2023-11-28 23:40:24,481 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-28 23:40:28,278 - BERTopic - Representation - Completed ✓
2023-11-28 23:42:27,337 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1654 [00:00<?, ?it/s]

2023-11-28 23:43:12,822 - BERTopic - Embedding - Completed ✓
2023-11-28 23:43:12,825 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-28 23:43:52,915 - BERTopic - Dimensionality - Completed ✓
2023-11-28 23:43:52,917 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-28 23:45:46,161 - BERTopic - Cluster - Completed ✓
2023-11-28 23:45:46,185 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-28 23:45:52,623 - BERTopic - Representation - Completed ✓
2023-11-28 23:48:12,657 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1654 [00:00<?, ?it/s]

2023-11-28 23:48:57,160 - BERTopic - Embedding - Completed ✓
2023-11-28 23:48:57,164 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-28 23:49:25,639 - BERTopic - Dimensionality - Completed ✓
2023-11-28 23:49:25,641 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-28 23:49:56,200 - BERTopic - Cluster - Completed ✓
2023-11-28 23:49:56,216 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-28 23:49:59,960 - BERTopic - Representation - Completed ✓
2023-11-28 23:51:34,541 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1654 [00:00<?, ?it/s]

2023-11-28 23:52:19,439 - BERTopic - Embedding - Completed ✓
2023-11-28 23:52:19,440 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-28 23:52:46,700 - BERTopic - Dimensionality - Completed ✓
2023-11-28 23:52:46,704 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-28 23:54:06,363 - BERTopic - Cluster - Completed ✓
2023-11-28 23:54:06,379 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-28 23:54:11,198 - BERTopic - Representation - Completed ✓
2023-11-28 23:56:02,109 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1654 [00:00<?, ?it/s]

2023-11-28 23:56:45,960 - BERTopic - Embedding - Completed ✓
2023-11-28 23:56:45,963 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-28 23:57:14,491 - BERTopic - Dimensionality - Completed ✓
2023-11-28 23:57:14,493 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-29 00:03:36,045 - BERTopic - Cluster - Completed ✓
2023-11-29 00:03:36,064 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-29 00:03:42,957 - BERTopic - Representation - Completed ✓


In [None]:
c_v

[0.8540061889774897,
 0.842063666426194,
 0.7668698105364173,
 0.8656594866071105,
 0.8401197809188442,
 0.7592871408997195,
 0.7830626425847975,
 0.7338149722681169,
 0.6217899002482835]

c_v:

[0.8540061889774897,
 0.842063666426194,
 0.7668698105364173,
 0.8656594866071105,
 0.8401197809188442,
 0.7592871408997195,
 0.7830626425847975,
 0.7338149722681169,
 0.6217899002482835]

In [None]:
u_mass

[-0.7525695776795867,
 -0.9117410521582159,
 -1.027985490306059,
 -0.7972076879064294,
 -0.9924612594986546,
 -1.1392225016626227,
 -0.8174459810419653,
 -1.093550261202448,
 -1.2175654633904427]

u_mass:

[-0.7525695776795867,
 -0.9117410521582159,
 -1.027985490306059,
 -0.7972076879064294,
 -0.9924612594986546,
 -1.1392225016626227,
 -0.8174459810419653,
 -1.093550261202448,
 -1.2175654633904427]

we need high c_v score and low u_mass; in this case, we'll select

umap_params = [
    {'n_neighbors': 10, 'n_components': 2, 'min_dist': 0.01}
]

hdbscan_params = [
    {'min_cluster_size': 100, 'min_samples': 100}
]

this combination will have c_v: 0.8656594866071105 and u_mass: -0.7972076879064294

# Finetune Bert with 2017&2018 data

In [None]:
data = df_2017_18['text_cleaned'].to_list()

In [None]:
# same code as previous fintune but remember to run eval function first
# Define a list of parameters to try for UMAP
umap_params = [
    {'n_neighbors': 15, 'n_components': 2, 'min_dist': 0.1},
    {'n_neighbors': 10, 'n_components': 2, 'min_dist': 0.01},
    {'n_neighbors': 3, 'n_components': 2, 'min_dist': 0.001}
]

# Define a list of parameters to try for HDBSCAN
hdbscan_params = [
    {'min_cluster_size': 100, 'min_samples': 100},
    {'min_cluster_size': 50, 'min_samples': 70},
    {'min_cluster_size': 5, 'min_samples': 50}
]

c_v = []
u_mass = []

vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Loop over the parameter combinations and fit BERTopic models
for umap_param in umap_params:
    for hdbscan_param in hdbscan_params:
        # Create UMAP and HDBSCAN models with the current parameter combination
        umap_model = UMAP(**umap_param)
        hdbscan_model = HDBSCAN(**hdbscan_param, gen_min_span_tree=True, prediction_data=True)

        # Fit a BERTopic model with the current parameter combination
        model = BERTopic(
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            top_n_words=10,
            language='english',
            calculate_probabilities=True,
            verbose=True,
            n_gram_range=(1, 2),
            vectorizer_model=vectorizer_model,
            ctfidf_model=ctfidf_model
        )
        topics, probs = model.fit_transform(data)

        # save model
        #model.save(f'{umap_param}_{hdbscan_param}', serialization="safetensors")

        # Visualize Interactive graph and save the figure to an HTML file
        fig1 = model.visualize_topics()
        fig1.write_html(f'model_topicmap_{umap_param}_hdbscan_{hdbscan_param}.html')
        fig2 = model.visualize_barchart()
        fig2.write_html(f'model_barchart_{umap_param}_hdbscan_{hdbscan_param}.html')


        # Evaluation
        cohence_cv = eval_cv(model, topics)
        cohence_umass = eval_umass(model, topics)
        c_v.append(cohence_cv)
        u_mass.append(cohence_umass)

2023-11-29 00:22:50,066 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3352 [00:00<?, ?it/s]

2023-11-29 00:24:25,918 - BERTopic - Embedding - Completed ✓
2023-11-29 00:24:25,920 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-29 00:26:15,496 - BERTopic - Dimensionality - Completed ✓
2023-11-29 00:26:15,499 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-29 00:28:08,773 - BERTopic - Cluster - Completed ✓
2023-11-29 00:28:08,801 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-29 00:28:16,157 - BERTopic - Representation - Completed ✓
2023-11-29 00:31:53,706 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3352 [00:00<?, ?it/s]

2023-11-29 00:33:24,130 - BERTopic - Embedding - Completed ✓
2023-11-29 00:33:24,132 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-29 00:35:12,175 - BERTopic - Dimensionality - Completed ✓
2023-11-29 00:35:12,182 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-29 00:38:50,509 - BERTopic - Cluster - Completed ✓
2023-11-29 00:38:50,537 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-29 00:38:58,738 - BERTopic - Representation - Completed ✓
2023-11-29 00:43:16,839 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3352 [00:00<?, ?it/s]

2023-11-29 00:44:47,168 - BERTopic - Embedding - Completed ✓
2023-11-29 00:44:47,169 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-29 00:46:37,722 - BERTopic - Dimensionality - Completed ✓
2023-11-29 00:46:37,728 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-29 00:57:52,569 - BERTopic - Cluster - Completed ✓
2023-11-29 00:57:52,598 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-29 00:58:00,278 - BERTopic - Representation - Completed ✓
2023-11-29 01:03:22,644 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3352 [00:00<?, ?it/s]

2023-11-29 01:04:52,448 - BERTopic - Embedding - Completed ✓
2023-11-29 01:04:52,450 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-29 01:06:23,656 - BERTopic - Dimensionality - Completed ✓
2023-11-29 01:06:23,659 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-29 01:08:53,670 - BERTopic - Cluster - Completed ✓
2023-11-29 01:08:53,698 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-29 01:09:02,243 - BERTopic - Representation - Completed ✓
2023-11-29 01:13:29,451 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3352 [00:00<?, ?it/s]

2023-11-29 01:15:00,878 - BERTopic - Embedding - Completed ✓
2023-11-29 01:15:00,879 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-29 01:16:31,587 - BERTopic - Dimensionality - Completed ✓
2023-11-29 01:16:31,589 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-29 01:21:39,733 - BERTopic - Cluster - Completed ✓
2023-11-29 01:21:39,775 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-29 01:21:47,442 - BERTopic - Representation - Completed ✓
2023-11-29 01:26:46,591 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3352 [00:00<?, ?it/s]

2023-11-29 01:28:19,126 - BERTopic - Embedding - Completed ✓
2023-11-29 01:28:19,127 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-29 01:29:49,954 - BERTopic - Dimensionality - Completed ✓
2023-11-29 01:29:49,956 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-29 01:43:06,541 - BERTopic - Cluster - Completed ✓
2023-11-29 01:43:06,573 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-29 01:43:15,838 - BERTopic - Representation - Completed ✓
2023-11-29 01:49:42,180 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3352 [00:00<?, ?it/s]

2023-11-29 01:51:15,179 - BERTopic - Embedding - Completed ✓
2023-11-29 01:51:15,181 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-29 01:52:26,363 - BERTopic - Dimensionality - Completed ✓
2023-11-29 01:52:26,366 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-29 01:56:04,069 - BERTopic - Cluster - Completed ✓
2023-11-29 01:56:04,111 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-29 01:56:13,664 - BERTopic - Representation - Completed ✓
2023-11-29 02:00:54,579 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3352 [00:00<?, ?it/s]

2023-11-29 02:02:22,593 - BERTopic - Embedding - Completed ✓
2023-11-29 02:02:22,595 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-29 02:03:30,969 - BERTopic - Dimensionality - Completed ✓
2023-11-29 02:03:30,972 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-29 02:16:38,964 - BERTopic - Cluster - Completed ✓
2023-11-29 02:16:38,994 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-29 02:16:49,245 - BERTopic - Representation - Completed ✓
2023-11-29 02:22:14,700 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3352 [00:00<?, ?it/s]

2023-11-29 02:23:43,123 - BERTopic - Embedding - Completed ✓
2023-11-29 02:23:43,125 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-11-29 02:24:53,722 - BERTopic - Dimensionality - Completed ✓
2023-11-29 02:24:53,724 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-11-29 03:31:01,615 - BERTopic - Cluster - Completed ✓
2023-11-29 03:31:01,651 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-11-29 03:31:09,542 - BERTopic - Representation - Completed ✓


In [None]:
c_v

[0.8323685257192155,
 0.8137693076961446,
 0.7454709797252583,
 0.8360822648621891,
 0.802255021683912,
 0.7426428404826131,
 0.7545055142247848,
 0.6639422838733615,
 0.5929901520381302]

c_v

[0.8323685257192155,
 0.8137693076961446,
 0.7454709797252583,
 0.8360822648621891,
 0.802255021683912,
 0.7426428404826131,
 0.7545055142247848,
 0.6639422838733615,
 0.5929901520381302]

In [None]:
u_mass

[-0.9115246890303869,
 -1.0469220514031312,
 -1.1622055995530238,
 -0.9714059087230608,
 -1.1998388724864886,
 -1.2095449517266632,
 -1.0610220390916911,
 -1.3304712019945484,
 -1.3417840651696404]

u_mass

[-0.9115246890303869,
 -1.0469220514031312,
 -1.1622055995530238,
 -0.9714059087230608,
 -1.1998388724864886,
 -1.2095449517266632,
 -1.0610220390916911,
 -1.3304712019945484,
 -1.3417840651696404]

umap_params = [
    {'n_neighbors': 10, 'n_components': 2, 'min_dist': 0.01}
]

hdbscan_params = [
    {'min_cluster_size': 100, 'min_samples': 100}
]

This combination has best cohence score

# Train Topic Model Using 2019-22 data

In [None]:
umap_param = {'n_neighbors': 10, 'n_components': 2, 'min_dist': 0.01}

hdbscan_param = {'min_cluster_size': 100, 'min_samples': 100}

vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) # change another flexible one to adjust frequency words

umap_model = UMAP(**umap_param)
hdbscan_model = HDBSCAN(**hdbscan_param, gen_min_span_tree=True, prediction_data=True)

# Fit a BERTopic model with the current parameter combination
model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    top_n_words=10,
    language='english',
    calculate_probabilities=True,
    verbose=True,
    n_gram_range=(1, 2),
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    nr_topics=30
)
topics, probs = model.fit_transform(train_data)

# save model
model.save('BERTopic_train', serialization="safetensors")

2023-12-04 05:48:16,100 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/6078 [00:00<?, ?it/s]

2023-12-04 05:51:03,853 - BERTopic - Embedding - Completed ✓
2023-12-04 05:51:03,855 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-12-04 05:57:19,283 - BERTopic - Dimensionality - Completed ✓
2023-12-04 05:57:19,288 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-12-04 06:09:17,280 - BERTopic - Cluster - Completed ✓
2023-12-04 06:09:17,287 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-12-04 06:09:35,660 - BERTopic - Representation - Completed ✓
2023-12-04 06:09:35,675 - BERTopic - Topic reduction - Reducing number of topics
2023-12-04 06:09:55,535 - BERTopic - Topic reduction - Reduced number of topics from 236 to 30


In [None]:
# Further reduce topics
# model.reduce_topics(train_data, nr_topics=30)

# Visualize Interactive graph and save the figure to an HTML file
fig1 = model.visualize_topics()
fig1.write_html(f'model_topicmap_{umap_param}_hdbscan_{hdbscan_param}.html')
fig2 = model.visualize_barchart()
fig2.write_html(f'model_barchart_{umap_param}_hdbscan_{hdbscan_param}.html')

In [None]:
model.get_topics()

{-1: [('say', 0.15067193493181222),
  ('trump', 0.15055994736576367),
  ('people', 0.1485213336614429),
  ('new', 0.1480022519621078),
  ('state', 0.14545593899827805),
  ('year', 0.14458925139629955),
  ('president', 0.14356551128573444),
  ('company', 0.14244149989103314),
  ('time', 0.14230117793566635),
  ('york', 0.1410165054415159)],
 0: [('climate', 0.25804781473974686),
  ('inflation', 0.23989263099237065),
  ('economy', 0.2270476808852037),
  ('company', 0.2235242601017766),
  ('stock', 0.21935593201193598),
  ('federal', 0.21819160832550655),
  ('reserve', 0.21730187088394223),
  ('tax', 0.21066858704852864),
  ('market', 0.2106574023419396),
  ('musk', 0.2083583735713185)],
 1: [('music', 0.30869272584382557),
  ('theater', 0.28174263499660485),
  ('art', 0.27780558996146043),
  ('artist', 0.27300304600237774),
  ('album', 0.26314732921165335),
  ('dance', 0.2594774062720424),
  ('broadway', 0.25678826145610695),
  ('song', 0.25049617439336774),
  ('fashion', 0.2470174801158

# Load Model

In [None]:
loaded_model = BERTopic.load("BERT_Model")

In [None]:
topics = loaded_model.topics_

In [None]:
vectorizer_model = CountVectorizer(stop_words="english")
# ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [None]:
vectorizer_model.fit(train_data)

In [None]:
loaded_model.vectorizer_model=vectorizer_model

In [None]:
topics, probability = loaded_model.transform(train_data)

Batches:   0%|          | 0/6078 [00:00<?, ?it/s]

2023-12-04 22:31:45,900 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


In [None]:
eval_cv(loaded_model, topics, train_data)

0.798572281500311

In [None]:
probability.shape

(194471, 30)

In [None]:
topic_col = [f'topic {k}' for k in loaded_model.get_topics().keys()]

In [None]:
df_2019_2022[topic_col] = probability

In [None]:
df_2019_2022.shape

(194471, 34)

In [None]:
df_2019_2022

Unnamed: 0,index,id,text_cleaned,pub_date,topic -1,topic 0,topic 1,topic 2,topic 3,topic 4,...,topic 19,topic 20,topic 21,topic 22,topic 23,topic 24,topic 25,topic 26,topic 27,topic 28
0,107255,107257,year crack-up treaty versailles prohibition ev...,2019-01-01 00:00:03+00:00,0.354325,0.314764,0.250457,0.318881,0.214246,0.219374,...,0.220084,0.200345,0.199709,0.196043,0.432287,0.210731,-0.001162,0.072058,0.146397,0.174210
1,107256,107258,search lose screen time imagine could money ho...,2019-01-01 00:00:07+00:00,0.348518,0.464572,0.156505,0.249597,0.211858,0.208072,...,0.124935,0.219131,0.267778,0.112582,0.366089,0.080646,0.014166,0.133316,0.242355,0.224051
2,107257,107259,warren well star receiver derailed career die ...,2019-01-01 00:08:04+00:00,0.296516,0.203885,0.245134,0.239344,0.228584,0.173901,...,0.039790,0.072611,0.183357,0.164622,0.127779,0.153292,0.003852,0.120821,0.086371,0.199546
3,107258,107260,year wolf constitution withstand partisan will...,2019-01-01 00:15:31+00:00,0.249828,0.153049,0.131532,0.358102,0.106882,0.189306,...,0.079092,0.170516,0.091143,0.174190,0.176080,0.121878,0.088516,0.119023,0.092137,0.078322
4,107259,107261,trump reign king cyrus christian right like pr...,2019-01-01 00:29:12+00:00,0.436082,0.317064,0.330554,0.334250,0.235414,0.450342,...,0.303308,0.284624,0.218822,0.223722,0.245404,0.352572,0.091932,0.087191,0.205380,0.128664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194466,301721,301724,jean franco pioneer scholar latin american lit...,2022-12-31 19:10:36+00:00,0.185726,0.059569,0.301387,0.230078,0.008236,0.168426,...,0.044770,0.034577,-0.020470,0.109698,0.147953,0.104999,-0.043533,0.059461,0.196887,0.009714
194467,301722,301725,extreme weather california cause flood landsli...,2022-12-31 19:21:09+00:00,0.296459,0.325269,0.123091,0.213622,0.243968,0.233263,...,0.112820,0.141387,0.265391,0.304063,0.153533,0.091941,0.158345,0.063430,0.020153,0.076382
194468,301723,301726,dave attell bid heartfelt hilarious farewell c...,2022-12-31 20:18:43+00:00,0.304054,0.202446,0.345855,0.363315,0.120994,0.186242,...,0.215621,0.147801,0.296762,0.212131,0.108151,0.101391,0.087343,0.032028,0.252327,0.081648
194469,301724,301727,extremely rare snowy owl sight transfixes cali...,2022-12-31 20:22:56+00:00,0.316582,0.254633,0.202880,0.337073,0.245594,0.147358,...,0.040865,0.123956,0.232003,0.176169,0.189301,0.136557,0.022683,0.043157,0.106051,0.152013


In [None]:
topic_scores_2019_2022 = df_2019_2022.groupby(df_2019_2022['pub_date'].dt.date)[topic_col].mean().reset_index()

In [None]:
topic_scores_2019_2022

Unnamed: 0,pub_date,topic -1,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,...,topic 19,topic 20,topic 21,topic 22,topic 23,topic 24,topic 25,topic 26,topic 27,topic 28
0,2019-01-01,0.330996,0.291438,0.249273,0.282799,0.194138,0.243160,0.243896,0.261627,0.191600,...,0.182909,0.209010,0.234181,0.184076,0.187595,0.167010,0.046883,0.139876,0.150303,0.133329
1,2019-01-02,0.326431,0.291077,0.234797,0.277161,0.194508,0.257241,0.229349,0.243458,0.179258,...,0.188678,0.208940,0.203907,0.172983,0.192182,0.167471,0.056221,0.124204,0.147891,0.125518
2,2019-01-03,0.325839,0.280783,0.262094,0.289386,0.181936,0.236182,0.240263,0.266779,0.173454,...,0.184726,0.216876,0.223833,0.176398,0.201078,0.165566,0.052949,0.122689,0.181733,0.137385
3,2019-01-04,0.325406,0.291218,0.238703,0.274981,0.186463,0.256959,0.245280,0.243821,0.172484,...,0.196213,0.212281,0.201511,0.184761,0.198511,0.171425,0.059795,0.124202,0.163967,0.134776
4,2019-01-05,0.355251,0.318491,0.226998,0.279491,0.216683,0.290617,0.281724,0.236730,0.214193,...,0.210384,0.221585,0.212575,0.225516,0.196948,0.191537,0.053220,0.146899,0.146065,0.125548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,2022-12-27,0.340012,0.299646,0.266241,0.299216,0.215671,0.236863,0.235321,0.249568,0.214401,...,0.201695,0.212453,0.230711,0.185030,0.215746,0.177093,0.039616,0.128825,0.176002,0.137581
1456,2022-12-28,0.336124,0.297847,0.243168,0.287664,0.231581,0.240242,0.237490,0.243640,0.191142,...,0.204514,0.216608,0.227884,0.189648,0.200392,0.177002,0.044465,0.132499,0.165232,0.116257
1457,2022-12-29,0.329127,0.292642,0.248591,0.295747,0.212935,0.221455,0.222914,0.243861,0.191341,...,0.187817,0.214190,0.238287,0.186716,0.206333,0.157840,0.041900,0.133400,0.161765,0.132305
1458,2022-12-30,0.334534,0.289416,0.266247,0.304964,0.191744,0.239372,0.242449,0.256239,0.206543,...,0.192152,0.213438,0.227416,0.196319,0.215207,0.173598,0.032431,0.143696,0.161747,0.130150


In [None]:
topic_scores_2019_2022.to_parquet("topic_scores_2019_2022.parquet", index=False)

## Predict 2023

In [None]:
topics2013, prob = loaded_model.transform(predict_data)

Batches:   0%|          | 0/525 [00:00<?, ?it/s]

2023-12-04 23:16:12,299 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


In [None]:
df_2023[topic_col] = prob

In [None]:
topic_scores_2023 = df_2023.groupby(df_2023['pub_date'].dt.date)[topic_col].mean().reset_index()

In [None]:
topic_scores_2023.to_parquet("topic_scores_2023.parquet", index=False)

In [None]:
topic_scores_2023

Unnamed: 0,pub_date,topic -1,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,...,topic 19,topic 20,topic 21,topic 22,topic 23,topic 24,topic 25,topic 26,topic 27,topic 28
0,2023-01-01,0.313714,0.273774,0.216220,0.268573,0.200243,0.245046,0.237776,0.227120,0.202037,...,0.182961,0.205092,0.227626,0.194298,0.194272,0.175338,0.050810,0.121558,0.145419,0.111443
1,2023-01-02,0.337780,0.291474,0.260387,0.299110,0.208479,0.243746,0.226300,0.259550,0.217208,...,0.219761,0.222564,0.233563,0.200722,0.204298,0.180523,0.031152,0.143341,0.174615,0.108187
2,2023-01-03,0.325721,0.292290,0.226566,0.265821,0.205515,0.258534,0.247847,0.232627,0.213091,...,0.187684,0.210991,0.214150,0.194497,0.205607,0.177029,0.066132,0.130668,0.153305,0.123480
3,2023-01-04,0.322850,0.287490,0.230236,0.266891,0.209268,0.244553,0.243694,0.238478,0.204291,...,0.190926,0.209531,0.204592,0.178404,0.198670,0.174849,0.050358,0.138373,0.159792,0.133052
4,2023-01-05,0.319439,0.267287,0.241772,0.270206,0.183980,0.258672,0.260276,0.249154,0.192265,...,0.179330,0.201967,0.200311,0.184754,0.182454,0.182693,0.075774,0.126028,0.158577,0.132268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,2023-07-01,0.337306,0.294266,0.245895,0.289306,0.194417,0.260003,0.252252,0.238538,0.202129,...,0.221367,0.205518,0.217805,0.191761,0.228244,0.179326,0.046066,0.139777,0.177037,0.118654
142,2023-08-01,0.328871,0.277770,0.247789,0.293901,0.189777,0.245729,0.242606,0.255090,0.186948,...,0.171062,0.191552,0.202266,0.180249,0.202702,0.170270,0.061091,0.127335,0.166412,0.127690
143,2023-09-01,0.344476,0.294112,0.268026,0.318108,0.203109,0.238203,0.238840,0.272418,0.191865,...,0.202843,0.207638,0.230823,0.186828,0.228212,0.172592,0.043484,0.123942,0.188812,0.122126
144,2023-10-01,0.368427,0.323951,0.239346,0.298597,0.214627,0.306356,0.293779,0.254855,0.241968,...,0.229980,0.229131,0.225912,0.230952,0.227497,0.198625,0.074077,0.143441,0.182657,0.128520
