# News Headlines Dataset in BERTopic

This is a template for using the News Headlines dataset in BERTopic.  Please attempt to create a notebook yourself before using this!  And note that this is a very basic template; you will likely want to clean the data, change model parameters, visualize your results, and iterate before you decide on a final model to share.

In [None]:
# imports
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired

In [None]:
# read in the data and select the column of interest
df = pd.read_csv("hf://datasets/valurank/News_headlines/final_headline_train_12000.csv")
print(df.columns)
print(len(df))
docs = df['headline'].to_list()
print(docs[:5])

In [None]:
# define default models and parameters
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
dimension_reduction_model = UMAP(n_components=2, n_neighbors = 15, metric="cosine", random_state=54382)
clustering_model = HDBSCAN(min_cluster_size=15, min_samples=1, cluster_selection_epsilon=0.165)
representation_model = KeyBERTInspired()

# define topic model pipeline with BERTopic
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=dimension_reduction_model,
    hdbscan_model=clustering_model,
    representation_model=representation_model
)

# fit the model to the docs
topic_model.fit(docs)

In [None]:
# print the resulting topics
topic_model.get_topic_info()

In [None]:
# this could be a good dataset to explore the hierarchical topic modeling bonus material...
topic_model.hierarchical_topics?

In [None]:
# define the linkage function and run the model
from scipy.cluster import hierarchy as sch
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)

In [None]:
# visualize the results
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
# print the tree
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)