# Laws Dataset in BERTopic

This is a template for using the Laws dataset in BERTopic.  Please attempt to create a notebook yourself before using this!  And note that this is a very basic template; you will likely want to change model parameters, visualize your results, and iterate before you decide on a final model to share.

In [None]:
# imports
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired

In [None]:
# read in the data and select the column of interest
laws = pd.read_csv("../data/us_federal_laws.csv")
print(laws.columns)
print(len(laws))
docs = laws['Title'].to_list()
print(docs[:5])

In [None]:
# define default models and parameters
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
dimension_reduction_model = UMAP(n_components=2, n_neighbors = 15, metric="cosine", random_state=54382)
clustering_model = HDBSCAN(min_cluster_size=15, min_samples=1, cluster_selection_epsilon=0.165)
representation_model = KeyBERTInspired()

# define topic model pipeline with BERTopic
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=dimension_reduction_model,
    hdbscan_model=clustering_model,
    representation_model=representation_model
)

# fit the model to the docs
topic_model.fit(docs)

In [None]:
# print the resulting topics
topic_model.get_topic_info()

In [None]:
# this could be a good dataset to explore the dynamic topic modeling bonus material...
topic_model.topics_over_time?

In [None]:
# define the dates (from the original dataset) and run the model
dates = laws['date_of_passage'].to_list()
topics_over_time = topic_model.topics_over_time(docs, dates, nr_bins=20, datetime_format="%Y-%m-%d")

In [None]:
# visualize the results
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)