# BERTopic Cluster Analysis Method

# Cell 1: Import Libaries

In [None]:
import os
from datetime import datetime
import pandas as pd
from bertopic import BERTopic


# Cell 2: Set Enviorment Variables

In [None]:
FILESPATH = "/home/tulipan16372/storage_NAS/Misc/Dani_Amaya/sentence-transformers/"
CLUSTERS_DATAFRAME_NAME = "20240820_Matt_df_cluster.csv"


# Cell 3: Load the Cluster DataFrame Containing the Documents and Cluster Assignments

In [None]:
df_cluster_path = os.path.join(FILESPATH, CLUSTERS_DATAFRAME_NAME)
df_cluster = pd.read_csv(df_cluster_path)

# Display the first few rows of the DataFrame to understand its structure
df_cluster.head()


# Cell 4: Initialize BERTopic Model

In [None]:
# Initialize the BERTopic model
topic_model = BERTopic()


# Cell 5: Fit BERTopic Model on Documents for a Specific Cluster

In [None]:
# Specify which cluster number you want to analyze
cluster_number = 0  # Example: Cluster 0

# Get the documents belonging to the specified cluster, and remove NaN values
documents_in_cluster = df_cluster[df_cluster['cluster'] == cluster_number]['documents'].dropna().tolist()

# Fit BERTopic model on the documents for this cluster
topics, probs = topic_model.fit_transform(documents_in_cluster)

# Display the topics generated for the cluster
topic_info = topic_model.get_topic_info()
print(f"Topic Information for Cluster {cluster_number}:\n", topic_info)


# Cell 6: Fit BERTopic Model on All Clusters and Display Results

In [None]:
# Fit BERTopic model on documents for all clusters and display topics for each cluster
for cluster in df_cluster['cluster'].unique():
    print(f"\nCluster {cluster}:")
    
    # Get the documents belonging to the specified cluster, and remove NaN values
    documents_in_cluster = df_cluster[df_cluster['cluster'] == cluster]['documents'].dropna().tolist()
    
    # Fit BERTopic model on the documents for this cluster
    topics, probs = topic_model.fit_transform(documents_in_cluster)
    
    # Display the topics generated for the cluster
    topic_info = topic_model.get_topic_info()
    print(f"Topic Information:\n", topic_info)
