# Topic Modeling Method (Cluster Summary Based on Keywords)

# Cell 1: Import Libraries

In [None]:
import os
import pandas as pd
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter


# Cell 2: Set Enviorment Variables

In [None]:
FILESPATH = os.environ.get("FILESPATH", "/path/to/your/test/folder/")
CLUSTERS_DATAFRAME_NAME = os.environ.get("CLUSTERS_DATAFRAME_NAME", "df_cluster.csv")


# Cell 3: Load the Cluster DataFrame Containing the Documents and Cluster Assignments

In [None]:
# Load the cluster DataFrame containing the documents and cluster assignments
df_cluster_path = "/home/tulipan16372/storage_NAS/Misc/Dani_Amaya/sentence-transformers/20240820_Matt_df_cluster.csv"

# Load the CSV file
df_cluster = pd.read_csv(df_cluster_path)

# Display the first few rows of the DataFrame to understand its structure
df_cluster.head()


# Cell 4: Function to Extract the Top Keywords from Documents in a Given Cluster Using TF-IDF

In [None]:
def extract_keywords(documents, top_n=10):
    # Initialize TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
    X = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    
    # Get the most important words for each document in the cluster
    word_counts = Counter()
    for doc_vector in X:
        indices = doc_vector.nonzero()[1]
        important_words = [feature_names[i] for i in indices]
        word_counts.update(important_words)
    
    # Return the most common words in the cluster
    return word_counts.most_common(top_n)


# Cell 5: Analyze and Display the Top 10 Keywords for a Specific Cluster

In [None]:
# Specify which cluster number you want to analyze
cluster_number = 0  # Example: Cluster 0

# Get the documents belonging to the specified cluster, and remove NaN values
documents_in_cluster = df_cluster[df_cluster['cluster'] == cluster_number]['documents'].dropna().tolist()

# Extract the top 10 keywords in this cluster
top_keywords = extract_keywords(documents_in_cluster, top_n=10)

# Display the top keywords
print(f"Top 10 Keywords in Cluster {cluster_number}: {top_keywords}")


# Cell 6: Analyze and Display the Top Keywords for All Clusters

In [None]:
# Analyze and display the top keywords for all clusters
for cluster in df_cluster['cluster'].unique():
    print(f"\nCluster {cluster}:")
    
    # Get the documents belonging to the specified cluster, and remove NaN values
    documents_in_cluster = df_cluster[df_cluster['cluster'] == cluster]['documents'].dropna().tolist()
    
    # Extract the top 10 keywords in this cluster
    top_keywords = extract_keywords(documents_in_cluster, top_n=10)
    
    # Display the top keywords
    print(f"Top 10 Keywords: {top_keywords}")
