# Summarize Cluster Documents Method

# Cell 1: Import Libraries

In [None]:
import os
from datetime import datetime
import pandas as pd
from transformers import pipeline


# Cell 2: Set Enviorment Variable

In [None]:
FILESPATH = "/home/tulipan16372/storage_NAS/Misc/Dani_Amaya/sentence-transformers/"
CLUSTERS_DATAFRAME_NAME = "20240820_Matt_df_cluster.csv"


# Cell 3: Load the Cluster DataFrame Containing the Documents and Cluster Assignments

In [None]:
df_cluster_path = os.path.join(FILESPATH, CLUSTERS_DATAFRAME_NAME)
df_cluster = pd.read_csv(df_cluster_path)

# Display the first few rows of the DataFrame to understand its structure
df_cluster.head()


# Cell 4: Initialize the Summarization Model from Hugging Face Transformers

In [None]:
# Load the summarization pipeline from Hugging Face
summarizer = pipeline("summarization")


# Cell 5: Define Function to Summarize Documents for a Specific Cluster

In [None]:
def summarize_cluster(documents, max_length=130, chunk_size=512):
    # Combine all documents in the cluster into a single text
    combined_text = ' '.join(documents)
    
    # Split the combined text into chunks that fit within the model's token limit
    chunks = [combined_text[i:i + chunk_size] for i in range(0, len(combined_text), chunk_size)]
    
    # Summarize each chunk individually
    summaries = []
    for chunk in chunks:
        summary = summarizer(chunk, max_length=max_length, min_length=30, do_sample=False)
        summaries.append(summary[0]['summary_text'])
    
    # Combine all chunk summaries into a final summary
    final_summary = ' '.join(summaries)
    
    return final_summary


# Cell 6: Summarize and Display for a Specific Cluster

In [None]:
# Specify which cluster number you want to analyze
cluster_number = 0  # Example: Cluster 0

# Get the documents belonging to the specified cluster, and remove NaN values
documents_in_cluster = df_cluster[df_cluster['cluster'] == cluster_number]['documents'].dropna().tolist()

# Summarize the documents in this cluster using the updated chunking method
summary = summarize_cluster(documents_in_cluster)

# Display the summary
print(f"Summary for Cluster {cluster_number}: {summary}")


# Cell 7: Summarize and Display for First Few Clusters

In [None]:
# # Summarize and display for the first few clusters (e.g., the first 5 clusters)
# num_clusters_to_summarize = 5  # You can adjust this number to process more or fewer clusters

# for i, cluster in enumerate(df_cluster['cluster'].unique()):
#     if i >= num_clusters_to_summarize:
#         break
    
#     print(f"\nCluster {cluster}:")
    
#     # Get the documents belonging to the specified cluster, and remove NaN values
#     documents_in_cluster = df_cluster[df_cluster['cluster'] == cluster]['documents'].dropna().tolist()
    
#     # Summarize the documents in this cluster using the updated chunking method
#     summary = summarize_cluster(documents_in_cluster)
    
#     # Display the summary
#     print(f"Summary: {summary}")
