# Named Entity Recognition Method (NER)

# Cell 1: Import Libraries

In [None]:
import os
from datetime import datetime
import pandas as pd
from transformers import pipeline
from collections import Counter


# Cell 2: Set Enviorment Variables

In [None]:
FILESPATH = "/home/tulipan16372/storage_NAS/Misc/Dani_Amaya/sentence-transformers/"
CLUSTERS_DATAFRAME_NAME = "20240820_Matt_df_cluster.csv"


# Cell 3: Load the Cluster DataFrame Containing the Documents and Cluster Assignments

In [None]:
df_cluster_path = os.path.join(FILESPATH, CLUSTERS_DATAFRAME_NAME)
df_cluster = pd.read_csv(df_cluster_path)

# Display the first few rows of the DataFrame to understand its structure
df_cluster.head()


# Cell 4: Initialize the Named Entity Recognition Model from Hugging Face Transformers

In [None]:
# Load the named entity recognition pipeline from Hugging Face
ner_pipeline = pipeline("ner")


# Cell 5: Define Function to Extract Entities for a Specific Cluster

In [None]:
def extract_entities(documents):
    # Extract named entities from the documents
    entities = []
    for doc in documents:
        ner_results = ner_pipeline(doc)
        # Filter for high-confidence entities
        entities.extend([entity['word'] for entity in ner_results if entity['score'] > 0.9])
    
    # Get the most common entities
    return Counter(entities).most_common(10)


# Cell 6: Extract and Display Entities for a Specific Cluster

In [None]:
# Specify which cluster number you want to analyze
cluster_number = 0  # Example: Cluster 0

# Get the documents belonging to the specified cluster, and remove NaN values
documents_in_cluster = df_cluster[df_cluster['cluster'] == cluster_number]['documents'].dropna().tolist()

# Extract entities from the documents in this cluster
top_entities = extract_entities(documents_in_cluster)

# Display the top entities
print(f"Top Entities in Cluster {cluster_number}: {top_entities}")


# Cell 7: Extract and Display Entities for All Clusters

In [None]:
# Extract and display the top entities for all clusters
for cluster in df_cluster['cluster'].unique():
    print(f"\nCluster {cluster}:")
    
    # Get the documents belonging to the specified cluster, and remove NaN values
    documents_in_cluster = df_cluster[df_cluster['cluster'] == cluster]['documents'].dropna().tolist()
    
    # Extract entities from the documents in this cluster
    top_entities = extract_entities(documents_in_cluster)
    
    # Display the top entities
    print(f"Top Entities: {top_entities}")
