In [34]:
import pandas as pd

In [38]:
subtopics = [
#'Passage Retrieval',
#  'Chunking',
#  'Text Error Correction',
#  'Named Entity Recognition',
#  'Text Normalization',
#'Dialogue Systems & Conversational Agents',
#  'Psycholinguistics',
#  'Machine Translation',
#  'Relation Extraction',
#  'Captioning',
#  'Emotion Analysis',
#  'Opinion Mining',
#  'Knowledge Representation',
#  'Language Models',
#  'Text Complexity',
#  'Open Information Extraction',
#  'Semantic Search',
#  'Cross-Lingual Transfer',
#  'Linguistic Theories',
#  'Tagging',
#  'Code Generation',
#  'Fact & Claim Verification',
#  'Commonsense Reasoning',
#  'Aspect-based Sentiment Analysis',
#  'Speech Recognition',
#  'Coreference Resolution',
#  'Speech & Audio in NLP',
#  'Low-Resource NLP',
#  'Machine Reading Comprehension',
#  'Question Generation',
#  'Term Extraction',
#  'Event Extraction',
#  'Text Classification',
#  'Question Answering',
#  'Cognitive Modeling',
#  'Stylistic Analysis',
#  'Discourse & Pragmatics',
#  'Code-Switching',
#  'Document Retrieval',
#  'Data-to-Text Generation',
#  'Programming Languages in NLP',
#  'Semantic Similarity',
#  'Word Sense Disambiguation',
#  'Dialogue Response Generation',
#  'Ethical NLP',
#  'Text Segmentation',
#  'Typology',
#  'Argument Mining',
#  'Morphology',
#  'Textual Inference',
 'Responsible & Trustworthy NLP',
 'Text Clustering',
 'Knowledge Graph Reasoning',
 'Representation Learning',
 'Structured Data in NLP',
 'Intent Recognition',
 'Summarization',
 'Paraphrasing',
 'Green & Sustainable NLP',
 'Visual Data in NLP',
 'Explainability & Interpretability in NLP',
 'Numerical Reasoning',
 'Semantic Parsing',
 'Robustness in NLP',
 'Indexing',
 'Phonology',
 'Phonetics',
 'Syntactic Parsing',
 'Topic Modeling',
 'Polarity Analysis',
 'Text Style Transfer']

In [41]:
import os

from neo4j import GraphDatabase
import pandas as pd

class Neo4jApp:

    def __init__(self):
        uri = os.getenv("uri", "neo4j://0.0.0.0:7687")
        user = os.getenv("user", "neo4j")
        password = os.getenv("password", "neo4j-connect")
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def insert_clusters(self, topic_name, df):
        print(f"Inserting clusters for topic {topic_name}...")    

        with self.driver.session() as session:
            session.execute_write(self._confirm_topic, topic_name)

            # Calculate the highest cluster level from column names
            cluster_level_cols = [col for col in df.columns if col.startswith('cluster_level_')]
            highest_level = max(int(col.split('_')[-1]) for col in cluster_level_cols)
            
            # Execute the transaction to insert clusters and their relationships
            session.execute_write(self._insert_clusters, topic_name, df, highest_level)

    @staticmethod
    def _confirm_topic(tx, topic_name):
        topic_query = """
        MERGE (topic:FieldOfStudy {label: $topic_name})
        """
        tx.run(topic_query, topic_name=topic_name)

    @staticmethod
    def _insert_clusters(tx, topic_name, df, highest_level):
        #print(f"Inserting clusters for topic {topic_name}...")
        # Insert Cluster Nodes
        for level in range(highest_level + 1):

            cluster_col = f'cluster_level_{level}'
            cluster_tag_col = f'cluster_tag_level_{level}'
            clusters = df[[cluster_col, cluster_tag_col]].drop_duplicates().dropna()
            for _, row in clusters.iterrows():
                cluster_path = eval(row[cluster_col]) if isinstance(row[cluster_col], str) else row[cluster_col]

                if isinstance(row[cluster_col], (str, int)):
                    cluster_id = f"{topic_name}_{row[cluster_col]}"
                else:  # it's assumed to be a tuple or list
                    cluster_path = eval(row[cluster_col]) if isinstance(row[cluster_col], str) else row[cluster_col]
                    cluster_id = f"{topic_name}_" + '_'.join(map(str, cluster_path))

                cluster_tag = row[cluster_tag_col]
                query = """
                MERGE (cluster:Cluster {id: $cluster_id})
                ON CREATE SET cluster.level = $level, cluster.tag = $cluster_tag
                ON MATCH SET cluster.level = $level, cluster.tag = $cluster_tag

                // Link cluster to the topic if it's at the root level
                WITH cluster
                MATCH (topic:FieldOfStudy {label: $topic_name})
                WHERE $level = 0
                MERGE (topic)-[:HAS_CLUSTER]->(cluster)
                
                // Always create a relationship from cluster to topic

                MERGE (cluster)-[:HAS_FIELD_OF_STUDY]->(topic)
                """
                tx.run(query, cluster_id=cluster_id, level=level, cluster_tag=cluster_tag, topic_name=topic_name)

        # Insert Relationships Between Cluster Nodes
        for level in range(highest_level):  # assuming child is one level deeper than parent
            for _, row in df.iterrows():

                parent_col = f'cluster_level_{level}'
                child_col = f'cluster_level_{level + 1}'

                if isinstance(row[parent_col], (str, int)):
                    parent_cluster_id = f"{topic_name}_{row[parent_col]}"
                else:
                    parent_path = eval(row[parent_col]) if isinstance(row[parent_col], str) else row[parent_col]
                    parent_cluster_id = f"{topic_name}_" + '_'.join(map(str, parent_path))
                
                if isinstance(row[child_col], (str, int)):
                    child_cluster_id = f"{topic_name}_{row[child_col]}"
                else:
                    child_path = eval(row[child_col]) if isinstance(row[child_col], str) else row[child_col]
                    child_cluster_id = f"{topic_name}_" + '_'.join(map(str, child_path))
                
                
                query = (
                    "MATCH (parent:Cluster {id: $parent_cluster_id}), "
                    "(child:Cluster {id: $child_cluster_id}) "
                    "MERGE (parent)-[:HAS_SUBCLUSTER]->(child)"
                    "MERGE (child)-[:HAS_PARENT_CLUSTER]->(parent)"
                )
                tx.run(query, parent_cluster_id=parent_cluster_id, child_cluster_id=child_cluster_id)

        # Link Publications to Lowest-Level Clusters
        for _, row in df.iterrows():

            publication_id = int(row['id'])
            
            lowest_cluster_col = f'cluster_level_{highest_level}'
        
            if isinstance(row[lowest_cluster_col], (str, int)):
                cluster_id = f"{topic_name}_{row[lowest_cluster_col]}"
            else:
                cluster_path = eval(row[lowest_cluster_col]) if isinstance(row[lowest_cluster_col], str) else row[lowest_cluster_col]
                cluster_id = f"{topic_name}_" + '_'.join(map(str, cluster_path))

            query = (
                "MATCH (p:Publication), (cluster:Cluster {id: $cluster_id}) "
                "WHERE ID(p) = $publication_id "
                "MERGE (p)-[:BELONGS_TO]->(cluster) "
                "MERGE (cluster)-[:HAS_PUBLICATION]->(p)"
            )

            tx.run(query, publication_id=publication_id, cluster_id=cluster_id)        

In [None]:
directory = './output'

for topic in subtopics:
    formatted_topic = topic.replace(' ', '_').lower()
    df_loaded = pd.read_csv(os.path.join(directory, 'clustered_'+formatted_topic+'.csv'))
    app = Neo4jApp()
    app.insert_clusters(topic, df_loaded)
    app.close()