In [None]:
import pandas as pd

In [None]:
topics = ['Speech & Audio in NLP', 'Multimodality', 'Visual Data in NLP',
                  'Structured Data in NLP', 'Programming Languages in NLP',
                  'Natural Language Interfaces', 'Question Answering',
                  'Dialogue Systems & Conversational Agents', 'Semantic Text Processing',
                  'Discourse & Pragmatics', 'Representation Learning', 'Knowledge Representation',
                  'Text Complexity', 'Semantic Search', 'Word Sense Disambiguation',
                  'Semantic Parsing', 'Language Models', 'Semantic Similarity',
                  'Sentiment Analysis', 'Opinion Mining', 'Stylistic Analysis',
                  'Intent Recognition', 'Emotion Analysis', 'Aspect-based Sentiment Analysis',
                  'Polarity Analysis', 'Syntactic Text Processing', 'Tagging', 'Morphology',
                  'Chunking', 'Phonology', 'Text Error Correction', 'Text Segmentation',
                  'Typology', 'Syntactic Parsing', 'Phonetics', 'Text Normalization',
                  'Linguistics & Cognitive NLP', 'Linguistic Theories', 'Cognitive Modeling',
                  'Psycholinguistics', 'Responsible & Trustworthy NLP', 'Responsible NLP',
                  'Ethical NLP', 'Low-Resource NLP', 'Robustness in NLP', 'Green & Sustainable NLP',
                  'Explainability & Interpretability in NLP', 'Reasoning', 'Textual Inference',
                  'Commonsense Reasoning', 'Numerical Reasoning', 'Knowledge Graph Reasoning',
                  'Machine Reading Comprehension', 'Fact & Claim Verification', 'Argument Mining',
                  'Multilinguality', 'Cross-Lingual Transfer', 'Machine Translation',
                  'Code-Switching', 'Information Retrieval', 'Indexing', 'Document Retrieval',
                  'Text Classification', 'Passage Retrieval', 'Information Extraction & Text Mining',
                  'Coreference Resolution', 'Text Clustering', 'Named Entity Recognition',
                  'Event Extraction', 'Open Information Extraction', 'Term Extraction',
                  'Topic Modeling', 'Relation Extraction', 'Text Generation',
                  'Data-to-Text Generation', 'Question Generation', 'Dialogue Response Generation',
                  'Captioning', 'Paraphrassing', 'Paraphrasing', 'Text Style Transfer',
                  'Code Generation', 'Summarization', 'Speech Recognition']

In [None]:
subtopics = ['Passage Retrieval',
 'Chunking',
 'Text Error Correction',
 'Named Entity Recognition',
 'Text Normalization',
 'Dialogue Systems & Conversational Agents',
 'Psycholinguistics',
 'Machine Translation',
 'Relation Extraction',
 'Captioning',
 'Emotion Analysis',
 'Opinion Mining',
 'Knowledge Representation',
 'Language Models',
 'Text Complexity',
 'Open Information Extraction',
 'Semantic Search',
 'Cross-Lingual Transfer',
 'Linguistic Theories',
 'Tagging',
 'Code Generation',
 'Fact & Claim Verification',
 'Commonsense Reasoning',
 'Aspect-based Sentiment Analysis',
 'Speech Recognition',
 'Coreference Resolution',
 'Speech & Audio in NLP',
 'Low-Resource NLP',
 'Machine Reading Comprehension',
 'Question Generation',
 'Term Extraction',
 'Event Extraction',
 'Text Classification',
 'Question Answering',
 'Cognitive Modeling',
 'Stylistic Analysis',
 'Discourse & Pragmatics',
 'Code-Switching',
 'Document Retrieval',
 'Data-to-Text Generation',
 'Programming Languages in NLP',
 'Semantic Similarity',
 'Word Sense Disambiguation',
 'Dialogue Response Generation',
 'Ethical NLP',
 'Text Segmentation',
 'Typology',
 'Argument Mining',
 'Morphology',
 'Textual Inference',
 'Responsible & Trustworthy NLP',
 'Text Clustering',
 'Knowledge Graph Reasoning',
 'Representation Learning',
 'Structured Data in NLP',
 'Intent Recognition',
 'Summarization',
 'Paraphrasing',
 'Green & Sustainable NLP',
 'Visual Data in NLP',
 'Explainability & Interpretability in NLP',
 'Numerical Reasoning',
 'Semantic Parsing',
 'Robustness in NLP',
 'Indexing',
 'Phonology',
 'Phonetics',
 'Syntactic Parsing',
 'Topic Modeling',
 'Polarity Analysis',
 'Text Style Transfer']

In [None]:
import os

from neo4j import GraphDatabase
import pandas as pd

class Neo4jApp:

    def __init__(self):
        uri = os.getenv("uri", "neo4j://0.0.0.0:7687")
        user = os.getenv("user", "neo4j")
        password = os.getenv("password", "neo4j-connect")
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def get_children_nodes(self, topic_name):
        with self.driver.session() as session:
            subtopics = session.execute_read(self._get_children_nodes, topic_name)
            return subtopics

    def get_parent_nodes(self, topic_name):
        with self.driver.session() as session:
            subtopics = session.execute_read(self._get_parent_nodes, topic_name)
            return subtopics
        
    def get_topic_definition(self, topic_name):
        with self.driver.session() as session:
            topic = session.execute_read(self._get_topic_definition, topic_name)
            return topic
        
   # Just for back up if we want to be more dynamic 
    def get_nlp_taxonomy(self):
        with self.driver.session() as session:
            taxonomy = session.execute_read(self._get_nlp_taxonomy)
            return taxonomy
        
    def get_total_nodes(self):
        with self.driver.session() as session:
            total_nodes = session.execute_read(self._get_total_nodes)
            return total_nodes

    def get_articles_in_topic(self, topic_name):
        with self.driver.session() as session:
            articles = session.execute_read(self._get_articles_in_topic, topic_name)
            return articles
    
        
    def get_all_articles(self):
        with self.driver.session() as session:
            articles = session.execute_read(self._get_all_articles)
            return articles
    
    def get_all_topics(self):
        with self.driver.session() as session:
            topics = session.execute_read(self._get_all_topics)
            return topics
        
    @staticmethod
    def _get_total_nodes(tx):
        query = (
            "MATCH (n) "
            "RETURN count(n) as total_nodes "
        )
        total_nodes = tx.run(query)
        total_nodes = total_nodes.single()[0]
        return total_nodes


    @staticmethod
    def _get_topic_definition(tx, topic_name: str):
        query = (
            "MATCH (n:FieldOfStudy {label: $topic_name }) "
            "RETURN n.description as description "
            "LIMIT 100 "
        )
        topic_definition = tx.run(query, topic_name=topic_name)
        topic_definition = str([row[0] for row in topic_definition])[2:-3]
        if topic_definition == "":
            result_str = f"We don't have an available definition for {topic_name}. Try to search for topics related to Natural Language Processing. You could also ask for the taxonomy."
        else: 
            result_str = f"The available definition of {topic_name} is: \n{topic_definition}"
        
        return result_str

    @staticmethod
    def _get_children_nodes(tx, topic_name):
        query = (
            "MATCH (n:FieldOfStudy {label: $topic_name }) -[]->(m:FieldOfStudy) "
            "WHERE n.level < m.level "
            "RETURN m.label "
            "LIMIT 100 "
        )
        children_nodes = tx.run(query, topic_name=topic_name)
        children_nodes = [row[0] for row in children_nodes]
        if len(children_nodes) == 0:
            return f"{topic_name} apparently doesn't have subtopics."
        children_nodes = ('\n  - ').join(children_nodes)
        if children_nodes == "":
            return "I don't know this topic. Try to search for topics related to Natural Language Processing. You could also ask for the taxonomy."
        result_str = f"Subtopics of {topic_name} are \n- {children_nodes}. \nYou can ask me for the definitions of these terms."
        
        return result_str
    
    @staticmethod
    def _get_parent_nodes(tx, topic_name):
        query = (
            "MATCH (n:FieldOfStudy {label: $topic_name }) -[]->(m:FieldOfStudy) "
            "WHERE n.level > m.level "
            "RETURN m.label "
            "LIMIT 100 "
        )
        parent_nodes = tx.run(query, topic_name=topic_name)
        parent_nodes = [row[0] for row in parent_nodes]
        if len(parent_nodes) == 0:
            return f"{topic_name} apparently doesn't have parents."
        parent_nodes = ('\n  - ').join(parent_nodes)
        if parent_nodes == "":
            return "I don't know this topic. Try to search for topics related to Natural Language Processing. You could also ask for the taxonomy."
        result_str = f"Parents of {topic_name} are \n- {parent_nodes}. \nYou can ask me for the definitions of these terms."
        
        return result_str
    
    @staticmethod
    def _get_articles_in_topic(tx, topic_name):
        query = (
            "MATCH (n:FieldOfStudy {label: $topic_name }) -[:IS_STUDIED_IN]->(p:Publication) "
            "RETURN ID(p) AS id, p.publicationTitle AS title, p.embedding AS embedding, p.publicationAbstract AS abstract, p.tldr AS tldr "
        )
        results = tx.run(query, topic_name=topic_name)
        df = pd.DataFrame(results, columns=['id', 'title', 'embedding', 'abstract', 'tldr'])
        #if len(results) == 0:
            #sreturn f"{topic_name} apparently doesn't have articles."
        return df
    
    @staticmethod
    def _get_all_topics(tx):
        query = (
            "MATCH (n:FieldOfStudy) "
            "RETURN n.label AS topic_name "
        )
        results = tx.run(query)
        df = pd.DataFrame([dict(record) for record in results], columns=['topic_name'])
        return df


    @staticmethod
    def _get_all_articles(tx):
        query = (
            "MATCH (p:Publication) "
            "RETURN p.publicationTitle AS title, p.embedding AS embedding, p.publicationAbstract AS abstract, p.tldr AS tldr "
        )
        results = tx.run(query)
        df = pd.DataFrame([dict(record) for record in results], columns=['id', 'title', 'embedding', 'abstract', 'tldr'])
        return df

    # Just for back up if we want to be more dynamic 
    @staticmethod
    def _get_nlp_taxonomy(tx):
        bullet_point0 = '\n   -- '
        bullet_point1 = '\n-- '
        level0 = 'Natural Language Processing'
        level1 = tx.run("match (n:FieldOfStudy {label: 'Natural Language Processing'})-[]->(m:FieldOfStudy) where m.level = [1] return m.label limit 100")
        level1 = [row[0] for row in level1]
        for i, topic in enumerate(level1):
            subtopics_level_2 = tx.run("match (n:FieldOfStudy {label: $topic})-[]->(m:FieldOfStudy) where m.level = [2] return m.label limit 100", topic=topic)
            level1[i] += (bullet_point0 + '\n   -- '.join([row[0] for row in subtopics_level_2]))
        taxonomy = ('- ' + level0 + '\n '+ bullet_point1 + ('\n-- ').join(level1))
        return taxonomy[0:1000]

In [None]:
def fix_embedding(df):
    df['embedding'] = df['embedding'].apply(lambda x: x[1:-1].split(','))
    df['embedding'] = df['embedding'].apply(lambda x: [float(i) for i in x])
    return df

In [None]:
import sklearn.cluster as cl
import sklearn.metrics as met
import numpy as np
import pandas as pd

def embed(df, distance_threshold=50, max_cluster_size=10, decay_rate=0.40, base=0.5):
    def fit_cluster(X, threshold):
        hier = cl.AgglomerativeClustering(n_clusters=None, distance_threshold=threshold, linkage='ward')
        hier.fit(X)
        return hier.labels_

    def subcluster(df, cluster_col, level, max_cluster_size, current_distance):
        new_cluster_col = f'cluster_level_{level}'
        updates = []

        for cluster_id in df[cluster_col].unique():
            sub_df = df[df[cluster_col] == cluster_id]
            if len(sub_df) <= max_cluster_size:
                updates.extend([(index, (cluster_id, 0)) for index in sub_df.index])
            else:
                X_sub = np.array(sub_df['embedding'].tolist())
                sub_labels = fit_cluster(X_sub, current_distance)
                updates.extend([(index, (cluster_id, sub_label)) for index, sub_label in zip(sub_df.index, sub_labels)])

        update_df = pd.DataFrame(updates, columns=['index', new_cluster_col]).set_index('index')
        df = df.merge(update_df, left_index=True, right_index=True, how='left')

        # Apply exponential decay to the distance threshold
        next_distance = current_distance * (decay_rate ** (base ** level))

        if df[new_cluster_col].apply(lambda x: x[1]).nunique() > 1:
            df = subcluster(df, new_cluster_col, level + 1, max_cluster_size, next_distance)
        return df

    # Start with initial clustering
    X = np.array(df['embedding'].tolist())
    initial_labels = fit_cluster(X, distance_threshold)
    df['cluster_level_0'] = initial_labels

    # Kick off the recursive subclustering with adjusted initial distance
    initial_distance_adjusted = distance_threshold * decay_rate
    return subcluster(df, 'cluster_level_0', 1, max_cluster_size, initial_distance_adjusted)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

class CustomTfidfVectorizer(TfidfVectorizer):
    def __init__(self, key_terms=None, stop_words='english', use_idf=True, min_df=.001, max_df=0.8, ngram_range=(2,5)):
        super(CustomTfidfVectorizer, self).__init__(stop_words=stop_words, use_idf=use_idf, min_df=min_df, max_df=max_df, ngram_range=ngram_range)
        self.key_terms = key_terms if key_terms is not None else {}

    def fit(self, raw_documents, y=None):
        super().fit(raw_documents, y=y)
        for term, value in self.key_terms.items():
            if term in self.vocabulary_:
                index = self.vocabulary_[term]
                self._tfidf.idf_[index] = value  # Manually adjust the idf
        return self

In [None]:
from pycorenlp import StanfordCoreNLP
import pandas as pd
#from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import json

nlp = StanfordCoreNLP('http://localhost:9000')

def lemmatize_and_tag(text):
    if pd.isna(text):
        return [("empty", "empty")]
    try:
        result = nlp.annotate(text, properties={
            'annotators': 'lemma, pos',
            'outputFormat': 'json'
        })

        if isinstance(result, str):
            result = json.loads(result)

        lemmas_pos = [(token['lemma'], token['pos']) for sentence in result['sentences'] for token in sentence['tokens']]
    except AssertionError as e:
        print('Error:', e)
        print('Failed text:', text)
        print('Failed result:', result)
        raise
    return lemmas_pos


In [None]:
import numpy as np
import pandas as pd
# Assuming CustomTfidfVectorizer is defined elsewhere and works similarly to sklearn's TfidfVectorizer

def calculate_tf_idf_for_subclusters(df, idf_name, tf_name, filter_list=[], key_terms=None):
    custom_vectorizer = CustomTfidfVectorizer(
        stop_words="english", 
        use_idf=True, 
        min_df=.01, 
        max_df=0.8, 
        ngram_range=(2,5),  # Adjust ngram_range to capture up to trigrams
        key_terms=key_terms
    )
    # Fit the vectorizer to the idf_name column
    custom_vectorizer.fit([" ".join([word for word, pos in lst]) for lst in df[idf_name]])

    cluster_labels = {}  # Map from cluster_id to label

    #max_level = int(df.columns.str.extract('cluster_level_(\d+)')[0].astype(float).max())

    cluster_level_cols = df.columns[df.columns.str.startswith('cluster_level_')]

    # print("cluster_level_cols", cluster_level_cols)

    # Extract the numeric part of these column names and convert to int
    cluster_levels = cluster_level_cols.str.extract('cluster_level_(\d+)')[0].astype(int)
    max_level = cluster_levels.max()

    # print("cluster_levels", cluster_levels)

    # print("max_level", max_level)

    for level in range(max_level + 1):
        cluster_col = f'cluster_level_{level}'
        tag_col = f'cluster_tag_level_{level}'
        df[tag_col] = ""

        used_labels = set()

        for cluster_id in df[cluster_col].unique():
            

            subcluster_df = df[df[cluster_col] == cluster_id]
            subcluster_nouns = [word for lst in subcluster_df[tf_name] for word, pos in set(lst) if pos.startswith('NN') and word not in filter_list]

            if subcluster_nouns:
                tfidf_matrix = custom_vectorizer.transform([' '.join(subcluster_nouns)])
                sorted_indices = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]  # Sort scores descending
                labels_attempted = 0

                for index in sorted_indices:
                    term = custom_vectorizer.get_feature_names_out()[index]
                    potential_label = f"{term}"

                    if potential_label not in used_labels:
                        used_labels.add(potential_label)
                        cluster_labels[cluster_id] = potential_label
                        break
                    else:
                        labels_attempted += 1
                        # Generate a new potential label by appending secondary terms if the primary is already used
                        if labels_attempted < len(sorted_indices):  # Ensure there are more terms to attempt
                            next_best_index = sorted_indices[labels_attempted]
                            next_best_term = custom_vectorizer.get_feature_names_out()[next_best_index]
                            potential_label = f"{term} {next_best_term}"
                            if potential_label not in used_labels:
                                used_labels.add(potential_label)
                                cluster_labels[cluster_id] = potential_label
                                break

                df.loc[df[cluster_col] == cluster_id, tag_col] = cluster_labels.get(cluster_id, "")

    return df

In [None]:
def tfidf(df_sorted, topic_names):
    #df_sorted['title_abstract'] = df_sorted['title']+df_sorted['abstract']
    #df_sorted['lemmatized_and_tagged_full'] = df_sorted['title_abstract'].apply(lemmatize_and_tag)
    df_sorted['lemmatized_and_tagged_title'] = df_sorted['title'].apply(lemmatize_and_tag)

    key_terms_example = {"quantum entanglement": 0.5, "machine learning": 0.6}
    df_sorted = calculate_tf_idf_for_subclusters(df_sorted, 'lemmatized_and_tagged_title', 'lemmatized_and_tagged_title', topic_names)    

    #df_sorted = calculate_tf_idf(df_sorted, 'lemmatized_and_tagged_full', 'lemmatized_and_tagged_title', topic_names)
    return df_sorted

In [None]:
import os

n4j = Neo4jApp()
for topic in subtopics:
    try:
        #topic_df = fix_embedding(topic_df)
        
        # Directory path
        directory = './output'

        # Check if the directory exists
        if not os.path.exists(directory):
            os.makedirs(directory)

        formatted_topic = topic.replace(' ', '_').lower()

        # Now save the DataFrame

        #topic_df.to_csv(os.path.join(directory, 'clustered_'+formatted_topic+'.csv'), index=False)        

        file_path = os.path.join(directory, 'clustered_' + formatted_topic + '.csv')

        # Check if the file already exists
        if not os.path.exists(file_path):            
            topic_df = n4j.get_articles_in_topic(topic)

            #topic_df = fix_embedding(topic_df)


            topic_df = embed(topic_df)
            word_list = topic.split()
            topic_df = tfidf(topic_df, word_list)

            # If the file does not exist, save the DataFrame to CSV
            topic_df.to_csv(file_path, index=False)
            print("Done", topic)
        else:
            pass
            # If the file exists, print a message
            #print("File already exists:", topic)
        #topic_df.to_csv('./output/clustered_'+topic+'.csv', index=False)
        
    except Exception as e:
        print(e)
        print("error", topic)
        continue