In [47]:
from collections import defaultdict

import re
from openai import OpenAI
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer
from typing import Dict, Tuple, Optional

API_KEY = "sk-proj-..."
ABSTRACTS_PATH = "shared/xai-interactive-abstracts.txt"
GPT_MODEL = "gpt-4"  # "gpt-4o-mini"  #

In [48]:
class ClusterInfo:
    def __init__(self, cluster_id: int):
        self.cluster_id = cluster_id
        self.cluster_title: Optional[str] = None
        self.cluster_stats: Dict[str, float] = {}


class AbstractInfo:
    def __init__(self, label: str, text: str):
        self.label = label
        self.text = text
        self.analysis_type = None
        self.cluster_info: Optional[ClusterInfo] = None


class AbstractTopicsAnalyzer:
    def __init__(self, api_key: str, data_path: str, n_topics: int = 5):
        # openai.api_key = api_key
        self.client = OpenAI(api_key=api_key)
        self.data_path = data_path
        self.n_topics = n_topics
        self.abstracts = self._load_abstracts()
        self.clusters = {}
        self.vectorizer = CountVectorizer(stop_words='english')
        self.analysis_type = None

    def _load_abstracts(self) -> Dict[str, AbstractInfo]:
        """Load abstracts and IDs from text file, cleaning and merging lines."""
        abstracts = {}
        i = 0
        with open(self.data_path, "r") as file:
            content = file.read().split("\n\n")
            for abstract in content:
                lines = abstract.strip().split("\n")
                if lines:
                    label = lines[0].strip()
                    print(f"[{i}]: {label}")
                    text = " ".join(lines[1:]).replace("ZWSP", "")  # Join lines and remove "ZWSP"
                    text = re.sub(r"[^a-zA-Z0-9 ,.\-:]", "", text)  # Keep only specified characters
                    # Remove phrases like "INDEX TERMS", "Keywords" in various capitalizations
                    text = re.sub(
                        r"\b(?:[Ii][Nn][Dd][Ee][Xx]\s?[Tt][Ee][Rr][Mm][Ss]|[Kk][Ee][Yy][Ww][Oo][Rr][Dd][Ss]?)(:)?\b",
                        "", text)
                    abstracts[label] = AbstractInfo(label, text.strip())
                    i += 1
        return abstracts

    def preview_abstracts(self, n: int = 5) -> Dict[str, AbstractInfo]:
        """Return a preview of a specified number of abstracts"""
        return dict(list(self.abstracts.items())[:n])

    def perform_lda(self) -> None:
        """Run LDA on abstracts and assign topics"""
        self.analysis_type = 'LDA'
        data = [info.text for info in self.abstracts.values()]
        data_vectorized = self.vectorizer.fit_transform(data)

        lda = LatentDirichletAllocation(n_components=self.n_topics, random_state=42)
        lda.fit(data_vectorized)
        topics = lda.transform(data_vectorized)

        for i, (label, info) in enumerate(self.abstracts.items()):
            cluster_id = topics[i].argmax()
            if cluster_id not in self.clusters:
                self.clusters[cluster_id] = ClusterInfo(cluster_id)
            info.cluster_info = self.clusters[cluster_id]
            info.analysis_type = self.analysis_type

    def perform_nmf(self) -> None:
        """Run NMF on abstracts and assign topics"""
        self.analysis_type = 'NMF'
        data = [info.text for info in self.abstracts.values()]
        data_vectorized = self.vectorizer.fit_transform(data)

        nmf = NMF(n_components=self.n_topics, random_state=42)
        nmf.fit(data_vectorized)
        topics = nmf.transform(data_vectorized)

        for i, (label, info) in enumerate(self.abstracts.items()):
            cluster_id = topics[i].argmax()
            if cluster_id not in self.clusters:
                self.clusters[cluster_id] = ClusterInfo(cluster_id)
            info.cluster_info = self.clusters[cluster_id]
            info.analysis_type = self.analysis_type

    def analysis_statistics(self) -> Dict[int, Dict[str, float]]:
        """Generate statistics on the analysis results and assign to each cluster"""
        topic_counts = defaultdict(int)
        topic_probabilities = defaultdict(list)

        for info in self.abstracts.values():
            cluster_id = info.cluster_info.cluster_id
            topic_counts[cluster_id] += 1
            # Collect dummy probabilities for the example; replace with actual
            topic_probabilities[cluster_id].append(1)

        statistics = {}
        for topic, count in topic_counts.items():
            avg_prob = sum(topic_probabilities[topic]) / len(topic_probabilities[topic])
            statistics[topic] = {"count": count, "avg_probability": avg_prob}
            self.clusters[topic].cluster_stats = statistics[topic]

        return statistics

    def add_topics_to_clusters(self) -> None:
        """Assign GPT-generated titles to each cluster"""
        clusters_text = defaultdict(list)
        for info in self.abstracts.values():
            clusters_text[info.cluster_info.cluster_id].append(info.text)

        for cluster_id, abstracts in clusters_text.items():
            prompt = f"Generate a title for the following cluster:\n" + "\n".join(abstracts)
            completion = self.client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant analyzing topics for LDA."},
                    {"role": "user", "content": prompt}
                ]
            )
            title = completion.choices[0].message.content.strip()
            self.clusters[cluster_id].cluster_title = title

    def result(self) -> Dict[str, Tuple[int, str]]:
        """Return the results with cluster ID and title for each abstract"""
        return {label: (info.cluster_info.cluster_id, info.cluster_info.cluster_title) for label, info in
                self.abstracts.items()}



In [49]:
analyzer = AbstractTopicsAnalyzer(api_key=API_KEY, data_path=ABSTRACTS_PATH, n_topics=4)

[0]: booshehri2024computational​
[1]: kim2024xai
[2]: huang2022analysis​
[3]: chromik2021review​
[4]: nazar2021systematic​
[5]: shneiderman2022human​
[6]: baniecki2020grammar​
[7]: feustel2024enhancing​
[8]: rago2023interactive​
[9]: conati2023personalized​
[10]: guo2024explainability​
[11]: mozolewski2022explain​
[12]: mindlin2024measuring​
[13]: singh2024actionability​
[14]: alaqsam2024systematic​
[15]: Liao2022human​
[16]: jacovi2023trends​


In [50]:
analyzer.preview_abstracts()

{'booshehri2024computational\u200b': <__main__.AbstractInfo at 0x7fcd06da37a0>,
 'kim2024xai': <__main__.AbstractInfo at 0x7fcd06da2390>,
 'huang2022analysis\u200b': <__main__.AbstractInfo at 0x7fcd06da1730>,
 'chromik2021review\u200b': <__main__.AbstractInfo at 0x7fcd06da2570>,
 'nazar2021systematic\u200b': <__main__.AbstractInfo at 0x7fcd06da14c0>}

In [56]:
# analyzer.perform_lda()
analyzer.perform_nmf()
analyzer.analysis_statistics()

{2: {'count': 7, 'avg_probability': 1.0},
 3: {'count': 4, 'avg_probability': 1.0},
 0: {'count': 5, 'avg_probability': 1.0},
 1: {'count': 1, 'avg_probability': 1.0}}

In [57]:
analyzer.add_topics_to_clusters()

In [58]:
print(analyzer.result())

{'booshehri2024computational\u200b': (2, '"Interactive and Personalized Approaches in Explainable Artificial Intelligence (XAI): Advancements and Challenges in Explanation Generation, User Interface Design, and Model Understanding"'), 'kim2024xai': (3, '"Advancements and Challenges in Explainable Artificial Intelligence (XAI): An Investigation into Trustworthiness, Transparency, and Algorithmic Recourse in Clinical Decision Support Systems and Beyond"'), 'huang2022analysis\u200b': (2, '"Interactive and Personalized Approaches in Explainable Artificial Intelligence (XAI): Advancements and Challenges in Explanation Generation, User Interface Design, and Model Understanding"'), 'chromik2021review\u200b': (2, '"Interactive and Personalized Approaches in Explainable Artificial Intelligence (XAI): Advancements and Challenges in Explanation Generation, User Interface Design, and Model Understanding"'), 'nazar2021systematic\u200b': (0, '"Intersecting Fields of Artificial Intelligence and Human

In [59]:
for cluster_id, cluster_info in analyzer.clusters.items():
    print(f"Cluster ID: {cluster_id}")
    print(f"Cluster Title: {cluster_info.cluster_title}")

    # Find articles belonging to the current cluster
    articles_in_cluster = [
        label for label, abstract_info in analyzer.abstracts.items()
        if abstract_info.cluster_info == cluster_info
    ]

    print("Articles in Cluster:")
    for label in articles_in_cluster:
        print(f" - {label}")
    print("-" * 50)

Cluster ID: 2
Cluster Title: "Interactive and Personalized Approaches in Explainable Artificial Intelligence (XAI): Advancements and Challenges in Explanation Generation, User Interface Design, and Model Understanding"
Articles in Cluster:
 - booshehri2024computational​
 - huang2022analysis​
 - chromik2021review​
 - rago2023interactive​
 - conati2023personalized​
 - mozolewski2022explain​
 - mindlin2024measuring​
--------------------------------------------------
Cluster ID: 3
Cluster Title: "Advancements and Challenges in Explainable Artificial Intelligence (XAI): An Investigation into Trustworthiness, Transparency, and Algorithmic Recourse in Clinical Decision Support Systems and Beyond"
Articles in Cluster:
 - kim2024xai
 - shneiderman2022human​
 - feustel2024enhancing​
 - singh2024actionability​
--------------------------------------------------
Cluster ID: 0
Cluster Title: "Intersecting Fields of Artificial Intelligence and Human-Computer Interaction: A Comprehensive Study into Ex

In [None]:
# LDA + "gpt-4"
# Cluster ID: 2
# Cluster Title: "Co-construction and Human-centered Approach in Explainable AI: From Clustering Methods to Actionability Assessment"
# Articles in Cluster:
#  - booshehri2024computational​
#  - shneiderman2022human​
#  - mozolewski2022explain​
#  - singh2024actionability​
# --------------------------------------------------
# Cluster ID: 1
# Cluster Title: "Advancements and Challenges in Explainable AI (XAI): Exploring Tools, Clinical Decision Support Systems, and Interactive Explanations"
# Articles in Cluster:
#  - kim2024xai
#  - rago2023interactive​
#  - mindlin2024measuring​
#  - alaqsam2024systematic​
# --------------------------------------------------
# Cluster ID: 0
# Cluster Title: "Exploring the Intersections of Explainable Artificial Intelligence and Human-Centering: Advancements in Techniques, Applications, and Interactivity"
# Articles in Cluster:
#  - huang2022analysis​
#  - chromik2021review​
#  - nazar2021systematic​
#  - baniecki2020grammar​
#  - feustel2024enhancing​
#  - guo2024explainability​
#  - Liao2022human​
#  - jacovi2023trends​
# --------------------------------------------------
# Cluster ID: 3
# Cluster Title: "Personalized Explainable Artificial Intelligence (XAI) in Intelligent Tutoring Systems (ITS): Impact on Student Learning and Perceptions"
# Articles in Cluster:
#  - conati2023personalized​
# --------------------------------------------------


In [None]:
# NMF + "gpt-4"
# Cluster ID: 2
# Cluster Title: "Interactive and Personalized Approaches in Explainable Artificial Intelligence (XAI): Advancements and Challenges in Explanation Generation, User Interface Design, and Model Understanding"
# Articles in Cluster:
#  - booshehri2024computational​
#  - huang2022analysis​
#  - chromik2021review​
#  - rago2023interactive​
#  - conati2023personalized​
#  - mozolewski2022explain​
#  - mindlin2024measuring​
# --------------------------------------------------
# Cluster ID: 3
# Cluster Title: "Advancements and Challenges in Explainable Artificial Intelligence (XAI): An Investigation into Trustworthiness, Transparency, and Algorithmic Recourse in Clinical Decision Support Systems and Beyond"
# Articles in Cluster:
#  - kim2024xai
#  - shneiderman2022human​
#  - feustel2024enhancing​
#  - singh2024actionability​
# --------------------------------------------------
# Cluster ID: 0
# Cluster Title: "Intersecting Fields of Artificial Intelligence and Human-Computer Interaction: A Comprehensive Study into Explainable AI Tools, Challenges, and Prospects in Healthcare and Other Domains"
# Articles in Cluster:
#  - nazar2021systematic​
#  - guo2024explainability​
#  - alaqsam2024systematic​
#  - Liao2022human​
#  - jacovi2023trends​
# --------------------------------------------------
# Cluster ID: 1
# Cluster Title: "Interactive Explanatory Model Analysis: Enhancing Explainable AI Through Sequential Approach and Human-Centered Design"
# Articles in Cluster:
#  - baniecki2020grammar​
# --------------------------------------------------
