# Cluster LLM Papers
Using BERTopic.

In [1]:
import os

import pandas as pd
import plotly.express as px
import plotly.io as pio
from umap import UMAP
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import OpenAI, MaximalMarginalRelevance
from hdbscan import HDBSCAN
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import re
import openai
from dotenv import load_dotenv

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
stop_words = list(set(stopwords.words('english')))

pio.renderers.default = "browser"
load_dotenv()

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/manuelrueda/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/manuelrueda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [330]:
papers_df = pd.read_pickle('papers_df.pkl')

content_cols = ["Summary", "main_contribution", "takeaways"]
papers_df = papers_df.dropna(subset=["Title"] + content_cols)
papers_df.drop_duplicates(subset=["Title"], inplace=True)

papers_dict = papers_df.set_index("Title")[content_cols].apply(lambda x: '\n'.join(x.astype(str)), axis=1).to_dict()
papers_df.head()

Unnamed: 0,Published,Title,Authors,Summary,main_contribution,takeaways,category,novelty_analysis,novelty_score,technical_analysis,technical_score,enjoyable_analysis,enjoyable_score
2023_07_27_simon_001.json,2023-07-27,ThoughtSource: A central hub for large languag...,"Simon Ott, Konstantin Hebenstreit, Valentin Li...",Large language models (LLMs) such as GPT-4 hav...,{'headline': 'ThoughtSource: A Meta-Dataset an...,{'headline': 'ThoughtSource Enhances LLM Reaso...,TRAINING,ThoughtSource represents a novel contribution ...,3,"The paper is somewhat technical, as it discuss...",2,"The paper is well-written and organized, prese...",3
2023_07_26_fabian_001.json,2023-07-26,Educational data augmentation in physics educa...,"Fabian Kieser, Peter Wulff, Jochen Kuhn, Stefa...",Generative AI technologies such as large langu...,{'headline': 'ChatGPT can generate synthetic d...,{'headline': 'ChatGPT can be used to generate ...,USE CASES,The paper presents a novel application of larg...,3,The paper is not overly technical. It focuses ...,1,The paper is well-written and presents a novel...,3
2023_07_25_sungmin_001.json,2023-07-25,Large Language Models are Few-shot Testers: Ex...,"Sungmin Kang, Juyeon Yoon, Shin Yoo",Many automated test generation techniques have...,{'headline': 'LIBRO: A Framework for Automatin...,{'headline': 'LIBRO Framework Enhances Develop...,USE CASES,The paper presents a novel approach to automat...,3,"The paper is somewhat technical, as it delves ...",2,The paper is well-organized and presents a nov...,3
2023_07_24_jindong_001.json,2023-07-24,A Systematic Survey of Prompt Engineering on V...,"Jindong Gu, Zhen Han, Shuo Chen, Ahmad Beirami...",Prompt engineering is a technique that involve...,{'headline': 'Comprehensive Survey of Prompt E...,{'headline': 'Prompt Engineering: A Versatile ...,PROMPTING,The paper provides a systematic survey of prom...,2,The paper is somewhat technical as it requires...,2,The paper is well-structured and provides a co...,2
2023_07_19_jiayu_001.json,2023-07-19,"LongNet: Scaling Transformers to 1,000,000,000...","Jiayu Ding, Shuming Ma, Li Dong, Xingxing Zhan...",Scaling sequence length has become a critical ...,{'headline': 'LongNet: A Transformer variant f...,{'headline': 'LongNet enables efficient modeli...,ARCHITECTURES,The introduction of LongNet and the concept of...,3,"The paper is highly technical, delving into th...",3,The paper is well-structured and presents a no...,2


In [331]:
## Pre-calculate embeddings.
titles = papers_df['Title'].tolist()
assert titles == list(papers_dict.keys())

all_content = list(papers_dict.values())
print(len(all_content), len(titles))
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(all_content, show_progress_bar=True)

141 141


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [332]:
def process_text(text):
    # Lowercase the text
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

In [333]:
## Utility objkts.
umap_model = UMAP(n_neighbors=5, n_components=7, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(2, 3), min_df=1, max_df=0.8, preprocessor=process_text)
mmr_model = MaximalMarginalRelevance(diversity=.5)

In [342]:
##OAI representation.
openai.api_key = os.environ["OPENAI_API_KEY"]
prompt = """
I have a topic that contains the following Large Language Model related documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Do not use "Large Language Model" in your description. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(model="gpt-4", exponential_backoff=True, chat=True, prompt=prompt, nr_docs=8)

In [343]:
topic_model = BERTopic(
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=[mmr_model, openai_model],
  top_n_words=10,
  verbose=True
)

topics, probs = topic_model.fit_transform(all_content, embeddings)
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)
reduced_embeddings = (reduced_embeddings - reduced_embeddings.mean(axis=0)) / reduced_embeddings.std(axis=0)

2023-07-29 21:26:58,615 - BERTopic - Reduced dimensionality
2023-07-29 21:26:58,619 - BERTopic - Clustered reduced embeddings


In [344]:
## Extract topics and save on DF.
topic_names = topic_model.get_topic_info().set_index("Topic")["Name"]
topic_names[-1] = "Miscellaneous"
clean_topic_names = [topic_names[t].split("_")[-1].replace('"', "").strip() for t in topics]

papers_df['topic'] = clean_topic_names
papers_df["dim1"] = reduced_embeddings[:,0]
papers_df["dim2"] = reduced_embeddings[:,1]

papers_df.to_pickle('papers_df.pkl')

topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,29,-1_Advanced AI Applications and Techniques,[Advanced AI Applications and Techniques],[A central notion in practical and theoretical...
1,0,27,0_CoT in Enhancing Complex Reasoning,[CoT in Enhancing Complex Reasoning],[Recent studies have discovered that Chain-of-...
2,1,26,1_Multimodal Learning and Fine-Tuning Techniques,[Multimodal Learning and Fine-Tuning Techniques],"[We give simpler, sparser, and faster algorith..."
3,2,14,2_AI-Assisted Reinforcement Learning Exploration,[AI-Assisted Reinforcement Learning Exploration],"[Recently, there has been increasing interest ..."
4,3,12,3_Human-like Reasoning in AI,[Human-like Reasoning in AI],[Abstract reasoning is a key ability for an in...
5,4,10,4_ChatGPT Applications and Text Detection,[ChatGPT Applications and Text Detection],[We present the first study to investigate Lar...
6,5,9,5_Code Synthesis and Automated Testing,[Code Synthesis and Automated Testing],[Program synthesis strives to generate a compu...
7,6,8,6_Enhancing Reliability and Consistency in LLMs,[Enhancing Reliability and Consistency in LLMs],[Numerous works are proposed to improve or eva...
8,7,6,7_Advanced NLP Applications & Challenges,[Advanced NLP Applications & Challenges],[Large language models (LLMs) are competitive ...


In [345]:
reduced_titles = []
for t in titles:
    if len(t) > 50:
        t = t[:50] + "..."
    reduced_titles.append(t)

plot_df = pd.DataFrame({
    'UMAP Dim1': reduced_embeddings[:,0],
    'UMAP Dim2': reduced_embeddings[:,1],
    'topics': clean_topic_names,
    'titles': reduced_titles
})
plot_df['topics'] = plot_df['topics'].astype(str)

fig = px.scatter(plot_df, x='UMAP Dim1', y='UMAP Dim2', color='topics',
                 opacity=0.7, hover_name='titles')

# Update layout
fig.update_layout(
    autosize=False,
    width=1200,
    height=500,
    font=dict(
        size=16,
    ),
    legend=dict(
        title=None,
        font=dict(
            size=14,
        ),
    )
)
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey'), size=10))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Optional: Add arxiv links

In [2]:
import arxiv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
papers_df = pd.read_pickle('papers_df.pkl')

In [4]:
def preprocess(text):
    """ Clean and simplify text string. """
    text = ''.join(c.lower() if c.isalnum() else ' ' for c in text)
    return text

def tfidf_similarity(title1, title2):
    """ Compute cosine similarity of TF-IDF representation between 2 strings. """
    title1 = preprocess(title1)
    title2 = preprocess(title2)

    vectorizer = TfidfVectorizer().fit_transform([title1, title2])
    vectors = vectorizer.toarray()

    return cosine_similarity(vectors[0:1], vectors[1:2])[0][0]

In [35]:
## Add link from arxiv.
def get_arxiv_link(title):
    """ Search article in Arxiv by name and retrieve their URL. """
    search = arxiv.Search(
        query=preprocess(title),
        max_results=20,
        sort_by=arxiv.SortCriterion.Relevance
    )
    res = list(search.results())
    if len(res) > 0:
        ## Sort by title similarity.
        res = sorted(res, key=lambda x: tfidf_similarity(title, x.title), reverse=True)
        new_title = res[0].title
        title_sim = tfidf_similarity(title, new_title)
        if title_sim > 0.7:
            return res[0].entry_id
        else:
            return None
    return None

## Execute.
null_idx = papers_df['arxiv_link'].isnull()
papers_df.loc[null_idx, 'arxiv_link'] = papers_df.loc[null_idx, 'Title'].apply(get_arxiv_link)

In [50]:
## Store results.
papers_df.to_pickle('papers_df.pkl')

In [51]:
search = arxiv.Search(
    query=preprocess("Early experiments with GPT-4"),
    max_results=3,
    sort_by=arxiv.SortCriterion.Relevance
)
res = list(search.results())
[r.title for r in res]

In [49]:
papers_df.loc[papers_df["arxiv_link"].isnull()]

Unnamed: 0,Published,Title,Authors,Summary,main_contribution,takeaways,category,novelty_analysis,novelty_score,technical_analysis,technical_score,enjoyable_analysis,enjoyable_score,topic,dim1,dim2,arxiv_link


## Optional: Rename files to Arxiv code

In [53]:
import os
import shutil

In [68]:
papers_df = pd.read_pickle('papers_df.pkl')

In [69]:
def get_arxiv_code(link):
    """ Extract Arxiv code from URL. """
    if link is None:
        return None
    return link.split("/")[-1]

def rename_file(fname, arxiv_code):
    """ Rename file to Arxiv code. """
    if arxiv_code is None:
        return None
    old_path = os.path.join("summaries", fname)
    new_path = os.path.join("summaries", arxiv_code + ".json")
    if os.path.exists(old_path):
        shutil.move(old_path, new_path)
        return new_path
    return None

In [70]:
## Code collection.
papers_df['arxiv_code'] = papers_df['arxiv_link'].apply(get_arxiv_code)
papers_df.head()

Unnamed: 0,Published,Title,Authors,Summary,main_contribution,takeaways,category,novelty_analysis,novelty_score,technical_analysis,technical_score,enjoyable_analysis,enjoyable_score,topic,dim1,dim2,arxiv_link,arxiv_code
2023_07_27_simon_001.json,2023-07-27,ThoughtSource: A central hub for large languag...,"Simon Ott, Konstantin Hebenstreit, Valentin Li...",Large language models (LLMs) such as GPT-4 hav...,{'headline': 'ThoughtSource: A Meta-Dataset an...,{'headline': 'ThoughtSource Enhances LLM Reaso...,TRAINING,ThoughtSource represents a novel contribution ...,3,"The paper is somewhat technical, as it discuss...",2,"The paper is well-written and organized, prese...",3,CoT in Enhancing Complex Reasoning,0.427124,0.360614,http://arxiv.org/abs/2301.11596v5,2301.11596v5
2023_07_26_fabian_001.json,2023-07-26,Educational data augmentation in physics educa...,"Fabian Kieser, Peter Wulff, Jochen Kuhn, Stefa...",Generative AI technologies such as large langu...,{'headline': 'ChatGPT can generate synthetic d...,{'headline': 'ChatGPT can be used to generate ...,USE CASES,The paper presents a novel application of larg...,3,The paper is not overly technical. It focuses ...,1,The paper is well-written and presents a novel...,3,ChatGPT Applications and Text Detection,-1.034149,0.792786,http://arxiv.org/abs/2307.14475v1,2307.14475v1
2023_07_25_sungmin_001.json,2023-07-25,Large Language Models are Few-shot Testers: Ex...,"Sungmin Kang, Juyeon Yoon, Shin Yoo",Many automated test generation techniques have...,{'headline': 'LIBRO: A Framework for Automatin...,{'headline': 'LIBRO Framework Enhances Develop...,USE CASES,The paper presents a novel approach to automat...,3,"The paper is somewhat technical, as it delves ...",2,The paper is well-organized and presents a nov...,3,Code Synthesis and Automated Testing,0.573592,-1.789426,http://arxiv.org/abs/2209.11515v3,2209.11515v3
2023_07_24_jindong_001.json,2023-07-24,A Systematic Survey of Prompt Engineering on V...,"Jindong Gu, Zhen Han, Shuo Chen, Ahmad Beirami...",Prompt engineering is a technique that involve...,{'headline': 'Comprehensive Survey of Prompt E...,{'headline': 'Prompt Engineering: A Versatile ...,PROMPTING,The paper provides a systematic survey of prom...,2,The paper is somewhat technical as it requires...,2,The paper is well-structured and provides a co...,2,Multimodal Learning and Fine-Tuning Techniques,1.256921,-0.998975,http://arxiv.org/abs/2307.12980v1,2307.12980v1
2023_07_19_jiayu_001.json,2023-07-19,"LongNet: Scaling Transformers to 1,000,000,000...","Jiayu Ding, Shuming Ma, Li Dong, Xingxing Zhan...",Scaling sequence length has become a critical ...,{'headline': 'LongNet: A Transformer variant f...,{'headline': 'LongNet enables efficient modeli...,ARCHITECTURES,The introduction of LongNet and the concept of...,3,"The paper is highly technical, delving into th...",3,The paper is well-structured and presents a no...,2,Multimodal Learning and Fine-Tuning Techniques,-0.923451,-1.427588,http://arxiv.org/abs/2307.02486v2,2307.02486v2


In [71]:
## Rename files.
papers_df['new_path'] = papers_df.reset_index().apply(lambda x: rename_file(x['index'], x['arxiv_code']), axis=1)
papers_df.head()

Unnamed: 0,Published,Title,Authors,Summary,main_contribution,takeaways,category,novelty_analysis,novelty_score,technical_analysis,technical_score,enjoyable_analysis,enjoyable_score,topic,dim1,dim2,arxiv_link,arxiv_code,new_path
2023_07_27_simon_001.json,2023-07-27,ThoughtSource: A central hub for large languag...,"Simon Ott, Konstantin Hebenstreit, Valentin Li...",Large language models (LLMs) such as GPT-4 hav...,{'headline': 'ThoughtSource: A Meta-Dataset an...,{'headline': 'ThoughtSource Enhances LLM Reaso...,TRAINING,ThoughtSource represents a novel contribution ...,3,"The paper is somewhat technical, as it discuss...",2,"The paper is well-written and organized, prese...",3,CoT in Enhancing Complex Reasoning,0.427124,0.360614,http://arxiv.org/abs/2301.11596v5,2301.11596v5,
2023_07_26_fabian_001.json,2023-07-26,Educational data augmentation in physics educa...,"Fabian Kieser, Peter Wulff, Jochen Kuhn, Stefa...",Generative AI technologies such as large langu...,{'headline': 'ChatGPT can generate synthetic d...,{'headline': 'ChatGPT can be used to generate ...,USE CASES,The paper presents a novel application of larg...,3,The paper is not overly technical. It focuses ...,1,The paper is well-written and presents a novel...,3,ChatGPT Applications and Text Detection,-1.034149,0.792786,http://arxiv.org/abs/2307.14475v1,2307.14475v1,
2023_07_25_sungmin_001.json,2023-07-25,Large Language Models are Few-shot Testers: Ex...,"Sungmin Kang, Juyeon Yoon, Shin Yoo",Many automated test generation techniques have...,{'headline': 'LIBRO: A Framework for Automatin...,{'headline': 'LIBRO Framework Enhances Develop...,USE CASES,The paper presents a novel approach to automat...,3,"The paper is somewhat technical, as it delves ...",2,The paper is well-organized and presents a nov...,3,Code Synthesis and Automated Testing,0.573592,-1.789426,http://arxiv.org/abs/2209.11515v3,2209.11515v3,
2023_07_24_jindong_001.json,2023-07-24,A Systematic Survey of Prompt Engineering on V...,"Jindong Gu, Zhen Han, Shuo Chen, Ahmad Beirami...",Prompt engineering is a technique that involve...,{'headline': 'Comprehensive Survey of Prompt E...,{'headline': 'Prompt Engineering: A Versatile ...,PROMPTING,The paper provides a systematic survey of prom...,2,The paper is somewhat technical as it requires...,2,The paper is well-structured and provides a co...,2,Multimodal Learning and Fine-Tuning Techniques,1.256921,-0.998975,http://arxiv.org/abs/2307.12980v1,2307.12980v1,
2023_07_19_jiayu_001.json,2023-07-19,"LongNet: Scaling Transformers to 1,000,000,000...","Jiayu Ding, Shuming Ma, Li Dong, Xingxing Zhan...",Scaling sequence length has become a critical ...,{'headline': 'LongNet: A Transformer variant f...,{'headline': 'LongNet enables efficient modeli...,ARCHITECTURES,The introduction of LongNet and the concept of...,3,"The paper is highly technical, delving into th...",3,The paper is well-structured and presents a no...,2,Multimodal Learning and Fine-Tuning Techniques,-0.923451,-1.427588,http://arxiv.org/abs/2307.02486v2,2307.02486v2,
