In [1]:
%load_ext autoreload
%autoreload 2

# Summarizer

In [2]:
from langchain.chat_models import ChatOpenAI, AzureChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.document_loaders import ArxivLoader
from langchain.chains import LLMChain
from langchain.callbacks import get_openai_callback
import tiktoken

import os
import re
import json
import arxiv
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from dotenv import load_dotenv

load_dotenv()

True

## Helper Funcs

In [3]:
def preprocess(text):
    """ Clean and simplify text string. """
    text = ''.join(c.lower() if c.isalnum() else ' ' for c in text)
    return text

def reformat_text(doc_content):
    content = doc_content.replace('-\n', '')
    content = re.sub(r'(?<!\n)\n(?!\n)', ' ', content)
    content = re.sub(' +', ' ', content)
    return content

def tfidf_similarity(title1, title2):
    """ Compute cosine similarity of TF-IDF representation between 2 strings. """
    title1 = preprocess(title1)
    title2 = preprocess(title2)
    vectorizer = TfidfVectorizer(use_idf=False, analyzer='char', ngram_range=(2,3)).fit_transform([title1, title2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors[0:1], vectors[1:2])[0][0]


def get_arxiv_info(title):
    """ Search article in Arxiv by name and retrieve meta-data. """
    search = arxiv.Search(
        query=preprocess(title),
        max_results=20,
        sort_by=arxiv.SortCriterion.Relevance
    )
    res = list(search.results())
    if len(res) > 0:
        ## Sort by title similarity.
        res = sorted(res, key=lambda x: tfidf_similarity(title, x.title), reverse=True)
        new_title = res[0].title
        title_sim = tfidf_similarity(title, new_title)
        if title_sim > 0.7:
            return res[0]
        else:
            return None
    return None


## LLM Chain Setup

In [4]:
## Underlying LLM.
llm = AzureChatOpenAI(deployment_name="gpt-4-32k", temperature=0.1)

## Create prompt.
prompt = PromptTemplate(
    input_variables=["content", "prev_summary"],
    template="""
As an applied AI researcher specialized in the field of Large Language Models (LLMs), you are currently conducting a survey of the literature, building a catalogue of the main contributions and innovations of each paper, determining how they can be applied to build systems or create new products. This catalogue will be published by a prestigious organization and will serve as the foundation for all applied LLM knowledge going forward. Now, carefully read the following paper:

{content}

========================

SUMMARY

{prev_summary}

Upon completion, answer the following questions:

1. What is the `main_contribution` of this paper? (1 line headline + 8-12 sentences)
    - If a new algorithm or technique is introduced, describe its workings clearly and comprehensively.
    - Do not assume that the reader knows terminology not part of the common AI/ML knowledge base.
    - Ensure that your answer provides practical insights that offer a solid understanding of the paper.
    - Detail the benefits or advantages of what has been presented, along with the practical implications for an LLM practitioner.
    - Do not include anything already discussed in the summary or abstract.

2. What is the main `takeaway`? (1 line headline + 8-12 sentences)
    - Focusing on the paper's contributions, explain how they can be used to create an interesting LLM application, improve current workflows, or increase efficiency when working with LLMs.
    - If different models were evaluated and their performance recorded, please note this and its practical implications (in detailed manner, i.e.: which model is best for what).
    - Be very precise, practical and specific as possible. Eliminate any irrelevant content from the paper's applied perspective.
    - If possible, provide a minimal code example or at least sketch the application.

3. Which category best describes this paper's primary focus? Choose one from the following options, with "OTHER" being the least desirable choice.
    a. "TRAINING": Discussions on LLM training methods, technical stack improvements, alternative training routines, etc.
    b. "FINE-TUNING": Discussions on fine-tuning, re-training, and specialization of LLMs.
    c. "ARCHITECTURES": Discussions on new LLM architectures, neural network components, etc., excluding prompting or computational systems to manage LLMs.
    d. "PROMPTING": Discussions on prompting methods, agent architectures, etc.
    e. "USE CASES": Discussions on LLM use in specific tasks, such as summarization, question answering, stock prediction, etc.
    f. "BEHAVIOR": Discussions on LLM behavior, including probing, interpretability, risks, biases, emerging abilities, etc.
    g. "OTHER": None of the above.

4. On a scale from 1 to 3, how novel is this paper? (1: not novel, 2: incrementally novel, 3: very novel)
    - Compare the paper's findings and contributions with what is presented in previous and related work. How unique and significant are the findings?
    - Be strict and rigorous; few papers should receive a high score.
    - Pay close attention to the comparison with prior work and the degree of difference in the author's contributions.

5. On a scale from 1 to 3, how technical is this paper? (1: not technical, 2: somewhat technical, 3: very technical)
    a) A very technical paper is difficult for a non-expert to understand, requires considerable technical knowledge, is filled with equations and jargon, and demands advanced mathematical knowledge.
    b) A somewhat technical paper may be challenging for a layman but can be understood reasonably well by someone with a computer science background. These papers, while not overly complex, explain processes in great detail and are practical and applicable (can be replicated).
    c) A non-technical paper is understandable for anyone with a college degree. These papers often discuss generalities, and the takeaways are more conceptual than technical.

6. On a scale from 1 to 3, how enjoyable is this paper? (1: hard to read, 2: ok, 3: a delight)
    a) A very enjoyable paper is well-written, organized, presents a novel and intriguing contribution, and is easy to read.
    b) An 'ok' paper is primarily plain and unexciting but is easy to read and contains some interesting parts. Most papers
    c) A non-enjoyable paper is difficult to read, poorly written, and lacks meaningful, practical, and insightful content.

When assigning numerical ratings consider these guidelines:
- Rating 3/3: Only about 20% of papers reach this standard.
- Rating 2/3: Most papers (50%) fall into this category.
- Rating 1/3: Around 30% of papers belong to this category.

Do not repeat the same comments across different answers.

Use the JSON format as in the following examples to respond.

EXAMPLE 1
==========
```
{{
    "main_contribution": {{
        "headline": "Chain-of-Thought (CoT) boosts LLM accuracy in financial sentiment analysis",
    "description": "The paper introduces the Chain-of-Thought (CoT) prompting technique for Large Language Models (LLMs) specifically targeting financial sentiment analysis. The core of CoT lies in its deviation from direct predictions. Instead, it guides the model to build a sequence of interconnected thoughts leading to an accurate sentiment score. In a comparative study, LLMs equipped with CoT achieved a 94% accuracy, surpassing the established FinBERT's 88% and the naive prompting model's 81%."
    }},
    "takeaways": {{
        "headline": "CoT opens new, efficient avenues for LLMs in financial analysis",
        "description": "Using the CoT prompting technique, LLMs can achieve enhanced accuracy in financial news sentiment analysis, ultimately refining stock market predictions. This method not only improves prediction accuracy but also renders the model's thought process transparent. When pitted against FinBERT, the LLM with CoT demonstrated superior performance, signaling its potential dominance in financial analysis tasks.",
        "example": "When processing a news snippet like 'Company X has strong Q3 earnings', an LLM with CoT could generate: 'Strong Q3 earnings -> Likely effective management -> Expected investor trust growth -> Potential bullish market -> Possible stock price ascent.' This layered output simplifies decision-making for market analysts."
    }},
    "category": "USE CASES",
    "novelty_analysis": "The paper extends the boundaries of current research by applying LLMs to financial news sentiment analysis. The introduction of the CoT prompting technique, tailored specifically for this application, represents an incremental advancement in the field.",
    "novelty_score": 2,
    "technical_analysis": "While the paper discusses a computational framework for managing LLM inputs and outputs, it does not delve into complex mathematical theories or algorithms, making it accessible to a wider audience.",
    "technical_score": 1,
    "enjoyable_analysis": "The engaging narrative style, coupled with practical insights, makes the paper an enjoyable read. It balances technical details with easily digestible information and an interesting practical application.",
    "enjoyable_score": 3
}}
```

EXAMPLE 2
==========
```
{{
    "main_contribution": {{
        "headline": "Zero-shot Prompting Technique for GPT-4 Code Interpreter",
        "description": "This paper proposes a zero-shot prompting technique for GPT-4 Code Interpreter that explicitly encourages the use of code for self-verification, which further boosts performance on math reasoning problems. They report a positive correlation between the better performance of GPT4-Code and the higher Code Usage Frequency. Initial experiments show that GPT4-Code achieved a zero-shot accuracy of 69.7% on the MATH dataset which is an improvement of 27.5% over GPT-4’s performance (42.2%)."
    }},
    "takeaways": {{
        "headline": "Leveraging Self-verification and Code Execution in LLMs",
        "description": "Self-verification is already a powerful approach to enhance the performance of LLMs on many tasks but this approach leverages the evaluation of code execution which could make it interesting to solve other kinds of problems. This work highlights the importance of code understanding and generation capabilities in LLMs.",
        "example": "Some of the ideas presented in this paper (specifically, the code-based self-verification and verification-guided weighted majority voting technique) can lead to building high-quality datasets that could potentially help improve the mathematical capabilities in open-source LLMs like Llama 2."
    }},
    "category": "PROMPTING",
    "novelty_analysis": "The research innovatively combines LLMs with code-based self-verification, achieving a 20% boost over state-of-the-art coding task accuracies. This method's practicality is evident, with tests showing a 30% reduction in coding errors, redefining efficiency in LLM-driven code generation.",
    "novelty_score": 3,
    "technical_analysis": "The paper delve into advanced algorithms, such as the Hypothetical Code-Integration Algorithm (HCIA), making it a dense read for those unfamiliar with theoretical computer science. While the introduction of a novel concept is enlightening, the paper's reliance on complex algorithms, logical proofs and symbolic reasoning makes it a technically advanced read.",
    "technical_score": 3,
    "enjoyable_analysis": "For those deeply engrossed in the LLM landscape, this paper promises an engaging journey. While its technical nuances can be challenging, the clearly presented transformative results, such as the significant performance leap in the MATH dataset, ensure a gripping narrative.",
    "enjoyable_score": 2
}}
```

EXAMPLE 3
==========
```
{{
    "main_contribution": {{
        "headline": "LLMManager: LLM-Driven Database Maintenance Knowledge Acquisition",
        "description": "LLMManager leverages a retriever system paired with a LLM to extract database maintenance knowledge from diverse textual sources. It incorporates a hybrid mechanism that combines transformer-based models with traditional relational database algorithms. The framework's ability to parse vast amounts of text and convert them into actionable database maintenance tasks has led to notable metrics: a 47% increase in real-time database issue detection and a 32% improvement in automated problem resolution compared to existing SotA systems."
    }},
    "takeaways": {{
        "headline": "Leveraging 'Tree of Thought' Reasoning for Enhanced Maintenance",
        "description": "LLMManager integration of the 'tree of thought' reasoning not only enhances root cause analysis but also creates a dynamic learning environment. Over time, LLMManager ability to revert to prior steps during anomalies becomes more refined, ensuring adaptive and evolving responses to complex database issues. Furthermore, its modular design allows for seamless integration with other LLMs, magnifying the collaborative aspect of the framework.",
        "example": "Automating database maintenance with D-Bot can lead to significant reductions in downtime and costs. Developers could design LLM systems that proactively address issues even before they escalate, unlocking more efficient and streamlined database operations."
    }},
    "category": "USE CASES",
    "novelty_analysis": "D-Bot's utilization of the 'tree of thought' reasoning in database maintenance is novel, although a targeted application inspired by similar work on other engineering areas.",
    "novelty_score": 2,
    "technical_analysis": "The paper delves into Entity-Relationship Diagrams and database management algorithms essential to LLMManagers's operations. However, it manages to remain accessible, avoiding overly complex jargon and ensuring a broader audience comprehension.",
    "technical_score": 2,
    "enjoyable_analysis": "The work provides a balanced blend of technical details and real-world applications, giving insights into LLMManager's functions and potential impacts.",
    "enjoyable_score": 2
}}
```

EXAMPLE 4
==========
{{
    "main_contribution": {{
        "headline": "Performance Analysis of LLMs in Entity Recognition",
        "description": "The paper undertakes a systematic comparison of four Large Language Models (LLMs) - GPT-4, Claude, GPT-3.5, and Prodisol-001 - with a focus on entity recognition. Each model was subjected to a consistent dataset, and their entity extraction capabilities were assessed based on precision, recall, and F1 score. Results highlighted that GPT-4 outperformed the other models, with Claude closely following, and GPT-3.5 and Prodisol-001 trailing behind. This comparative study offers insights into the current capabilities of prominent LLMs in the domain of entity recognition."
    }},
    "takeaways": {{
        "headline": "Entity Recognition Capabilities Vary Across LLMs",
        "description": "The paper underscores variations in the performance of different LLMs when tasked with entity recognition. The presented findings provide a benchmark for professionals and researchers aiming to choose an LLM for entity recognition tasks. The nuanced comparison suggests that while GPT-4 exhibits top-tier performance in this domain, other models like Claude also present strong capabilities.",
        "example": "When parsing a complex news article about the merger between two tech giants, it becomes crucial to accurately recognize and categorize entities such as company names, CEOs, financial figures, and locations. An LLM with superior entity recognition, in such a context, aids in extracting critical data points efficiently, enabling a more thorough analysis of the situation."
    }},
    "category": "USE CASES",
    "novelty_analysis": "The study contributes to existing literature by offering a contemporary comparison of the latest LLMs in entity recognition. While the task itself isn't novel, the inclusion of GPT-4 and Claude in the comparison introduces an incremental advancement to the current body of research.",
    "novelty_score": 2,
    "technical_analysis": "The paper balances technical depth with accessibility, providing a detailed outline of evaluation metrics and methodologies. This ensures insights are communicated comprehensively, catering to both technical and non-technical readers.",
    "technical_score": 2,
    "enjoyable_analysis": "Through its well-structured approach and clear visualizations, the paper facilitates an engaging read. The methodical presentation of results aids in drawing comparisons and understanding the landscape of LLMs in entity recognition.",
    "enjoyable_score": 2
}}
```

YOUR TURN
==========
"""
)
chain = LLMChain(llm=llm, prompt=prompt)

## Iterate Papers

In [5]:
paper_names = [
    # "One Wide Feedforward is All You Need",
    # "ModelScope-Agent: Building Your Customizable Agent System with Open-source Large Language Models",
    # "Efficient RLHF: Reducing the Memory Usage of PPO",
    # "Large Content And Behavior Models To Understand, Simulate, And Optimize Content And Behavior",
    # "Point-Bind & Point-LLM: Aligning Point Cloud with Multi-modality for 3D Understanding, Generation, and Instruction Following",
    # "Can Programming Languages Boost Each Other via Instruction Tuning?",
    # "The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants",
    # "BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual Pragmatic Knowledge",
    # "LM-Infinite: Simple On-the-Fly Length Generalization for Large Language Models",
    # "LLaSM: Large Language and Speech Model",
    # "Jais and Jais-chat: Arabic-Centric Foundation and Instruction-Tuned Open Generative Large Language Models",
    # "MedAlign: A Clinician-Generated Dataset for Instruction Following with Electronic Medical Records",
    # "OmniQuant: Omnidirectionally Calibrated Quantization for Large Language Models",
    # "SoTaNa: The Open-Source Software Development Assistant",
    # "Teach LLMs to Personalize -- An Approach inspired by Writing Education",
    # "AnomalyGPT: Detecting Industrial Anomalies using Large Vision-Language Models",
    # "FacTool: Factuality Detection in Generative AI -- A Tool Augmented Framework for Multi-Task and Multi-Domain Scenarios",
    # "Graph of Thoughts: Solving Elaborate Problems with Large Language Models",
    # "LLaSM: Large Language and Speech Model",
    # "Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity",
    # "GPT Can Solve Mathematical Problems Without a Calculator",
    # "Large Language Models as Optimizers",
    # "DoLa: Decoding by Contrasting Layers Improves Factuality in Large Language Models",
    # "FLM-101B: An Open LLM and How to Train It with $100K Budget",
    # "XGen-7B Technical Report",
    "Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models",
    "Physically Grounded Vision-Language Models for Robotic Manipulation",
    "Scaling Autoregressive Multi-Modal Models: Pretraining and Instruction Tuning",
    # "Scaling Clinical Trial Matching Using Large Language Models: A Case Study in Oncology",
    # "Gated recurrent neural networks discover attention",
    # "Topic Discovery via Latent Space Clustering of Pretrained Language Model Representations",
    # "Improving Language Models with Advantage-based Offline Policy Gradients",
    # "DrugChat: Towards Enabling ChatGPT-Like Capabilities on Drug Molecule Graphs",
    # "Large-Scale Automatic Audiobook Creation",
    # "Mobile V-MoEs: Scaling Down Vision Transformers via Sparse Mixture-of-Experts",
    # "From Sparse to Dense: GPT-4 Summarization with Chain of Density Prompting",
    # "GPT Can Solve Mathematical Problems Without a Calculator",
    # "When Less is More: Investigating Data Pruning for Pretraining LLMs at Scale",
    # "NExT-GPT: Any-to-Any Multimodal LLM",
    # "Optimize Weight Rounding via Signed Gradient Descent for the Quantization of LLMs",
    # "Textbooks Are All You Need II: phi-1.5 technical report",
    # "Neurons in Large Language Models: Dead, N-gram, Positional",
    # "FIAT: Fusing learning paradigms with Instruction-Accelerated Tuning",
    # "MADLAD-400: A Multilingual And Document-Level Large Audited Dataset",
    # "Hypothesis Search: Inductive Reasoning with Language Models",
    # "ANALYZING TRANSFORMER DYNAMICS AS MOVEMENT THROUGH EMBEDDING SPACE",
    # "From Sparse to Dense: GPT-4 Summarization with Chain of Density Prompting",
    # "What In-Context Learning Learns In-Context: Disentangling Task Recognition and Task Learning",
    # "A Latent Space Theory for Emergent Abilities in Large Language Models"
]

print(len(paper_names))
failed_papers = []
existing_papers = ["XXXX"]

3


In [6]:
with get_openai_callback() as cb:
    for paper_name in tqdm(paper_names):
        ## Get paper.
        pre_similarity = max([tfidf_similarity(paper_name, t) for t in existing_papers])
        if pre_similarity > 0.9:
            continue
            # pass

        ## Load by the force.
        try:
            docs = ArxivLoader(query=preprocess(paper_name), load_max_docs=1).load()
        except:
            try:
                docs = ArxivLoader(query=preprocess(paper_name), load_max_docs=3).load()
            except:
                docs = ArxivLoader(query=preprocess(paper_name), load_max_docs=1).load()

        if len(docs) == 0:
            print(f"Could not find {paper_name}.")
            failed_papers.append(paper_name)
            continue
        try:
            docs = sorted(docs, key=lambda x: tfidf_similarity(paper_name, x.metadata["Title"]), reverse=True)
            new_title = docs[0].metadata["Title"]
            title_sim = tfidf_similarity(paper_name, new_title)
            if title_sim < 0.7:
                print(f"No similar title name found for {paper_name}.")
                continue
            doc_meta = docs[0].metadata
            doc_content = docs[0].page_content
            doc_content = reformat_text(doc_content)
            first_author = doc_meta["Authors"].split(" ")[0]
            published = pd.to_datetime(doc_meta["Published"]).strftime("%Y_%m_%d")
            prev_summary = doc_meta["Summary"].replace("\n", " ")

            ## Name serially.
            base_name = f"{published}_{first_author.lower()}"
            i = 1
            if os.path.exists(f"../papers/{base_name}_{str(i).zfill(3)}.json"):
                print(f"Found locally: {paper_name}:")
                # print(f"Reworking: {paper_name}.")
                continue
            else:
                print(f"Processing: {paper_name}")

            ## Run model.
            summary = chain.run({'content': doc_content, "prev_summary": prev_summary})

            ## Extract and combine results.
            parsed_summary = summary.replace("```", "").strip()
            parsed_summary = json.loads(parsed_summary)

            result_dict = {**doc_meta, **parsed_summary}
            result_dict["Summary"] = prev_summary

            ## Store.
            with open(f"../papers/{base_name}_{str(i).zfill(3)}.json", 'w') as f:
                json.dump(result_dict, f)

        except Exception as e:
            print(f"Failed on {paper_name}:")
            print(e)
            failed_papers.append(paper_name)
            continue

 33%|███▎      | 1/3 [00:09<00:18,  9.21s/it]

Found locally: Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models:
Processing: Physically Grounded Vision-Language Models for Robotic Manipulation


 67%|██████▋   | 2/3 [03:27<02:00, 120.51s/it]

Processing: Scaling Autoregressive Multi-Modal Models: Pretraining and Instruction Tuning


100%|██████████| 3/3 [06:28<00:00, 129.52s/it]
