In [98]:
from datasets import Dataset
from transformers import CodeLlamaTokenizerFast
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import pandas as pd
import json
import os

load_dotenv()

True

In [38]:
tokenizer = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-13b-hf")

In [17]:
## Input paths.
arxiv_path = os.getenv("ARXIV_PATH")
summaries_path = os.getenv("SUMMARY_PATH")

## Load.
arxiv_fnames = os.listdir(arxiv_path)
arxiv_fnames = [fname.replace(".txt", "") for fname in arxiv_fnames]
summaries_fnames = os.listdir(summaries_path)
summaries_fnames = [fname.replace(".json", "") for fname in summaries_fnames]

## Combine.
all_fnames = set(arxiv_fnames).intersection(set(summaries_fnames))
print(len(all_fnames))

472


In [33]:
keep_keys = [
    'main_contribution',
     'takeaways',
     'category',
     'novelty_analysis',
     'novelty_score',
     'technical_analysis',
     'technical_score',
     'enjoyable_analysis',
     'enjoyable_score'
]

In [84]:
## Create list of input-output pairs.
input_output_pairs = []
token_lengths = []
string_lengths = []

for fname in tqdm(all_fnames):
    ## Loaders.
    input_text = open(f"{arxiv_path}/{fname}.txt", "r").read()
    input_text = input_text[:25000 * 3]
    output_json = open(f"{summaries_path}/{fname}.json", "r").read()
    output_json = json.loads(output_json)
    summary_text = output_json["Summary"]
    output_json = {k: output_json[k] for k in keep_keys}
    output_text = json.dumps(output_json, indent=4)
        
    ## Token count and limit.
    input_tokens = len(tokenizer.tokenize(input_text))
    output_tokens = len(tokenizer.tokenize(output_text))
    
    ## Cutoff at 30000 tokens (~3 chars per token).
    # if input_tokens > 30000:
    #     input_text = input_text[:30000 * 3]
    #     input_tokens = len(tokenizer.tokenize(input_text))
    
    ## Store.
    token_lengths.append((input_tokens, output_tokens))
    string_lengths.append((len(input_text), len(output_text)))
    input_output_pairs.append((input_text, output_text, summary_text))

  0%|          | 0/472 [00:00<?, ?it/s]

In [89]:
## As data framelet.
df = pd.DataFrame(input_output_pairs, columns=["input", "output", "summary"])
df.head()

Unnamed: 0,input,output,summary
0,Preprint.\nALPAGASUS\n:\nTRAINING A BETTER ALP...,"{\n ""main_contribution"": {\n ""headli...",Large language models~(LLMs) obtain instructio...
1,"October 10, 2022\nMIND’S EYE: GROUNDED LANGUAG...","{\n ""main_contribution"": {\n ""headli...",Successful and effective communication between...
2,Exploring the Intersection of Large Language M...,"{\n ""main_contribution"": ""{'headline': 'Exp...",The final frontier for simulation is the accur...
3,\n1 \n \n \nMachine Psychology: Investigating...,"{\n ""main_contribution"": {\n ""headli...",Large language models (LLMs) are currently at ...
4,Interactive Fashion Content Generation Using L...,"{\n ""main_contribution"": {\n ""headli...",Fashionable image generation aims to synthesiz...


In [92]:
def generate_prompt(row):
    """ Format intput and output into a prompt. """
    content = row["input"]
    summary = row["summary"]
    response = row["output"]
    prompt = f"""
As an applied AI researcher specialized in the field of Large Language Models (LLMs), you are currently conducting a survey of the literature, building a catalogue of the main contributions and innovations of each paper, determining how they can be applied to build systems or create new products. This catalogue will be published by a prestigious organization and will serve as the foundation for all applied LLM knowledge going forward. Now, carefully read the following paper:

{content}

========================

SUMMARY

{summary}

Upon completion, answer the following questions:

1. What is the `main_contribution` of this paper? (1 line headline + 8-12 sentences)
    - If a new algorithm or technique is introduced, describe its workings clearly and comprehensively.
    - Do not assume that the reader knows terminology not part of the common AI/ML knowledge base.
    - Ensure that your answer provides practical insights that offer a solid understanding of the paper.
    - Detail the benefits or advantages of what has been presented, along with the practical implications for an LLM practitioner.
    - Do not include anything already discussed in the summary or abstract.

2. What is the main `takeaway`? (1 line headline + 8-12 sentences)
    - Focusing on the paper's contributions, explain how they can be used to create an interesting LLM application, improve current workflows, or increase efficiency when working with LLMs.
    - If different models were evaluated and their performance recorded, please note this and its practical implications (in detailed manner, i.e.: which model is best for what).
    - Be very precise, practical and specific as possible. Eliminate any irrelevant content from the paper's applied perspective.
    - If possible, provide a minimal code example or at least sketch the application.

3. Which category best describes this paper's primary focus? Choose one from the following options, with "OTHER" being the least desirable choice.
    a. "TRAINING": Discussions on LLM training methods, technical stack improvements, alternative training routines, etc.
    b. "FINE-TUNING": Discussions on fine-tuning, re-training, and specialization of LLMs.
    c. "ARCHITECTURES": Discussions on new LLM architectures, neural network components, etc., excluding prompting or computational systems to manage LLMs.
    d. "PROMPTING": Discussions on prompting methods, agent architectures, etc.
    e. "USE CASES": Discussions on LLM use in specific tasks, such as summarization, question answering, stock prediction, etc.
    f. "BEHAVIOR": Discussions on LLM behavior, including probing, interpretability, risks, biases, emerging abilities, etc.
    g. "OTHER": None of the above.

4. On a scale from 1 to 3, how novel is this paper? (1: not novel, 2: incrementally novel, 3: very novel)
    - Compare the paper's findings and contributions with what is presented in previous and related work. How unique and significant are the findings?
    - Be strict and rigorous; few papers should receive a high score.
    - Pay close attention to the comparison with prior work and the degree of difference in the author's contributions.

5. On a scale from 1 to 3, how technical is this paper? (1: not technical, 2: somewhat technical, 3: very technical)
    a) A very technical paper is difficult for a non-expert to understand, requires considerable technical knowledge, is filled with equations and jargon, and demands advanced mathematical knowledge.
    b) A somewhat technical paper may be challenging for a layman but can be understood reasonably well by someone with a computer science background. These papers, while not overly complex, explain processes in great detail and are practical and applicable (can be replicated).
    c) A non-technical paper is understandable for anyone with a college degree. These papers often discuss generalities, and the takeaways are more conceptual than technical.

6. On a scale from 1 to 3, how enjoyable is this paper? (1: hard to read, 2: ok, 3: a delight)
    a) A very enjoyable paper is well-written, organized, presents a novel and intriguing contribution, and is easy to read.
    b) An 'ok' paper is primarily plain and unexciting but is easy to read and contains some interesting parts. Most papers
    c) A non-enjoyable paper is difficult to read, poorly written, and lacks meaningful, practical, and insightful content.

When assigning numerical ratings consider these guidelines:
- Rating 3/3: Only about 20% of papers reach this standard.
- Rating 2/3: Most papers (50%) fall into this category.
- Rating 1/3: Around 30% of papers belong to this category.

Do not repeat the same comments across different answers.

Use the JSON format as in the following examples to respond.

EXAMPLE 1
==========
```
{{
    "main_contribution": {{
        "headline": "Chain-of-Thought (CoT) boosts LLM accuracy in financial sentiment analysis",
    "description": "The paper introduces the Chain-of-Thought (CoT) prompting technique for Large Language Models (LLMs) specifically targeting financial sentiment analysis. The core of CoT lies in its deviation from direct predictions. Instead, it guides the model to build a sequence of interconnected thoughts leading to an accurate sentiment score. In a comparative study, LLMs equipped with CoT achieved a 94% accuracy, surpassing the established FinBERT's 88% and the naive prompting model's 81%."
    }},
    "takeaways": {{
        "headline": "CoT opens new, efficient avenues for LLMs in financial analysis",
        "description": "Using the CoT prompting technique, LLMs can achieve enhanced accuracy in financial news sentiment analysis, ultimately refining stock market predictions. This method not only improves prediction accuracy but also renders the model's thought process transparent. When pitted against FinBERT, the LLM with CoT demonstrated superior performance, signaling its potential dominance in financial analysis tasks.",
        "example": "When processing a news snippet like 'Company X has strong Q3 earnings', an LLM with CoT could generate: 'Strong Q3 earnings -> Likely effective management -> Expected investor trust growth -> Potential bullish market -> Possible stock price ascent.' This layered output simplifies decision-making for market analysts."
    }},
    "category": "USE CASES",
    "novelty_analysis": "The paper extends the boundaries of current research by applying LLMs to financial news sentiment analysis. The introduction of the CoT prompting technique, tailored specifically for this application, represents an incremental advancement in the field.",
    "novelty_score": 2,
    "technical_analysis": "While the paper discusses a computational framework for managing LLM inputs and outputs, it does not delve into complex mathematical theories or algorithms, making it accessible to a wider audience.",
    "technical_score": 1,
    "enjoyable_analysis": "The engaging narrative style, coupled with practical insights, makes the paper an enjoyable read. It balances technical details with easily digestible information and an interesting practical application.",
    "enjoyable_score": 3
}}
```

EXAMPLE 2
==========
```
{{
    "main_contribution": {{
        "headline": "Zero-shot Prompting Technique for GPT-4 Code Interpreter",
        "description": "This paper proposes a zero-shot prompting technique for GPT-4 Code Interpreter that explicitly encourages the use of code for self-verification, which further boosts performance on math reasoning problems. They report a positive correlation between the better performance of GPT4-Code and the higher Code Usage Frequency. Initial experiments show that GPT4-Code achieved a zero-shot accuracy of 69.7% on the MATH dataset which is an improvement of 27.5% over GPT-4’s performance (42.2%)."
    }},
    "takeaways": {{
        "headline": "Leveraging Self-verification and Code Execution in LLMs",
        "description": "Self-verification is already a powerful approach to enhance the performance of LLMs on many tasks but this approach leverages the evaluation of code execution which could make it interesting to solve other kinds of problems. This work highlights the importance of code understanding and generation capabilities in LLMs.",
        "example": "Some of the ideas presented in this paper (specifically, the code-based self-verification and verification-guided weighted majority voting technique) can lead to building high-quality datasets that could potentially help improve the mathematical capabilities in open-source LLMs like Llama 2."
    }},
    "category": "PROMPTING",
    "novelty_analysis": "The research innovatively combines LLMs with code-based self-verification, achieving a 20% boost over state-of-the-art coding task accuracies. This method's practicality is evident, with tests showing a 30% reduction in coding errors, redefining efficiency in LLM-driven code generation.",
    "novelty_score": 3,
    "technical_analysis": "The paper delve into advanced algorithms, such as the Hypothetical Code-Integration Algorithm (HCIA), making it a dense read for those unfamiliar with theoretical computer science. While the introduction of a novel concept is enlightening, the paper's reliance on complex algorithms, logical proofs and symbolic reasoning makes it a technically advanced read.",
    "technical_score": 3,
    "enjoyable_analysis": "For those deeply engrossed in the LLM landscape, this paper promises an engaging journey. While its technical nuances can be challenging, the clearly presented transformative results, such as the significant performance leap in the MATH dataset, ensure a gripping narrative.",
    "enjoyable_score": 2
}}
```

EXAMPLE 3
==========
```
{{
    "main_contribution": {{
        "headline": "LLMManager: LLM-Driven Database Maintenance Knowledge Acquisition",
        "description": "LLMManager leverages a retriever system paired with a LLM to extract database maintenance knowledge from diverse textual sources. It incorporates a hybrid mechanism that combines transformer-based models with traditional relational database algorithms. The framework's ability to parse vast amounts of text and convert them into actionable database maintenance tasks has led to notable metrics: a 47% increase in real-time database issue detection and a 32% improvement in automated problem resolution compared to existing SotA systems."
    }},
    "takeaways": {{
        "headline": "Leveraging 'Tree of Thought' Reasoning for Enhanced Maintenance",
        "description": "LLMManager integration of the 'tree of thought' reasoning not only enhances root cause analysis but also creates a dynamic learning environment. Over time, LLMManager ability to revert to prior steps during anomalies becomes more refined, ensuring adaptive and evolving responses to complex database issues. Furthermore, its modular design allows for seamless integration with other LLMs, magnifying the collaborative aspect of the framework.",
        "example": "Automating database maintenance with D-Bot can lead to significant reductions in downtime and costs. Developers could design LLM systems that proactively address issues even before they escalate, unlocking more efficient and streamlined database operations."
    }},
    "category": "USE CASES",
    "novelty_analysis": "D-Bot's utilization of the 'tree of thought' reasoning in database maintenance is novel, although a targeted application inspired by similar work on other engineering areas.",
    "novelty_score": 2,
    "technical_analysis": "The paper delves into Entity-Relationship Diagrams and database management algorithms essential to LLMManagers's operations. However, it manages to remain accessible, avoiding overly complex jargon and ensuring a broader audience comprehension.",
    "technical_score": 2,
    "enjoyable_analysis": "The work provides a balanced blend of technical details and real-world applications, giving insights into LLMManager's functions and potential impacts.",
    "enjoyable_score": 2
}}
```

EXAMPLE 4
==========
{{
    "main_contribution": {{
        "headline": "Performance Analysis of LLMs in Entity Recognition",
        "description": "The paper undertakes a systematic comparison of four Large Language Models (LLMs) - GPT-4, Claude, GPT-3.5, and Prodisol-001 - with a focus on entity recognition. Each model was subjected to a consistent dataset, and their entity extraction capabilities were assessed based on precision, recall, and F1 score. Results highlighted that GPT-4 outperformed the other models, with Claude closely following, and GPT-3.5 and Prodisol-001 trailing behind. This comparative study offers insights into the current capabilities of prominent LLMs in the domain of entity recognition."
    }},
    "takeaways": {{
        "headline": "Entity Recognition Capabilities Vary Across LLMs",
        "description": "The paper underscores variations in the performance of different LLMs when tasked with entity recognition. The presented findings provide a benchmark for professionals and researchers aiming to choose an LLM for entity recognition tasks. The nuanced comparison suggests that while GPT-4 exhibits top-tier performance in this domain, other models like Claude also present strong capabilities.",
        "example": "When parsing a complex news article about the merger between two tech giants, it becomes crucial to accurately recognize and categorize entities such as company names, CEOs, financial figures, and locations. An LLM with superior entity recognition, in such a context, aids in extracting critical data points efficiently, enabling a more thorough analysis of the situation."
    }},
    "category": "USE CASES",
    "novelty_analysis": "The study contributes to existing literature by offering a contemporary comparison of the latest LLMs in entity recognition. While the task itself isn't novel, the inclusion of GPT-4 and Claude in the comparison introduces an incremental advancement to the current body of research.",
    "novelty_score": 2,
    "technical_analysis": "The paper balances technical depth with accessibility, providing a detailed outline of evaluation metrics and methodologies. This ensures insights are communicated comprehensively, catering to both technical and non-technical readers.",
    "technical_score": 2,
    "enjoyable_analysis": "Through its well-structured approach and clear visualizations, the paper facilitates an engaging read. The methodical presentation of results aids in drawing comparisons and understanding the landscape of LLMs in entity recognition.",
    "enjoyable_score": 2
}}
```

YOUR TURN
==========

{response}
"""
    return prompt

In [100]:
def gen_dataset():
    for _, row in df.iterrows():
        yield {"text": generate_prompt(row)}
        
ds = Dataset.from_generator(gen_dataset)
ds.save_to_disk("data/arxiv_summary_prompts")

Generating train split: 0 examples [00:00, ? examples/s]