In [None]:
from datasets import Dataset
from transformers import CodeLlamaTokenizerFast
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import pandas as pd
import json
import os
import re

load_dotenv()

In [None]:
tokenizer = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-13b-hf")

In [None]:
## Input paths.
arxiv_path = os.getenv("ARXIV_PATH")
summaries_path = os.getenv("SUMMARY_PATH")

## Load.
arxiv_fnames = os.listdir(arxiv_path)
arxiv_fnames = [fname.replace(".txt", "") for fname in arxiv_fnames]
summaries_fnames = os.listdir(summaries_path)
summaries_fnames = [fname.replace(".json", "") for fname in summaries_fnames]

## Combine.
all_fnames = set(arxiv_fnames).intersection(set(summaries_fnames))
print(len(all_fnames))

In [None]:
keep_keys = [
    'main_contribution',
     'takeaways',
     'category',
     'novelty_analysis',
     'novelty_score',
     'technical_analysis',
     'technical_score',
     'enjoyable_analysis',
     'enjoyable_score'
]

In [None]:
## Create list of input-output pairs.
input_output_pairs = []
token_lengths = []
string_lengths = []

for fname in tqdm(all_fnames):
    ## Loaders.
    input_text = open(f"{arxiv_path}/{fname}.txt", "r").read()
    input_text = re.sub(r'\n{3,}', '\n\n', input_text)
    biblio_pattern = re.compile(r'(\nReferences.{0,5}|\nBibliography.{0,5})\s*[\s\S]*', re.IGNORECASE)
    input_text = biblio_pattern.sub('', input_text)
    input_text = input_text[:25000 * 3]
    
    output_json = open(f"{summaries_path}/{fname}.json", "r").read()
    output_json = json.loads(output_json)
    summary_text = output_json["Summary"]
    output_json = {k: output_json[k] for k in keep_keys}
    output_text = json.dumps(output_json, indent=4)
    
        
    ## Token count and limit.
    input_tokens = len(tokenizer.tokenize(input_text))
    output_tokens = len(tokenizer.tokenize(output_text))
    
    ## Store.
    token_lengths.append((input_tokens, output_tokens))
    string_lengths.append((len(input_text), len(output_text)))
    input_output_pairs.append((input_text, output_text, summary_text))

In [None]:
## As data framelet.
df = pd.DataFrame(input_output_pairs, columns=["input", "output", "summary"])
df.head()

In [None]:
def generate_prompt(row):
    """ Format intput and output into a prompt. """
    content = row["input"]
    summary = row["summary"]
    response = row["output"]
    prompt = f"""
### INPUT
========================
#### Summary:
{summary}

#### Whitepaper:
{content}


### INSTRUCTIONS
========================
Based on the whitepaper presented above, answer the following questions:

1. What is the `main_contribution` of this paper? (1 line headline + 8-12 sentences)
    - If a new algorithm or technique is introduced, describe its workings clearly and comprehensively.
    - Do not assume that the reader knows terminology not part of the common AI/ML knowledge base.
    - Ensure that your answer provides practical insights that offer a solid understanding of the paper.
    - Detail the benefits or advantages of what has been presented, along with the practical implications for an LLM practitioner.
    - Do not include anything already discussed in the summary or abstract.

2. What is the main `takeaway`? (1 line headline + 8-12 sentences)
    - Focusing on the paper's contributions, explain how they can be used to create an interesting LLM application, improve current workflows, or increase efficiency when working with LLMs.
    - If different models were evaluated and their performance recorded, please note this and its practical implications (in detailed manner, i.e.: which model is best for what).
    - Be very precise, practical and specific as possible. Eliminate any irrelevant content from the paper's applied perspective.
    - If possible, provide a minimal code example or at least sketch the application.

3. Which category best describes this paper's primary focus? Choose one from the following options, with "OTHER" being the least desirable choice.
    a. "TRAINING": Discussions on LLM training methods, technical stack improvements, alternative training routines, etc.
    b. "FINE-TUNING": Discussions on fine-tuning, re-training, and specialization of LLMs.
    c. "ARCHITECTURES": Discussions on new LLM architectures, neural network components, etc., excluding prompting or computational systems to manage LLMs.
    d. "PROMPTING": Discussions on prompting methods, agent architectures, etc.
    e. "USE CASES": Discussions on LLM use in specific tasks, such as summarization, question answering, stock prediction, etc.
    f. "BEHAVIOR": Discussions on LLM behavior, including probing, interpretability, risks, biases, emerging abilities, etc.
    g. "OTHER": None of the above.

4. On a scale from 1 to 3, how novel is this paper? (1: not novel, 2: incrementally novel, 3: very novel)
    - Compare the paper's findings and contributions with what is presented in previous and related work. How unique and significant are the findings?
    - Be strict and rigorous; few papers should receive a high score.
    - Pay close attention to the comparison with prior work and the degree of difference in the author's contributions.

5. On a scale from 1 to 3, how technical is this paper? (1: not technical, 2: somewhat technical, 3: very technical)
    a) A very technical paper is difficult for a non-expert to understand, requires considerable technical knowledge, is filled with equations and jargon, and demands advanced mathematical knowledge.
    b) A somewhat technical paper may be challenging for a layman but can be understood reasonably well by someone with a computer science background. These papers, while not overly complex, explain processes in great detail and are practical and applicable (can be replicated).
    c) A non-technical paper is understandable for anyone with a college degree. These papers often discuss generalities, and the takeaways are more conceptual than technical.

6. On a scale from 1 to 3, how enjoyable is this paper? (1: hard to read, 2: ok, 3: a delight)
    a) A very enjoyable paper is well-written, organized, presents a novel and intriguing contribution, and is easy to read.
    b) An 'ok' paper is primarily plain and unexciting but is easy to read and contains some interesting parts. Most papers
    c) A non-enjoyable paper is difficult to read, poorly written, and lacks meaningful, practical, and insightful content.

When assigning numerical ratings consider these guidelines:
- Rating 3/3: Only about 20% of papers reach this standard.
- Rating 2/3: Most papers (50%) fall into this category.
- Rating 1/3: Around 30% of papers belong to this category.

Do not repeat the same comments across different answers.

Use the JSON format as in the following examples to respond.

{{
    "main_contribution": {{
        "headline": "<<main_headline>>",
        "description": "<<main_description>>"
    }},
    "takeaways": {{
        "headline": "<<takeaways_headline>>",
        "description": "<<takeaways_description>>",
        "example": "<<takeaways_example>>"
    }},
    "category": "<<category>>",
    "novelty_analysis": "<<novelty_analysis_text>>",
    "novelty_score": <<novelty_score_number>>,
    "technical_analysis": "<<technical_analysis_text>>",
    "technical_score": <<technical_score_number>>,
    "enjoyable_analysis": "<<enjoyable_analysis_text>>",
    "enjoyable_score": <<enjoyable_score_number>>
}}


### RESPONSE
========================
{response}
"""
    return prompt

In [None]:
def gen_dataset():
    for _, row in df.iterrows():
        yield {"text": generate_prompt(row)}
        
ds = Dataset.from_generator(gen_dataset)
ds.save_to_disk("data/arxiv_summary_prompts")