In [6]:
from datasets import Dataset
from transformers import CodeLlamaTokenizerFast
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import pandas as pd
import json
import os
import re

load_dotenv()

True

In [7]:
tokenizer = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-13b-hf")

In [8]:
## Input paths.
arxiv_path = os.getenv("ARXIV_PATH")
summaries_path = os.getenv("SUMMARY_PATH")

## Load.
arxiv_fnames = os.listdir(arxiv_path)
arxiv_fnames = [fname.replace(".txt", "") for fname in arxiv_fnames]
summaries_fnames = os.listdir(summaries_path)
summaries_fnames = [fname.replace(".json", "") for fname in summaries_fnames]

## Combine.
all_fnames = set(arxiv_fnames).intersection(set(summaries_fnames))
print(len(all_fnames))

472


In [9]:
keep_keys = [
    'main_contribution',
     'takeaways',
     'category',
     'novelty_analysis',
     'novelty_score',
     'technical_analysis',
     'technical_score',
     'enjoyable_analysis',
     'enjoyable_score'
]

In [10]:
## Create list of input-output pairs.
input_output_pairs = []
token_lengths = []
string_lengths = []

for fname in tqdm(all_fnames):
    ## Loaders.
    input_text = open(f"{arxiv_path}/{fname}.txt", "r").read()
    input_text = re.sub(r'\n{3,}', '\n\n', input_text)
    biblio_pattern = re.compile(r'(\nReferences.{0,5}|\nBibliography.{0,5})\s*[\s\S]*', re.IGNORECASE)
    input_text = biblio_pattern.sub('', input_text)
    input_text = input_text[:25000 * 3]
    
    output_json = open(f"{summaries_path}/{fname}.json", "r").read()
    output_json = json.loads(output_json)
    summary_text = output_json["Summary"]
    output_json = {k: output_json[k] for k in keep_keys}
    output_text = json.dumps(output_json, indent=4)
    
        
    ## Token count and limit.
    input_tokens = len(tokenizer.tokenize(input_text))
    output_tokens = len(tokenizer.tokenize(output_text))
    
    ## Store.
    token_lengths.append((input_tokens, output_tokens))
    string_lengths.append((len(input_text), len(output_text)))
    input_output_pairs.append((input_text, output_text, summary_text))

  0%|          | 0/472 [00:00<?, ?it/s]

In [11]:
## As data framelet.
df = pd.DataFrame(input_output_pairs, columns=["input", "output", "summary"])
df.head()

Unnamed: 0,input,output,summary
0,LLM-Rec: Personalized Recommendation via\nProm...,"{\n ""main_contribution"": {\n ""headli...",We investigate various prompting strategies fo...
1,Causal-Discovery Performance of ChatGPT in the...,"{\n ""main_contribution"": {\n ""headli...",ChatGPT has demonstrated exceptional proficien...
2,KoLA: Carefully Benchmarking World Knowledge\n...,"{\n ""main_contribution"": {\n ""headli...",The unprecedented performance of large languag...
3,Connecting Neural Response measurements &\nCom...,"{\n ""main_contribution"": {\n ""headli...",Understanding the neural basis of language com...
4,Code Prompting: a Neural Symbolic Method for\n...,"{\n ""main_contribution"": {\n ""headli...",Large language models (LLMs) have scaled up to...


In [12]:
def generate_prompt(row):
    """ Format intput and output into a prompt. """
    content = row["input"]
    summary = row["summary"]
    response = row["output"]
    prompt = f"""
### INPUT
========================
#### Summary:
{summary}

#### Whitepaper:
{content}


### INSTRUCTIONS
========================
Use the following JSON template to respond.

{{
    "main_contribution": {{
        "headline": "<<main_headline>>",
        "description": "<<main_description>>"
    }},
    "takeaways": {{
        "headline": "<<takeaways_headline>>",
        "description": "<<takeaways_description>>",
        "example": "<<takeaways_example>>"
    }},
    "category": "<<category>>",
    "novelty_analysis": "<<novelty_analysis_text>>",
    "novelty_score": <<novelty_score_number>>,
    "technical_analysis": "<<technical_analysis_text>>",
    "technical_score": <<technical_score_number>>,
    "enjoyable_analysis": "<<enjoyable_analysis_text>>",
    "enjoyable_score": <<enjoyable_score_number>>
}}


### RESPONSE
========================
{response}


### END
"""
    return prompt

In [13]:
def gen_dataset():
    for _, row in df.iterrows():
        yield {"text": generate_prompt(row)}
        
ds = Dataset.from_generator(gen_dataset)
ds.save_to_disk("data/arxiv_summary_prompts")

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/472 [00:00<?, ? examples/s]

In [14]:
df["prompt"] = df.apply(generate_prompt, axis=1)
df.head()

Unnamed: 0,input,output,summary,prompt
0,LLM-Rec: Personalized Recommendation via\nProm...,"{\n ""main_contribution"": {\n ""headli...",We investigate various prompting strategies fo...,\n### INPUT\n========================\n#### Su...
1,Causal-Discovery Performance of ChatGPT in the...,"{\n ""main_contribution"": {\n ""headli...",ChatGPT has demonstrated exceptional proficien...,\n### INPUT\n========================\n#### Su...
2,KoLA: Carefully Benchmarking World Knowledge\n...,"{\n ""main_contribution"": {\n ""headli...",The unprecedented performance of large languag...,\n### INPUT\n========================\n#### Su...
3,Connecting Neural Response measurements &\nCom...,"{\n ""main_contribution"": {\n ""headli...",Understanding the neural basis of language com...,\n### INPUT\n========================\n#### Su...
4,Code Prompting: a Neural Symbolic Method for\n...,"{\n ""main_contribution"": {\n ""headli...",Large language models (LLMs) have scaled up to...,\n### INPUT\n========================\n#### Su...


In [15]:
print(df.iloc[20]["prompt"])


### INPUT
#### Summary:
Recent artificial intelligence (AI) systems have reached milestones in "grand challenges" ranging from Go to protein-folding. The capability to retrieve medical knowledge, reason over it, and answer medical questions comparably to physicians has long been viewed as one such grand challenge.   Large language models (LLMs) have catalyzed significant progress in medical question answering; Med-PaLM was the first model to exceed a "passing" score in US Medical Licensing Examination (USMLE) style questions with a score of 67.2% on the MedQA dataset. However, this and other prior work suggested significant room for improvement, especially when models' answers were compared to clinicians' answers. Here we present Med-PaLM 2, which bridges these gaps by leveraging a combination of base LLM improvements (PaLM 2), medical domain finetuning, and prompting strategies including a novel ensemble refinement approach.   Med-PaLM 2 scored up to 86.5% on the MedQA dataset, impro