# Description

This notebook TODO

# Modules

In [1]:
from IPython.display import display
import pandas as pd
from langchain.globals import set_llm_cache
from langchain.cache import SQLiteCache

from proj import conf
from proj.utils import llm_pairwise

# Settings/paths

In [2]:
REPO = "pivlab/manubot-ai-editor-code-test-biochatter-manuscript"
# LLM_JUDGE = "openai:gpt-4-turbo-preview"
# LLM_JUDGE = "openai:gpt-4"
# LLM_JUDGE = "openai:gpt-3.5-turbo"
LLM_JUDGE = "mistral:7b-instruct-fp16"
N_REPS = 10

# Model parameters
TEMPERATURE = 0.5
MAX_TOKENS = 2000
SEED_INIT = 0

In [3]:
MANUSCRIPT_CODE = REPO.split("-test-")[1]
display(MANUSCRIPT_CODE)

'biochatter-manuscript'

In [4]:
INPUT_FILE = (
    conf.common.RESULTS_DIR / "paragraph_match" / MANUSCRIPT_CODE
).with_suffix(".pkl")
assert INPUT_FILE.exists()
display(INPUT_FILE)

PosixPath('/home/miltondp/projects/others/manubot/manubot-ai-editor-code/base/results/paragraph_match/biochatter-manuscript.pkl')

In [5]:
OUTPUT_FILE = (conf.common.RESULTS_DIR / "llm_pairwise" / MANUSCRIPT_CODE).with_suffix(
    ".pkl"
)
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
display(OUTPUT_FILE)

PosixPath('/home/miltondp/projects/others/manubot/manubot-ai-editor-code/base/results/llm_pairwise/biochatter-manuscript.pkl')

In [6]:
BASE_LANGCHAIN_CACHE_DIR = conf.common.RESULTS_DIR / "llm_cache"
BASE_LANGCHAIN_CACHE_DIR.mkdir(parents=True, exist_ok=True)
display(BASE_LANGCHAIN_CACHE_DIR)

PosixPath('/home/miltondp/projects/others/manubot/manubot-ai-editor-code/base/results/llm_cache')

# Load paragraphs

In [7]:
df = pd.read_pickle(INPUT_FILE)

In [8]:
df.shape

(37, 3)

In [9]:
df.head()

Unnamed: 0,section,original,modified
0,abstract,Current-generation Large Language Models (LLMs...,Large Language Models (LLMs) have generated si...
1,introduction,"Despite technological advances, understanding ...","Despite technological advances, understanding ..."
2,introduction,Large Language Models (LLMs) of the current ge...,The latest generation of Large Language Models...
3,introduction,Computational biomedicine involves many tasks ...,Computational biomedicine encompasses various ...
4,results,The framework is designed to be modular: any o...,"The framework is designed to be modular, allow..."


In [10]:
df.iloc[0]["original"]

'Current-generation Large Language Models (LLMs) have stirred enormous interest in recent months, yielding great potential for accessibility and automation, while simultaneously posing significant challenges and risk of misuse. To facilitate interfacing with LLMs in the biomedical space, while at the same time safeguarding their functionalities through sensible constraints, we propose a dedicated, open-source framework: BioChatter. Based on open-source software packages, we synergise the many functionalities that are currently developing around LLMs, such as knowledge integration / retrieval-augmented generation, model chaining, and benchmarking, resulting in an easy-to-use and inclusive framework for application in many use cases of biomedicine. We focus on robust and user-friendly implementation, including ways to deploy privacy-preserving local open-source LLMs. We demonstrate use cases via two multi-purpose web apps ([https://chat.biocypher.org](https://chat.biocypher.org)), and pr

In [11]:
df.iloc[0]["modified"]

'Large Language Models (LLMs) have generated significant interest due to their potential for accessibility and automation in various fields, including biomedicine. However, they also present challenges and risks of misuse. In this paper, we address the need for a framework to interface with LLMs in the biomedical domain while ensuring their safe and effective use. To meet this need, we introduce BioChatter, an open-source framework that integrates various functionalities of LLMs, such as knowledge integration, retrieval-augmented generation, model chaining, and benchmarking. By leveraging open-source software packages, we have developed a user-friendly and versatile platform that can be applied across a range of biomedicine use cases. Our focus is on implementing robust and privacy-preserving local open-source LLMs. We showcase the utility of BioChatter through two multi-purpose web apps available at [https://chat.biocypher.org](https://chat.biocypher.org) and provide comprehensive doc

# Test run

In [12]:
t_json = llm_pairwise(
    df.iloc[0]["original"],
    df.iloc[0]["modified"],
    df.iloc[0]["section"],
    model_name=LLM_JUDGE,
    model_params={
        "temperature": TEMPERATURE,
        "max_tokens": MAX_TOKENS,
        "model_kwargs": {
            "seed": SEED_INIT,
        },
    },
    verbose=True,
)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are an expert copyeditor with ample experience in scientific writing. You are assessing the quality of two versions of the Abstract of a scientific article.
Human: Read the following paragraph and write a list with your comments on the following areas: 1) has a clear sentence structure, 2) is easy to follow, 3) is correct in grammar, 4) has no spelling errors, 5) potential areas of improvement.

Paragraph A: Current-generation Large Language Models (LLMs) have stirred enormous interest in recent months, yielding great potential for accessibility and automation, while simultaneously posing significant challenges and risk of misuse. To facilitate interfacing with LLMs in the biomedical space, while at the same time safeguarding their functionalities through sensible constraints, we propose a dedicated, open-source framework: BioChatter. Based on open-source software packages, we synergise the man

In [13]:
t_json

{'best': 'Paragraph 1',
 'rationale': "Paragraph 1 has a clear sentence structure, is easy to follow, is correct in grammar, and has no spelling errors. It also provides more specific details about BioChatter's functionality and how it differs from other LLM frameworks in the biomedical space. Additionally, it showcases the utility of BioChatter through two web apps available at <https://chat.biocypher.org> and provides comprehensive documentation, support, and a vibrant open community."}

In [14]:
type(t_json)

dict

# Run

Since models are stochastic, we run the pairwise comparison many times.

Here I use a cache to avoid hitting an external API multiple times.

In [15]:
results = []

In [16]:
for rep_idx in range(N_REPS):
    # we cache prompt/results by repetition
    output_cache_file = BASE_LANGCHAIN_CACHE_DIR / f"rep{rep_idx}.db"
    set_llm_cache(SQLiteCache(database_path=str(output_cache_file)))

    print(f"{str(rep_idx).zfill(2)} ({output_cache_file.name}): ", end="", flush=True)

    for par_idx, par in df.iterrows():
        print(".", end="", flush=True)

        res = llm_pairwise(
            par["original"],
            par["modified"],
            par["section"],
            model_name=LLM_JUDGE,
            model_params={
                "temperature": TEMPERATURE,
                "max_tokens": MAX_TOKENS,
                "model_kwargs": {
                    "seed": SEED_INIT + rep_idx,
                },
            },
            verbose=False,
        )

        results.append(
            {
                "rep_index": rep_idx,
                "paragraph_index": par_idx,
                "paragraph_section": par["section"],
                "winner": res["best"],
                "rationale": res["rationale"],
            }
        )

    print(flush=True)

00 (rep0.db): .

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7093f9e65ad0>>
Traceback (most recent call last):
  File "/home/miltondp/software/miniconda3/envs/manubot-ai-editor-code/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 

KeyboardInterrupt



# Process results

In [None]:
winner_matchings = {
    "Paragraph A": "-1",  # Original
    "Paragraph 1": "1",  # Modified
    "tie": "0",
}

In [None]:
df_results = pd.DataFrame(results)
df_results["winner_score"] = df_results["winner"].replace(winner_matchings).apply(float)

In [None]:
df_results.shape

In [None]:
df_results.head()

In [None]:
df_results.dtypes

In [None]:
df_results.groupby("paragraph_section")["winner_score"].mean()

# Save

In [None]:
df_results.to_pickle(OUTPUT_FILE)