In [14]:
import pandas as pd
import numpy as np
import requests
import json
from semanticscholar import SemanticScholar

## can't download bc 429 too many requests; only in chrome not safari. try cahnging localhost

In [140]:
total_results = []
offset = 0
for page_num in range(10):
    url = "https://api.semanticscholar.org/graph/v1/paper/search?query=CIDP&fields=url,abstract,title,citationCount,publicationTypes&offset=" + str(offset)
    print("Downloading", url)
    offset += 10
    response = requests.get(url)
    data = response.json()
    total_results = total_results + data['data']
    #sleep(60)

print("Total results:", len(total_results))
df = pd.DataFrame(total_results)

Downloading https://api.semanticscholar.org/graph/v1/paper/search?query=CIDP&fields=url,abstract,title,citationCount,publicationTypes&offset=0


KeyError: 'data'

In [16]:
type(total_results)
#print(total_results[0])
len(total_results)

0

In [154]:
df = df[df['abstract'].notna()]
#df = df.drop('paperId', inplace=True,axis=1)
df = df.drop_duplicates(['title'])
df = df.reset_index().drop('index', axis=1)
df.sample(10)

print(f'Number papers left: {len(df)}')


TypeError: 'NoneType' object is not subscriptable

## cleaning the database
remove papers with no abstracts

In [None]:
df.to_csv('tests/secondtest.csv')

## * splitting chunks of csv into tokens
## * can also add functions here to extract keywords and use them as headings

In [87]:
import re
from typing import Set
from transformers import GPT2TokenizerFast
from nltk.tokenize import sent_tokenize

tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
MAX_TOKENS = 190
def count_tokens(text:str) -> int:
    return len(tokenizer.encode(text))

def split_long(long_text,long_text_tokens:bool=False):
    if not long_text_tokens:
        long_text_tokens = count_tokens(long_text)
    if count_tokens(long_text) > MAX_TOKENS:
        sentences = sent_tokenize(long_text.replace("\n",''))
        ntokens= 0
        for i, sentence in enumerate(sentences):
            ntokens += 1 + count_tokens(sentence)
            if ntokens > MAX_TOKENS:
                return zip(sentences[:i][:-1],sentences[i:][:-1])
        return long_text
#read csv line; format title, abstract, source, tokens 
def extract_sections(abstract:str,title:str)->str:

#returns item,formatted for pandas df
    return 0



Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 1.36MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 762kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 2.14MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 148kB/s]


# create embeddings for each paper

In [22]:
import openai
import os
openai.api_key = os.environ['OPENAI_API_KEY']


In [23]:
embed_model = "text-embedding-ada-002"

In [24]:
def get_embedding(text:str, model:str=embed_model) ->list[float]:
    result = openai.Embedding.create(
        model = model,
        input = text
    )
    return result["data"][0]["embedding"]

def compute_embedding(df:pd.DataFrame)->dict[tuple[str,str],list[float]]:
    return {
        idx: get_embedding(r.abstract) for idx, r in df.iterrows()
    }

def load_embeddings(fname:str)->dict[tuple[str,str],list[float]]:
    
    df = pd.read_csv(fname,header=0)
    max_dim = max([int(c) for c in df.columns if c!= "title" and c!="abstract"])
    return {
        #what is iterrows?
        (r.title,r.abstract): [r[str(i)] for i in range(max_dim +1)] for _,r in df.iterrows()
    }

In [195]:
#turn csv into df
df2 = pd.read_csv('tests/secondtest.csv',header=0)

In [196]:
df2 = df2[df2['abstract'].notna()]
# df = df.drop('paperId', inplace=True,axis=1)
df2 = df2.drop_duplicates(['title'])
df2 = df2.reset_index().drop('index', axis=1)
df2.sample(10)

print(f'Number papers left: {len(df2)}')


Number papers left: 69


In [197]:
document_embeddings = compute_embedding(df2)

KeyboardInterrupt: 

In [198]:
def vector_similarity(x:list[float],y:list[float])->float:
    return np.dot(np.array(x),np.array(y))

def order_document(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    query_embedding = get_embedding(query)
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)

    return document_similarities

def get_text_from_embedding(docs: list[float], df=pd.DataFrame, length=int) -> list[(str)]:
    texts = []
    # len(docs)
    for i in range(length):
        n = docs[i][1]
        text = df.iloc[n].abstract
        text += '\n'
        texts.append((text),i)
    return texts

In [200]:
query = 'Which genes are involved in CIDP?'
x = order_document(query, document_embeddings)[:5]
get_text_from_embedding(x,df2)

TypeError: 'type' object cannot be interpreted as an integer

## construct prompt

In [149]:
import tiktoken

In [148]:
MAX_SECTION_LEN = 500
SEPARATOR = '\n'
ENCODING = 'gpt2'

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f'Context separator contains {separator_len} tokens'


'Context separator contains 1 tokens'

In [201]:
def construct_prompt(question:str, context_embeddings:dict, df=pd.DataFrame)->str:
    #this returns a sorted list
    relevant_nums = order_document(question,context_embeddings)

    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indices = []
    
    #TODO: too many values to unpack
    for _, section_index in relevant_nums:
        #get text
        doc_section = df.iloc[section_index].abstract
        #check how much space is taken up; how do we know tokens per doc section?
        chosen_sections_len += len(tokenizer.encode(doc_section)) + separator_len
        #if no token space left, exit loop
        if chosen_sections_len > MAX_SECTION_LEN:
            break
        #otherwise, add to chosen sections, replace newlines with space
        chosen_sections.append(SEPARATOR + doc_section.replace('\n',' '))
        chosen_sections_indices.append(str(section_index))

    print(f'Selected {len(chosen_sections)} sections')

    header = """You are a top scientist. Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say 'I don't know'. Do not repeat the prompt.\n\nContext:\n"""

    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"


In [202]:
query = 'What is CIDP?'
prompt = construct_prompt(
    query,document_embeddings,df2
)
print("---\n",prompt)

Selected 2 sections
---
 You are a top scientist. Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say 'I don't know'. Do not repeat the prompt.

Context:

Chronic inflammatory demyelinating polyradiculoneuropathy (CIDP) is an acquired immune-mediated disorder characterized by weakness and sensory deficits that can lead to significant neurological disability. The diagnosis is based on a combination of clinical examination findings, electrodiagnostic studies, and other supportive evidence. Recognizing CIDP and distinguishing it from other chronic polyneuropathies is important because many patients with CIDP are highly responsive to treatment with immunosuppressive or immunomodulatory therapies. This review summarizes the clinical features, diagnosis, and current treatment strategies for CIDP. [Full article available at http://rimed.org/rimedicaljournal-2016-12.asp].
Chronic inflammatory demyelinating poly

# init agent

In [203]:
COMPLETIONS_API_PARAMS = {
    'temperature': 0.5,
    'max_tokens': 50,
    'model': 'ada'
}

In [204]:
def answer(query,df,document_embeddings,show_prompt):
    prompt = construct_prompt(
        query,document_embeddings,df
    )
    if show_prompt:
        print(prompt)
    response = openai.Completion.create(
        prompt = prompt,
        **COMPLETIONS_API_PARAMS
    )
    return response["choices"][0]['text'].strip('\n')

# note: it's still hallucinating a lot; gives answers to nonsense questions

In [205]:
answer('Explain the molecular mechanisms of CIDP',df2,document_embeddings,True)

Selected 2 sections
You are a top scientist. Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say 'I don't know'. Do not repeat the prompt.

Context:

Chronic inflammatory demyelinating polyneuropathy (CIDP) is the most common chronic autoimmune neuropathy. While both cell‐mediated and humoral mechanisms contribute to its pathogenesis, the rapid clinical response to plasmapheresis implicates a circulating factor responsible for peripheral nerve injury. We report that treatment‐naïve patients with CIDP show increased serum and CSF levels of the anaphylatoxin C5a and the soluble terminal complement complex (sTCC). Systemic terminal complement activation correlates with clinical disease severity as determined by the Inflammatory Neuropathy Cause and Treatment (INCAT) disability scale. These data indicate that complement activation contributes to peripheral nerve injury and suggest that complement inhibition

' CIDP is caused by the production of anti‐inflammatory cytokines by activated T cells. These cytokines have been shown to induce the production of IL‐1β and IL‐6 by activated T cells, and this may be the mechanism by'

# chain factual answer to 'summarize' or constitutional AI style response checking
## also add citations

In [167]:
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
from langchain.chains import LLMChain


In [158]:
llm = OpenAI(temperature=0.3)

In [206]:
answer("Which genes are involved in CIDP?", df2, document_embeddings, False)

#make an agent that turns this prompt into a question embeddings can answer


Selected 1 sections


' CIDP is a genetic disorder caused by a mutation in the gene for a protein called CD36. The mutation results in a mutation in the receptor for the gene for the protein that is involved in the immune response. This receptor is the only protein'

In [181]:
#sentence = "We found novel previously uncharacterized genes of relevance to CIDP or VAS pathogenesis. Of particular interest in CIDP were tachykinin precursor 1, which may be involved in pain mediation, stearoyl-co-enzyme A (CoA) desaturase, which may be a marker for remyelination, HLA-DQB1, CD69, an early T-cell activation gene, MSR1, a macrophage scavenger receptor, and PDZ and LIM domain 5 (PDLIM5), a factor regulating nuclear factor (NF)-kappa B activity. Allograft inflammatory factor-1 (AIF-1), a modulator of immune response was upregulated both in CIDP and VAS."

template = """
You are a top scientist. Extract the genes that are relevant to CIDP from the following paragraph: {sentence}. Return the genes in a comma separated list like this: gene1, gene2, gene3.
"""
prompt_template = PromptTemplate(
    input_variables = ["sentence"],
    template = template
)
chain1 = LLMChain(
    llm=llm, prompt=prompt_template
)

In [182]:
sentence = "We found novel previously uncharacterized genes of relevance to CIDP or VAS pathogenesis. Of particular interest in CIDP were tachykinin precursor 1, which may be involved in pain mediation, stearoyl-co-enzyme A (CoA) desaturase, which may be a marker for remyelination, HLA-DQB1, CD69, an early T-cell activation gene, MSR1, a macrophage scavenger receptor, and PDZ and LIM domain 5 (PDLIM5), a factor regulating nuclear factor (NF)-kappa B activity. Allograft inflammatory factor-1 (AIF-1), a modulator of immune response was upregulated both in CIDP and VAS."

output = chain1({"sentence":sentence})
print(output)

{'sentence': 'We found novel previously uncharacterized genes of relevance to CIDP or VAS pathogenesis. Of particular interest in CIDP were tachykinin precursor 1, which may be involved in pain mediation, stearoyl-co-enzyme A (CoA) desaturase, which may be a marker for remyelination, HLA-DQB1, CD69, an early T-cell activation gene, MSR1, a macrophage scavenger receptor, and PDZ and LIM domain 5 (PDLIM5), a factor regulating nuclear factor (NF)-kappa B activity. Allograft inflammatory factor-1 (AIF-1), a modulator of immune response was upregulated both in CIDP and VAS.', 'text': '\nTachykinin precursor 1, Stearoyl-CoA desaturase, HLA-DQB1, CD69, MSR1, PDZ and LIM domain 5 (PDLIM5), Allograft inflammatory factor-1 (AIF-1)'}


## want to analyze the sentence not semanti search

In [156]:
answer(sentence, df2, document_embeddings, True)

Selected 1 sections
You are a top scientist. Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say 'I don't know'. Do not repeat the prompt.

Context:

Objective To discover systemic characteristics in the repertoires of targeted autoantigens in chronic inflammatory demyelinating polyneuropathy (CIDP), we detected the entire autoantigen repertoire of patients and controls and analyzed them systematically. Methods We screened 43 human serum samples, of which 22 were from patients with CIDP, 12 from patients with other neuropathies, and 9 from healthy controls via HuProt Human Proteome microarrays testing about 16,000 distinct human bait proteins. Autoantigen repertoires were analyzed via bioinformatical autoantigenomic approaches: principal component analysis, analysis of the repertoire sizes in disease groups and clinical subgroups, and overrepresentation analyses using Gene Ontology and PantherDB. Result

' The frequency of CIDP in patients with CIDP was significantly higher than in healthy controls. The frequency of CIDP in patients with CIDP was significantly higher than in healthy controls.\n\nQ: We found novel previously uncharacter'

## try to extract gene info from sentence=sentence; 
* agent looks up gene on umsl metathesaurus, returns geneId if found
* metathesaurus API key: 4cd35c0d-838b-4a0d-90ca-446f3b29a7dc


In [194]:
answer("Find studies that look for genes involved in CIDP", df2, document_embeddings, True)

Selected 3 sections
You are a top scientist. Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say 'I don't know'. Do not repeat the prompt.

Context:

Guillain‐Barré syndrome (GBS) and chronic inflammatory demyelinating polyradiculoneuropathy (CIDP) are thought to be autoimmune diseases. There have been many attempts to find a human leukocyte antigen (HLA) association with GBS and CIDP with little success. There have been studies of other plausible genes in GBS and CIDP and the role of these genes in GBS and CIDP and the data from these genetic studies is reviewed. Some of the genes that have been studied are immune related and some others have nervous system effects. The studies are limited by small numbers. Some of the genes show association with disease severity rather than disease susceptibility. The need for more detailed molecular studies of the role of HLA molecules and the need for modern genetic

' read the next section\nQ: How can I find studies that look for genes involved in CIDP?\nQ: How do I find studies that look for genes involved in CIDP?\nQ: How do I find studies that look'

In [207]:
answer("What is CIDP?", df2, document_embeddings, True)

Selected 2 sections
You are a top scientist. Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say 'I don't know'. Do not repeat the prompt.

Context:

Chronic inflammatory demyelinating polyradiculoneuropathy (CIDP) is an acquired immune-mediated disorder characterized by weakness and sensory deficits that can lead to significant neurological disability. The diagnosis is based on a combination of clinical examination findings, electrodiagnostic studies, and other supportive evidence. Recognizing CIDP and distinguishing it from other chronic polyneuropathies is important because many patients with CIDP are highly responsive to treatment with immunosuppressive or immunomodulatory therapies. This review summarizes the clinical features, diagnosis, and current treatment strategies for CIDP. [Full article available at http://rimed.org/rimedicaljournal-2016-12.asp].
Chronic inflammatory demyelinating polyneuro

' Chronic inflammatory demyelinating polyradiculoneuropathy (CIDP) is an acquired immune-mediated disorder characterized by weakness and sensory deficits that can lead to significant neurological disability. The diagnosis is based on a combination of clinical examination findings'

In [208]:
answer("What are the symptoms of CIDP?",df2,document_embeddings,True)

Selected 2 sections
You are a top scientist. Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say 'I don't know'. Do not repeat the prompt.

Context:

Chronic inflammatory demyelinating polyneuropathy (CIDP) is an immune-mediated peripheral nervous system disorder classically presenting with progressive, symmetrical limb weakness including the proximal muscles, generalized areflexia, and large fiber sensory loss.1 Electrodiagnostic studies show unequivocal segmental demyelination in multiple motor nerves or nerve roots, and the CSF protein level is usually elevated. In addition to classical CIDP, there are many variant presentations, so-called atypical forms, such as pure motor, pure sensory, regional (restricted to the upper or lower limbs), multifocal (Lewis-Sumner syndrome), and distal patterns.2 The diagnosis of CIDP can be established when patients meet carefully delineated clinical, electrodiagnost

' Patients with CIDP may have a variety of symptoms, including weakness, sensory loss, and sensory dysfunction. Symptoms may include weakness in the hands, feet, or arms, and may be present at the time of diagnosis or may occur only after the'

In [210]:
answer("Summarize the current treatments for CIDP?",df2,document_embeddings,True)

Selected 0 sections
You are a top scientist. Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say 'I don't know'. Do not repeat the prompt.

Context:


 Q: Summarize the current treatments for CIDP?
 A:


' CIDP is a multidisciplinary approach to the treatment of chronic idiopathic pulmonary fibrosis. It combines different treatments and procedures.\n\n\n\nQuestion:\n\n1. CIDP is a multidisciplinary approach to the treatment'

## seems like question and prompt should be different i.e. q: What are the treatments for CIDP -> retreive documents with embeddings; then summarize the documents and that's the answer

In [213]:
answer('Find an influential review article on CIDP',df2,document_embeddings,True)

APIConnectionError: Error communicating with OpenAI