# Notebook for running literature search and validation 

In [1]:
import os
import pandas as pd
import json 
from Bio import Entrez
import openai
from utils.reference_checker import get_references_for_paragraphs
import pickle

In [2]:
runVersion = 'additional'#'initial'

In [3]:
runOnlyExamples = False

In [4]:
dataType = "Omics"

In [5]:
if dataType == "Omics":
    LLM_analysisFilePath  = 'data/omics_LLM_Enrichr_simVals_DF.tsv'
    toSaveFilePath = 'data/omics_LLM_Enrichr_simVals_refs_DF.tsv'
    nameCol = 'GeneSetName'
    

elif dataType == "GO_sets":
    LLM_analysisFilePath = 'data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms.tsv'
    toSaveFilePath = 'data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms_refs.tsv'
    jsonFilePath = 'reference_checking_task1.json'
    examplesTORun = ["GO:0019433"] 
    nameCol = 'GO'
    
else:
    print("Not implemented for dataType")
    

In [6]:
if runVersion == 'additional':
    LLM_analysisFilePath = toSaveFilePath

In [7]:
LLM_analysisFilePath

'data/omics_LLM_Enrichr_simVals_refs_DF.tsv'

In [8]:
jsonFilePath = 'jsonFiles/reference_checking_revision.json'

In [9]:
with open(jsonFilePath) as json_file:
    config = json.load(json_file)

openai.api_key = os.environ["OPENAI_API_KEY"]
email = config['EMAIL']

 # save the paragraph with corresponding keywords and references to a json file 
savejsonPath = 'paragraph_ref_data_revision_240109' #this is the default, change to your own json file name (no need to add '.json') 

openai.api_key = os.environ["OPENAI_API_KEY"] # Environment variable

In [10]:
MarkedParagraphs = []

In [11]:
## Read in the LLM analysis file
df = pd.read_csv(LLM_analysisFilePath, sep='\t', keep_default_na=False, na_values=['NaN'])
df = df.replace({'None':None})

In [12]:
if (runVersion == "initial") or (runVersion == "rerun") or (runVersion == "test"):
    df['References'] = None

In [13]:
for i, row in df.iterrows():
#for i in range(startRow, df.shape[0]):
#    row = df.iloc[i]
  #  if runOnlyExamples: # Only run examples
  #      if df.iloc[i][nameCol] not in examplesTORun: 
  #          continue

    
    if runVersion == "initial":
        if df.iloc[i]['References'] is not None:
            continue
    if runVersion == "additional":
        if not (df.iloc[i]['References'] == ''):
            continue # skip this row because already done
    
    print('=========================================')

    print(['dataframe row', i])
    # check out the llm analysis 
    example_analysis = df.iloc[i]['LLM Analysis']
    paragraphs = list(filter(lambda p: len(p.split()) > 5, example_analysis.split("\n")))
    
    try:
        references = get_references_for_paragraphs(paragraphs, email = email, config =config, n=3, verbose=True, MarkedParagraphs = MarkedParagraphs, saveto=savejsonPath)
        print(['Cannot get references for row', i])
    except:
        references = ''
        
    references.replace( '\n', '')
        
    df.loc[i, 'References'] = references
    if i%5==0:
        df.to_csv(toSaveFilePath, sep = '\t')
        
if not runOnlyExamples:
    if MarkedParagraphs:
        with open('data/marked_paragraph_reference.json', 'w') as fp:
            json.dump(MarkedParagraphs, fp)
if not runOnlyExamples: 
    df.to_csv(toSaveFilePath, sep = '\t')

['dataframe row', 26]
Extracting keywords from paragraph
Paragraph:
1. ZMPSTE24 is a metalloproteinase involved in the post-translational processing of lamin A, a component of the nuclear lamina. Mutations in ZMPSTE24 can lead to diseases affecting connective tissue, including the basement membrane of the glomerulus.
197
Query:
 
I have a paragraph
Paragraph:
1. ZMPSTE24 is a metalloproteinase involved in the post-translational processing of lamin A, a component of the nuclear lamina. Mutations in ZMPSTE24 can lead to diseases affecting connective tissue, including the basement membrane of the glomerulus.

I would like to search PubMed to find supporting evidence for the statements in this paragraph. Give me a list of gene symbols from the paragraph. Please only include genes. Return the genes as a comma separated list without spacing, if there are no genes in the statements, please return "Unknown" 
Result:
ZMPSTE24,lamin A
307
Query:

I would like to search PubMed to find supporting 

In [14]:
# save MarkedParagraphs
with open('data/MarkedParagraphs.pickle', 'wb') as f:
    pickle.dump(MarkedParagraphs, f)

In [15]:
df.to_csv(toSaveFilePath, sep = '\t')